[𝘀𝗽𝗿] changes introduced through rebaseupstream/users/Prabhuk/sprmain.callsiteinfocallgraphsection-extract-and-propagate-indirect-call-type-ids

Created using spr 1.3.6-beta.1 [skip ci]
author: Necip Fazil Yildiran <necip@google.com> 2024-05-01 15:41:58 -0700
committer: prabhukr <prabhukr@google.com> 2024-05-01 15:41:58 -0700
commit: c2f27b33d29096f69ebb3a6ab8db60ac71651865 (patch)
tree: 59fd099b1987744e0b00a0c57e46353cca3138c1
parent: 3065b9b275511f4bd1840187af7de8934078d9ed (diff)
parent: 56e4111f9da499072e5119ccf92693202430f894 (diff)
2340 files changed, 262506 insertions, 117941 deletions
diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 81e9246de9b5..78a9cb77ff7d 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -91,7 +91,7 @@ function add-dependencies() {
     echo "${project}"
     case ${project} in
     bolt)
-      for p in lld llvm; do
+      for p in clang lld llvm; do
         echo $p
       done
     ;;
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index b347c443da67..b00a4b984a1d 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -48,7 +48,6 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_LIT_ARGS="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --timeout=1200 --time-tests" \
       -D LLVM_ENABLE_LLD=ON \
       -D CMAKE_CXX_FLAGS=-gmlt \
-      -D BOLT_CLANG_EXE=/usr/bin/clang \
       -D LLVM_CCACHE_BUILD=ON \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON
 
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 45da8af51bb9..0f178df1d18f 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -35,6 +35,10 @@
 clang/lib/AST/Interp/ @tbaederr
 clang/test/AST/Interp/ @tbaederr
 
+/clang/include/clang/CIR @lanza @bcardosolopes
+/clang/lib/CIR @lanza @bcardosolopes
+/clang/tools/cir-* @lanza @bcardosolopes
+
 /lldb/ @JDevlieghere
 
 # MLIR Interfaces.
diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 9cf64417d3cb..d608ea449f1d 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -1,3 +1,6 @@
+BOLT:
+  - bolt/**/*
+
 ClangIR:
   - clang/include/clang/CIR/**/*
   - clang/lib/CIR/**/*
@@ -467,6 +470,7 @@ backend:m68k:
 
 libc++:
   - libcxx/**
+  - .github/workflows/libcxx-*
 
 libc++abi:
   - libcxxabi/**
diff --git a/bolt/include/bolt/Passes/BinaryPasses.h b/bolt/include/bolt/Passes/BinaryPasses.h
index 8d89ef8b5484..5d7692559eda 100644
--- a/bolt/include/bolt/Passes/BinaryPasses.h
+++ b/bolt/include/bolt/Passes/BinaryPasses.h
@@ -400,8 +400,7 @@ public:
 /// dyno stats categories.
 class PrintProgramStats : public BinaryFunctionPass {
 public:
-  explicit PrintProgramStats(const cl::opt<bool> &PrintPass)
-      : BinaryFunctionPass(PrintPass) {}
+  explicit PrintProgramStats() : BinaryFunctionPass(false) {}
 
   const char *getName() const override { return "print-stats"; }
   bool shouldPrint(const BinaryFunction &) const override { return false; }
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 84f76caae9db..f2fa59bcaa1a 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -198,14 +198,8 @@ private:
   /// A trace is region of code executed between two LBR entries supplied in
   /// execution order.
   ///
-  /// Return true if the trace is valid, false otherwise.
-  bool
-  recordTrace(BinaryFunction &BF, const LBREntry &First, const LBREntry &Second,
-              uint64_t Count,
-              SmallVector<std::pair<uint64_t, uint64_t>, 16> &Branches) const;
-
   /// Return a vector of offsets corresponding to a trace in a function
-  /// (see recordTrace() above).
+  /// if the trace is valid, std::nullopt otherwise.
   std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
   getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First,
                          const LBREntry &Second, uint64_t Count = 1) const;
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index af832b4c7c84..41a92e7ba01e 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -422,10 +422,18 @@ private:
   /// Section name used for extra BOLT code in addition to .text.
   static StringRef getBOLTTextSectionName() { return ".bolt.text"; }
 
+  /// Symbol markers for BOLT reserved area.
+  static StringRef getBOLTReservedStart() { return "__bolt_reserved_start"; }
+  static StringRef getBOLTReservedEnd() { return "__bolt_reserved_end"; }
+
   /// Common section names.
   static StringRef getEHFrameSectionName() { return ".eh_frame"; }
+  static StringRef getEHFrameHdrSectionName() { return ".eh_frame_hdr"; }
   static StringRef getRelaDynSectionName() { return ".rela.dyn"; }
 
+  /// FILE symbol name used for local fragments of global functions.
+  static StringRef getBOLTFileSymbolName() { return "bolt-pseudo.o"; }
+
   /// An instance of the input binary we are processing, externally owned.
   llvm::object::ELFObjectFileBase *InputFile;
 
@@ -490,6 +498,9 @@ private:
   /// Store all non-zero symbols in this map for a quick address lookup.
   std::map<uint64_t, llvm::object::SymbolRef> FileSymRefs;
 
+  /// FILE symbols used for disambiguating split function parents.
+  std::vector<ELFSymbolRef> FileSymbols;
+
   std::unique_ptr<DWARFRewriter> DebugInfoRewriter;
 
   std::unique_ptr<BoltAddressTranslation> BAT;
diff --git a/bolt/include/bolt/Utils/NameResolver.h b/bolt/include/bolt/Utils/NameResolver.h
index 2e3ac20a532d..ccffa5633245 100644
--- a/bolt/include/bolt/Utils/NameResolver.h
+++ b/bolt/include/bolt/Utils/NameResolver.h
@@ -28,10 +28,23 @@ class NameResolver {
   static constexpr char Sep = '/';
 
 public:
-  /// Return unique version of the \p Name in the form "Name<Sep><Number>".
+  /// Return the number of uniquified versions of a given \p Name.
+  uint64_t getUniquifiedNameCount(StringRef Name) const {
+    if (Counters.contains(Name))
+      return Counters.at(Name);
+    return 0;
+  }
+
+  /// Return unique version of the \p Name in the form "Name<Sep><ID>".
+  std::string getUniqueName(StringRef Name, const uint64_t ID) const {
+    return (Name + Twine(Sep) + Twine(ID)).str();
+  }
+
+  /// Register new version of \p Name and return unique version in the form
+  /// "Name<Sep><Number>".
   std::string uniquify(StringRef Name) {
     const uint64_t ID = ++Counters[Name];
-    return (Name + Twine(Sep) + Twine(ID)).str();
+    return getUniqueName(Name, ID);
   }
 
   /// For uniquified \p Name, return the original form (that may no longer be
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 0b2a4e86561f..5108392c824c 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -14,6 +14,7 @@
 #include "bolt/Profile/DataAggregator.h"
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "bolt/Passes/BinaryPasses.h"
 #include "bolt/Profile/BoltAddressTranslation.h"
 #include "bolt/Profile/Heatmap.h"
 #include "bolt/Profile/YAMLProfileWriter.h"
@@ -611,6 +612,7 @@ Error DataAggregator::readProfile(BinaryContext &BC) {
         if (std::error_code EC = writeBATYAML(BC, opts::SaveProfile))
           report_error("cannot create output data file", EC);
     }
+    BC.logBOLTErrorsAndQuitOnFatal(PrintProgramStats().runOnFunctions(BC));
   }
 
   return Error::success();
@@ -859,14 +861,17 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
   return true;
 }
 
-bool DataAggregator::recordTrace(
-    BinaryFunction &BF, const LBREntry &FirstLBR, const LBREntry &SecondLBR,
-    uint64_t Count,
-    SmallVector<std::pair<uint64_t, uint64_t>, 16> &Branches) const {
+std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
+DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
+                                       const LBREntry &FirstLBR,
+                                       const LBREntry &SecondLBR,
+                                       uint64_t Count) const {
+  SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
+
   BinaryContext &BC = BF.getBinaryContext();
 
   if (!BF.isSimple())
-    return false;
+    return std::nullopt;
 
   assert(BF.hasCFG() && "can only record traces in CFG state");
 
@@ -875,13 +880,13 @@ bool DataAggregator::recordTrace(
   const uint64_t To = SecondLBR.From - BF.getAddress();
 
   if (From > To)
-    return false;
+    return std::nullopt;
 
   const BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(From);
   const BinaryBasicBlock *ToBB = BF.getBasicBlockContainingOffset(To);
 
   if (!FromBB || !ToBB)
-    return false;
+    return std::nullopt;
 
   // Adjust FromBB if the first LBR is a return from the last instruction in
   // the previous block (that instruction should be a call).
@@ -905,7 +910,7 @@ bool DataAggregator::recordTrace(
   // within the same basic block, e.g. when two call instructions are in the
   // same block. In this case we skip the processing.
   if (FromBB == ToBB)
-    return true;
+    return Branches;
 
   // Process blocks in the original layout order.
   BinaryBasicBlock *BB = BF.getLayout().getBlock(FromBB->getIndex());
@@ -919,7 +924,7 @@ bool DataAggregator::recordTrace(
       LLVM_DEBUG(dbgs() << "no fall-through for the trace:\n"
                         << "  " << FirstLBR << '\n'
                         << "  " << SecondLBR << '\n');
-      return false;
+      return std::nullopt;
     }
 
     const MCInst *Instr = BB->getLastNonPseudoInstr();
@@ -943,20 +948,7 @@ bool DataAggregator::recordTrace(
     BI.Count += Count;
   }
 
-  return true;
-}
-
-std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
-                                       const LBREntry &FirstLBR,
-                                       const LBREntry &SecondLBR,
-                                       uint64_t Count) const {
-  SmallVector<std::pair<uint64_t, uint64_t>, 16> Res;
-
-  if (!recordTrace(BF, FirstLBR, SecondLBR, Count, Res))
-    return std::nullopt;
-
-  return Res;
+  return Branches;
 }
 
 bool DataAggregator::recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred,
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index be4888ccfa56..cbb7199a53dd 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -356,7 +356,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   // order they're registered.
 
   // Run this pass first to use stats for the original functions.
-  Manager.registerPass(std::make_unique<PrintProgramStats>(NeverPrint));
+  Manager.registerPass(std::make_unique<PrintProgramStats>());
 
   if (opts::PrintProfileStats)
     Manager.registerPass(std::make_unique<PrintProfileStats>(NeverPrint));
diff --git a/bolt/lib/Rewrite/BoltDiff.cpp b/bolt/lib/Rewrite/BoltDiff.cpp
index fa43b7a2f92c..74b5ca18abce 100644
--- a/bolt/lib/Rewrite/BoltDiff.cpp
+++ b/bolt/lib/Rewrite/BoltDiff.cpp
@@ -292,7 +292,7 @@ class RewriteInstanceDiff {
         }
       }
     }
-    PrintProgramStats PPS(opts::NeverPrint);
+    PrintProgramStats PPS;
     outs() << "* BOLT-DIFF: Starting print program stats pass for binary 1\n";
     RI1.BC->logBOLTErrorsAndQuitOnFatal(PPS.runOnFunctions(*RI1.BC));
     outs() << "* BOLT-DIFF: Starting print program stats pass for binary 2\n";
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index d96199e020d3..17077b4fa248 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -248,6 +248,9 @@ class LinuxKernelRewriter final : public MetadataRewriter {
   /// Update ORC data in the binary.
   Error rewriteORCTables();
 
+  /// Validate written ORC tables after binary emission.
+  Error validateORCTables();
+
   /// Static call table handling.
   Error readStaticCalls();
   Error rewriteStaticCalls();
@@ -358,6 +361,9 @@ public:
     if (Error E = updateStaticKeysJumpTablePostEmit())
       return E;
 
+    if (Error E = validateORCTables())
+      return E;
+
     return Error::success();
   }
 };
@@ -837,6 +843,32 @@ Error LinuxKernelRewriter::rewriteORCTables() {
   return Error::success();
 }
 
+Error LinuxKernelRewriter::validateORCTables() {
+  if (!ORCUnwindIPSection)
+    return Error::success();
+
+  const uint64_t IPSectionAddress = ORCUnwindIPSection->getAddress();
+  DataExtractor IPDE = DataExtractor(ORCUnwindIPSection->getOutputContents(),
+                                     BC.AsmInfo->isLittleEndian(),
+                                     BC.AsmInfo->getCodePointerSize());
+  DataExtractor::Cursor IPCursor(0);
+  uint64_t PrevIP = 0;
+  for (uint32_t Index = 0; Index < NumORCEntries; ++Index) {
+    const uint64_t IP =
+        IPSectionAddress + IPCursor.tell() + (int32_t)IPDE.getU32(IPCursor);
+    if (!IPCursor)
+      return createStringError(errc::executable_format_error,
+                               "out of bounds while reading ORC IP table: %s",
+                               toString(IPCursor.takeError()).c_str());
+
+    assert(IP >= PrevIP && "Unsorted ORC table detected");
+    (void)PrevIP;
+    PrevIP = IP;
+  }
+
+  return Error::success();
+}
+
 /// The static call site table is created by objtool and contains entries in the
 /// following format:
 ///
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 4e0096cf988a..23f79e3c135a 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -840,6 +840,7 @@ void RewriteInstance::discoverFileObjects() {
       continue;
 
     if (cantFail(Symbol.getType()) == SymbolRef::ST_File) {
+      FileSymbols.emplace_back(Symbol);
       StringRef Name =
           cantFail(std::move(NameOrError), "cannot get symbol name for file");
       // Ignore Clang LTO artificial FILE symbol as it is not always generated,
@@ -1062,6 +1063,11 @@ void RewriteInstance::discoverFileObjects() {
       continue;
     }
 
+    if (SymName == getBOLTReservedStart() || SymName == getBOLTReservedEnd()) {
+      registerName(SymbolSize);
+      continue;
+    }
+
     LLVM_DEBUG(dbgs() << "BOLT-DEBUG: considering symbol " << UniqueName
                       << " for function\n");
 
@@ -1340,6 +1346,7 @@ void RewriteInstance::discoverFileObjects() {
   }
 
   registerFragments();
+  FileSymbols.clear();
 }
 
 Error RewriteInstance::discoverRtFiniAddress() {
@@ -1417,50 +1424,134 @@ void RewriteInstance::registerFragments() {
   if (!BC->HasSplitFunctions)
     return;
 
+  // Process fragments with ambiguous parents separately as they are typically a
+  // vanishing minority of cases and require expensive symbol table lookups.
+  std::vector<std::pair<StringRef, BinaryFunction *>> AmbiguousFragments;
   for (auto &BFI : BC->getBinaryFunctions()) {
     BinaryFunction &Function = BFI.second;
     if (!Function.isFragment())
       continue;
-    unsigned ParentsFound = 0;
     for (StringRef Name : Function.getNames()) {
-      StringRef BaseName, Suffix;
-      std::tie(BaseName, Suffix) = Name.split('/');
+      StringRef BaseName = NR.restore(Name);
+      const bool IsGlobal = BaseName == Name;
       const size_t ColdSuffixPos = BaseName.find(".cold");
       if (ColdSuffixPos == StringRef::npos)
         continue;
-      // For cold function with local (foo.cold/1) symbol, prefer a parent with
-      // local symbol as well (foo/1) over global symbol (foo).
-      std::string ParentName = BaseName.substr(0, ColdSuffixPos).str();
+      StringRef ParentName = BaseName.substr(0, ColdSuffixPos);
       const BinaryData *BD = BC->getBinaryDataByName(ParentName);
-      if (Suffix != "") {
-        ParentName.append(Twine("/", Suffix).str());
-        const BinaryData *BDLocal = BC->getBinaryDataByName(ParentName);
-        if (BDLocal || !BD)
-          BD = BDLocal;
-      }
-      if (!BD) {
-        if (opts::Verbosity >= 1)
-          BC->outs() << "BOLT-INFO: parent function not found for " << Name
-                     << "\n";
+      const uint64_t NumPossibleLocalParents =
+          NR.getUniquifiedNameCount(ParentName);
+      // The most common case: single local parent fragment.
+      if (!BD && NumPossibleLocalParents == 1) {
+        BD = BC->getBinaryDataByName(NR.getUniqueName(ParentName, 1));
+      } else if (BD && (!NumPossibleLocalParents || IsGlobal)) {
+        // Global parent and either no local candidates (second most common), or
+        // the fragment is global as well (uncommon).
+      } else {
+        // Any other case: need to disambiguate using FILE symbols.
+        AmbiguousFragments.emplace_back(ParentName, &Function);
         continue;
       }
-      const uint64_t Address = BD->getAddress();
-      BinaryFunction *BF = BC->getBinaryFunctionAtAddress(Address);
-      if (!BF) {
-        if (opts::Verbosity >= 1)
-          BC->outs() << formatv(
-              "BOLT-INFO: parent function not found at {0:x}\n", Address);
-        continue;
+      if (BD) {
+        BinaryFunction *BF = BC->getFunctionForSymbol(BD->getSymbol());
+        if (BF) {
+          BC->registerFragment(Function, *BF);
+          continue;
+        }
       }
-      BC->registerFragment(Function, *BF);
-      ++ParentsFound;
-    }
-    if (!ParentsFound) {
       BC->errs() << "BOLT-ERROR: parent function not found for " << Function
                  << '\n';
       exit(1);
     }
   }
+
+  if (AmbiguousFragments.empty())
+    return;
+
+  if (!BC->hasSymbolsWithFileName()) {
+    BC->errs() << "BOLT-ERROR: input file has split functions but does not "
+                  "have FILE symbols. If the binary was stripped, preserve "
+                  "FILE symbols with --keep-file-symbols strip option";
+    exit(1);
+  }
+
+  // The first global symbol is identified by the symbol table sh_info value.
+  // Used as local symbol search stopping point.
+  auto *ELF64LEFile = cast<ELF64LEObjectFile>(InputFile);
+  const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
+  auto *SymTab = llvm::find_if(cantFail(Obj.sections()), [](const auto &Sec) {
+    return Sec.sh_type == ELF::SHT_SYMTAB;
+  });
+  assert(SymTab);
+  // Symtab sh_info contains the value one greater than the symbol table index
+  // of the last local symbol.
+  ELFSymbolRef LocalSymEnd = ELF64LEFile->toSymbolRef(SymTab, SymTab->sh_info);
+
+  for (auto &[ParentName, BF] : AmbiguousFragments) {
+    const uint64_t Address = BF->getAddress();
+
+    // Get fragment's own symbol
+    const auto SymIt = FileSymRefs.find(Address);
+    if (SymIt == FileSymRefs.end()) {
+      BC->errs()
+          << "BOLT-ERROR: symbol lookup failed for function at address 0x"
+          << Twine::utohexstr(Address) << '\n';
+      exit(1);
+    }
+
+    // Find containing FILE symbol
+    ELFSymbolRef Symbol = SymIt->second;
+    auto FSI = llvm::upper_bound(FileSymbols, Symbol);
+    if (FSI == FileSymbols.begin()) {
+      BC->errs() << "BOLT-ERROR: owning FILE symbol not found for symbol "
+                 << cantFail(Symbol.getName()) << '\n';
+      exit(1);
+    }
+
+    ELFSymbolRef StopSymbol = LocalSymEnd;
+    if (FSI != FileSymbols.end())
+      StopSymbol = *FSI;
+
+    uint64_t ParentAddress{0};
+
+    // BOLT split fragment symbols are emitted just before the main function
+    // symbol.
+    for (ELFSymbolRef NextSymbol = Symbol; NextSymbol < StopSymbol;
+         NextSymbol.moveNext()) {
+      StringRef Name = cantFail(NextSymbol.getName());
+      if (Name == ParentName) {
+        ParentAddress = cantFail(NextSymbol.getValue());
+        goto registerParent;
+      }
+      if (Name.starts_with(ParentName))
+        // With multi-way splitting, there are multiple fragments with different
+        // suffixes. Parent follows the last fragment.
+        continue;
+      break;
+    }
+
+    // Iterate over local file symbols and check symbol names to match parent.
+    for (ELFSymbolRef Symbol(FSI[-1]); Symbol < StopSymbol; Symbol.moveNext()) {
+      if (cantFail(Symbol.getName()) == ParentName) {
+        ParentAddress = cantFail(Symbol.getAddress());
+        break;
+      }
+    }
+
+registerParent:
+    // No local parent is found, use global parent function.
+    if (!ParentAddress)
+      if (BinaryData *ParentBD = BC->getBinaryDataByName(ParentName))
+        ParentAddress = ParentBD->getAddress();
+
+    if (BinaryFunction *ParentBF =
+            BC->getBinaryFunctionAtAddress(ParentAddress)) {
+      BC->registerFragment(*BF, *ParentBF);
+      continue;
+    }
+    BC->errs() << "BOLT-ERROR: parent function not found for " << *BF << '\n';
+    exit(1);
+  }
 }
 
 void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress,
@@ -1725,12 +1816,6 @@ void RewriteInstance::adjustFunctionBoundaries() {
       if (!Function.isSymbolValidInScope(Symbol, SymbolSize))
         break;
 
-      // Ignore unnamed symbols. Used, for example, by debugging info on RISC-V.
-      if (BC->isRISCV() && cantFail(Symbol.getName()).empty()) {
-        ++NextSymRefI;
-        continue;
-      }
-
       // Skip basic block labels. This happens on RISC-V with linker relaxation
       // enabled because every branch needs a relocation and corresponding
       // symbol. We don't want to add such symbols as entry points.
@@ -3532,6 +3617,26 @@ void RewriteInstance::updateMetadata() {
 void RewriteInstance::mapFileSections(BOLTLinker::SectionMapper MapSection) {
   BC->deregisterUnusedSections();
 
+  // Check if the input has a space reserved for BOLT.
+  BinaryData *StartBD = BC->getBinaryDataByName(getBOLTReservedStart());
+  BinaryData *EndBD = BC->getBinaryDataByName(getBOLTReservedEnd());
+  if (!StartBD != !EndBD) {
+    BC->errs() << "BOLT-ERROR: one of the symbols is missing from the binary: "
+               << getBOLTReservedStart() << ", " << getBOLTReservedEnd()
+               << '\n';
+    exit(1);
+  }
+
+  if (StartBD) {
+    PHDRTableOffset = 0;
+    PHDRTableAddress = 0;
+    NewTextSegmentAddress = 0;
+    NewTextSegmentOffset = 0;
+    NextAvailableAddress = StartBD->getAddress();
+    BC->outs()
+        << "BOLT-INFO: using reserved space for allocating new sections\n";
+  }
+
   // If no new .eh_frame was written, remove relocated original .eh_frame.
   BinarySection *RelocatedEHFrameSection =
       getSection(".relocated" + getEHFrameSectionName());
@@ -3551,6 +3656,18 @@ void RewriteInstance::mapFileSections(BOLTLinker::SectionMapper MapSection) {
 
   // Map the rest of the sections.
   mapAllocatableSections(MapSection);
+
+  if (StartBD) {
+    const uint64_t ReservedSpace = EndBD->getAddress() - StartBD->getAddress();
+    const uint64_t AllocatedSize = NextAvailableAddress - StartBD->getAddress();
+    if (ReservedSpace < AllocatedSize) {
+      BC->errs() << "BOLT-ERROR: reserved space (" << ReservedSpace << " byte"
+                 << (ReservedSpace == 1 ? "" : "s")
+                 << ") is smaller than required for new allocations ("
+                 << AllocatedSize << " bytes)\n";
+      exit(1);
+    }
+  }
 }
 
 std::vector<BinarySection *> RewriteInstance::getCodeSections() {
@@ -3792,7 +3909,7 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
   // Add the new text section aggregating all existing code sections.
   // This is pseudo-section that serves a purpose of creating a corresponding
   // entry in section header table.
-  int64_t NewTextSectionSize =
+  const uint64_t NewTextSectionSize =
       NextAvailableAddress - NewTextSectionStartAddress;
   if (NewTextSectionSize) {
     const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
@@ -3875,7 +3992,7 @@ void RewriteInstance::mapAllocatableSections(
       if (PHDRTableAddress) {
         // Segment size includes the size of the PHDR area.
         NewTextSegmentSize = NextAvailableAddress - PHDRTableAddress;
-      } else {
+      } else if (NewTextSegmentAddress) {
         // Existing PHDR table would be updated.
         NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress;
       }
@@ -3914,7 +4031,7 @@ void RewriteInstance::patchELFPHDRTable() {
     assert(!PHDRTableAddress && "unexpected address for program header table");
     PHDRTableOffset = Obj.getHeader().e_phoff;
     if (NewWritableSegmentSize) {
-      BC->errs() << "Unable to add writable segment with UseGnuStack option\n";
+      BC->errs() << "BOLT-ERROR: unable to add writable segment\n";
       exit(1);
     }
   }
@@ -3924,19 +4041,15 @@ void RewriteInstance::patchELFPHDRTable() {
   if (!NewWritableSegmentSize) {
     if (PHDRTableAddress)
       NewTextSegmentSize = NextAvailableAddress - PHDRTableAddress;
-    else
+    else if (NewTextSegmentAddress)
       NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress;
   } else {
     NewWritableSegmentSize = NextAvailableAddress - NewWritableSegmentAddress;
   }
 
+  const uint64_t SavedPos = OS.tell();
   OS.seek(PHDRTableOffset);
 
-  bool ModdedGnuStack = false;
-  (void)ModdedGnuStack;
-  bool AddedSegment = false;
-  (void)AddedSegment;
-
   auto createNewTextPhdr = [&]() {
     ELF64LEPhdrTy NewPhdr;
     NewPhdr.p_type = ELF::PT_LOAD;
@@ -3952,40 +4065,55 @@ void RewriteInstance::patchELFPHDRTable() {
     NewPhdr.p_filesz = NewTextSegmentSize;
     NewPhdr.p_memsz = NewTextSegmentSize;
     NewPhdr.p_flags = ELF::PF_X | ELF::PF_R;
-    // FIXME: Currently instrumentation is experimental and the runtime data
-    // is emitted with code, thus everything needs to be writable
-    if (opts::Instrument)
+    if (opts::Instrument) {
+      // FIXME: Currently instrumentation is experimental and the runtime data
+      // is emitted with code, thus everything needs to be writable.
       NewPhdr.p_flags |= ELF::PF_W;
+    }
     NewPhdr.p_align = BC->PageAlign;
 
     return NewPhdr;
   };
 
-  auto createNewWritableSectionsPhdr = [&]() {
-    ELF64LEPhdrTy NewPhdr;
-    NewPhdr.p_type = ELF::PT_LOAD;
-    NewPhdr.p_offset = getFileOffsetForAddress(NewWritableSegmentAddress);
-    NewPhdr.p_vaddr = NewWritableSegmentAddress;
-    NewPhdr.p_paddr = NewWritableSegmentAddress;
-    NewPhdr.p_filesz = NewWritableSegmentSize;
-    NewPhdr.p_memsz = NewWritableSegmentSize;
-    NewPhdr.p_align = BC->RegularPageSize;
-    NewPhdr.p_flags = ELF::PF_R | ELF::PF_W;
-    return NewPhdr;
+  auto writeNewSegmentPhdrs = [&]() {
+    if (PHDRTableAddress || NewTextSegmentSize) {
+      ELF64LE::Phdr NewPhdr = createNewTextPhdr();
+      OS.write(reinterpret_cast<const char *>(&NewPhdr), sizeof(NewPhdr));
+    }
+
+    if (NewWritableSegmentSize) {
+      ELF64LEPhdrTy NewPhdr;
+      NewPhdr.p_type = ELF::PT_LOAD;
+      NewPhdr.p_offset = getFileOffsetForAddress(NewWritableSegmentAddress);
+      NewPhdr.p_vaddr = NewWritableSegmentAddress;
+      NewPhdr.p_paddr = NewWritableSegmentAddress;
+      NewPhdr.p_filesz = NewWritableSegmentSize;
+      NewPhdr.p_memsz = NewWritableSegmentSize;
+      NewPhdr.p_align = BC->RegularPageSize;
+      NewPhdr.p_flags = ELF::PF_R | ELF::PF_W;
+      OS.write(reinterpret_cast<const char *>(&NewPhdr), sizeof(NewPhdr));
+    }
   };
 
+  bool ModdedGnuStack = false;
+  bool AddedSegment = false;
+
   // Copy existing program headers with modifications.
   for (const ELF64LE::Phdr &Phdr : cantFail(Obj.program_headers())) {
     ELF64LE::Phdr NewPhdr = Phdr;
-    if (PHDRTableAddress && Phdr.p_type == ELF::PT_PHDR) {
-      NewPhdr.p_offset = PHDRTableOffset;
-      NewPhdr.p_vaddr = PHDRTableAddress;
-      NewPhdr.p_paddr = PHDRTableAddress;
-      NewPhdr.p_filesz = sizeof(NewPhdr) * Phnum;
-      NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum;
-    } else if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) {
-      ErrorOr<BinarySection &> EHFrameHdrSec =
-          BC->getUniqueSectionByName(getNewSecPrefix() + ".eh_frame_hdr");
+    switch (Phdr.p_type) {
+    case ELF::PT_PHDR:
+      if (PHDRTableAddress) {
+        NewPhdr.p_offset = PHDRTableOffset;
+        NewPhdr.p_vaddr = PHDRTableAddress;
+        NewPhdr.p_paddr = PHDRTableAddress;
+        NewPhdr.p_filesz = sizeof(NewPhdr) * Phnum;
+        NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum;
+      }
+      break;
+    case ELF::PT_GNU_EH_FRAME: {
+      ErrorOr<BinarySection &> EHFrameHdrSec = BC->getUniqueSectionByName(
+          getNewSecPrefix() + getEHFrameHdrSectionName());
       if (EHFrameHdrSec && EHFrameHdrSec->isAllocatable() &&
           EHFrameHdrSec->isFinalized()) {
         NewPhdr.p_offset = EHFrameHdrSec->getOutputFileOffset();
@@ -3994,37 +4122,38 @@ void RewriteInstance::patchELFPHDRTable() {
         NewPhdr.p_filesz = EHFrameHdrSec->getOutputSize();
         NewPhdr.p_memsz = EHFrameHdrSec->getOutputSize();
       }
-    } else if (opts::UseGnuStack && Phdr.p_type == ELF::PT_GNU_STACK) {
-      NewPhdr = createNewTextPhdr();
-      ModdedGnuStack = true;
-    } else if (!opts::UseGnuStack && Phdr.p_type == ELF::PT_DYNAMIC) {
-      // Insert the new header before DYNAMIC.
-      ELF64LE::Phdr NewTextPhdr = createNewTextPhdr();
-      OS.write(reinterpret_cast<const char *>(&NewTextPhdr),
-               sizeof(NewTextPhdr));
-      if (NewWritableSegmentSize) {
-        ELF64LEPhdrTy NewWritablePhdr = createNewWritableSectionsPhdr();
-        OS.write(reinterpret_cast<const char *>(&NewWritablePhdr),
-                 sizeof(NewWritablePhdr));
+      break;
+    }
+    case ELF::PT_GNU_STACK:
+      if (opts::UseGnuStack) {
+        // Overwrite the header with the new text segment header.
+        NewPhdr = createNewTextPhdr();
+        ModdedGnuStack = true;
       }
-      AddedSegment = true;
+      break;
+    case ELF::PT_DYNAMIC:
+      if (!opts::UseGnuStack) {
+        // Insert new headers before DYNAMIC.
+        writeNewSegmentPhdrs();
+        AddedSegment = true;
+      }
+      break;
     }
     OS.write(reinterpret_cast<const char *>(&NewPhdr), sizeof(NewPhdr));
   }
 
   if (!opts::UseGnuStack && !AddedSegment) {
-    // Append the new header to the end of the table.
-    ELF64LE::Phdr NewTextPhdr = createNewTextPhdr();
-    OS.write(reinterpret_cast<const char *>(&NewTextPhdr), sizeof(NewTextPhdr));
-    if (NewWritableSegmentSize) {
-      ELF64LEPhdrTy NewWritablePhdr = createNewWritableSectionsPhdr();
-      OS.write(reinterpret_cast<const char *>(&NewWritablePhdr),
-               sizeof(NewWritablePhdr));
-    }
+    // Append new headers to the end of the table.
+    writeNewSegmentPhdrs();
+  }
+
+  if (opts::UseGnuStack && !ModdedGnuStack) {
+    BC->errs()
+        << "BOLT-ERROR: could not find PT_GNU_STACK program header to modify\n";
+    exit(1);
   }
 
-  assert((!opts::UseGnuStack || ModdedGnuStack) &&
-         "could not find GNU_STACK program header to modify");
+  OS.seek(SavedPos);
 }
 
 namespace {
@@ -4050,9 +4179,8 @@ void RewriteInstance::rewriteNoteSections() {
   const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
   raw_fd_ostream &OS = Out->os();
 
-  uint64_t NextAvailableOffset = getFileOffsetForAddress(NextAvailableAddress);
-  assert(NextAvailableOffset >= FirstNonAllocatableOffset &&
-         "next available offset calculation failure");
+  uint64_t NextAvailableOffset = std::max(
+      getFileOffsetForAddress(NextAvailableAddress), FirstNonAllocatableOffset);
   OS.seek(NextAvailableOffset);
 
   // Copy over non-allocatable section contents and update file offsets.
@@ -4493,6 +4621,8 @@ void RewriteInstance::updateELFSymbolTable(
   // Symbols for the new symbol table.
   std::vector<ELFSymTy> Symbols;
 
+  bool EmittedColdFileSymbol = false;
+
   auto getNewSectionIndex = [&](uint32_t OldIndex) {
     // For dynamic symbol table, the section index could be wrong on the input,
     // and its value is ignored by the runtime if it's different from
@@ -4551,6 +4681,20 @@ void RewriteInstance::updateELFSymbolTable(
       Symbols.emplace_back(ICFSymbol);
     }
     if (Function.isSplit()) {
+      // Prepend synthetic FILE symbol to prevent local cold fragments from
+      // colliding with existing symbols with the same name.
+      if (!EmittedColdFileSymbol &&
+          FunctionSymbol.getBinding() == ELF::STB_GLOBAL) {
+        ELFSymTy FileSymbol;
+        FileSymbol.st_shndx = ELF::SHN_ABS;
+        FileSymbol.st_name = AddToStrTab(getBOLTFileSymbolName());
+        FileSymbol.st_value = 0;
+        FileSymbol.st_size = 0;
+        FileSymbol.st_other = 0;
+        FileSymbol.setBindingAndType(ELF::STB_LOCAL, ELF::STT_FILE);
+        Symbols.emplace_back(FileSymbol);
+        EmittedColdFileSymbol = true;
+      }
       for (const FunctionFragment &FF :
            Function.getLayout().getSplitFragments()) {
         if (FF.getAddress()) {
@@ -4775,7 +4919,7 @@ void RewriteInstance::updateELFSymbolTable(
       ++NumHotDataSymsUpdated;
     }
 
-    if (*SymbolName == "_end")
+    if (*SymbolName == "_end" && NextAvailableAddress > Symbol.st_value)
       updateSymbolValue(*SymbolName, NextAvailableAddress);
 
     if (IsDynSym)
@@ -4889,13 +5033,6 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile<ELFT> *File) {
   std::vector<uint32_t> NewSectionIndex;
   getOutputSections(File, NewSectionIndex);
 
-  // Set pointer at the end of the output file, so we can pwrite old symbol
-  // tables if we need to.
-  uint64_t NextAvailableOffset = getFileOffsetForAddress(NextAvailableAddress);
-  assert(NextAvailableOffset >= FirstNonAllocatableOffset &&
-         "next available offset calculation failure");
-  Out->os().seek(NextAvailableOffset);
-
   // Update dynamic symbol table.
   const ELFShdrTy *DynSymSection = nullptr;
   for (const ELFShdrTy &Section : cantFail(Obj.sections())) {
@@ -5432,6 +5569,17 @@ uint64_t RewriteInstance::getNewFunctionOrDataAddress(uint64_t OldAddress) {
   if (BD && BD->isMoved())
     return BD->getOutputAddress();
 
+  if (const BinaryFunction *BF =
+          BC->getBinaryFunctionContainingAddress(OldAddress)) {
+    if (BF->isEmitted()) {
+      BC->errs() << "BOLT-ERROR: unable to get new address corresponding to "
+                    "input address 0x"
+                 << Twine::utohexstr(OldAddress) << " in function " << *BF
+                 << ". Consider adding this function to --skip-funcs=...\n";
+      exit(1);
+    }
+  }
+
   return 0;
 }
 
@@ -5449,10 +5597,10 @@ void RewriteInstance::rewriteFile() {
   auto Streamer = BC->createStreamer(OS);
   // Make sure output stream has enough reserved space, otherwise
   // pwrite() will fail.
-  uint64_t Offset = OS.seek(getFileOffsetForAddress(NextAvailableAddress));
-  (void)Offset;
-  assert(Offset == getFileOffsetForAddress(NextAvailableAddress) &&
-         "error resizing output file");
+  uint64_t Offset = std::max(getFileOffsetForAddress(NextAvailableAddress),
+                             FirstNonAllocatableOffset);
+  Offset = OS.seek(Offset);
+  assert((Offset != (uint64_t)-1) && "Error resizing output file");
 
   // Overwrite functions with fixed output address. This is mostly used by
   // non-relocation mode, with one exception: injected functions are covered
@@ -5671,7 +5819,8 @@ void RewriteInstance::writeEHFrameHeader() {
       BC->AsmInfo->getCodePointerSize()));
   check_error(std::move(Er), "failed to parse EH frame");
 
-  LLVM_DEBUG(dbgs() << "BOLT: writing a new .eh_frame_hdr\n");
+  LLVM_DEBUG(dbgs() << "BOLT: writing a new " << getEHFrameHdrSectionName()
+                    << '\n');
 
   NextAvailableAddress =
       appendPadding(Out->os(), NextAvailableAddress, EHFrameHdrAlign);
@@ -5683,25 +5832,35 @@ void RewriteInstance::writeEHFrameHeader() {
   std::vector<char> NewEHFrameHdr = CFIRdWrt->generateEHFrameHeader(
       RelocatedEHFrame, NewEHFrame, EHFrameHdrOutputAddress, FailedAddresses);
 
-  assert(Out->os().tell() == EHFrameHdrFileOffset && "offset mismatch");
+  Out->os().seek(EHFrameHdrFileOffset);
   Out->os().write(NewEHFrameHdr.data(), NewEHFrameHdr.size());
 
   const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
                                                  /*IsText=*/false,
                                                  /*IsAllocatable=*/true);
-  BinarySection *OldEHFrameHdrSection = getSection(".eh_frame_hdr");
+  BinarySection *OldEHFrameHdrSection = getSection(getEHFrameHdrSectionName());
   if (OldEHFrameHdrSection)
-    OldEHFrameHdrSection->setOutputName(getOrgSecPrefix() + ".eh_frame_hdr");
+    OldEHFrameHdrSection->setOutputName(getOrgSecPrefix() +
+                                        getEHFrameHdrSectionName());
 
   BinarySection &EHFrameHdrSec = BC->registerOrUpdateSection(
-      getNewSecPrefix() + ".eh_frame_hdr", ELF::SHT_PROGBITS, Flags, nullptr,
-      NewEHFrameHdr.size(), /*Alignment=*/1);
+      getNewSecPrefix() + getEHFrameHdrSectionName(), ELF::SHT_PROGBITS, Flags,
+      nullptr, NewEHFrameHdr.size(), /*Alignment=*/1);
   EHFrameHdrSec.setOutputFileOffset(EHFrameHdrFileOffset);
   EHFrameHdrSec.setOutputAddress(EHFrameHdrOutputAddress);
-  EHFrameHdrSec.setOutputName(".eh_frame_hdr");
+  EHFrameHdrSec.setOutputName(getEHFrameHdrSectionName());
 
   NextAvailableAddress += EHFrameHdrSec.getOutputSize();
 
+  if (const BinaryData *ReservedEnd =
+          BC->getBinaryDataByName(getBOLTReservedEnd())) {
+    if (NextAvailableAddress > ReservedEnd->getAddress()) {
+      BC->errs() << "BOLT-ERROR: unable to fit " << getEHFrameHdrSectionName()
+                 << " into reserved space\n";
+      exit(1);
+    }
+  }
+
   // Merge new .eh_frame with the relocated original so that gdb can locate all
   // FDEs.
   if (RelocatedEHFrameSection) {
diff --git a/bolt/test/RISCV/unnamed-sym-no-entry.c b/bolt/test/RISCV/fake-label-no-entry.c
index 605bbc00aeec..bd125263101b 100644
--- a/bolt/test/RISCV/unnamed-sym-no-entry.c
+++ b/bolt/test/RISCV/fake-label-no-entry.c
@@ -5,12 +5,12 @@
 
 // RUN: %clang %cflags -g -Wl,-q -o %t %s
 
-/// Verify that the binary indeed contains an unnamed symbol at _start
+/// Verify that the binary indeed contains a fake label ".L0 " at _start.
 // RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=CHECK-ELF
 // CHECK-ELF-DAG: [[#%x,START:]] {{.*}} FUNC GLOBAL DEFAULT [[#%d,SECTION:]] _start{{$}}
-// CHECK-ELF-DAG: [[#%x,START]] {{.*}} NOTYPE LOCAL DEFAULT [[#SECTION]] {{$}}
+// CHECK-ELF-DAG: [[#%x,START]] {{.*}} NOTYPE LOCAL DEFAULT [[#SECTION]] .L0 {{$}}
 
-/// Verify that BOLT did not create an extra entry point for the unnamed symbol
+/// Verify that BOLT did not create an extra entry point for the fake label.
 // RUN: llvm-bolt -o %t.bolt %t --print-cfg | FileCheck %s
 // CHECK: Binary Function "_start" after building cfg {
 // CHECK:  IsMultiEntry: 0
diff --git a/bolt/test/X86/cdsplit-symbol-names.s b/bolt/test/X86/cdsplit-symbol-names.s
index e2259276e255..e53863e22246 100644
--- a/bolt/test/X86/cdsplit-symbol-names.s
+++ b/bolt/test/X86/cdsplit-symbol-names.s
@@ -10,6 +10,7 @@
 # RUN:         --call-scale=2 --data=%t.fdata --reorder-blocks=ext-tsp
 # RUN: llvm-objdump --syms %t.bolt | FileCheck %s --check-prefix=CHECK-SYMS-WARM
 
+# CHECK-SYMS-WARM: 0000000000000000 l df *ABS* 0000000000000000 bolt-pseudo.o
 # CHECK-SYMS-WARM: .text.warm
 # CHECK-SYMS-WARM-SAME: chain.warm
 # CHECK-SYMS-WARM: .text.cold
diff --git a/bolt/test/X86/fragment-lite.s b/bolt/test/X86/fragment-lite.s
index 97069bf8096e..32d1f5a98b64 100644
--- a/bolt/test/X86/fragment-lite.s
+++ b/bolt/test/X86/fragment-lite.s
@@ -3,35 +3,42 @@
 # RUN: split-file %s %t
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/baz.s -o %t.baz.o
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/baz2.s -o %t.baz2.o
 # RUN: link_fdata %s %t.o %t.main.fdata
 # RUN: link_fdata %s %t.baz.o %t.baz.fdata
-# RUN: merge-fdata %t.main.fdata %t.baz.fdata > %t.fdata
-# RUN: %clang %cflags %t.o %t.baz.o -o %t.exe -Wl,-q
+# RUN: link_fdata %s %t.baz2.o %t.baz2.fdata
+# RUN: merge-fdata %t.main.fdata %t.baz.fdata %t.baz2.fdata > %t.fdata
+# RUN: %clang %cflags %t.o %t.baz.o %t.baz2.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.out --lite=1 --data %t.fdata -v=1 -print-cfg \
 # RUN:   2>&1 | FileCheck %s
 
 # CHECK: BOLT-INFO: processing main.cold.1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing foo.cold.1/1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing bar.cold.1/1 as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing foo.cold.1/1(*2) as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing bar.cold.1/1(*2) as a sibling of non-ignored function
 # CHECK: BOLT-INFO: processing baz.cold.1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing baz.cold.1/1 as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing baz.cold.1/1(*2) as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing baz.cold.1/2(*2) as a sibling of non-ignored function
 
 # CHECK: Binary Function "main.cold.1" after building cfg
 # CHECK: Parent : main
 
-# CHECK: Binary Function "foo.cold.1/1" after building cfg
+# CHECK: Binary Function "foo.cold.1/1(*2)" after building cfg
 # CHECK: Parent : foo
 
-# CHECK: Binary Function "bar.cold.1/1" after building cfg
-# CHECK: Parent : bar/1
+# CHECK: Binary Function "bar.cold.1/1(*2)" after building cfg
+# CHECK: Parent : bar/1(*2)
 
 # CHECK: Binary Function "baz.cold.1" after building cfg
 # CHECK: Parent : baz{{$}}
 
-# CHECK: Binary Function "baz.cold.1/1" after building cfg
-# CHECK: Parent : baz/1
+# CHECK: Binary Function "baz.cold.1/1(*2)" after building cfg
+# CHECK: Parent : baz/1(*2)
+
+# CHECK: Binary Function "baz.cold.1/2(*2)" after building cfg
+# CHECK: Parent : baz/2(*2)
 
 #--- main.s
+.file "main.s"
   .globl main
   .type main, %function
 main:
@@ -126,6 +133,7 @@ baz.cold.1:
 .size baz.cold.1, .-baz.cold.1
 
 #--- baz.s
+.file "baz.s"
   .local baz
   .type baz, %function
 baz:
@@ -149,3 +157,29 @@ baz.cold.1:
   retq
   .cfi_endproc
 .size baz.cold.1, .-baz.cold.1
+
+#--- baz2.s
+.file "baz2.s"
+  .local baz
+  .type baz, %function
+baz:
+  .cfi_startproc
+# FDATA: 0 [unknown] 0 1 baz/2 0 1 0
+  cmpl	$0x0, %eax
+  je	baz.cold.1
+  retq
+  .cfi_endproc
+.size baz, .-baz
+
+  .section .text.cold
+  .local baz.cold.1
+  .type baz.cold.1, %function
+baz.cold.1:
+  .cfi_startproc
+  pushq	%rbp
+  movq	%rsp, %rbp
+  movl	$0x0, %eax
+  popq	%rbp
+  retq
+  .cfi_endproc
+.size baz.cold.1, .-baz.cold.1
diff --git a/bolt/test/X86/indirect-goto-pie.test b/bolt/test/X86/indirect-goto-pie.test
new file mode 100644
index 000000000000..039ff5c41d3d
--- /dev/null
+++ b/bolt/test/X86/indirect-goto-pie.test
@@ -0,0 +1,16 @@
+# Check that llvm-bolt fails to process PIC binaries with computed goto, as the
+# support is not there yet for correctly updating dynamic relocations
+# referencing code inside functions.
+
+REQUIRES: x86_64-linux
+
+RUN: %clang %S/Inputs/indirect_goto.c -o %t -fpic -pie -Wl,-q
+RUN: not llvm-bolt %t -o %t.bolt --relocs=1 --print-cfg --print-only=main \
+RUN:   |& FileCheck %s
+
+# Check that processing works if main() is skipped.
+RUN: llvm-bolt %t -o %t.bolt --relocs=1 --skip-funcs=main
+
+CHECK:  jmpq    *%rax # UNKNOWN CONTROL FLOW
+
+CHECK: BOLT-ERROR: unable to get new address
diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test
index e8c3f64239a2..0bd44720f1b7 100644
--- a/bolt/test/X86/pre-aggregated-perf.test
+++ b/bolt/test/X86/pre-aggregated-perf.test
@@ -11,7 +11,14 @@ REQUIRES: system-linux
 
 RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
 RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \
-RUN:   --profile-use-dfs
+RUN:   --profile-use-dfs | FileCheck %s
+
+RUN: llvm-bolt %t.exe -data %t -o %t.null | FileCheck %s
+RUN: llvm-bolt %t.exe -data %t.new -o %t.null | FileCheck %s
+RUN: llvm-bolt %t.exe -p %p/Inputs/pre-aggregated.txt --pa -o %t.null | FileCheck %s
+
+CHECK: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
+
 RUN: cat %t | sort | FileCheck %s -check-prefix=PERF2BOLT
 RUN: cat %t.new | FileCheck %s -check-prefix=NEWFORMAT
 
diff --git a/bolt/test/X86/register-fragments-bolt-symbols.s b/bolt/test/X86/register-fragments-bolt-symbols.s
new file mode 100644
index 000000000000..fa9b70e0b2d8
--- /dev/null
+++ b/bolt/test/X86/register-fragments-bolt-symbols.s
@@ -0,0 +1,32 @@
+# Test the heuristics for matching BOLT-added split functions.
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %S/cdsplit-symbol-names.s -o %t.main.o
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.chain.o
+# RUN: link_fdata %S/cdsplit-symbol-names.s %t.main.o %t.fdata
+# RUN: sed -i 's|chain|chain/2|g' %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.main.o
+# RUN: llvm-objcopy --localize-symbol=chain %t.main.o
+# RUN: %clang %cflags %t.chain.o %t.main.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=randomN \
+# RUN:   --reorder-blocks=ext-tsp --enable-bat --bolt-seed=7 --data=%t.fdata
+# RUN: llvm-objdump --syms %t.bolt | FileCheck %s --check-prefix=CHECK-SYMS
+
+# RUN: link_fdata %s %t.bolt %t.preagg PREAGG
+# PREAGG: B X:0 #chain.cold.0# 1 0
+# RUN: perf2bolt %t.bolt -p %t.preagg --pa -o %t.bat.fdata -w %t.bat.yaml -v=1 \
+# RUN:   | FileCheck %s --check-prefix=CHECK-REGISTER
+
+# CHECK-SYMS: l df *ABS*          [[#]] chain.s
+# CHECK-SYMS: l  F .bolt.org.text [[#]] chain
+# CHECK-SYMS: l  F .text.cold     [[#]] chain.cold.0
+# CHECK-SYMS: l  F .text          [[#]] chain
+# CHECK-SYMS: l df *ABS*          [[#]] bolt-pseudo.o
+
+# CHECK-REGISTER: BOLT-INFO: marking chain.cold.0/1(*2) as a fragment of chain/2(*2)
+
+.file "chain.s"
+        .text
+        .type   chain, @function
+chain:
+        ret
+        .size   chain, .-chain
diff --git a/bolt/test/X86/shrinkwrapping-do-not-pessimize.s b/bolt/test/X86/shrinkwrapping-do-not-pessimize.s
index a57131131423..3fdd5f5e38fe 100644
--- a/bolt/test/X86/shrinkwrapping-do-not-pessimize.s
+++ b/bolt/test/X86/shrinkwrapping-do-not-pessimize.s
@@ -53,6 +53,6 @@ end_if_1:
   .size _start, .-_start
 
   .data
-rel:  .quad end_if_1
+rel:  .quad _start
 
 # CHECK:   BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 0 spills inserting push/pops
diff --git a/bolt/test/runtime/X86/Inputs/indirect_goto.c b/bolt/test/runtime/X86/Inputs/indirect_goto.c
deleted file mode 100644
index b781e9e03b6d..000000000000
--- a/bolt/test/runtime/X86/Inputs/indirect_goto.c
+++ /dev/null
@@ -1,18 +0,0 @@
-int main(int argc, char *argv[]) {
-  static const void *T1[] = { &&L1, &&L2 };
-  static const void *T2[] = { &&L2, &&L3 };
-
-  const void **T = (argc > 1) ? T1 : T2;
-
-  int i = 0;
-
-L0:
-  goto *T[argc];
-L1:
-  ++i;
-L2:
-  i++;
-L3:
-  i++;
-  return i;
-}
diff --git a/bolt/test/runtime/X86/indirect-goto-pie.test b/bolt/test/runtime/X86/indirect-goto-pie.test
deleted file mode 100644
index 76089fda3abf..000000000000
--- a/bolt/test/runtime/X86/indirect-goto-pie.test
+++ /dev/null
@@ -1,10 +0,0 @@
-# Check llvm-bolt processes binaries compiled from sources that use indirect goto.
-REQUIRES: x86_64-linux
-
-RUN: %clang %S/Inputs/indirect_goto.c -o %t -fpic -pie -Wl,-q
-RUN: llvm-bolt %t -o %t.bolt --relocs=1 --print-cfg --print-only=main \
-RUN:   |& FileCheck %s 
-# The test fails as we don't update corresponding dynamic relocations.
-RUN: not %t.bolt
-
-CHECK:  jmpq    *%rax # UNKNOWN CONTROL FLOW
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
index 84e99c7fafc7..10868129e76d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
@@ -967,7 +967,8 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
   // Get out the qualifiers of the original type. This will always be
   // re-applied to the WorkType to ensure it is the same qualification as the
   // original From was.
-  auto QualifiersToApply = From.split().Quals.getAsOpaqueValue();
+  auto FastQualifiersToApply = static_cast<unsigned>(
+      From.split().Quals.getAsOpaqueValue() & Qualifiers::FastMask);
 
   // LValue->RValue is irrelevant for the check, because it is a thing to be
   // done at a call site, and will be performed if need be performed.
@@ -993,7 +994,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
     // "const double -> double".
     LLVM_DEBUG(llvm::dbgs()
                << "--- approximateStdConv. Conversion between numerics.\n");
-    WorkType = QualType{ToBuiltin, QualifiersToApply};
+    WorkType = QualType{ToBuiltin, FastQualifiersToApply};
   }
 
   const auto *FromEnum = WorkType->getAs<EnumType>();
@@ -1002,7 +1003,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
     // Unscoped enumerations (or enumerations in C) convert to numerics.
     LLVM_DEBUG(llvm::dbgs()
                << "--- approximateStdConv. Unscoped enum to numeric.\n");
-    WorkType = QualType{ToBuiltin, QualifiersToApply};
+    WorkType = QualType{ToBuiltin, FastQualifiersToApply};
   } else if (FromNumeric && ToEnum && ToEnum->isUnscopedEnumerationType()) {
     // Numeric types convert to enumerations only in C.
     if (Ctx.getLangOpts().CPlusPlus) {
@@ -1013,7 +1014,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
 
     LLVM_DEBUG(llvm::dbgs()
                << "--- approximateStdConv. Numeric to unscoped enum.\n");
-    WorkType = QualType{ToEnum, QualifiersToApply};
+    WorkType = QualType{ToEnum, FastQualifiersToApply};
   }
 
   // Check for pointer conversions.
@@ -1022,14 +1023,14 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
   if (FromPtr && ToPtr) {
     if (ToPtr->isVoidPointerType()) {
       LLVM_DEBUG(llvm::dbgs() << "--- approximateStdConv. To void pointer.\n");
-      WorkType = QualType{ToPtr, QualifiersToApply};
+      WorkType = QualType{ToPtr, FastQualifiersToApply};
     }
 
     const auto *FromRecordPtr = FromPtr->getPointeeCXXRecordDecl();
     const auto *ToRecordPtr = ToPtr->getPointeeCXXRecordDecl();
     if (isDerivedToBase(FromRecordPtr, ToRecordPtr)) {
       LLVM_DEBUG(llvm::dbgs() << "--- approximateStdConv. Derived* to Base*\n");
-      WorkType = QualType{ToPtr, QualifiersToApply};
+      WorkType = QualType{ToPtr, FastQualifiersToApply};
     }
   }
 
@@ -1039,7 +1040,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
   const auto *ToRecord = To->getAsCXXRecordDecl();
   if (isDerivedToBase(FromRecord, ToRecord)) {
     LLVM_DEBUG(llvm::dbgs() << "--- approximateStdConv. Derived To Base.\n");
-    WorkType = QualType{ToRecord->getTypeForDecl(), QualifiersToApply};
+    WorkType = QualType{ToRecord->getTypeForDecl(), FastQualifiersToApply};
   }
 
   if (Ctx.getLangOpts().CPlusPlus17 && FromPtr && ToPtr) {
@@ -1054,7 +1055,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
         !ToFunctionPtr->hasNoexceptExceptionSpec()) {
       LLVM_DEBUG(llvm::dbgs() << "--- approximateStdConv. noexcept function "
                                  "pointer to non-noexcept.\n");
-      WorkType = QualType{ToPtr, QualifiersToApply};
+      WorkType = QualType{ToPtr, FastQualifiersToApply};
     }
   }
 
diff --git a/clang-tools-extra/clang-tidy/modernize/CMakeLists.txt b/clang-tools-extra/clang-tidy/modernize/CMakeLists.txt
index 6852db6c2ee3..8005d6e91c06 100644
--- a/clang-tools-extra/clang-tidy/modernize/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/modernize/CMakeLists.txt
@@ -16,6 +16,7 @@ add_clang_library(clangTidyModernizeModule
   MakeSharedCheck.cpp
   MakeSmartPtrCheck.cpp
   MakeUniqueCheck.cpp
+  MinMaxUseInitializerListCheck.cpp
   ModernizeTidyModule.cpp
   PassByValueCheck.cpp
   RawStringLiteralCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
new file mode 100644
index 000000000000..45f7700463d5
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
@@ -0,0 +1,271 @@
+//===--- MinMaxUseInitializerListCheck.cpp - clang-tidy -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MinMaxUseInitializerListCheck.h"
+#include "../utils/ASTUtils.h"
+#include "../utils/LexerUtils.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Lex/Lexer.h"
+
+using namespace clang;
+
+namespace {
+
+struct FindArgsResult {
+  const Expr *First;
+  const Expr *Last;
+  const Expr *Compare;
+  SmallVector<const clang::Expr *, 2> Args;
+};
+
+} // anonymous namespace
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::modernize {
+
+static FindArgsResult findArgs(const CallExpr *Call) {
+  FindArgsResult Result;
+  Result.First = nullptr;
+  Result.Last = nullptr;
+  Result.Compare = nullptr;
+
+  //   check if the function has initializer list argument
+  if (Call->getNumArgs() < 3) {
+    auto ArgIterator = Call->arguments().begin();
+
+    const auto *InitListExpr =
+        dyn_cast<CXXStdInitializerListExpr>(*ArgIterator);
+    const auto *InitList =
+        InitListExpr != nullptr
+            ? dyn_cast<clang::InitListExpr>(
+                  InitListExpr->getSubExpr()->IgnoreImplicit())
+            : nullptr;
+
+    if (InitList) {
+      Result.Args.append(InitList->inits().begin(), InitList->inits().end());
+      Result.First = *ArgIterator;
+      Result.Last = *ArgIterator;
+
+      // check if there is a comparison argument
+      std::advance(ArgIterator, 1);
+      if (ArgIterator != Call->arguments().end())
+        Result.Compare = *ArgIterator;
+
+      return Result;
+    }
+    Result.Args = SmallVector<const Expr *>(Call->arguments());
+  } else {
+    // if it has 3 arguments then the last will be the comparison
+    Result.Compare = *(std::next(Call->arguments().begin(), 2));
+    Result.Args = SmallVector<const Expr *>(llvm::drop_end(Call->arguments()));
+  }
+  Result.First = Result.Args.front();
+  Result.Last = Result.Args.back();
+
+  return Result;
+}
+
+static SmallVector<FixItHint>
+generateReplacements(const MatchFinder::MatchResult &Match,
+                     const CallExpr *TopCall, const FindArgsResult &Result,
+                     const bool IgnoreNonTrivialTypes,
+                     const std::uint64_t IgnoreTrivialTypesOfSizeAbove) {
+  SmallVector<FixItHint> FixItHints;
+  const SourceManager &SourceMngr = *Match.SourceManager;
+  const LangOptions &LanguageOpts = Match.Context->getLangOpts();
+
+  const QualType ResultType = TopCall->getDirectCallee()
+                                  ->getReturnType()
+                                  .getCanonicalType()
+                                  .getNonReferenceType()
+                                  .getUnqualifiedType();
+
+  // check if the type is trivial
+  const bool IsResultTypeTrivial = ResultType.isTrivialType(*Match.Context);
+
+  if ((!IsResultTypeTrivial && IgnoreNonTrivialTypes))
+    return FixItHints;
+
+  if (IsResultTypeTrivial &&
+      static_cast<std::uint64_t>(
+          Match.Context->getTypeSizeInChars(ResultType).getQuantity()) >
+          IgnoreTrivialTypesOfSizeAbove)
+    return FixItHints;
+
+  for (const Expr *Arg : Result.Args) {
+    const auto *InnerCall = dyn_cast<CallExpr>(Arg->IgnoreParenImpCasts());
+
+    // If the argument is not a nested call
+    if (!InnerCall) {
+      // check if typecast is required
+      const QualType ArgType = Arg->IgnoreParenImpCasts()
+                                   ->getType()
+                                   .getCanonicalType()
+                                   .getUnqualifiedType();
+
+      if (ArgType == ResultType)
+        continue;
+
+      const StringRef ArgText = Lexer::getSourceText(
+          CharSourceRange::getTokenRange(Arg->getSourceRange()), SourceMngr,
+          LanguageOpts);
+
+      const auto Replacement = Twine("static_cast<")
+                                   .concat(ResultType.getAsString(LanguageOpts))
+                                   .concat(">(")
+                                   .concat(ArgText)
+                                   .concat(")")
+                                   .str();
+
+      FixItHints.push_back(
+          FixItHint::CreateReplacement(Arg->getSourceRange(), Replacement));
+      continue;
+    }
+
+    const FindArgsResult InnerResult = findArgs(InnerCall);
+
+    // if the nested call doesn't have arguments skip it
+    if (!InnerResult.First || !InnerResult.Last)
+      continue;
+
+    // if the nested call is not the same as the top call
+    if (InnerCall->getDirectCallee()->getQualifiedNameAsString() !=
+        TopCall->getDirectCallee()->getQualifiedNameAsString())
+      continue;
+
+    // if the nested call doesn't have the same compare function
+    if ((Result.Compare || InnerResult.Compare) &&
+        !utils::areStatementsIdentical(Result.Compare, InnerResult.Compare,
+                                       *Match.Context))
+      continue;
+
+    // remove the function call
+    FixItHints.push_back(
+        FixItHint::CreateRemoval(InnerCall->getCallee()->getSourceRange()));
+
+    // remove the parentheses
+    const auto LParen = utils::lexer::findNextTokenSkippingComments(
+        InnerCall->getCallee()->getEndLoc(), SourceMngr, LanguageOpts);
+    if (LParen.has_value() && LParen->is(tok::l_paren))
+      FixItHints.push_back(
+          FixItHint::CreateRemoval(SourceRange(LParen->getLocation())));
+    FixItHints.push_back(
+        FixItHint::CreateRemoval(SourceRange(InnerCall->getRParenLoc())));
+
+    // if the inner call has an initializer list arg
+    if (InnerResult.First == InnerResult.Last) {
+      // remove the initializer list braces
+      FixItHints.push_back(FixItHint::CreateRemoval(
+          CharSourceRange::getTokenRange(InnerResult.First->getBeginLoc())));
+      FixItHints.push_back(FixItHint::CreateRemoval(
+          CharSourceRange::getTokenRange(InnerResult.First->getEndLoc())));
+    }
+
+    const SmallVector<FixItHint> InnerReplacements = generateReplacements(
+        Match, InnerCall, InnerResult, IgnoreNonTrivialTypes,
+        IgnoreTrivialTypesOfSizeAbove);
+
+    FixItHints.append(InnerReplacements);
+
+    if (InnerResult.Compare) {
+      // find the comma after the value arguments
+      const auto Comma = utils::lexer::findNextTokenSkippingComments(
+          InnerResult.Last->getEndLoc(), SourceMngr, LanguageOpts);
+
+      // remove the comma and the comparison
+      if (Comma.has_value() && Comma->is(tok::comma))
+        FixItHints.push_back(
+            FixItHint::CreateRemoval(SourceRange(Comma->getLocation())));
+
+      FixItHints.push_back(
+          FixItHint::CreateRemoval(InnerResult.Compare->getSourceRange()));
+    }
+  }
+
+  return FixItHints;
+}
+
+MinMaxUseInitializerListCheck::MinMaxUseInitializerListCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      IgnoreNonTrivialTypes(Options.get("IgnoreNonTrivialTypes", true)),
+      IgnoreTrivialTypesOfSizeAbove(
+          Options.get("IgnoreTrivialTypesOfSizeAbove", 32L)),
+      Inserter(Options.getLocalOrGlobal("IncludeStyle",
+                                        utils::IncludeSorter::IS_LLVM),
+               areDiagsSelfContained()) {}
+
+void MinMaxUseInitializerListCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IgnoreNonTrivialTypes", IgnoreNonTrivialTypes);
+  Options.store(Opts, "IgnoreTrivialTypesOfSizeAbove",
+                IgnoreTrivialTypesOfSizeAbove);
+  Options.store(Opts, "IncludeStyle", Inserter.getStyle());
+}
+
+void MinMaxUseInitializerListCheck::registerMatchers(MatchFinder *Finder) {
+  auto CreateMatcher = [](const StringRef FunctionName) {
+    auto FuncDecl = functionDecl(hasName(FunctionName));
+    auto Expression = callExpr(callee(FuncDecl));
+
+    return callExpr(callee(FuncDecl),
+                    anyOf(hasArgument(0, Expression),
+                          hasArgument(1, Expression),
+                          hasArgument(0, cxxStdInitializerListExpr())),
+                    unless(hasParent(Expression)))
+        .bind("topCall");
+  };
+
+  Finder->addMatcher(CreateMatcher("::std::max"), this);
+  Finder->addMatcher(CreateMatcher("::std::min"), this);
+}
+
+void MinMaxUseInitializerListCheck::registerPPCallbacks(
+    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
+  Inserter.registerPreprocessor(PP);
+}
+
+void MinMaxUseInitializerListCheck::check(
+    const MatchFinder::MatchResult &Match) {
+
+  const auto *TopCall = Match.Nodes.getNodeAs<CallExpr>("topCall");
+
+  const FindArgsResult Result = findArgs(TopCall);
+  const SmallVector<FixItHint> Replacements =
+      generateReplacements(Match, TopCall, Result, IgnoreNonTrivialTypes,
+                           IgnoreTrivialTypesOfSizeAbove);
+
+  if (Replacements.empty())
+    return;
+
+  const DiagnosticBuilder Diagnostic =
+      diag(TopCall->getBeginLoc(),
+           "do not use nested 'std::%0' calls, use an initializer list instead")
+      << TopCall->getDirectCallee()->getName()
+      << Inserter.createIncludeInsertion(
+             Match.SourceManager->getFileID(TopCall->getBeginLoc()),
+             "<algorithm>");
+
+  // if the top call doesn't have an initializer list argument
+  if (Result.First != Result.Last) {
+    // add { and } insertions
+    Diagnostic << FixItHint::CreateInsertion(Result.First->getBeginLoc(), "{");
+
+    Diagnostic << FixItHint::CreateInsertion(
+        Lexer::getLocForEndOfToken(Result.Last->getEndLoc(), 0,
+                                   *Match.SourceManager,
+                                   Match.Context->getLangOpts()),
+        "}");
+  }
+
+  Diagnostic << Replacements;
+}
+
+} // namespace clang::tidy::modernize
diff --git a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.h b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.h
new file mode 100644
index 000000000000..577d12653076
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.h
@@ -0,0 +1,56 @@
+//===--- MinMaxUseInitializerListCheck.h - clang-tidy -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MINMAXUSEINITIALIZERLISTCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MINMAXUSEINITIALIZERLISTCHECK_H
+
+#include "../ClangTidyCheck.h"
+#include "../utils/IncludeInserter.h"
+
+namespace clang::tidy::modernize {
+
+/// Replaces nested ``std::min`` and ``std::max`` calls with an initializer list
+/// where applicable.
+///
+/// For example:
+///
+/// \code
+///   int a = std::max(std::max(i, j), k);
+/// \endcode
+///
+/// This code is transformed to:
+///
+/// \code
+///   int a = std::max({i, j, k});
+/// \endcode
+class MinMaxUseInitializerListCheck : public ClangTidyCheck {
+public:
+  MinMaxUseInitializerListCheck(StringRef Name, ClangTidyContext *Context);
+
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP,
+                           Preprocessor *ModuleExpanderPP) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Match) override;
+
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus11;
+  }
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+
+private:
+  bool IgnoreNonTrivialTypes;
+  std::uint64_t IgnoreTrivialTypesOfSizeAbove;
+  utils::IncludeInserter Inserter;
+};
+
+} // namespace clang::tidy::modernize
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MODERNIZE_MINMAXUSEINITIALIZERLISTCHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp b/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
index e96cf274f58c..776558433c5b 100644
--- a/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
@@ -18,6 +18,7 @@
 #include "MacroToEnumCheck.h"
 #include "MakeSharedCheck.h"
 #include "MakeUniqueCheck.h"
+#include "MinMaxUseInitializerListCheck.h"
 #include "PassByValueCheck.h"
 #include "RawStringLiteralCheck.h"
 #include "RedundantVoidArgCheck.h"
@@ -68,6 +69,8 @@ public:
     CheckFactories.registerCheck<MacroToEnumCheck>("modernize-macro-to-enum");
     CheckFactories.registerCheck<MakeSharedCheck>("modernize-make-shared");
     CheckFactories.registerCheck<MakeUniqueCheck>("modernize-make-unique");
+    CheckFactories.registerCheck<MinMaxUseInitializerListCheck>(
+        "modernize-min-max-use-initializer-list");
     CheckFactories.registerCheck<PassByValueCheck>("modernize-pass-by-value");
     CheckFactories.registerCheck<UseDesignatedInitializersCheck>(
         "modernize-use-designated-initializers");
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
index 6c32a4edb4ff..f1591bae4465 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
@@ -19,7 +19,7 @@ public:
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     // FIXME this should be CPlusPlus11 but that causes test cases to
     // erroneously fail.
-    return LangOpts.CPlusPlus;
+    return LangOpts.CPlusPlus || LangOpts.C23;
   }
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
index 48bca41f4a3b..f077040a3529 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
@@ -64,8 +64,11 @@ void AvoidReturnWithVoidValueCheck::check(
            << BraceInsertionHints.closingBraceFixIt();
   }
   Diag << FixItHint::CreateRemoval(VoidReturn->getReturnLoc());
-  if (!Result.Nodes.getNodeAs<FunctionDecl>("function_parent") ||
-      SurroundingBlock->body_back() != VoidReturn)
+  const auto *FunctionParent =
+      Result.Nodes.getNodeAs<FunctionDecl>("function_parent");
+  if (!FunctionParent ||
+      (SurroundingBlock && SurroundingBlock->body_back() != VoidReturn))
+    // If this is not the last statement in a function body, we add a `return`.
     Diag << FixItHint::CreateInsertion(SemicolonPos.getLocWithOffset(1),
                                        " return;", true);
 }
diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
index dd772d692025..41065fc8e878 100644
--- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
@@ -28,6 +28,7 @@ add_clang_library(clangTidyReadabilityModule
   IsolateDeclarationCheck.cpp
   MagicNumbersCheck.cpp
   MakeMemberFunctionConstCheck.cpp
+  MathMissingParenthesesCheck.cpp
   MisleadingIndentationCheck.cpp
   MisplacedArrayIndexCheck.cpp
   NamedParameterCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
new file mode 100644
index 000000000000..65fd29609491
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.cpp
@@ -0,0 +1,99 @@
+//===--- MathMissingParenthesesCheck.cpp - clang-tidy ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MathMissingParenthesesCheck.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Lex/Lexer.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::readability {
+
+void MathMissingParenthesesCheck::registerMatchers(MatchFinder *Finder) {
+  Finder->addMatcher(binaryOperator(unless(hasParent(binaryOperator())),
+                                    unless(isAssignmentOperator()),
+                                    unless(isComparisonOperator()),
+                                    unless(hasAnyOperatorName("&&", "||")),
+                                    hasDescendant(binaryOperator()))
+                         .bind("binOp"),
+                     this);
+}
+
+static int getPrecedence(const BinaryOperator *BinOp) {
+  if (!BinOp)
+    return 0;
+  switch (BinOp->getOpcode()) {
+  case BO_Mul:
+  case BO_Div:
+  case BO_Rem:
+    return 5;
+  case BO_Add:
+  case BO_Sub:
+    return 4;
+  case BO_And:
+    return 3;
+  case BO_Xor:
+    return 2;
+  case BO_Or:
+    return 1;
+  default:
+    return 0;
+  }
+}
+static void addParantheses(const BinaryOperator *BinOp,
+                           const BinaryOperator *ParentBinOp,
+                           ClangTidyCheck *Check,
+                           const clang::SourceManager &SM,
+                           const clang::LangOptions &LangOpts) {
+  if (!BinOp)
+    return;
+
+  int Precedence1 = getPrecedence(BinOp);
+  int Precedence2 = getPrecedence(ParentBinOp);
+
+  if (ParentBinOp != nullptr && Precedence1 != Precedence2) {
+    const clang::SourceLocation StartLoc = BinOp->getBeginLoc();
+    const clang::SourceLocation EndLoc =
+        clang::Lexer::getLocForEndOfToken(BinOp->getEndLoc(), 0, SM, LangOpts);
+
+    auto Diag =
+        Check->diag(StartLoc,
+                    "'%0' has higher precedence than '%1'; add parentheses to "
+                    "explicitly specify the order of operations")
+        << (Precedence1 > Precedence2 ? BinOp->getOpcodeStr()
+                                      : ParentBinOp->getOpcodeStr())
+        << (Precedence1 > Precedence2 ? ParentBinOp->getOpcodeStr()
+                                      : BinOp->getOpcodeStr())
+        << SourceRange(StartLoc, EndLoc);
+
+    if (EndLoc.isValid()) {
+      Diag << FixItHint::CreateInsertion(StartLoc, "(")
+           << FixItHint::CreateInsertion(EndLoc, ")");
+    }
+  }
+
+  addParantheses(dyn_cast<BinaryOperator>(BinOp->getLHS()->IgnoreImpCasts()),
+                 BinOp, Check, SM, LangOpts);
+  addParantheses(dyn_cast<BinaryOperator>(BinOp->getRHS()->IgnoreImpCasts()),
+                 BinOp, Check, SM, LangOpts);
+}
+
+void MathMissingParenthesesCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *BinOp = Result.Nodes.getNodeAs<BinaryOperator>("binOp");
+  std::vector<
+      std::pair<clang::SourceRange, std::pair<const clang::BinaryOperator *,
+                                              const clang::BinaryOperator *>>>
+      Insertions;
+  const SourceManager &SM = *Result.SourceManager;
+  const clang::LangOptions &LO = Result.Context->getLangOpts();
+  addParantheses(BinOp, nullptr, this, SM, LO);
+}
+
+} // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h
new file mode 100644
index 000000000000..9a9d2b3cfaab
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h
@@ -0,0 +1,34 @@
+//===--- MathMissingParenthesesCheck.h - clang-tidy -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MATHMISSINGPARENTHESESCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MATHMISSINGPARENTHESESCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::readability {
+
+/// Check for mising parantheses in mathematical expressions that involve
+/// operators of different priorities.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/readability/math-missing-parentheses.html
+class MathMissingParenthesesCheck : public ClangTidyCheck {
+public:
+  MathMissingParenthesesCheck(StringRef Name, ClangTidyContext *Context)
+      : ClangTidyCheck(Name, Context) {}
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+};
+
+} // namespace clang::tidy::readability
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_MATHMISSINGPARENTHESESCHECK_H
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index 376b84683df7..d61c0ba39658 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -32,6 +32,7 @@
 #include "IsolateDeclarationCheck.h"
 #include "MagicNumbersCheck.h"
 #include "MakeMemberFunctionConstCheck.h"
+#include "MathMissingParenthesesCheck.h"
 #include "MisleadingIndentationCheck.h"
 #include "MisplacedArrayIndexCheck.h"
 #include "NamedParameterCheck.h"
@@ -105,6 +106,8 @@ public:
         "readability-identifier-naming");
     CheckFactories.registerCheck<ImplicitBoolConversionCheck>(
         "readability-implicit-bool-conversion");
+    CheckFactories.registerCheck<MathMissingParenthesesCheck>(
+        "readability-math-missing-parentheses");
     CheckFactories.registerCheck<RedundantInlineSpecifierCheck>(
         "readability-redundant-inline-specifier");
     CheckFactories.registerCheck<InconsistentDeclarationParameterNameCheck>(
diff --git a/clang-tools-extra/clangd/CodeCompletionStrings.cpp b/clang-tools-extra/clangd/CodeCompletionStrings.cpp
index 2075e5965f18..9b4442b0bb76 100644
--- a/clang-tools-extra/clangd/CodeCompletionStrings.cpp
+++ b/clang-tools-extra/clangd/CodeCompletionStrings.cpp
@@ -253,7 +253,7 @@ void getSignature(const CodeCompletionString &CCS, std::string *Signature,
       if (!IncludeFunctionArguments &&
           ResultKind == CodeCompletionResult::RK_Declaration)
         TruncateSnippetAt.emplace(Snippet->size());
-      LLVM_FALLTHROUGH;
+      [[fallthrough]];
     case CodeCompletionString::CK_RightParen:
     case CodeCompletionString::CK_LeftBracket:
     case CodeCompletionString::CK_RightBracket:
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 799a549ff081..94437857cecc 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -854,7 +854,7 @@ TEST_F(TargetDeclTest, DependentExprs) {
           }
         };
       )cpp";
-  EXPECT_DECLS("CXXDependentScopeMemberExpr", "void foo()");
+  EXPECT_DECLS("MemberExpr", "void foo()");
 
   // Similar to above but base expression involves a function call.
   Code = R"cpp(
@@ -872,7 +872,7 @@ TEST_F(TargetDeclTest, DependentExprs) {
           }
         };
       )cpp";
-  EXPECT_DECLS("CXXDependentScopeMemberExpr", "void foo()");
+  EXPECT_DECLS("MemberExpr", "void foo()");
 
   // Similar to above but uses a function pointer.
   Code = R"cpp(
@@ -891,7 +891,7 @@ TEST_F(TargetDeclTest, DependentExprs) {
           }
         };
       )cpp";
-  EXPECT_DECLS("CXXDependentScopeMemberExpr", "void foo()");
+  EXPECT_DECLS("MemberExpr", "void foo()");
 
   // Base expression involves a member access into this.
   Code = R"cpp(
@@ -962,7 +962,7 @@ TEST_F(TargetDeclTest, DependentExprs) {
           void Foo() { this->[[find]](); }
         };
   )cpp";
-  EXPECT_DECLS("CXXDependentScopeMemberExpr", "void find()");
+  EXPECT_DECLS("MemberExpr", "void find()");
 }
 
 TEST_F(TargetDeclTest, DependentTypes) {
diff --git a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
index 4156921d83ed..30b9b1902aa9 100644
--- a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
@@ -621,7 +621,7 @@ sizeof...($TemplateParameter[[Elements]]);
       struct $Class_def[[Foo]] {
         int $Field_decl[[Waldo]];
         void $Method_def[[bar]]() {
-          $Class[[Foo]]().$Field_dependentName[[Waldo]];
+          $Class[[Foo]]().$Field[[Waldo]];
         }
         template $Bracket[[<]]typename $TemplateParameter_def[[U]]$Bracket[[>]]
         void $Method_def[[bar1]]() {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 5b1feffb89ea..5956ccb92548 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -100,13 +100,15 @@ Improvements to clang-tidy
 - Improved :program:`run-clang-tidy.py` script. Added argument `-source-filter`
   to filter source files from the compilation database, via a RegEx. In a
   similar fashion to what `-header-filter` does for header files.
+
 - Improved :program:`check_clang_tidy.py` script. Added argument `-export-fixes`
   to aid in clang-tidy and test development.
+
 - Fixed bug where big values for unsigned check options overflowed into negative values
-  when being printed with ``--dump-config``.
+  when being printed with `--dump-config`.
 
-- Fixed ``--verify-config`` option not properly parsing checks when using the 
-  literal operator in the ``.clang-tidy`` config.
+- Fixed `--verify-config` option not properly parsing checks when using the
+  literal operator in the `.clang-tidy` config.
 
 New checks
 ^^^^^^^^^^
@@ -131,6 +133,12 @@ New checks
   to reading out-of-bounds data due to inadequate or incorrect string null
   termination.
 
+- New :doc:`modernize-min-max-use-initializer-list
+  <clang-tidy/checks/modernize/min-max-use-initializer-list>` check.
+
+  Replaces nested ``std::min`` and ``std::max`` calls with an initializer list
+  where applicable.
+
 - New :doc:`modernize-use-designated-initializers
   <clang-tidy/checks/modernize/use-designated-initializers>` check.
 
@@ -143,6 +151,12 @@ New checks
   Enforces consistent style for enumerators' initialization, covering three
   styles: none, first only, or all initialized explicitly.
 
+- New :doc:`readability-math-missing-parentheses
+  <clang-tidy/checks/readability/math-missing-parentheses>` check.
+
+  Check for missing parentheses in mathematical expressions that involve
+  operators of different priorities.
+
 - New :doc:`readability-use-std-min-max
   <clang-tidy/checks/readability/use-std-min-max>` check.
 
@@ -224,7 +238,7 @@ Changes in existing checks
 
 - Improved :doc:`google-explicit-constructor
   <clang-tidy/checks/google/explicit-constructor>` check to better handle
-  ``C++-20`` `explicit(bool)`.
+  C++20 `explicit(bool)`.
 
 - Improved :doc:`google-global-names-in-headers
   <clang-tidy/checks/google/global-names-in-headers>` check by replacing the local
@@ -237,13 +251,18 @@ Changes in existing checks
   check by ignoring other functions with same prefixes as the target specific
   functions.
 
+- Improved :doc:`linuxkernel-must-check-errs
+  <clang-tidy/checks/linuxkernel/must-check-errs>` check documentation to
+  consistently use the check's proper name.
+
 - Improved :doc:`llvm-header-guard
   <clang-tidy/checks/llvm/header-guard>` check by replacing the local
   option `HeaderFileExtensions` by the global option of the same name.
 
 - Improved :doc:`misc-const-correctness
   <clang-tidy/checks/misc/const-correctness>` check by avoiding infinite recursion
-  for recursive forwarding reference.
+  for recursive functions with forwarding reference parameters and reference
+  variables which refer to themselves.
 
 - Improved :doc:`misc-definitions-in-headers
   <clang-tidy/checks/misc/definitions-in-headers>` check by replacing the local
@@ -269,6 +288,10 @@ Changes in existing checks
   don't remove parentheses used in ``sizeof`` calls when they have array index
   accesses as arguments.
 
+- Improved :doc:`modernize-use-nullptr
+  <clang-tidy/checks/modernize/use-nullptr>` check to include support for C23,
+  which also has introduced the ``nullptr`` keyword.
+
 - Improved :doc:`modernize-use-override
   <clang-tidy/checks/modernize/use-override>` check to also remove any trailing
   whitespace when deleting the ``virtual`` keyword.
@@ -324,13 +347,9 @@ Miscellaneous
 ^^^^^^^^^^^^^
 
 - Fixed incorrect formatting in :program:`clang-apply-replacements` when no
-  ``--format`` option is specified. Now :program:`clang-apply-replacements`
+  `--format` option is specified. Now :program:`clang-apply-replacements`
   applies formatting only with the option.
 
-- Fixed the :doc:`linuxkernel-must-check-errs
-  <clang-tidy/checks/linuxkernel/must-check-errs>` documentation to consistently
-  use the check's proper name.
-
 Improvements to include-fixer
 -----------------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst
index c5321b07f7f8..9271c9ecccc0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst
@@ -10,4 +10,4 @@ but does not actually attempt to execute a command.
 
 This check corresponds to the CERT C Coding Standard rule
 `ENV33-C. Do not call system()
-<https://www.securecoding.cert.org/confluence/pages/viewpage.action?pageId=2130132>`_.
+<https://www.securecoding.cert.org/confluence/display/c/ENV33-C.+Do+not+call+system()>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 5d9d487f75f9..49747ff896ba 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -276,6 +276,7 @@ Clang-Tidy Checks
    :doc:`modernize-macro-to-enum <modernize/macro-to-enum>`, "Yes"
    :doc:`modernize-make-shared <modernize/make-shared>`, "Yes"
    :doc:`modernize-make-unique <modernize/make-unique>`, "Yes"
+   :doc:`modernize-min-max-use-initializer-list <modernize/min-max-use-initializer-list>`, "Yes"
    :doc:`modernize-pass-by-value <modernize/pass-by-value>`, "Yes"
    :doc:`modernize-raw-string-literal <modernize/raw-string-literal>`, "Yes"
    :doc:`modernize-redundant-void-arg <modernize/redundant-void-arg>`, "Yes"
@@ -363,6 +364,7 @@ Clang-Tidy Checks
    :doc:`readability-isolate-declaration <readability/isolate-declaration>`, "Yes"
    :doc:`readability-magic-numbers <readability/magic-numbers>`,
    :doc:`readability-make-member-function-const <readability/make-member-function-const>`, "Yes"
+   :doc:`readability-math-missing-parentheses <readability/math-missing-parentheses>`, "Yes"
    :doc:`readability-misleading-indentation <readability/misleading-indentation>`,
    :doc:`readability-misplaced-array-index <readability/misplaced-array-index>`, "Yes"
    :doc:`readability-named-parameter <readability/named-parameter>`, "Yes"
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst
new file mode 100644
index 000000000000..d6721a25629b
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/min-max-use-initializer-list.rst
@@ -0,0 +1,50 @@
+.. title:: clang-tidy - modernize-min-max-use-initializer-list
+
+modernize-min-max-use-initializer-list
+======================================
+
+Replaces nested ``std::min`` and ``std::max`` calls with an initializer list 
+where applicable.
+
+For instance, consider the following code:
+
+.. code-block:: cpp
+
+  int a = std::max(std::max(i, j), k);
+
+The check will transform the above code to:
+
+.. code-block:: cpp
+
+  int a = std::max({i, j, k});
+
+Performance Considerations
+==========================
+
+While this check simplifies the code and makes it more readable, it may cause 
+performance degradation for non-trivial types due to the need to copy objects 
+into the initializer list.
+
+To avoid this, it is recommended to use `std::ref` or `std::cref` for
+non-trivial types:
+
+.. code-block:: cpp
+
+  std::string b = std::max({std::ref(i), std::ref(j), std::ref(k)});
+
+Options
+=======
+
+.. option:: IncludeStyle
+
+  A string specifying which include-style is used, `llvm` or `google`. Default
+  is `llvm`.
+
+.. option:: IgnoreNonTrivialTypes
+
+  A boolean specifying whether to ignore non-trivial types. Default is `true`.
+
+.. option:: IgnoreTrivialTypesOfSizeAbove
+
+  An integer specifying the size (in bytes) above which trivial types are
+  ignored. Default is `32`.
+\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-nullptr.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-nullptr.rst
index 5e1ba858adf3..25e17fee0a3d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-nullptr.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-nullptr.rst
@@ -4,7 +4,7 @@ modernize-use-nullptr
 =====================
 
 The check converts the usage of null pointer constants (e.g. ``NULL``, ``0``)
-to use the new C++11 ``nullptr`` keyword.
+to use the new C++11 and C23 ``nullptr`` keyword.
 
 Example
 -------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst
new file mode 100644
index 000000000000..21d66daab334
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/math-missing-parentheses.rst
@@ -0,0 +1,27 @@
+.. title:: clang-tidy - readability-math-missing-parentheses
+
+readability-math-missing-parentheses
+====================================
+
+Check for missing parentheses in mathematical expressions that involve operators
+of different priorities.
+
+Parentheses in mathematical expressions clarify the order
+of operations, especially with different-priority operators. Lengthy or multiline
+expressions can obscure this order, leading to coding errors. IDEs can aid clarity
+by highlighting parentheses. Explicitly using parentheses also clarifies what the 
+developer had in mind when writing the expression. Ensuring their presence reduces
+ambiguity and errors, promoting clearer and more maintainable code.
+
+Before:
+
+.. code-block:: c++
+
+  int x = 1 + 2 * 3 - 4 / 5;
+
+
+After:
+
+.. code-block:: c++
+
+  int x = 1 + (2 * 3) - (4 / 5);
+\ No newline at end of file
diff --git a/clang-tools-extra/test/CMakeLists.txt b/clang-tools-extra/test/CMakeLists.txt
index f4c529ee8af2..7a1c168e22f9 100644
--- a/clang-tools-extra/test/CMakeLists.txt
+++ b/clang-tools-extra/test/CMakeLists.txt
@@ -67,33 +67,30 @@ foreach(dep ${LLVM_UTILS_DEPS})
   endif()
 endforeach()
 
-if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
-  if (NOT WIN32 OR NOT LLVM_LINK_LLVM_DYLIB)
-    llvm_add_library(
-        CTTestTidyModule
-        MODULE clang-tidy/CTTestTidyModule.cpp
-        PLUGIN_TOOL clang-tidy
-        DEPENDS clang-tidy-headers)
-  endif()
+if (NOT WIN32 OR NOT LLVM_LINK_LLVM_DYLIB)
+  llvm_add_library(
+      CTTestTidyModule
+      MODULE clang-tidy/CTTestTidyModule.cpp
+      PLUGIN_TOOL clang-tidy)
+endif()
 
-  if(CLANG_BUILT_STANDALONE)
-    # LLVMHello library is needed below
-    if (EXISTS ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Hello
-       AND NOT TARGET LLVMHello)
-      add_subdirectory(${LLVM_MAIN_SRC_DIR}/lib/Transforms/Hello
-        lib/Transforms/Hello)
-    endif()
+if(CLANG_BUILT_STANDALONE)
+  # LLVMHello library is needed below
+  if (EXISTS ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Hello
+      AND NOT TARGET LLVMHello)
+    add_subdirectory(${LLVM_MAIN_SRC_DIR}/lib/Transforms/Hello
+      lib/Transforms/Hello)
   endif()
+endif()
 
-  if(TARGET CTTestTidyModule)
-      list(APPEND CLANG_TOOLS_TEST_DEPS CTTestTidyModule LLVMHello)
-      target_include_directories(CTTestTidyModule PUBLIC BEFORE "${CLANG_TOOLS_SOURCE_DIR}")
-      if(CLANG_PLUGIN_SUPPORT AND (WIN32 OR CYGWIN))
-        set(LLVM_LINK_COMPONENTS
-          Support
-        )
-      endif()
-  endif()
+if(TARGET CTTestTidyModule)
+    list(APPEND CLANG_TOOLS_TEST_DEPS CTTestTidyModule LLVMHello)
+    target_include_directories(CTTestTidyModule PUBLIC BEFORE "${CLANG_TOOLS_SOURCE_DIR}")
+    if(CLANG_PLUGIN_SUPPORT AND (WIN32 OR CYGWIN))
+      set(LLVM_LINK_COMPONENTS
+        Support
+      )
+    endif()
 endif()
 
 add_lit_testsuite(check-clang-extra "Running clang-tools-extra/test"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
index 574efe7bd914..ae61b17ca14d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
@@ -309,6 +309,8 @@ struct HeapArray {                                          // Ok, since destruc
 
   HeapArray(HeapArray &&other) : _data(other._data), size(other.size) { // Ok
     other._data = nullptr;                                              // Ok
+    // CHECK-NOTES: [[@LINE-1]]:5: warning: expected assignment source to be of type 'gsl::owner<>'; got 'std::nullptr_t'
+    // FIXME: This warning is emitted because an ImplicitCastExpr for the NullToPointer conversion isn't created for dependent types.
     other.size = 0;
   }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp
new file mode 100644
index 000000000000..51ab9bda975f
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp
@@ -0,0 +1,305 @@
+// RUN: %check_clang_tidy %s modernize-min-max-use-initializer-list %t
+
+// CHECK-FIXES: #include <algorithm>
+namespace utils {
+template <typename T>
+T max(T a, T b) {
+  return (a < b) ? b : a;
+}
+} // namespace utils
+
+namespace std {
+template< class T >
+struct initializer_list {
+  initializer_list()=default;
+  initializer_list(T*,int){}
+  const T* begin() const {return nullptr;}
+  const T* end() const {return nullptr;}
+};
+
+template<class ForwardIt>
+ForwardIt min_element(ForwardIt first, ForwardIt last)
+{
+    if (first == last)
+        return last;
+
+    ForwardIt smallest = first;
+
+    while (++first != last)
+        if (*first < *smallest)
+            smallest = first;
+
+    return smallest;
+}
+
+template<class ForwardIt, class Compare>
+ForwardIt min_element(ForwardIt first, ForwardIt last, Compare comp)
+{
+    if (first == last)
+        return last;
+
+    ForwardIt smallest = first;
+
+    while (++first != last)
+        if (comp(*first, *smallest))
+            smallest = first;
+
+    return smallest;
+}
+
+template<class ForwardIt>
+ForwardIt max_element(ForwardIt first, ForwardIt last)
+{
+    if (first == last)
+        return last;
+
+    ForwardIt largest = first;
+
+    while (++first != last)
+        if (*largest < *first)
+            largest = first;
+
+    return largest;
+}
+
+template<class ForwardIt, class Compare>
+ForwardIt max_element(ForwardIt first, ForwardIt last, Compare comp)
+{
+    if (first == last)
+        return last;
+
+    ForwardIt largest = first;
+
+    while(++first != last)
+        if (comp(*largest, *first))
+            largest = first;
+
+    return largest;
+}
+
+template< class T >
+const T& max( const T& a, const T& b ) {
+  return (a < b) ? b : a;
+};
+
+template< class T >
+T max(std::initializer_list<T> ilist)
+{
+    return *std::max_element(ilist.begin(), ilist.end());
+}
+
+template< class T, class Compare >
+const T& max( const T& a, const T& b, Compare comp ) {
+  return (comp(a, b)) ? b : a;
+};
+
+template< class T, class Compare >
+T max(std::initializer_list<T> ilist, Compare comp) {
+    return *std::max_element(ilist.begin(), ilist.end(), comp);
+};
+
+template< class T >
+const T& min( const T& a, const T& b ) {
+  return (b < a) ? b : a;
+};
+
+template< class T >
+T min(std::initializer_list<T> ilist)
+{
+    return *std::min_element(ilist.begin(), ilist.end());
+}
+
+
+template< class T, class Compare >
+const T& min( const T& a, const T& b, Compare comp ) {
+  return (comp(b, a)) ? b : a;
+};
+
+template< class T, class Compare >
+T min(std::initializer_list<T> ilist, Compare comp) {
+    return *std::min_element(ilist.begin(), ilist.end(), comp);
+};
+
+} // namespace std
+
+using namespace std;
+
+namespace {
+bool fless_than(int a, int b) {
+return a < b;
+}
+
+bool fgreater_than(int a, int b) {
+return a > b;
+}
+auto less_than = [](int a, int b) { return a < b; };
+auto greater_than = [](int a, int b) { return a > b; };
+
+int max1 = std::max(1, std::max(2, 3));
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max1 = std::max({1, 2, 3});
+
+int min1 = std::min(1, std::min(2, 3));
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::min' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int min1 = std::min({1, 2, 3});
+
+int max2 = std::max(1, std::max(2, std::max(3, 4)));
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max2 = std::max({1, 2, 3, 4});
+
+int max2b = std::max(std::max(std::max(1, 2), std::max(3, 4)), std::max(std::max(5, 6), std::max(7, 8)));
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max2b = std::max({1, 2, 3, 4, 5, 6, 7, 8});
+
+int max2c = std::max(std::max(1, std::max(2, 3)), 4);
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max2c = std::max({1, 2, 3, 4});
+
+int max2d = std::max(std::max({1, 2, 3}), 4);
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max2d = std::max({1, 2, 3, 4});
+
+
+int max2e = std::max(1, max(2, max(3, 4)));
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max2e = std::max({1, 2, 3, 4});
+
+int min2 = std::min(1, std::min(2, std::min(3, 4)));
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::min' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int min2 = std::min({1, 2, 3, 4});
+
+int max3 = std::max(std::max(4, 5), std::min(2, std::min(3, 1)));
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-MESSAGES: :[[@LINE-2]]:37: warning: do not use nested 'std::min' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max3 = std::max({4, 5, std::min({2, 3, 1})});
+
+int min3 = std::min(std::min(4, 5), std::max(2, std::max(3, 1)));
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::min' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-MESSAGES: :[[@LINE-2]]:37: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int min3 = std::min({4, 5, std::max({2, 3, 1})});
+
+int max4 = std::max(1, std::max(2, 3, greater_than), less_than);
+// CHECK-FIXES: int max4 = std::max(1, std::max(2, 3, greater_than), less_than);
+
+int min4 = std::min(1, std::min(2, 3, greater_than), less_than);
+// CHECK-FIXES: int min4 = std::min(1, std::min(2, 3, greater_than), less_than);
+
+int max5 = std::max(1, std::max(2, 3), less_than);
+// CHECK-FIXES: int max5 = std::max(1, std::max(2, 3), less_than);
+
+int min5 = std::min(1, std::min(2, 3), less_than);
+// CHECK-FIXES: int min5 = std::min(1, std::min(2, 3), less_than);
+
+int max6 = std::max(1, std::max(2, 3, greater_than), greater_than);
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max6 = std::max({1, 2, 3 }, greater_than);
+
+int min6 = std::min(1, std::min(2, 3, greater_than), greater_than);
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::min' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int min6 = std::min({1, 2, 3 }, greater_than);
+
+int max7 = std::max(1, std::max(2, 3, fless_than), fgreater_than);
+// CHECK-FIXES: int max7 = std::max(1, std::max(2, 3, fless_than), fgreater_than);
+
+int min7 = std::min(1, std::min(2, 3, fless_than), fgreater_than);
+// CHECK-FIXES: int min7 = std::min(1, std::min(2, 3, fless_than), fgreater_than);
+
+int max8 = std::max(1, std::max(2, 3, fless_than), less_than);
+// CHECK-FIXES: int max8 = std::max(1, std::max(2, 3, fless_than), less_than)
+
+int min8 = std::min(1, std::min(2, 3, fless_than), less_than);
+// CHECK-FIXES: int min8 = std::min(1, std::min(2, 3, fless_than), less_than);
+
+int max9 = std::max(1, std::max(2, 3, fless_than), fless_than);
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max9 = std::max({1, 2, 3 }, fless_than);
+
+int min9 = std::min(1, std::min(2, 3, fless_than), fless_than);
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: do not use nested 'std::min' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int min9 = std::min({1, 2, 3 }, fless_than);
+
+int min10 = std::min(std::min(4, 5), std::max(2, utils::max(3, 1)));
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use nested 'std::min' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int min10 = std::min({4, 5, std::max(2, utils::max(3, 1))});
+
+int max10 = std::max({std::max(1, 2), std::max({5, 6, 1}), 2, std::min({1, 2, 4})});
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int max10 = std::max({1, 2, 5, 6, 1, 2, std::min({1, 2, 4})});
+
+int typecastTest = std::max(std::max<int>(0U, 0.0f), 0);
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int typecastTest = std::max({static_cast<int>(0U), static_cast<int>(0.0f), 0});
+
+int typecastTest1 = std::max(std::max<long>(0U, 0.0f), 0L);
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int typecastTest1 = std::max({static_cast<long>(0U), static_cast<long>(0.0f), 0L});
+
+int typecastTest2 = std::max(std::max<int>(10U, 20.0f), 30);
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int typecastTest2 = std::max({static_cast<int>(10U), static_cast<int>(20.0f), 30});
+
+int typecastTest3 = std::max(std::max<int>(0U, std::max<int>(0.0f, 1.0f)), 0);
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int typecastTest3 = std::max({static_cast<int>(0U), static_cast<int>(0.0f), static_cast<int>(1.0f), 0});
+
+#define max3f(a, b, c) std::max(a, std::max(b, c))
+// CHECK-FIXES: #define max3f(a, b, c) std::max(a, std::max(b, c))
+
+#define value 4545
+int macroVarMax = std::max(value, std::max(1, 2));
+// CHECK-MESSAGES: :[[@LINE-1]]:19: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int macroVarMax = std::max({value, 1, 2});
+
+#define value2 45U
+int macroVarMax2 = std::max(1, std::max<int>(value2, 2.0f));
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: int macroVarMax2 = std::max({1, static_cast<int>(value2), static_cast<int>(2.0f)});
+
+// True-negative tests
+int maxTN1 = std::max(1, 2);
+// CHECK-FIXES: int maxTN1 = std::max(1, 2);
+
+int maxTN2 = std::max({1, 2, 3});
+// CHECK-FIXES: int maxTN2 = std::max({1, 2, 3});
+
+int maxTN3 = std::max({1, 2, 3}, less_than);
+// CHECK-FIXES: int maxTN3 = std::max({1, 2, 3}, less_than);
+
+// non-trivial types
+struct A {
+  int a;
+  A(int a) : a(a) {}
+  bool operator<(const A &rhs) const { return a < rhs.a; }
+};
+
+A maxNT1 = std::max(A(1), A(2));
+// CHECK-FIXES: A maxNT1 = std::max(A(1), A(2));
+
+A maxNT2 = std::max(A(1), std::max(A(2), A(3)));
+// CHECK-FIXES: A maxNT2 = std::max(A(1), std::max(A(2), A(3)));
+
+A maxNT3 = std::max(A(1), std::max(A(2), A(3)), [](const A &lhs, const A &rhs) { return lhs.a < rhs.a; });
+// CHECK-FIXES: A maxNT3 = std::max(A(1), std::max(A(2), A(3)), [](const A &lhs, const A &rhs) { return lhs.a < rhs.a; });
+
+// Trivial type with size greater than 32
+struct B {
+  // 9*4 = 36 bytes > 32 bytes
+  int a[9];
+
+  bool operator<(const B& rhs) const {
+    return a[0] < rhs.a[0];
+  }
+};
+
+B maxTT1 = std::max(B(), B());
+// CHECK-FIXES: B maxTT1 = std::max(B(), B());
+
+B maxTT2 = std::max(B(), std::max(B(), B()));
+// CHECK-FIXES: B maxTT2 = std::max(B(), std::max(B(), B()));
+
+B maxTT3 = std::max(B(), std::max(B(), B()), [](const B &lhs, const B &rhs) { return lhs.a[0] < rhs.a[0]; });
+// CHECK-FIXES: B maxTT3 = std::max(B(), std::max(B(), B()), [](const B &lhs, const B &rhs) { return lhs.a[0] < rhs.a[0]; });
+
+
+} // namespace
+
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
index 559031cf4d9b..4abb9c855597 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
@@ -260,6 +260,8 @@ template <class T>
 struct Template {
   Template() = default;
   Template(const Template &Other) : Field(Other.Field) {}
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use '= default'
+  // CHECK-FIXES: Template(const Template &Other)  = default;
   Template &operator=(const Template &Other);
   void foo(const T &t);
   int Field;
@@ -269,8 +271,12 @@ Template<T> &Template<T>::operator=(const Template<T> &Other) {
   Field = Other.Field;
   return *this;
 }
+// CHECK-MESSAGES: :[[@LINE-4]]:27: warning: use '= default'
+// CHECK-FIXES: Template<T> &Template<T>::operator=(const Template<T> &Other) = default;
+
 Template<int> T1;
 
+
 // Dependent types.
 template <class T>
 struct DT1 {
@@ -284,6 +290,9 @@ DT1<T> &DT1<T>::operator=(const DT1<T> &Other) {
   Field = Other.Field;
   return *this;
 }
+// CHECK-MESSAGES: :[[@LINE-4]]:17: warning: use '= default'
+// CHECK-FIXES: DT1<T> &DT1<T>::operator=(const DT1<T> &Other) = default;
+
 DT1<int> Dt1;
 
 template <class T>
@@ -303,6 +312,9 @@ DT2<T> &DT2<T>::operator=(const DT2<T> &Other) {
 struct T {
   typedef int TT;
 };
+// CHECK-MESSAGES: :[[@LINE-8]]:17: warning: use '= default'
+// CHECK-FIXES: DT2<T> &DT2<T>::operator=(const DT2<T> &Other) = default;
+
 DT2<T> Dt2;
 
 // Default arguments.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr-c23.c b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr-c23.c
new file mode 100644
index 000000000000..6fb879b91e41
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr-c23.c
@@ -0,0 +1,139 @@
+// RUN: %check_clang_tidy %s modernize-use-nullptr %t -- -- -std=c23
+
+#define NULL 0
+
+void test_assignment() {
+  int *p1 = 0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: use nullptr [modernize-use-nullptr]
+  // CHECK-FIXES: int *p1 = nullptr;
+  p1 = 0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: use nullptr
+  // CHECK-FIXES: p1 = nullptr;
+
+  int *p2 = NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: use nullptr
+  // CHECK-FIXES: int *p2 = nullptr;
+
+  p2 = p1;
+  // CHECK-FIXES: p2 = p1;
+
+  const int null = 0;
+  int *p3 = &null;
+
+  p3 = NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: use nullptr
+  // CHECK-FIXES: p3 = nullptr;
+
+  int *p4 = p3;
+
+  int i1 = 0;
+
+  int i2 = NULL;
+
+  int i3 = null;
+
+  int *p5, *p6, *p7;
+  p5 = p6 = p7 = NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: use nullptr
+  // CHECK-FIXES: p5 = p6 = p7 = nullptr;
+}
+
+void test_function(int *p) {}
+
+void test_function_no_ptr_param(int i) {}
+
+void test_function_call() {
+  test_function(0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: use nullptr
+  // CHECK-FIXES: test_function(nullptr);
+
+  test_function(NULL);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: use nullptr
+  // CHECK-FIXES: test_function(nullptr);
+
+  test_function_no_ptr_param(0);
+}
+
+char *test_function_return1() {
+  return 0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: use nullptr
+  // CHECK-FIXES: return nullptr;
+}
+
+void *test_function_return2() {
+  return NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: use nullptr
+  // CHECK-FIXES: return nullptr;
+}
+
+int test_function_return4() {
+  return 0;
+}
+
+int test_function_return5() {
+  return NULL;
+}
+
+int *test_function_return_cast1() {
+  return(int)0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: use nullptr
+  // CHECK-FIXES: return nullptr;
+}
+
+int *test_function_return_cast2() {
+#define RET return
+  RET(int)0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: use nullptr
+  // CHECK-FIXES: RET nullptr;
+#undef RET
+}
+
+// Test parentheses expressions resulting in a nullptr.
+int *test_parentheses_expression1() {
+  return(0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: use nullptr
+  // CHECK-FIXES: return(nullptr);
+}
+
+int *test_parentheses_expression2() {
+  return((int)(0.0f));
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: use nullptr
+  // CHECK-FIXES: return(nullptr);
+}
+
+int *test_nested_parentheses_expression() {
+  return((((0))));
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: use nullptr
+  // CHECK-FIXES: return((((nullptr))));
+}
+
+void test_const_pointers() {
+  const int *const_p1 = 0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use nullptr
+  // CHECK-FIXES: const int *const_p1 = nullptr;
+  const int *const_p2 = NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use nullptr
+  // CHECK-FIXES: const int *const_p2 = nullptr;
+  const int *const_p3 = (int)0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use nullptr
+  // CHECK-FIXES: const int *const_p3 = nullptr;
+  const int *const_p4 = (int)0.0f;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use nullptr
+  // CHECK-FIXES: const int *const_p4 = nullptr;
+}
+
+void test_nested_implicit_cast_expr() {
+  int func0(void*, void*);
+  int func1(int, void*, void*);
+
+  (double)func1(0, 0, 0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: use nullptr
+  // CHECK-MESSAGES: :[[@LINE-2]]:23: warning: use nullptr
+  // CHECK-FIXES: (double)func1(0, nullptr, nullptr);
+  (double)func1(func0(0, 0), 0, 0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: use nullptr
+  // CHECK-MESSAGES: :[[@LINE-2]]:26: warning: use nullptr
+  // CHECK-MESSAGES: :[[@LINE-3]]:30: warning: use nullptr
+  // CHECK-MESSAGES: :[[@LINE-4]]:33: warning: use nullptr
+  // CHECK-FIXES: (double)func1(func0(nullptr, nullptr), nullptr, nullptr);
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.c b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.c
index c2ccbbd81171..1218b837199c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.c
@@ -1,4 +1,4 @@
-// RUN: clang-tidy %s -checks=-*,modernize-use-nullptr -- | count 0
+// RUN: clang-tidy %s -checks=-*,modernize-use-nullptr -- -std=c17 | count 0
 
 // Note: this test expects no diagnostics, but FileCheck cannot handle that,
 // hence the use of | count 0.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp
new file mode 100644
index 000000000000..a6045c079a48
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp
@@ -0,0 +1,142 @@
+// RUN: %check_clang_tidy %s readability-math-missing-parentheses %t
+
+#define MACRO_AND &
+#define MACRO_ADD +
+#define MACRO_OR |
+#define MACRO_MULTIPLY *
+#define MACRO_XOR ^
+#define MACRO_SUBTRACT -
+#define MACRO_DIVIDE /
+
+int foo(){
+    return 5;
+}
+
+int bar(){
+    return 4;
+}
+
+int sink(int);
+#define FUN(ARG) (sink(ARG))
+#define FUN2(ARG) sink((ARG))
+#define FUN3(ARG) sink(ARG)
+#define FUN4(ARG) sink(1 + ARG)
+#define FUN5(ARG) sink(4 * ARG)
+
+class fun{
+public:
+    int A;
+    double B;
+    fun(){
+        A = 5;
+        B = 5.4;
+    }
+};
+
+void f(){
+    //CHECK-MESSAGES: :[[@LINE+2]]:17: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int a = 1 + (2 * 3);
+    int a = 1 + 2 * 3;
+
+    int a_negative = 1 + (2 * 3); // No warning
+
+    int b = 1 + 2 + 3; // No warning
+
+    int c = 1 * 2 * 3; // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+3]]:17: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+2]]:25: warning: '/' has higher precedence than '-'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int d = 1 + (2 * 3) - (4 / 5);
+    int d = 1 + 2 * 3 - 4 / 5;
+
+    int d_negative = 1 + (2 * 3) - (4 / 5); // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+4]]:13: warning: '&' has higher precedence than '|'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+3]]:17: warning: '+' has higher precedence than '&'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+2]]:25: warning: '*' has higher precedence than '|'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int e = (1 & (2 + 3)) | (4 * 5);
+    int e = 1 & 2 + 3 | 4 * 5;
+
+    int e_negative = (1 & (2 + 3)) | (4 * 5); // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+2]]:13: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int f = (1 * -2) + 4;
+    int f = 1 * -2 + 4;
+
+    int f_negative = (1 * -2) + 4; // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+2]]:13: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int g = (1 * 2 * 3) + 4 + 5;
+    int g = 1 * 2 * 3 + 4 + 5;
+
+    int g_negative = (1 * 2 * 3) + 4 + 5; // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+4]]:13: warning: '&' has higher precedence than '|'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+3]]:19: warning: '+' has higher precedence than '&'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+2]]:27: warning: '*' has higher precedence than '|'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int h = (120 & (2 + 3)) | (22 * 5);
+    int h = 120 & 2 + 3 | 22 * 5;
+
+    int h_negative = (120 & (2 + 3)) | (22 * 5); // No warning
+
+    int i = 1 & 2 & 3; // No warning
+
+    int j = 1 | 2 | 3; // No warning
+
+    int k = 1 ^ 2 ^ 3; // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+2]]:13: warning: '+' has higher precedence than '^'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int l = (1 + 2) ^ 3;
+    int l = 1 + 2 ^ 3;
+
+    int l_negative = (1 + 2) ^ 3; // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+2]]:13: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int m = (2 * foo()) + bar();
+    int m = 2 * foo() + bar();
+
+    int m_negative = (2 * foo()) + bar(); // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+2]]:13: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int n = (1.05 * foo()) + double(bar());
+    int n = 1.05 * foo() + double(bar());
+
+    int n_negative = (1.05 * foo()) + double(bar()); // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+3]]:17: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int o = 1 + (obj.A * 3) + obj.B;
+    fun obj;
+    int o = 1 + obj.A * 3 + obj.B;
+
+    int o_negative = 1 + (obj.A * 3) + obj.B; // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+2]]:18: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int p = 1U + (2 * 3);
+    int p = 1U + 2 * 3;
+
+    int p_negative = 1U + (2 * 3); // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+7]]:13: warning: '+' has higher precedence than '|'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+6]]:25: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+5]]:53: warning: '&' has higher precedence than '^'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+4]]:53: warning: '^' has higher precedence than '|'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+3]]:77: warning: '-' has higher precedence than '^'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-MESSAGES: :[[@LINE+2]]:94: warning: '/' has higher precedence than '-'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    //CHECK-FIXES: int q = (1 MACRO_ADD (2 MACRO_MULTIPLY 3)) MACRO_OR ((4 MACRO_AND 5) MACRO_XOR (6 MACRO_SUBTRACT (7 MACRO_DIVIDE 8)));
+    int q = 1 MACRO_ADD 2 MACRO_MULTIPLY 3 MACRO_OR 4 MACRO_AND 5 MACRO_XOR 6 MACRO_SUBTRACT 7 MACRO_DIVIDE 8; // No warning
+
+    //CHECK-MESSAGES: :[[@LINE+1]]:21: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    int r = FUN(0 + 1 * 2);
+
+    //CHECK-MESSAGES: :[[@LINE+1]]:22: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    int s = FUN2(0 + 1 * 2);
+
+    //CHECK-MESSAGES: :[[@LINE+1]]:22: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    int t = FUN3(0 + 1 * 2);
+
+    //CHECK-MESSAGES: :[[@LINE+1]]:18: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    int u = FUN4(1 * 2);
+
+    //CHECK-MESSAGES: :[[@LINE+1]]:13: warning: '*' has higher precedence than '+'; add parentheses to explicitly specify the order of operations [readability-math-missing-parentheses]
+    int v = FUN5(0 + 1);
+}
diff --git a/clang-tools-extra/test/lit.site.cfg.py.in b/clang-tools-extra/test/lit.site.cfg.py.in
index 4eb830a1baf1..e6503a4c097c 100644
--- a/clang-tools-extra/test/lit.site.cfg.py.in
+++ b/clang-tools-extra/test/lit.site.cfg.py.in
@@ -10,7 +10,7 @@ config.python_executable = "@Python3_EXECUTABLE@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.clang_tidy_staticanalyzer = @CLANG_TIDY_ENABLE_STATIC_ANALYZER@
-config.has_plugins = @CLANG_PLUGIN_SUPPORT@ & ~@LLVM_INSTALL_TOOLCHAIN_ONLY@
+config.has_plugins = @CLANG_PLUGIN_SUPPORT@
 # Support substitution of the tools and libs dirs with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-pragma-general.cpp b/clang-tools-extra/test/pp-trace/pp-trace-pragma-general.cpp
index f01ebd1ec67d..b16ec56e321b 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-pragma-general.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-pragma-general.cpp
@@ -21,6 +21,12 @@ void foo() {
 
 // CHECK: ---
 // CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}pp-trace-pragma-general.cpp:3:1"
 // CHECK-NEXT:   Introducer: PIK_HashPragma
 // CHECK-NEXT: - Callback: PragmaDiagnosticPush
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-pragma-ms.cpp b/clang-tools-extra/test/pp-trace/pp-trace-pragma-ms.cpp
index 932b0eb93c90..f5bf9ac2b955 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-pragma-ms.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-pragma-ms.cpp
@@ -18,6 +18,12 @@
 
 // CHECK: ---
 // CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}pp-trace-pragma-ms.cpp:3:1"
 // CHECK-NEXT:   Introducer: PIK_HashPragma
 // CHECK-NEXT: - Callback: PragmaComment
@@ -67,7 +73,7 @@
 // CHECK-NEXT:   Introducer: PIK_HashPragma
 // CHECK-NEXT: - Callback: PragmaMessage
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}pp-trace-pragma-ms.cpp:13:9"
-// CHECK-NEXT:   Namespace: 
+// CHECK-NEXT:   Namespace:
 // CHECK-NEXT:   Kind: PMK_Message
 // CHECK-NEXT:   Str: message argument
 // CHECK-NEXT: - Callback: PragmaDirective
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-pragma-opencl.cpp b/clang-tools-extra/test/pp-trace/pp-trace-pragma-opencl.cpp
index 31f61027994f..ed33d37eb3d5 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-pragma-opencl.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-pragma-opencl.cpp
@@ -6,6 +6,12 @@
 
 // CHECK: ---
 // CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}pp-trace-pragma-opencl.cpp:3:1"
 // CHECK-NEXT:   Introducer: PIK_HashPragma
 // CHECK-NEXT: - Callback: PragmaOpenCLExtension
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index f092766fa19f..cf97e3c6e851 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -166,6 +166,10 @@ if(CLANG_ENABLE_LIBXML2)
 endif()
 
 if(CLANG_ENABLE_CIR)
+  if (CLANG_BUILT_STANDALONE)
+    message(FATAL_ERROR
+      "ClangIR is not yet supported in the standalone build.")
+  endif()
   if (NOT "${LLVM_ENABLE_PROJECTS}" MATCHES "MLIR|mlir")
     message(FATAL_ERROR
       "Cannot build ClangIR without MLIR in LLVM_ENABLE_PROJECTS")
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index fa972636553f..c0bfcbdfc1c2 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -1,95 +1,94 @@
 # Plain options configure the first build.
 # BOOTSTRAP_* options configure the second build.
 # BOOTSTRAP_BOOTSTRAP_* options configure the third build.
+# PGO Builds have 3 stages (stage1, stage2-instrumented, stage2)
+# non-PGO Builds have 2 stages (stage1, stage2)
 
-# General Options
+
+function (set_final_stage_var name value type)
+  if (LLVM_RELEASE_ENABLE_PGO)
+    set(BOOTSTRAP_BOOTSTRAP_${name} ${value} CACHE ${type} "")
+  else()
+    set(BOOTSTRAP_${name} ${value} CACHE ${type} "")
+  endif()
+endfunction()
+
+function (set_instrument_and_final_stage_var name value type)
+  # This sets the varaible for the final stage in non-PGO builds and in
+  # the stage2-instrumented stage for PGO builds.
+  set(BOOTSTRAP_${name} ${value} CACHE ${type} "")
+  if (LLVM_RELEASE_ENABLE_PGO)
+    # Set the variable in the final stage for PGO builds.
+    set(BOOTSTRAP_BOOTSTRAP_${name} ${value} CACHE ${type} "")
+  endif()
+endfunction()
+
+# General Options:
+# If you want to override any of the LLVM_RELEASE_* variables you can set them
+# on the command line via -D, but you need to do this before you pass this
+# cache file to CMake via -C. e.g.
+#
+# cmake -D LLVM_RELEASE_ENABLE_PGO=ON -C Release.cmake
 set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PGO OFF CACHE BOOL "")
-
+set(LLVM_RELEASE_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
+set(LLVM_RELEASE_ENABLE_PROJECTS "clang;lld;lldb;clang-tools-extra;bolt;polly;mlir;flang" CACHE STRING "")
+# Note we don't need to add install here, since it is one of the pre-defined
+# steps.
+set(LLVM_RELEASE_FINAL_STAGE_TARGETS "clang;package;check-all;check-llvm;check-clang" CACHE STRING "")
 set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "")
 
-# Stage 1 Bootstrap Setup
+# Stage 1 Options
+set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
 set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
+
+set(STAGE1_PROJECTS "clang")
+set(STAGE1_RUNTIMES "")
+
 if (LLVM_RELEASE_ENABLE_PGO)
+  list(APPEND STAGE1_PROJECTS "lld")
+  list(APPEND STAGE1_RUNTIMES "compiler-rt")
   set(CLANG_BOOTSTRAP_TARGETS
     generate-profdata
-    stage2
     stage2-package
     stage2-clang
-    stage2-distribution
     stage2-install
-    stage2-install-distribution
-    stage2-install-distribution-toolchain
     stage2-check-all
     stage2-check-llvm
-    stage2-check-clang
-    stage2-test-suite CACHE STRING "")
-else()
-  set(CLANG_BOOTSTRAP_TARGETS
-    clang
-    check-all
-    check-llvm
-    check-clang
-    test-suite
-    stage3
-    stage3-clang
-    stage3-check-all
-    stage3-check-llvm
-    stage3-check-clang
-    stage3-install
-    stage3-test-suite CACHE STRING "")
-endif()
+    stage2-check-clang CACHE STRING "")
 
-# Stage 1 Options
-set(STAGE1_PROJECTS "clang")
-set(STAGE1_RUNTIMES "")
+  # Configuration for stage2-instrumented
+  set(BOOTSTRAP_CLANG_ENABLE_BOOTSTRAP ON CACHE STRING "")
+  # This enables the build targets for the final stage which is called stage2.
+  set(BOOTSTRAP_CLANG_BOOTSTRAP_TARGETS ${LLVM_RELEASE_FINAL_STAGE_TARGETS} CACHE STRING "")
+  set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED IR CACHE STRING "")
+  set(BOOTSTRAP_LLVM_ENABLE_RUNTIMES "compiler-rt" CACHE STRING "")
+  set(BOOTSTRAP_LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
 
-if (LLVM_RELEASE_ENABLE_PGO)
-  list(APPEND STAGE1_PROJECTS "lld")
-  list(APPEND STAGE1_RUNTIMES "compiler-rt")
+else()
+  if (LLVM_RELEASE_ENABLE_LTO)
+    list(APPEND STAGE1_PROJECTS "lld")
+  endif()
+  # Any targets added here will be given the target name stage2-${target}, so
+  # if you want to run them you can just use:
+  # ninja -C $BUILDDIR stage2-${target}
+  set(CLANG_BOOTSTRAP_TARGETS ${LLVM_RELEASE_FINAL_STAGE_TARGETS} CACHE STRING "")
 endif()
 
+# Stage 1 Common Config
 set(LLVM_ENABLE_RUNTIMES ${STAGE1_RUNTIMES} CACHE STRING "")
 set(LLVM_ENABLE_PROJECTS ${STAGE1_PROJECTS} CACHE STRING "")
 
-set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
-
-# Stage 2 Bootstrap Setup
-set(BOOTSTRAP_CLANG_ENABLE_BOOTSTRAP ON CACHE STRING "")
-set(BOOTSTRAP_CLANG_BOOTSTRAP_TARGETS
-  clang
-  package
-  check-all
-  check-llvm
-  check-clang CACHE STRING "")
-
-# Stage 2 Options
-set(STAGE2_PROJECTS "clang")
-set(STAGE2_RUNTIMES "")
-
-if (LLVM_RELEASE_ENABLE_LTO OR LLVM_RELEASE_ENABLE_PGO)
- list(APPEND STAGE2_PROJECTS "lld")
-endif()
-
-if (LLVM_RELEASE_ENABLE_PGO)
-  set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED IR CACHE STRING "")
-  list(APPEND STAGE2_RUNTIMES "compiler-rt")
-  set(BOOTSTRAP_LLVM_ENABLE_LTO ${LLVM_RELEASE_ENABLE_LTO})
-  if (LLVM_RELEASE_ENABLE_LTO)
-    set(BOOTSTRAP_LLVM_ENABLE_LLD ON CACHE BOOL "")
-  endif()
+# stage2-instrumented and Final Stage Config:
+# Options that need to be set in both the instrumented stage (if we are doing
+# a pgo build) and the final stage.
+set_instrument_and_final_stage_var(CMAKE_POSITION_INDEPENDENT_CODE "ON" STRING)
+set_instrument_and_final_stage_var(LLVM_ENABLE_LTO "${LLVM_RELEASE_ENABLE_LTO}" STRING)
+if (LLVM_RELEASE_ENABLE_LTO)
+  set_instrument_and_final_stage_var(LLVM_ENABLE_LLD "ON" BOOL)
 endif()
 
-set(BOOTSTRAP_LLVM_ENABLE_PROJECTS ${STAGE2_PROJECTS} CACHE STRING "")
-set(BOOTSTRAP_LLVM_ENABLE_RUNTIMES ${STAGE2_RUNTIMES} CACHE STRING "")
-if (NOT LLVM_RELEASE_ENABLE_PGO)
-  set(BOOTSTRAP_LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
-endif()
+# Final Stage Config (stage2)
+set_final_stage_var(LLVM_ENABLE_RUNTIMES "${LLVM_RELEASE_ENABLE_RUNTIMES}" STRING)
+set_final_stage_var(LLVM_ENABLE_PROJECTS "${LLVM_RELEASE_ENABLE_PROJECTS}" STRING)
 
-# Stage 3 Options
-set(BOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
-set(BOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_PROJECTS "clang;lld;lldb;clang-tools-extra;bolt;polly;mlir;flang" CACHE STRING "")
-set(BOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_LTO ${LLVM_RELEASE_ENABLE_LTO} CACHE STRING "")
-if (LLVM_RELEASE_ENABLE_LTO)
-  set(BOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_LLD ON CACHE BOOL "")
-endif()
diff --git a/clang/cmake/caches/VectorEngine.cmake b/clang/cmake/caches/VectorEngine.cmake
index e3976f3206db..2f968a21cc40 100644
--- a/clang/cmake/caches/VectorEngine.cmake
+++ b/clang/cmake/caches/VectorEngine.cmake
@@ -40,6 +40,7 @@ set(RUNTIMES_x86_64-unknown-linux-gnu_COMPILER_RT_BUILD_CRT OFF CACHE BOOL "")
 set(RUNTIMES_x86_64-unknown-linux-gnu_COMPILER_RT_BUILD_SANITIZERS OFF CACHE BOOL "")
 set(RUNTIMES_x86_64-unknown-linux-gnu_COMPILER_RT_BUILD_XRAY OFF CACHE BOOL "")
 set(RUNTIMES_x86_64-unknown-linux-gnu_COMPILER_RT_BUILD_LIBFUZZER OFF CACHE BOOL "")
+set(RUNTIMES_x86_64-unknown-linux-gnu_COMPILER_RT_BUILD_CTX_PROFILE OFF CACHE BOOL "")
 set(RUNTIMES_x86_64-unknown-linux-gnu_COMPILER_RT_BUILD_PROFILE OFF CACHE BOOL "")
 set(RUNTIMES_x86_64-unknown-linux-gnu_COMPILER_RT_BUILD_MEMPROF OFF CACHE BOOL "")
 set(RUNTIMES_x86_64-unknown-linux-gnu_COMPILER_RT_BUILD_ORC OFF CACHE BOOL "")
@@ -52,6 +53,7 @@ set(RUNTIMES_ve-unknown-linux-gnu_COMPILER_RT_BUILD_SANITIZERS OFF CACHE BOOL ""
 set(RUNTIMES_ve-unknown-linux-gnu_COMPILER_RT_BUILD_XRAY OFF CACHE BOOL "")
 set(RUNTIMES_ve-unknown-linux-gnu_COMPILER_RT_BUILD_LIBFUZZER OFF CACHE BOOL "")
 set(RUNTIMES_ve-unknown-linux-gnu_COMPILER_RT_BUILD_PROFILE ON CACHE BOOL "")
+set(RUNTIMES_ve-unknown-linux-gnu_COMPILER_RT_BUILD_CTX_PROFILE OFF CACHE BOOL "")
 set(RUNTIMES_ve-unknown-linux-gnu_COMPILER_RT_BUILD_MEMPROF OFF CACHE BOOL "")
 set(RUNTIMES_ve-unknown-linux-gnu_COMPILER_RT_BUILD_ORC OFF CACHE BOOL "")
 set(RUNTIMES_ve-unknown-linux-gnu_COMPILER_RT_BUILD_GWP_ASAN OFF CACHE BOOL "")
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 84fc4dee02fa..c2e90f4e7d58 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -711,6 +711,8 @@ even-odd element pair with indices ``i * 2`` and ``i * 2 + 1`` with
 power of 2, the vector is widened with neutral elements for the reduction
 at the end to the next power of 2.
 
+These reductions support both fixed-sized and scalable vector types.
+
 Example:
 
 .. code-block:: c++
@@ -1493,6 +1495,7 @@ Conditional ``explicit``                     __cpp_conditional_explicit       C+
 ``if consteval``                             __cpp_if_consteval               C++23         C++20
 ``static operator()``                        __cpp_static_call_operator       C++23         C++03
 Attributes on Lambda-Expressions                                              C++23         C++11
+Attributes on Structured Bindings            __cpp_structured_bindings        C++26         C++03
 ``= delete ("should have a reason");``       __cpp_deleted_function           C++26         C++03
 -------------------------------------------- -------------------------------- ------------- -------------
 Designated initializers (N494)                                                C99           C89
@@ -2928,7 +2931,7 @@ Query for this feature with ``__has_builtin(__builtin_dump_struct)``
 ``__builtin_shufflevector`` is used to express generic vector
 permutation/shuffle/swizzle operations.  This builtin is also very important
 for the implementation of various target-specific header files like
-``<xmmintrin.h>``.
+``<xmmintrin.h>``. This builtin can be used within constant expressions.
 
 **Syntax**:
 
@@ -2955,7 +2958,7 @@ for the implementation of various target-specific header files like
   // Concatenate every other element of 8-element vectors V1 and V2.
   __builtin_shufflevector(V1, V2, 0, 2, 4, 6, 8, 10, 12, 14)
 
-  // Shuffle v1 with some elements being undefined
+  // Shuffle v1 with some elements being undefined. Not allowed in constexpr.
   __builtin_shufflevector(v1, v1, 3, -1, 1, -1)
 
 **Description**:
@@ -2968,6 +2971,7 @@ starting with the first vector, continuing into the second vector.  Thus, if
 ``vec1`` is a 4-element vector, index 5 would refer to the second element of
 ``vec2``. An index of -1 can be used to indicate that the corresponding element
 in the returned vector is a don't care and can be optimized by the backend.
+Values of -1 are not supported in constant expressions.
 
 The result of ``__builtin_shufflevector`` is a vector with the same element
 type as ``vec1``/``vec2`` but that has an element count equal to the number of
@@ -2982,7 +2986,8 @@ Query for this feature with ``__has_builtin(__builtin_shufflevector)``.
 
 ``__builtin_convertvector`` is used to express generic vector
 type-conversion operations. The input vector and the output vector
-type must have the same number of elements.
+type must have the same number of elements. This builtin can be used within
+constant expressions.
 
 **Syntax**:
 
@@ -5572,3 +5577,25 @@ but the expression has no runtime effects.
 Type- and value-dependent expressions are not supported yet.
 
 This facility is designed to aid with testing name lookup machinery.
+
+Predefined Macros
+=================
+
+`__GCC_DESTRUCTIVE_SIZE` and `__GCC_CONSTRUCTIVE_SIZE`
+------------------------------------------------------
+Specify the mimum offset between two objects to avoid false sharing and the
+maximum size of contiguous memory to promote true sharing, respectively. These
+macros are predefined in all C and C++ language modes, but can be redefined on
+the command line with ``-D`` to specify different values as needed or can be
+undefined on the command line with ``-U`` to disable support for the feature.
+
+**Note: the values the macros expand to are not guaranteed to be stable. They
+are are affected by architectures and CPU tuning flags, can change between
+releases of Clang and will not match the values defined by other compilers such
+as GCC.**
+
+Compiling different TUs depending on these flags (including use of
+``std::hardware_constructive_interference`` or
+``std::hardware_destructive_interference``)  with different compilers, macro
+definitions, or architecture flags will lead to ODR violations and should be
+avoided.
+\ No newline at end of file
diff --git a/clang/docs/LibTooling.rst b/clang/docs/LibTooling.rst
index df50dcebf9b8..87d84321ab28 100644
--- a/clang/docs/LibTooling.rst
+++ b/clang/docs/LibTooling.rst
@@ -63,15 +63,22 @@ and automatic location of the compilation database using source files paths.
   #include "llvm/Support/CommandLine.h"
 
   using namespace clang::tooling;
+  using namespace llvm;
 
   // Apply a custom category to all command-line options so that they are the
   // only ones displayed.
-  static llvm::cl::OptionCategory MyToolCategory("my-tool options");
+  static cl::OptionCategory MyToolCategory("my-tool options");
 
   int main(int argc, const char **argv) {
-    // CommonOptionsParser constructor will parse arguments and create a
-    // CompilationDatabase.  In case of error it will terminate the program.
-    CommonOptionsParser OptionsParser(argc, argv, MyToolCategory);
+    // CommonOptionsParser::create will parse arguments and create a
+    // CompilationDatabase.
+    auto ExpectedParser = CommonOptionsParser::create(argc, argv, MyToolCategory);
+    if (!ExpectedParser) {
+      // Fail gracefully for unsupported options.
+      llvm::errs() << ExpectedParser.takeError();
+      return 1;
+    }
+    CommonOptionsParser& OptionsParser = ExpectedParser.get();
 
     // Use OptionsParser.getCompilations() and OptionsParser.getSourcePathList()
     // to retrieve CompilationDatabase and the list of input file paths.
@@ -133,7 +140,12 @@ version of this example tool is also checked into the clang tree at
   static cl::extrahelp MoreHelp("\nMore help text...\n");
 
   int main(int argc, const char **argv) {
-    CommonOptionsParser OptionsParser(argc, argv, MyToolCategory);
+    auto ExpectedParser = CommonOptionsParser::create(argc, argv, MyToolCategory);
+    if (!ExpectedParser) {
+      llvm::errs() << ExpectedParser.takeError();
+      return 1;
+    }
+    CommonOptionsParser& OptionsParser = ExpectedParser.get();
     ClangTool Tool(OptionsParser.getCompilations(),
                    OptionsParser.getSourcePathList());
     return Tool.run(newFrontendActionFactory<clang::SyntaxOnlyAction>().get());
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index f8146bc365e8..5e63b2c0f0be 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -310,7 +310,9 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | misc                         | dispatch construct and function variant argument adjustment  | :part:`worked on`        | D99537, D99679                                                        |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | assume and assumes directives                                | :part:`worked on`        |                                                                       |
+| misc                         | assumes directives                                           | :part:`worked on`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | assume directive                                             | :part:`worked on`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | misc                         | nothing directive                                            | :good:`done`             | D123286                                                               |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7bea43ec64f0..2c5308fbcb31 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -63,6 +63,12 @@ ABI Changes in This Version
   MSVC uses a different mangling for these objects, compatibility is not affected.
   (#GH85423).
 
+- Fixed Microsoft calling convention for returning certain classes with a
+  templated constructor. If a class has a templated constructor, it should
+  be returned indirectly even if it meets all the other requirements for
+  returning a class in a register. This affects some uses of std::pair.
+  (#GH86384).
+
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 
@@ -90,6 +96,18 @@ C++ Language Changes
 --------------------
 - Implemented ``_BitInt`` literal suffixes ``__wb`` or ``__WB`` as a Clang extension with ``unsigned`` modifiers also allowed. (#GH85223).
 
+C++17 Feature Support
+^^^^^^^^^^^^^^^^^^^^^
+- Clang now exposes ``__GCC_DESTRUCTIVE_SIZE`` and ``__GCC_CONSTRUCTIVE_SIZE``
+  predefined macros to support standard library implementations of
+  ``std::hardware_destructive_interference_size`` and
+  ``std::hardware_constructive_interference_size``, respectively. These macros
+  are predefined in all C and C++ language modes. The values the macros
+  expand to are not stable between releases of Clang and do not need to match
+  the values produced by GCC, so these macros should not be used from header
+  files because they may not be stable across multiple TUs (the values may vary
+  based on compiler version as well as CPU tuning). #GH60174
+
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -131,6 +149,9 @@ C++2c Feature Support
 
 - Implemented `P2573R2: = delete("should have a reason"); <https://wg21.link/P2573R2>`_
 
+- Implemented `P0609R3: Attributes for Structured Bindings <https://wg21.link/P0609R3>`_
+
+- Implemented `P2748R5 Disallow Binding a Returned Glvalue to a Temporary <https://wg21.link/P2748R5>`_.
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -207,6 +228,20 @@ Non-comprehensive list of changes in this release
 - ``__typeof_unqual__`` is available in all C modes as an extension, which behaves
   like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``.
 
+- ``__builtin_reduce_{add|mul|xor|or|and|min|max}`` builtins now support scalable vectors.
+
+* Shared libraries linked with either the ``-ffast-math``, ``-Ofast``, or
+  ``-funsafe-math-optimizations`` flags will no longer enable flush-to-zero
+  floating-point mode by default. This decision can be overridden with use of
+  ``-mdaz-ftz``. This behavior now matches GCC's behavior.
+  (`#57589 <https://github.com/llvm/llvm-project/issues/57589>`_)
+
+* ``-fdenormal-fp-math=preserve-sign`` is no longer implied by ``-ffast-math``
+  on x86 systems.
+
+- Builtins ``__builtin_shufflevector()`` and ``__builtin_convertvector()`` may
+  now be used within constant expressions.
+
 New Compiler Flags
 ------------------
 - ``-fsanitize=implicit-bitfield-conversion`` checks implicit truncation and
@@ -221,6 +256,10 @@ New Compiler Flags
 - ``-fexperimental-modules-reduced-bmi`` enables the Reduced BMI for C++20 named modules.
   See the document of standard C++ modules for details.
 
+- ``-fexperimental-late-parse-attributes`` enables an experimental feature to
+  allow late parsing certain attributes in specific contexts where they would
+  not normally be late parsed.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -375,6 +414,18 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses requires expressions with explicit object parameters.
 
+- Clang now looks up members of the current instantiation in the template definition context
+  if the current instantiation has no dependent base classes.
+
+  .. code-block:: c++
+
+     template<typename T>
+     struct A {
+       int f() {
+         return this->x; // error: no member named 'x' in 'A<T>'
+       }
+     };
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -421,6 +472,9 @@ Bug Fixes in This Version
 - Clang now correctly generates overloads for bit-precise integer types for
   builtin operators in C++. Fixes #GH82998.
 
+- Fix crash when destructor definition is preceded with an equals sign.
+  Fixes (#GH89544).
+
 - When performing mixed arithmetic between ``_Complex`` floating-point types and integers,
   Clang now correctly promotes the integer to its corresponding real floating-point
   type only rather than to the complex type (e.g. ``_Complex float / int`` is now evaluated
@@ -436,6 +490,10 @@ Bug Fixes in This Version
 
 - Fixed an assertion failure on invalid InitListExpr in C89 mode (#GH88008).
 
+- Fixed missing destructor calls when we branch from middle of an expression.
+  This could happen through a branch in stmt-expr or in an expression containing a coroutine
+  suspension. Fixes (#GH63818) (#GH88478).
+
 - Clang will no longer diagnose an erroneous non-dependent ``switch`` condition
   during instantiation, and instead will only diagnose it once, during checking
   of the function template.
@@ -564,6 +622,14 @@ Bug Fixes to C++ Support
 - Fixed a crash when trying to evaluate a user-defined ``static_assert`` message whose ``size()``
   function returns a large or negative value. Fixes (#GH89407).
 - Fixed a use-after-free bug in parsing of type constraints with default arguments that involve lambdas. (#GH67235)
+- Fixed bug in which the body of a consteval lambda within a template was not parsed as within an
+  immediate function context.
+- Fix CTAD for ``std::initializer_list``. This allows ``std::initializer_list{1, 2, 3}`` to be deduced as
+  ``std::initializer_list<int>`` as intended.
+- Fix a bug on template partial specialization whose template parameter is `decltype(auto)`.
+- Fix a bug on template partial specialization with issue on deduction of nontype template parameter
+  whose type is `decltype(auto)`. Fixes (#GH68885).
+- Clang now correctly treats the noexcept-specifier of a friend function to be a complete-class context.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -616,6 +682,10 @@ Arm and AArch64 Support
     * Arm Cortex-A78AE (cortex-a78ae).
     * Arm Cortex-A520AE (cortex-a520ae).
     * Arm Cortex-A720AE (cortex-a720ae).
+    * Arm Cortex-R82AE (cortex-r82ae).
+    * Arm Neoverse-N3 (neoverse-n3).
+    * Arm Neoverse-V3 (neoverse-v3).
+    * Arm Neoverse-V3AE (neoverse-v3ae).
 
 Android Support
 ^^^^^^^^^^^^^^^
@@ -672,6 +742,11 @@ AIX Support
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 
+The -mcpu=generic configuration now enables multivalue and reference-types.These
+proposals are standardized and available in all major engines. Enabling
+multivalue here only enables the language feature but does not turn on the
+multivalue ABI (this enables non-ABI uses of multivalue, like exnref).
+
 AVR Support
 ^^^^^^^^^^^
 
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 8df40566fcba..370de7d9c769 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1452,8 +1452,6 @@ floating point semantic models: precise (the default), strict, and fast.
   "fenv_access", "off", "on", "off"
   "rounding_mode", "tonearest", "dynamic", "tonearest"
   "contract", "on", "off", "fast"
-  "denormal_fp_math", "IEEE", "IEEE", "IEEE"
-  "denormal_fp32_math", "IEEE","IEEE", "IEEE"
   "support_math_errno", "on", "on", "off"
   "no_honor_nans", "off", "off", "on"
   "no_honor_infinities", "off", "off", "on"
@@ -1462,6 +1460,14 @@ floating point semantic models: precise (the default), strict, and fast.
   "allow_approximate_fns", "off", "off", "on"
   "allow_reassociation", "off", "off", "on"
 
+The ``-ffp-model`` option does not modify the ``fdenormal-fp-math``
+setting, but it does have an impact on whether ``crtfastmath.o`` is
+linked. Because linking ``crtfastmath.o`` has a global effect on the
+program, and because the global denormal handling can be changed in
+other ways, the state of ``fdenormal-fp-math`` handling cannot
+be assumed in any function based on fp-model. See :ref:`crtfastmath.o`
+for more details.
+
 .. option:: -ffast-math
 
    Enable fast-math mode.  This option lets the
@@ -1506,7 +1512,8 @@ floating point semantic models: precise (the default), strict, and fast.
 
    * ``-ffp-contract=fast``
 
-   Note: ``-ffast-math`` causes ``crtfastmath.o`` to be linked with code. See
+   Note: ``-ffast-math`` causes ``crtfastmath.o`` to be linked with code unless
+   ``-shared`` or ``-mno-daz-ftz`` is present. See
    :ref:`crtfastmath.o` for more details.
 
 .. option:: -fno-fast-math
@@ -1536,7 +1543,6 @@ floating point semantic models: precise (the default), strict, and fast.
    Also, this option resets following options to their target-dependent defaults.
 
    * ``-f[no-]math-errno``
-   * ``-fdenormal-fp-math=<value>``
 
    There is ambiguity about how ``-ffp-contract``, ``-ffast-math``,
    and ``-fno-fast-math`` behave when combined. To keep the value of
@@ -1559,8 +1565,8 @@ floating point semantic models: precise (the default), strict, and fast.
      ``-ffp-contract`` setting is determined by the default value of
      ``-ffp-contract``.
 
-   Note: ``-fno-fast-math`` implies ``-fdenormal-fp-math=ieee``.
-   ``-fno-fast-math`` causes ``crtfastmath.o`` to not be linked with code.
+   Note: ``-fno-fast-math`` causes ``crtfastmath.o`` to not be linked with code
+   unless ``-mdaz-ftz`` is present.
 
 .. option:: -fdenormal-fp-math=<value>
 
@@ -1690,9 +1696,7 @@ floating point semantic models: precise (the default), strict, and fast.
    * ``-fno-associative-math``
    * ``-fno-reciprocal-math``
    * ``-fsigned-zeros``
-   * ``-ftrapping-math``
    * ``-ffp-contract=on``
-   * ``-fdenormal-fp-math=ieee``
 
    There is ambiguity about how ``-ffp-contract``,
    ``-funsafe-math-optimizations``, and ``-fno-unsafe-math-optimizations``
@@ -1938,10 +1942,13 @@ by using ``#pragma STDC FENV_ROUND`` with a value other than ``FE_DYNAMIC``.
 
 A note about ``crtfastmath.o``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-``-ffast-math`` and ``-funsafe-math-optimizations`` cause ``crtfastmath.o`` to be
-automatically linked,  which adds a static constructor that sets the FTZ/DAZ
+``-ffast-math`` and ``-funsafe-math-optimizations`` without the ``-shared``
+option cause ``crtfastmath.o`` to be
+automatically linked, which adds a static constructor that sets the FTZ/DAZ
 bits in MXCSR, affecting not only the current compilation unit but all static
-and shared libraries included in the program.
+and shared libraries included in the program. This decision can be overridden
+by using either the flag ``-mdaz-ftz`` or ``-mno-daz-ftz`` to respectively
+link or not link ``crtfastmath.o``.
 
 .. _FLT_EVAL_METHOD:
 
@@ -2314,6 +2321,8 @@ are listed below.
    on ELF targets when using the integrated assembler. This flag currently
    only has an effect on ELF targets.
 
+.. _funique_internal_linkage_names:
+
 .. option:: -f[no]-unique-internal-linkage-names
 
    Controls whether Clang emits a unique (best-effort) symbol name for internal
@@ -2443,27 +2452,41 @@ usual build cycle when using sample profilers for optimization:
    usual build flags that you always build your application with. The only
    requirement is that DWARF debug info including source line information is
    generated. This DWARF information is important for the profiler to be able
-   to map instructions back to source line locations.
+   to map instructions back to source line locations. The usefulness of this
+   DWARF information can be improved with the ``-fdebug-info-for-profiling``
+   and ``-funique-internal-linkage-names`` options.
 
-   On Linux, ``-g`` or just ``-gline-tables-only`` is sufficient:
+   On Linux:
 
    .. code-block:: console
 
-     $ clang++ -O2 -gline-tables-only code.cc -o code
+     $ clang++ -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       code.cc -o code
 
    While MSVC-style targets default to CodeView debug information, DWARF debug
    information is required to generate source-level LLVM profiles. Use
    ``-gdwarf`` to include DWARF debug information:
 
-   .. code-block:: console
+   .. code-block:: winbatch
+
+     > clang-cl /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
 
-     $ clang-cl -O2 -gdwarf -gline-tables-only coff-profile.cpp -fuse-ld=lld -link -debug:dwarf
+.. note::
+
+   :ref:`-funique-internal-linkage-names <funique_internal_linkage_names>`
+   generates unique names based on given command-line source file paths. If
+   your build system uses absolute source paths and these paths may change
+   between steps 1 and 4, then the uniqued function names may change and result
+   in unused profile data. Consider omitting this option in such cases.
 
 2. Run the executable under a sampling profiler. The specific profiler
    you use does not really matter, as long as its output can be converted
    into the format that the LLVM optimizer understands.
 
-   Two such profilers are the the Linux Perf profiler
+   Two such profilers are the Linux Perf profiler
    (https://perf.wiki.kernel.org/) and Intel's Sampling Enabling Product (SEP),
    available as part of `Intel VTune
    <https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/vtune-profiler.html>`_.
@@ -2477,7 +2500,9 @@ usual build cycle when using sample profilers for optimization:
 
    .. code-block:: console
 
-     $ perf record -b ./code
+     $ perf record -b -e BR_INST_RETIRED.NEAR_TAKEN:uppp ./code
+
+   If the event above is unavailable, ``branches:u`` is probably next-best.
 
    Note the use of the ``-b`` flag. This tells Perf to use the Last Branch
    Record (LBR) to record call chains. While this is not strictly required,
@@ -2527,21 +2552,42 @@ usual build cycle when using sample profilers for optimization:
    that executes faster than the original one. Note that you are not
    required to build the code with the exact same arguments that you
    used in the first step. The only requirement is that you build the code
-   with ``-gline-tables-only`` and ``-fprofile-sample-use``.
+   with the same debug info options and ``-fprofile-sample-use``.
+
+   On Linux:
 
    .. code-block:: console
 
-     $ clang++ -O2 -gline-tables-only -fprofile-sample-use=code.prof code.cc -o code
+     $ clang++ -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       -fprofile-sample-use=code.prof code.cc -o code
 
-  [OPTIONAL] Sampling-based profiles can have inaccuracies or missing block/
-  edge counters. The profile inference algorithm (profi) can be used to infer
-  missing blocks and edge counts, and improve the quality of profile data.
-  Enable it with ``-fsample-profile-use-profi``.
+   On Windows:
 
-  .. code-block:: console
+   .. code-block:: winbatch
+
+     > clang-cl /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       /fprofile-sample-use=code.prof code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
+
+   [OPTIONAL] Sampling-based profiles can have inaccuracies or missing block/
+   edge counters. The profile inference algorithm (profi) can be used to infer
+   missing blocks and edge counts, and improve the quality of profile data.
+   Enable it with ``-fsample-profile-use-profi``. For example, on Linux:
+
+   .. code-block:: console
+
+     $ clang++ -fsample-profile-use-profi -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       -fprofile-sample-use=code.prof code.cc -o code
+
+   On Windows:
+
+   .. code-block:: winbatch
 
-    $ clang++ -O2 -gline-tables-only -fprofile-sample-use=code.prof \
-      -fsample-profile-use-profi code.cc -o code
+     > clang-cl /clang:-fsample-profile-use-profi /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       /fprofile-sample-use=code.prof code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
 
 Sample Profile Formats
 """"""""""""""""""""""
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index fb748d23a53d..0d87df36ced0 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1462,6 +1462,99 @@ checker).
 
 Default value of the option is ``true``.
 
+.. _unix-Stream:
+
+unix.Stream (C)
+"""""""""""""""
+Check C stream handling functions:
+``fopen, fdopen, freopen, tmpfile, fclose, fread, fwrite, fgetc, fgets, fputc, fputs, fprintf, fscanf, ungetc, getdelim, getline, fseek, fseeko, ftell, ftello, fflush, rewind, fgetpos, fsetpos, clearerr, feof, ferror, fileno``.
+
+The checker maintains information about the C stream objects (``FILE *``) and
+can detect error conditions related to use of streams. The following conditions
+are detected:
+
+* The ``FILE *`` pointer passed to the function is NULL (the single exception is
+  ``fflush`` where NULL is allowed).
+* Use of stream after close.
+* Opened stream is not closed.
+* Read from a stream after end-of-file. (This is not a fatal error but reported
+  by the checker. Stream remains in EOF state and the read operation fails.)
+* Use of stream when the file position is indeterminate after a previous failed
+  operation. Some functions (like ``ferror``, ``clearerr``, ``fseek``) are
+  allowed in this state.
+* Invalid 3rd ("``whence``") argument to ``fseek``.
+
+The stream operations are by this checker usually split into two cases, a success
+and a failure case. However, in the case of write operations (like ``fwrite``,
+``fprintf`` and even ``fsetpos``) this behavior could produce a large amount of
+unwanted reports on projects that don't have error checks around the write
+operations, so by default the checker assumes that write operations always succeed.
+This behavior can be controlled by the ``Pedantic`` flag: With
+``-analyzer-config unix.Stream:Pedantic=true`` the checker will model the
+cases where a write operation fails and report situations where this leads to
+erroneous behavior. (The default is ``Pedantic=false``, where write operations
+are assumed to succeed.)
+
+.. code-block:: c
+
+ void test1() {
+   FILE *p = fopen("foo", "r");
+ } // warn: opened file is never closed
+
+ void test2() {
+   FILE *p = fopen("foo", "r");
+   fseek(p, 1, SEEK_SET); // warn: stream pointer might be NULL
+   fclose(p);
+ }
+
+ void test3() {
+   FILE *p = fopen("foo", "r");
+   if (p) {
+     fseek(p, 1, 3); // warn: third arg should be SEEK_SET, SEEK_END, or SEEK_CUR
+     fclose(p);
+   }
+ }
+
+ void test4() {
+   FILE *p = fopen("foo", "r");
+   if (!p)
+     return;
+
+   fclose(p);
+   fclose(p); // warn: stream already closed
+ }
+
+ void test5() {
+   FILE *p = fopen("foo", "r");
+   if (!p)
+     return;
+
+   fgetc(p);
+   if (!ferror(p))
+     fgetc(p); // warn: possible read after end-of-file
+
+   fclose(p);
+ }
+
+ void test6() {
+   FILE *p = fopen("foo", "r");
+   if (!p)
+     return;
+
+   fgetc(p);
+   if (!feof(p))
+     fgetc(p); // warn: file position may be indeterminate after I/O error
+
+   fclose(p);
+ }
+
+**Limitations**
+
+The checker does not track the correspondence between integer file descriptors
+and ``FILE *`` pointers. Operations on standard streams like ``stdin`` are not
+treated specially and are therefore often not recognized (because these streams
+are usually not opened explicitly by the program, and are global variables).
+
 .. _osx-checkers:
 
 osx
@@ -3116,99 +3209,6 @@ Check for misuses of stream APIs. Check for misuses of stream APIs: ``fopen, fcl
    fclose(F); // warn: closing a previously closed file stream
  }
 
-.. _alpha-unix-Stream:
-
-alpha.unix.Stream (C)
-"""""""""""""""""""""
-Check C stream handling functions:
-``fopen, fdopen, freopen, tmpfile, fclose, fread, fwrite, fgetc, fgets, fputc, fputs, fprintf, fscanf, ungetc, getdelim, getline, fseek, fseeko, ftell, ftello, fflush, rewind, fgetpos, fsetpos, clearerr, feof, ferror, fileno``.
-
-The checker maintains information about the C stream objects (``FILE *``) and
-can detect error conditions related to use of streams. The following conditions
-are detected:
-
-* The ``FILE *`` pointer passed to the function is NULL (the single exception is
-  ``fflush`` where NULL is allowed).
-* Use of stream after close.
-* Opened stream is not closed.
-* Read from a stream after end-of-file. (This is not a fatal error but reported
-  by the checker. Stream remains in EOF state and the read operation fails.)
-* Use of stream when the file position is indeterminate after a previous failed
-  operation. Some functions (like ``ferror``, ``clearerr``, ``fseek``) are
-  allowed in this state.
-* Invalid 3rd ("``whence``") argument to ``fseek``.
-
-The stream operations are by this checker usually split into two cases, a success
-and a failure case. However, in the case of write operations (like ``fwrite``,
-``fprintf`` and even ``fsetpos``) this behavior could produce a large amount of
-unwanted reports on projects that don't have error checks around the write
-operations, so by default the checker assumes that write operations always succeed.
-This behavior can be controlled by the ``Pedantic`` flag: With
-``-analyzer-config alpha.unix.Stream:Pedantic=true`` the checker will model the
-cases where a write operation fails and report situations where this leads to
-erroneous behavior. (The default is ``Pedantic=false``, where write operations
-are assumed to succeed.)
-
-.. code-block:: c
-
- void test1() {
-   FILE *p = fopen("foo", "r");
- } // warn: opened file is never closed
-
- void test2() {
-   FILE *p = fopen("foo", "r");
-   fseek(p, 1, SEEK_SET); // warn: stream pointer might be NULL
-   fclose(p);
- }
-
- void test3() {
-   FILE *p = fopen("foo", "r");
-   if (p) {
-     fseek(p, 1, 3); // warn: third arg should be SEEK_SET, SEEK_END, or SEEK_CUR
-     fclose(p);
-   }
- }
-
- void test4() {
-   FILE *p = fopen("foo", "r");
-   if (!p)
-     return;
-
-   fclose(p);
-   fclose(p); // warn: stream already closed
- }
-
- void test5() {
-   FILE *p = fopen("foo", "r");
-   if (!p)
-     return;
-
-   fgetc(p);
-   if (!ferror(p))
-     fgetc(p); // warn: possible read after end-of-file
-
-   fclose(p);
- }
-
- void test6() {
-   FILE *p = fopen("foo", "r");
-   if (!p)
-     return;
-
-   fgetc(p);
-   if (!feof(p))
-     fgetc(p); // warn: file position may be indeterminate after I/O error
-
-   fclose(p);
- }
-
-**Limitations**
-
-The checker does not track the correspondence between integer file descriptors
-and ``FILE *`` pointers. Operations on standard streams like ``stdin`` are not
-treated specially and are therefore often not recognized (because these streams
-are usually not opened explicitly by the program, and are global variables).
-
 .. _alpha-unix-cstring-BufferOverlap:
 
 alpha.unix.cstring.BufferOverlap (C)
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 7a8bd985a91f..365b607c7411 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -1644,8 +1644,9 @@ enum CXCursorKind {
   CXCursor_ObjCSelfExpr = 146,
 
   /** OpenMP 5.0 [2.1.5, Array Section].
+   * OpenACC 3.3 [2.7.1, Data Specification for Data Clauses (Sub Arrays)]
    */
-  CXCursor_OMPArraySectionExpr = 147,
+  CXCursor_ArraySectionExpr = 147,
 
   /** Represents an @available(...) check.
    */
diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index 93bb045d6a66..026a4a431e73 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -675,6 +675,11 @@ class TagInfo : public CommonTypeInfo {
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsFlagEnum : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned SwiftCopyableSpecified : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned SwiftCopyable : 1;
+
 public:
   std::optional<std::string> SwiftImportAs;
   std::optional<std::string> SwiftRetainOp;
@@ -682,7 +687,9 @@ public:
 
   std::optional<EnumExtensibilityKind> EnumExtensibility;
 
-  TagInfo() : HasFlagEnum(0), IsFlagEnum(0) {}
+  TagInfo()
+      : HasFlagEnum(0), IsFlagEnum(0), SwiftCopyableSpecified(false),
+        SwiftCopyable(false) {}
 
   std::optional<bool> isFlagEnum() const {
     if (HasFlagEnum)
@@ -694,6 +701,15 @@ public:
     IsFlagEnum = Value.value_or(false);
   }
 
+  std::optional<bool> isSwiftCopyable() const {
+    return SwiftCopyableSpecified ? std::optional<bool>(SwiftCopyable)
+                                  : std::nullopt;
+  }
+  void setSwiftCopyable(std::optional<bool> Value) {
+    SwiftCopyableSpecified = Value.has_value();
+    SwiftCopyable = Value.value_or(false);
+  }
+
   TagInfo &operator|=(const TagInfo &RHS) {
     static_cast<CommonTypeInfo &>(*this) |= RHS;
 
@@ -710,6 +726,9 @@ public:
     if (!EnumExtensibility)
       EnumExtensibility = RHS.EnumExtensibility;
 
+    if (!SwiftCopyableSpecified)
+      setSwiftCopyable(RHS.isSwiftCopyable());
+
     return *this;
   }
 
@@ -724,6 +743,7 @@ inline bool operator==(const TagInfo &LHS, const TagInfo &RHS) {
          LHS.SwiftRetainOp == RHS.SwiftRetainOp &&
          LHS.SwiftReleaseOp == RHS.SwiftReleaseOp &&
          LHS.isFlagEnum() == RHS.isFlagEnum() &&
+         LHS.isSwiftCopyable() == RHS.isSwiftCopyable() &&
          LHS.EnumExtensibility == RHS.EnumExtensibility;
 }
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index d5ed20ff5015..6dbd06251dda 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -455,7 +455,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// initialization of another module).
   struct PerModuleInitializers {
     llvm::SmallVector<Decl*, 4> Initializers;
-    llvm::SmallVector<Decl::DeclID, 4> LazyInitializers;
+    llvm::SmallVector<GlobalDeclID, 4> LazyInitializers;
 
     void resolve(ASTContext &Ctx);
   };
@@ -1059,7 +1059,7 @@ public:
   /// or an ImportDecl nominating another module that has initializers.
   void addModuleInitializer(Module *M, Decl *Init);
 
-  void addLazyModuleInitializers(Module *M, ArrayRef<Decl::DeclID> IDs);
+  void addLazyModuleInitializers(Module *M, ArrayRef<GlobalDeclID> IDs);
 
   /// Get the initializations to perform when importing a module, if any.
   ArrayRef<Decl*> getModuleInitializers(Module *M);
@@ -1127,7 +1127,8 @@ public:
   CanQualType OCLSamplerTy, OCLEventTy, OCLClkEventTy;
   CanQualType OCLQueueTy, OCLReserveIDTy;
   CanQualType IncompleteMatrixIdxTy;
-  CanQualType OMPArraySectionTy, OMPArrayShapingTy, OMPIteratorTy;
+  CanQualType ArraySectionTy;
+  CanQualType OMPArrayShapingTy, OMPIteratorTy;
 #define EXT_OPAQUE_TYPE(ExtType, Id, Ext) \
   CanQualType Id##Ty;
 #include "clang/Basic/OpenCLExtensionTypes.def"
@@ -2196,6 +2197,16 @@ public:
     return getQualifiedType(type.getUnqualifiedType(), Qs);
   }
 
+  /// \brief Return a type with the given __ptrauth qualifier.
+  QualType getPointerAuthType(QualType Ty, PointerAuthQualifier PointerAuth) {
+    assert(!Ty.getPointerAuth());
+    assert(PointerAuth);
+
+    Qualifiers Qs;
+    Qs.setPointerAuth(PointerAuth);
+    return getQualifiedType(Ty, Qs);
+  }
+
   unsigned char getFixedPointScale(QualType Ty) const;
   unsigned char getFixedPointIBits(QualType Ty) const;
   llvm::FixedPointSemantics getFixedPointSemantics(QualType Ty) const;
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index 216dc9eef08b..bf7c204e4ad7 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -844,6 +844,12 @@ public:
     }
   }
 
+  void VisitUnresolvedLookupExpr(const UnresolvedLookupExpr *E) {
+    if (E->hasExplicitTemplateArgs())
+      for (auto Arg : E->template_arguments())
+        Visit(Arg.getArgument());
+  }
+
   void VisitRequiresExpr(const RequiresExpr *E) {
     for (auto *D : E->getLocalParameters())
       Visit(D);
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
index 1f2797cc7014..ab036f1d445a 100644
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -213,9 +213,9 @@ public:
   }
 
   Qualifiers readQualifiers() {
-    static_assert(sizeof(Qualifiers().getAsOpaqueValue()) <= sizeof(uint32_t),
+    static_assert(sizeof(Qualifiers().getAsOpaqueValue()) <= sizeof(uint64_t),
                   "update this if the value size changes");
-    uint32_t value = asImpl().readUInt32();
+    uint64_t value = asImpl().readUInt64();
     return Qualifiers::fromOpaqueValue(value);
   }
 
diff --git a/clang/include/clang/AST/AbstractBasicWriter.h b/clang/include/clang/AST/AbstractBasicWriter.h
index 07afa388de2c..8e42fcaad1d3 100644
--- a/clang/include/clang/AST/AbstractBasicWriter.h
+++ b/clang/include/clang/AST/AbstractBasicWriter.h
@@ -196,9 +196,9 @@ public:
   }
 
   void writeQualifiers(Qualifiers value) {
-    static_assert(sizeof(value.getAsOpaqueValue()) <= sizeof(uint32_t),
+    static_assert(sizeof(value.getAsOpaqueValue()) <= sizeof(uint64_t),
                   "update this if the value size changes");
-    asImpl().writeUInt32(value.getAsOpaqueValue());
+    asImpl().writeUInt64(value.getAsOpaqueValue());
   }
 
   void writeExceptionSpecInfo(
diff --git a/clang/include/clang/AST/BuiltinTypes.def b/clang/include/clang/AST/BuiltinTypes.def
index c04f6f6f1271..0a36fdc5d9c0 100644
--- a/clang/include/clang/AST/BuiltinTypes.def
+++ b/clang/include/clang/AST/BuiltinTypes.def
@@ -320,7 +320,7 @@ PLACEHOLDER_TYPE(ARCUnbridgedCast, ARCUnbridgedCastTy)
 PLACEHOLDER_TYPE(IncompleteMatrixIdx, IncompleteMatrixIdxTy)
 
 // A placeholder type for OpenMP array sections.
-PLACEHOLDER_TYPE(OMPArraySection, OMPArraySectionTy)
+PLACEHOLDER_TYPE(ArraySection, ArraySectionTy)
 
 // A placeholder type for OpenMP array shaping operation.
 PLACEHOLDER_TYPE(OMPArrayShaping, OMPArrayShapingTy)
diff --git a/clang/include/clang/AST/ComputeDependence.h b/clang/include/clang/AST/ComputeDependence.h
index 7abf9141237d..6d3a51c379f9 100644
--- a/clang/include/clang/AST/ComputeDependence.h
+++ b/clang/include/clang/AST/ComputeDependence.h
@@ -94,7 +94,7 @@ class DesignatedInitExpr;
 class ParenListExpr;
 class PseudoObjectExpr;
 class AtomicExpr;
-class OMPArraySectionExpr;
+class ArraySectionExpr;
 class OMPArrayShapingExpr;
 class OMPIteratorExpr;
 class ObjCArrayLiteral;
@@ -189,7 +189,7 @@ ExprDependence computeDependence(ParenListExpr *E);
 ExprDependence computeDependence(PseudoObjectExpr *E);
 ExprDependence computeDependence(AtomicExpr *E);
 
-ExprDependence computeDependence(OMPArraySectionExpr *E);
+ExprDependence computeDependence(ArraySectionExpr *E);
 ExprDependence computeDependence(OMPArrayShapingExpr *E);
 ExprDependence computeDependence(OMPIteratorExpr *E);
 
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 8b121896d66d..a53c27a99a8c 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -157,7 +157,7 @@ public:
                                    SourceLocation CommentLoc,
                                    PragmaMSCommentKind CommentKind,
                                    StringRef Arg);
-  static PragmaCommentDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static PragmaCommentDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                                unsigned ArgSize);
 
   PragmaMSCommentKind getCommentKind() const { return CommentKind; }
@@ -192,7 +192,7 @@ public:
                                           SourceLocation Loc, StringRef Name,
                                           StringRef Value);
   static PragmaDetectMismatchDecl *
-  CreateDeserialized(ASTContext &C, DeclID ID, unsigned NameValueSize);
+  CreateDeserialized(ASTContext &C, GlobalDeclID ID, unsigned NameValueSize);
 
   StringRef getName() const { return getTrailingObjects<char>(); }
   StringRef getValue() const { return getTrailingObjects<char>() + ValueStart; }
@@ -518,7 +518,7 @@ public:
   static LabelDecl *Create(ASTContext &C, DeclContext *DC,
                            SourceLocation IdentL, IdentifierInfo *II,
                            SourceLocation GnuLabelL);
-  static LabelDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static LabelDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   LabelStmt *getStmt() const { return TheStmt; }
   void setStmt(LabelStmt *T) { TheStmt = T; }
@@ -581,7 +581,7 @@ public:
                                IdentifierInfo *Id, NamespaceDecl *PrevDecl,
                                bool Nested);
 
-  static NamespaceDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static NamespaceDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   using redecl_range = redeclarable_base::redecl_range;
   using redecl_iterator = redeclarable_base::redecl_iterator;
@@ -1146,7 +1146,7 @@ public:
                          const IdentifierInfo *Id, QualType T,
                          TypeSourceInfo *TInfo, StorageClass S);
 
-  static VarDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static VarDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -1728,7 +1728,7 @@ public:
   static ImplicitParamDecl *Create(ASTContext &C, QualType T,
                                    ImplicitParamKind ParamKind);
 
-  static ImplicitParamDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ImplicitParamDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   ImplicitParamDecl(ASTContext &C, DeclContext *DC, SourceLocation IdLoc,
                     const IdentifierInfo *Id, QualType Type,
@@ -1782,7 +1782,7 @@ public:
                              TypeSourceInfo *TInfo, StorageClass S,
                              Expr *DefArg);
 
-  static ParmVarDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ParmVarDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -2178,7 +2178,7 @@ public:
          bool hasWrittenPrototype, ConstexprSpecKind ConstexprKind,
          Expr *TrailingRequiresClause);
 
-  static FunctionDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static FunctionDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   DeclarationNameInfo getNameInfo() const {
     return DeclarationNameInfo(getDeclName(), getLocation(), DNLoc);
@@ -3136,7 +3136,7 @@ public:
                            TypeSourceInfo *TInfo, Expr *BW, bool Mutable,
                            InClassInitStyle InitStyle);
 
-  static FieldDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static FieldDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   /// Returns the index of this field within its record,
   /// as appropriate for passing to ASTRecordLayout::getFieldOffset.
@@ -3311,7 +3311,7 @@ public:
                                   SourceLocation L, IdentifierInfo *Id,
                                   QualType T, Expr *E,
                                   const llvm::APSInt &V);
-  static EnumConstantDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static EnumConstantDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   const Expr *getInitExpr() const { return (const Expr*) Init; }
   Expr *getInitExpr() { return (Expr*) Init; }
@@ -3357,7 +3357,7 @@ public:
                                    QualType T,
                                    llvm::MutableArrayRef<NamedDecl *> CH);
 
-  static IndirectFieldDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static IndirectFieldDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   using chain_iterator = ArrayRef<NamedDecl *>::const_iterator;
 
@@ -3542,7 +3542,7 @@ public:
   static TypedefDecl *Create(ASTContext &C, DeclContext *DC,
                              SourceLocation StartLoc, SourceLocation IdLoc,
                              const IdentifierInfo *Id, TypeSourceInfo *TInfo);
-  static TypedefDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static TypedefDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -3567,7 +3567,7 @@ public:
   static TypeAliasDecl *Create(ASTContext &C, DeclContext *DC,
                                SourceLocation StartLoc, SourceLocation IdLoc,
                                const IdentifierInfo *Id, TypeSourceInfo *TInfo);
-  static TypeAliasDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static TypeAliasDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -3977,7 +3977,7 @@ public:
                           IdentifierInfo *Id, EnumDecl *PrevDecl,
                           bool IsScoped, bool IsScopedUsingClassTag,
                           bool IsFixed);
-  static EnumDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static EnumDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   /// Overrides to provide correct range when there's an enum-base specifier
   /// with forward declarations.
@@ -4182,7 +4182,7 @@ public:
   static RecordDecl *Create(const ASTContext &C, TagKind TK, DeclContext *DC,
                             SourceLocation StartLoc, SourceLocation IdLoc,
                             IdentifierInfo *Id, RecordDecl* PrevDecl = nullptr);
-  static RecordDecl *CreateDeserialized(const ASTContext &C, DeclID ID);
+  static RecordDecl *CreateDeserialized(const ASTContext &C, GlobalDeclID ID);
 
   RecordDecl *getPreviousDecl() {
     return cast_or_null<RecordDecl>(
@@ -4433,7 +4433,7 @@ public:
                                   StringLiteral *Str, SourceLocation AsmLoc,
                                   SourceLocation RParenLoc);
 
-  static FileScopeAsmDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static FileScopeAsmDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceLocation getAsmLoc() const { return getLocation(); }
   SourceLocation getRParenLoc() const { return RParenLoc; }
@@ -4469,7 +4469,7 @@ class TopLevelStmtDecl : public Decl, public DeclContext {
 
 public:
   static TopLevelStmtDecl *Create(ASTContext &C, Stmt *Statement);
-  static TopLevelStmtDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static TopLevelStmtDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
   Stmt *getStmt() { return Statement; }
@@ -4563,7 +4563,7 @@ protected:
 
 public:
   static BlockDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation L);
-  static BlockDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static BlockDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceLocation getCaretLocation() const { return getLocation(); }
 
@@ -4717,7 +4717,7 @@ public:
 
   static CapturedDecl *Create(ASTContext &C, DeclContext *DC,
                               unsigned NumParams);
-  static CapturedDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static CapturedDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                           unsigned NumParams);
 
   Stmt *getBody() const override;
@@ -4851,7 +4851,7 @@ public:
                                     SourceLocation EndLoc);
 
   /// Create a new, deserialized module import declaration.
-  static ImportDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static ImportDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                         unsigned NumLocations);
 
   /// Retrieve the module that was imported by the import declaration.
@@ -4892,7 +4892,7 @@ private:
 public:
   static ExportDecl *Create(ASTContext &C, DeclContext *DC,
                             SourceLocation ExportLoc);
-  static ExportDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ExportDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceLocation getExportLoc() const { return getLocation(); }
   SourceLocation getRBraceLoc() const { return RBraceLoc; }
@@ -4931,7 +4931,7 @@ class EmptyDecl : public Decl {
 public:
   static EmptyDecl *Create(ASTContext &C, DeclContext *DC,
                            SourceLocation L);
-  static EmptyDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static EmptyDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) { return K == Empty; }
@@ -4957,7 +4957,7 @@ public:
                                 bool CBuffer, SourceLocation KwLoc,
                                 IdentifierInfo *ID, SourceLocation IDLoc,
                                 SourceLocation LBrace);
-  static HLSLBufferDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static HLSLBufferDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY {
     return SourceRange(getLocStart(), RBraceLoc);
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index d8cafc3d8152..e43e812cd945 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -15,6 +15,7 @@
 
 #include "clang/AST/ASTDumperUtils.h"
 #include "clang/AST/AttrIterator.h"
+#include "clang/AST/DeclID.h"
 #include "clang/AST/DeclarationName.h"
 #include "clang/AST/SelectorLocationsKind.h"
 #include "clang/Basic/IdentifierTable.h"
@@ -239,9 +240,6 @@ public:
     ModulePrivate
   };
 
-  /// An ID number that refers to a declaration in an AST file.
-  using DeclID = uint32_t;
-
 protected:
   /// The next declaration within the same lexical
   /// DeclContext. These pointers form the linked list that is
@@ -361,7 +359,7 @@ protected:
   /// \param Ctx The context in which we will allocate memory.
   /// \param ID The global ID of the deserialized declaration.
   /// \param Extra The amount of extra space to allocate after the object.
-  void *operator new(std::size_t Size, const ASTContext &Ctx, DeclID ID,
+  void *operator new(std::size_t Size, const ASTContext &Ctx, GlobalDeclID ID,
                      std::size_t Extra = 0);
 
   /// Allocate memory for a non-deserialized declaration.
@@ -779,10 +777,10 @@ public:
 
   /// Retrieve the global declaration ID associated with this
   /// declaration, which specifies where this Decl was loaded from.
-  DeclID getGlobalID() const {
+  GlobalDeclID getGlobalID() const {
     if (isFromASTFile())
-      return *((const DeclID *)this - 1);
-    return 0;
+      return (*((const GlobalDeclID *)this - 1));
+    return GlobalDeclID();
   }
 
   /// Retrieve the global ID of the module that owns this particular
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index a7644d2a19d2..fb52ac804849 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -120,7 +120,7 @@ public:
     return new (C, DC) AccessSpecDecl(AS, DC, ASLoc, ColonLoc);
   }
 
-  static AccessSpecDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static AccessSpecDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
@@ -579,7 +579,8 @@ public:
                                      TypeSourceInfo *Info, SourceLocation Loc,
                                      unsigned DependencyKind, bool IsGeneric,
                                      LambdaCaptureDefault CaptureDefault);
-  static CXXRecordDecl *CreateDeserialized(const ASTContext &C, DeclID ID);
+  static CXXRecordDecl *CreateDeserialized(const ASTContext &C,
+                                           GlobalDeclID ID);
 
   bool isDynamicClass() const {
     return data().Polymorphic || data().NumVBases != 0;
@@ -1980,7 +1981,8 @@ public:
          CXXConstructorDecl *Ctor = nullptr,
          DeductionCandidate Kind = DeductionCandidate::Normal);
 
-  static CXXDeductionGuideDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static CXXDeductionGuideDecl *CreateDeserialized(ASTContext &C,
+                                                   GlobalDeclID ID);
 
   ExplicitSpecifier getExplicitSpecifier() { return ExplicitSpec; }
   const ExplicitSpecifier getExplicitSpecifier() const { return ExplicitSpec; }
@@ -2035,7 +2037,8 @@ public:
   static RequiresExprBodyDecl *Create(ASTContext &C, DeclContext *DC,
                                       SourceLocation StartLoc);
 
-  static RequiresExprBodyDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static RequiresExprBodyDecl *CreateDeserialized(ASTContext &C,
+                                                  GlobalDeclID ID);
 
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
@@ -2078,7 +2081,7 @@ public:
          ConstexprSpecKind ConstexprKind, SourceLocation EndLocation,
          Expr *TrailingRequiresClause = nullptr);
 
-  static CXXMethodDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static CXXMethodDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   bool isStatic() const;
   bool isInstance() const { return !isStatic(); }
@@ -2579,7 +2582,7 @@ public:
   friend class ASTDeclWriter;
   friend TrailingObjects;
 
-  static CXXConstructorDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static CXXConstructorDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                                 uint64_t AllocKind);
   static CXXConstructorDecl *
   Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
@@ -2822,7 +2825,7 @@ public:
          bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared,
          ConstexprSpecKind ConstexprKind,
          Expr *TrailingRequiresClause = nullptr);
-  static CXXDestructorDecl *CreateDeserialized(ASTContext & C, DeclID ID);
+  static CXXDestructorDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg);
 
@@ -2881,7 +2884,7 @@ public:
          bool UsesFPIntrin, bool isInline, ExplicitSpecifier ES,
          ConstexprSpecKind ConstexprKind, SourceLocation EndLocation,
          Expr *TrailingRequiresClause = nullptr);
-  static CXXConversionDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static CXXConversionDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   ExplicitSpecifier getExplicitSpecifier() {
     return getCanonicalDecl()->ExplicitSpec;
@@ -2948,7 +2951,7 @@ public:
                                  SourceLocation ExternLoc,
                                  SourceLocation LangLoc,
                                  LinkageSpecLanguageIDs Lang, bool HasBraces);
-  static LinkageSpecDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static LinkageSpecDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   /// Return the language specified by this linkage specification.
   LinkageSpecLanguageIDs getLanguage() const {
@@ -3096,7 +3099,7 @@ public:
                                     SourceLocation IdentLoc,
                                     NamedDecl *Nominated,
                                     DeclContext *CommonAncestor);
-  static UsingDirectiveDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static UsingDirectiveDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY {
     return SourceRange(UsingLoc, getLocation());
@@ -3157,7 +3160,7 @@ public:
                                     SourceLocation IdentLoc,
                                     NamedDecl *Namespace);
 
-  static NamespaceAliasDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static NamespaceAliasDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   using redecl_range = redeclarable_base::redecl_range;
   using redecl_iterator = redeclarable_base::redecl_iterator;
@@ -3254,7 +3257,7 @@ public:
         LifetimeExtendedTemporaryDecl(Temp, EDec, Mangling);
   }
   static LifetimeExtendedTemporaryDecl *CreateDeserialized(ASTContext &C,
-                                                           DeclID ID) {
+                                                           GlobalDeclID ID) {
     return new (C, ID) LifetimeExtendedTemporaryDecl(EmptyShell{});
   }
 
@@ -3357,7 +3360,7 @@ public:
         UsingShadowDecl(UsingShadow, C, DC, Loc, Name, Introducer, Target);
   }
 
-  static UsingShadowDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static UsingShadowDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   using redecl_range = redeclarable_base::redecl_range;
   using redecl_iterator = redeclarable_base::redecl_iterator;
@@ -3566,7 +3569,7 @@ public:
                            const DeclarationNameInfo &NameInfo,
                            bool HasTypenameKeyword);
 
-  static UsingDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static UsingDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -3645,7 +3648,7 @@ public:
                                             UsingDecl *Using, NamedDecl *Target,
                                             bool IsVirtual);
   static ConstructorUsingShadowDecl *CreateDeserialized(ASTContext &C,
-                                                        DeclID ID);
+                                                        GlobalDeclID ID);
 
   /// Override the UsingShadowDecl's getIntroducer, returning the UsingDecl that
   /// introduced this.
@@ -3757,7 +3760,7 @@ public:
                                SourceLocation UsingL, SourceLocation EnumL,
                                SourceLocation NameL, TypeSourceInfo *EnumType);
 
-  static UsingEnumDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static UsingEnumDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -3830,7 +3833,7 @@ public:
                                NamedDecl *InstantiatedFrom,
                                ArrayRef<NamedDecl *> UsingDecls);
 
-  static UsingPackDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static UsingPackDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                            unsigned NumExpansions);
 
   SourceRange getSourceRange() const override LLVM_READONLY {
@@ -3923,8 +3926,8 @@ public:
            NestedNameSpecifierLoc QualifierLoc,
            const DeclarationNameInfo &NameInfo, SourceLocation EllipsisLoc);
 
-  static UnresolvedUsingValueDecl *
-  CreateDeserialized(ASTContext &C, DeclID ID);
+  static UnresolvedUsingValueDecl *CreateDeserialized(ASTContext &C,
+                                                      GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -4014,8 +4017,8 @@ public:
            SourceLocation TargetNameLoc, DeclarationName TargetName,
            SourceLocation EllipsisLoc);
 
-  static UnresolvedUsingTypenameDecl *
-  CreateDeserialized(ASTContext &C, DeclID ID);
+  static UnresolvedUsingTypenameDecl *CreateDeserialized(ASTContext &C,
+                                                         GlobalDeclID ID);
 
   /// Retrieves the canonical declaration of this declaration.
   UnresolvedUsingTypenameDecl *getCanonicalDecl() override {
@@ -4045,7 +4048,7 @@ public:
                                              SourceLocation Loc,
                                              DeclarationName Name);
   static UnresolvedUsingIfExistsDecl *CreateDeserialized(ASTContext &Ctx,
-                                                         DeclID ID);
+                                                         GlobalDeclID ID);
 
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) { return K == Decl::UnresolvedUsingIfExists; }
@@ -4073,7 +4076,7 @@ public:
                                   SourceLocation StaticAssertLoc,
                                   Expr *AssertExpr, Expr *Message,
                                   SourceLocation RParenLoc, bool Failed);
-  static StaticAssertDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static StaticAssertDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   Expr *getAssertExpr() { return AssertExprAndFailed.getPointer(); }
   const Expr *getAssertExpr() const { return AssertExprAndFailed.getPointer(); }
@@ -4120,7 +4123,7 @@ public:
 
   static BindingDecl *Create(ASTContext &C, DeclContext *DC,
                              SourceLocation IdLoc, IdentifierInfo *Id);
-  static BindingDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static BindingDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   /// Get the expression to which this declaration is bound. This may be null
   /// in two different cases: while parsing the initializer for the
@@ -4189,7 +4192,7 @@ public:
                                    QualType T, TypeSourceInfo *TInfo,
                                    StorageClass S,
                                    ArrayRef<BindingDecl *> Bindings);
-  static DecompositionDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static DecompositionDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                                unsigned NumBindings);
 
   ArrayRef<BindingDecl *> bindings() const {
@@ -4246,7 +4249,7 @@ public:
                                 SourceLocation L, DeclarationName N, QualType T,
                                 TypeSourceInfo *TInfo, SourceLocation StartL,
                                 IdentifierInfo *Getter, IdentifierInfo *Setter);
-  static MSPropertyDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static MSPropertyDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   static bool classof(const Decl *D) { return D->getKind() == MSProperty; }
 
@@ -4300,7 +4303,7 @@ private:
   MSGuidDecl(DeclContext *DC, QualType T, Parts P);
 
   static MSGuidDecl *Create(const ASTContext &C, QualType T, Parts P);
-  static MSGuidDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static MSGuidDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   // Only ASTContext::getMSGuidDecl and deserialization create these.
   friend class ASTContext;
@@ -4353,7 +4356,7 @@ class UnnamedGlobalConstantDecl : public ValueDecl,
   static UnnamedGlobalConstantDecl *Create(const ASTContext &C, QualType T,
                                            const APValue &APVal);
   static UnnamedGlobalConstantDecl *CreateDeserialized(ASTContext &C,
-                                                       DeclID ID);
+                                                       GlobalDeclID ID);
 
   // Only ASTContext::getUnnamedGlobalConstantDecl and deserialization create
   // these.
diff --git a/clang/include/clang/AST/DeclContextInternals.h b/clang/include/clang/AST/DeclContextInternals.h
index c4734ab57895..e169c4859219 100644
--- a/clang/include/clang/AST/DeclContextInternals.h
+++ b/clang/include/clang/AST/DeclContextInternals.h
@@ -42,11 +42,12 @@ class StoredDeclsList {
   /// external declarations.
   DeclsAndHasExternalTy Data;
 
-  template<typename Fn>
-  void erase_if(Fn ShouldErase) {
+  template <typename Fn> DeclListNode::Decls *erase_if(Fn ShouldErase) {
     Decls List = Data.getPointer();
+
     if (!List)
-      return;
+      return nullptr;
+
     ASTContext &C = getASTContext();
     DeclListNode::Decls NewHead = nullptr;
     DeclListNode::Decls *NewLast = nullptr;
@@ -79,6 +80,17 @@ class StoredDeclsList {
     Data.setPointer(NewHead);
 
     assert(llvm::none_of(getLookupResult(), ShouldErase) && "Still exists!");
+
+    if (!Data.getPointer())
+      // All declarations are erased.
+      return nullptr;
+    else if (NewHead.is<NamedDecl *>())
+      // The list only contains a declaration, the header itself.
+      return (DeclListNode::Decls *)&Data;
+    else {
+      assert(NewLast && NewLast->is<NamedDecl *>() && "Not the tail?");
+      return NewLast;
+    }
   }
 
   void erase(NamedDecl *ND) {
@@ -160,12 +172,16 @@ public:
 
   void replaceExternalDecls(ArrayRef<NamedDecl*> Decls) {
     // Remove all declarations that are either external or are replaced with
-    // external declarations.
-    erase_if([Decls](NamedDecl *ND) {
+    // external declarations with higher visibilities.
+    DeclListNode::Decls *Tail = erase_if([Decls](NamedDecl *ND) {
       if (ND->isFromASTFile())
         return true;
+      // FIXME: Can we get rid of this loop completely?
       for (NamedDecl *D : Decls)
-        if (D->declarationReplaces(ND, /*IsKnownNewer=*/false))
+        // Only replace the local declaration if the external declaration has
+        // higher visibilities.
+        if (D->getModuleOwnershipKind() <= ND->getModuleOwnershipKind() &&
+            D->declarationReplaces(ND, /*IsKnownNewer=*/false))
           return true;
       return false;
     });
@@ -185,24 +201,15 @@ public:
       DeclsAsList = Node;
     }
 
-    DeclListNode::Decls Head = Data.getPointer();
-    if (Head.isNull()) {
+    if (!Data.getPointer()) {
       Data.setPointer(DeclsAsList);
       return;
     }
 
-    // Find the end of the existing list.
-    // FIXME: It would be possible to preserve information from erase_if to
-    // avoid this rescan looking for the end of the list.
-    DeclListNode::Decls *Tail = &Head;
-    while (DeclListNode *Node = Tail->dyn_cast<DeclListNode *>())
-      Tail = &Node->Rest;
-
     // Append the Decls.
     DeclListNode *Node = C.AllocateDeclListNode(Tail->get<NamedDecl *>());
     Node->Rest = DeclsAsList;
     *Tail = Node;
-    Data.setPointer(Head);
   }
 
   /// Return the list of all the decls.
diff --git a/clang/include/clang/AST/DeclFriend.h b/clang/include/clang/AST/DeclFriend.h
index b56627a5337d..9789282f351a 100644
--- a/clang/include/clang/AST/DeclFriend.h
+++ b/clang/include/clang/AST/DeclFriend.h
@@ -112,7 +112,7 @@ public:
   Create(ASTContext &C, DeclContext *DC, SourceLocation L, FriendUnion Friend_,
          SourceLocation FriendL,
          ArrayRef<TemplateParameterList *> FriendTypeTPLists = std::nullopt);
-  static FriendDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static FriendDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                         unsigned FriendTypeNumTPLists);
 
   /// If this friend declaration names an (untemplated but possibly
diff --git a/clang/include/clang/AST/DeclID.h b/clang/include/clang/AST/DeclID.h
new file mode 100644
index 000000000000..614ba06b6386
--- /dev/null
+++ b/clang/include/clang/AST/DeclID.h
@@ -0,0 +1,227 @@
+//===--- DeclID.h - ID number for deserialized declarations  ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines DeclID class family to describe the deserialized
+// declarations. The DeclID is widely used in AST via LazyDeclPtr, or calls to
+// `ExternalASTSource::getExternalDecl`. It will be helpful for type safety to
+// require the use of `DeclID` to explicit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_DECLID_H
+#define LLVM_CLANG_AST_DECLID_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/iterator.h"
+
+namespace clang {
+
+/// Predefined declaration IDs.
+///
+/// These declaration IDs correspond to predefined declarations in the AST
+/// context, such as the NULL declaration ID. Such declarations are never
+/// actually serialized, since they will be built by the AST context when
+/// it is created.
+enum PredefinedDeclIDs {
+  /// The NULL declaration.
+  PREDEF_DECL_NULL_ID = 0,
+
+  /// The translation unit.
+  PREDEF_DECL_TRANSLATION_UNIT_ID = 1,
+
+  /// The Objective-C 'id' type.
+  PREDEF_DECL_OBJC_ID_ID = 2,
+
+  /// The Objective-C 'SEL' type.
+  PREDEF_DECL_OBJC_SEL_ID = 3,
+
+  /// The Objective-C 'Class' type.
+  PREDEF_DECL_OBJC_CLASS_ID = 4,
+
+  /// The Objective-C 'Protocol' type.
+  PREDEF_DECL_OBJC_PROTOCOL_ID = 5,
+
+  /// The signed 128-bit integer type.
+  PREDEF_DECL_INT_128_ID = 6,
+
+  /// The unsigned 128-bit integer type.
+  PREDEF_DECL_UNSIGNED_INT_128_ID = 7,
+
+  /// The internal 'instancetype' typedef.
+  PREDEF_DECL_OBJC_INSTANCETYPE_ID = 8,
+
+  /// The internal '__builtin_va_list' typedef.
+  PREDEF_DECL_BUILTIN_VA_LIST_ID = 9,
+
+  /// The internal '__va_list_tag' struct, if any.
+  PREDEF_DECL_VA_LIST_TAG = 10,
+
+  /// The internal '__builtin_ms_va_list' typedef.
+  PREDEF_DECL_BUILTIN_MS_VA_LIST_ID = 11,
+
+  /// The predeclared '_GUID' struct.
+  PREDEF_DECL_BUILTIN_MS_GUID_ID = 12,
+
+  /// The extern "C" context.
+  PREDEF_DECL_EXTERN_C_CONTEXT_ID = 13,
+
+  /// The internal '__make_integer_seq' template.
+  PREDEF_DECL_MAKE_INTEGER_SEQ_ID = 14,
+
+  /// The internal '__NSConstantString' typedef.
+  PREDEF_DECL_CF_CONSTANT_STRING_ID = 15,
+
+  /// The internal '__NSConstantString' tag type.
+  PREDEF_DECL_CF_CONSTANT_STRING_TAG_ID = 16,
+
+  /// The internal '__type_pack_element' template.
+  PREDEF_DECL_TYPE_PACK_ELEMENT_ID = 17,
+};
+
+/// The number of declaration IDs that are predefined.
+///
+/// For more information about predefined declarations, see the
+/// \c PredefinedDeclIDs type and the PREDEF_DECL_*_ID constants.
+const unsigned int NUM_PREDEF_DECL_IDS = 18;
+
+/// GlobalDeclID means DeclID in the current ASTContext and LocalDeclID means
+/// DeclID specific to a certain ModuleFile. Specially, in ASTWriter, the
+/// LocalDeclID to the ModuleFile been writting is equal to the GlobalDeclID.
+/// Outside the serializer, all the DeclID been used should be GlobalDeclID.
+/// We can translate a LocalDeclID to the GlobalDeclID by
+/// `ASTReader::getGlobalDeclID()`.
+
+class DeclIDBase {
+public:
+  /// An ID number that refers to a declaration in an AST file.
+  ///
+  /// The ID numbers of declarations are consecutive (in order of
+  /// discovery), with values below NUM_PREDEF_DECL_IDS being reserved.
+  /// At the start of a chain of precompiled headers, declaration ID 1 is
+  /// used for the translation unit declaration.
+  ///
+  /// DeclID should only be used directly in serialization. All other users
+  /// should use LocalDeclID or GlobalDeclID.
+  using DeclID = uint32_t;
+
+protected:
+  DeclIDBase() : ID(PREDEF_DECL_NULL_ID) {}
+  explicit DeclIDBase(DeclID ID) : ID(ID) {}
+
+public:
+  DeclID get() const { return ID; }
+
+  explicit operator DeclID() const { return ID; }
+
+  explicit operator PredefinedDeclIDs() const { return (PredefinedDeclIDs)ID; }
+
+  bool isValid() const { return ID != PREDEF_DECL_NULL_ID; }
+
+  bool isInvalid() const { return ID == PREDEF_DECL_NULL_ID; }
+
+  friend bool operator==(const DeclIDBase &LHS, const DeclIDBase &RHS) {
+    return LHS.ID == RHS.ID;
+  }
+  friend bool operator!=(const DeclIDBase &LHS, const DeclIDBase &RHS) {
+    return LHS.ID != RHS.ID;
+  }
+  // We may sort the decl ID.
+  friend bool operator<(const DeclIDBase &LHS, const DeclIDBase &RHS) {
+    return LHS.ID < RHS.ID;
+  }
+  friend bool operator>(const DeclIDBase &LHS, const DeclIDBase &RHS) {
+    return LHS.ID > RHS.ID;
+  }
+  friend bool operator<=(const DeclIDBase &LHS, const DeclIDBase &RHS) {
+    return LHS.ID <= RHS.ID;
+  }
+  friend bool operator>=(const DeclIDBase &LHS, const DeclIDBase &RHS) {
+    return LHS.ID >= RHS.ID;
+  }
+
+protected:
+  DeclID ID;
+};
+
+class LocalDeclID : public DeclIDBase {
+  using Base = DeclIDBase;
+
+public:
+  LocalDeclID() : Base() {}
+  LocalDeclID(PredefinedDeclIDs ID) : Base(ID) {}
+  explicit LocalDeclID(DeclID ID) : Base(ID) {}
+
+  LocalDeclID &operator++() {
+    ++ID;
+    return *this;
+  }
+
+  LocalDeclID operator++(int) {
+    LocalDeclID Ret = *this;
+    ++(*this);
+    return Ret;
+  }
+};
+
+class GlobalDeclID : public DeclIDBase {
+  using Base = DeclIDBase;
+
+public:
+  GlobalDeclID() : Base() {}
+  explicit GlobalDeclID(DeclID ID) : Base(ID) {}
+
+  // For DeclIDIterator<GlobalDeclID> to be able to convert a GlobalDeclID
+  // to a LocalDeclID.
+  explicit operator LocalDeclID() const { return LocalDeclID(this->ID); }
+};
+
+/// A helper iterator adaptor to convert the iterators to
+/// `SmallVector<SomeDeclID>` to the iterators to `SmallVector<OtherDeclID>`.
+template <class FromTy, class ToTy>
+class DeclIDIterator
+    : public llvm::iterator_adaptor_base<DeclIDIterator<FromTy, ToTy>,
+                                         const FromTy *,
+                                         std::forward_iterator_tag, ToTy> {
+public:
+  DeclIDIterator() : DeclIDIterator::iterator_adaptor_base(nullptr) {}
+
+  DeclIDIterator(const FromTy *ID)
+      : DeclIDIterator::iterator_adaptor_base(ID) {}
+
+  ToTy operator*() const { return ToTy(*this->I); }
+
+  bool operator==(const DeclIDIterator &RHS) const { return this->I == RHS.I; }
+};
+
+} // namespace clang
+
+namespace llvm {
+template <> struct DenseMapInfo<clang::GlobalDeclID> {
+  using GlobalDeclID = clang::GlobalDeclID;
+  using DeclID = GlobalDeclID::DeclID;
+
+  static GlobalDeclID getEmptyKey() {
+    return GlobalDeclID(DenseMapInfo<DeclID>::getEmptyKey());
+  }
+
+  static GlobalDeclID getTombstoneKey() {
+    return GlobalDeclID(DenseMapInfo<DeclID>::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const GlobalDeclID &Key) {
+    return DenseMapInfo<DeclID>::getHashValue(Key.get());
+  }
+
+  static bool isEqual(const GlobalDeclID &L, const GlobalDeclID &R) {
+    return L == R;
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/clang/include/clang/AST/DeclObjC.h b/clang/include/clang/AST/DeclObjC.h
index 7780afa6f1cf..d2cc61ca19f8 100644
--- a/clang/include/clang/AST/DeclObjC.h
+++ b/clang/include/clang/AST/DeclObjC.h
@@ -236,7 +236,7 @@ public:
          ObjCImplementationControl impControl = ObjCImplementationControl::None,
          bool HasRelatedResultType = false);
 
-  static ObjCMethodDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCMethodDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   ObjCMethodDecl *getCanonicalDecl() override;
   const ObjCMethodDecl *getCanonicalDecl() const {
@@ -614,7 +614,8 @@ public:
                                    IdentifierInfo *name,
                                    SourceLocation colonLoc,
                                    TypeSourceInfo *boundInfo);
-  static ObjCTypeParamDecl *CreateDeserialized(ASTContext &ctx, DeclID ID);
+  static ObjCTypeParamDecl *CreateDeserialized(ASTContext &ctx,
+                                               GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -789,7 +790,7 @@ public:
                                   TypeSourceInfo *TSI,
                                   PropertyControl propControl = None);
 
-  static ObjCPropertyDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCPropertyDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   SourceLocation getAtLoc() const { return AtLoc; }
   void setAtLoc(SourceLocation L) { AtLoc = L; }
@@ -1279,7 +1280,8 @@ public:
          ObjCInterfaceDecl *PrevDecl,
          SourceLocation ClassLoc = SourceLocation(), bool isInternal = false);
 
-  static ObjCInterfaceDecl *CreateDeserialized(const ASTContext &C, DeclID ID);
+  static ObjCInterfaceDecl *CreateDeserialized(const ASTContext &C,
+                                               GlobalDeclID ID);
 
   /// Retrieve the type parameters of this class.
   ///
@@ -1969,7 +1971,7 @@ public:
                               TypeSourceInfo *TInfo, AccessControl ac,
                               Expr *BW = nullptr, bool synthesized = false);
 
-  static ObjCIvarDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCIvarDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   /// Return the class interface that this ivar is logically contained
   /// in; this is either the interface where the ivar was declared, or the
@@ -2039,7 +2041,8 @@ public:
                                      SourceLocation IdLoc, IdentifierInfo *Id,
                                      QualType T, Expr *BW);
 
-  static ObjCAtDefsFieldDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCAtDefsFieldDecl *CreateDeserialized(ASTContext &C,
+                                                 GlobalDeclID ID);
 
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
@@ -2142,7 +2145,7 @@ public:
                                   SourceLocation atStartLoc,
                                   ObjCProtocolDecl *PrevDecl);
 
-  static ObjCProtocolDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCProtocolDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   const ObjCProtocolList &getReferencedProtocols() const {
     assert(hasDefinition() && "No definition available!");
@@ -2361,7 +2364,7 @@ public:
          ObjCTypeParamList *typeParamList,
          SourceLocation IvarLBraceLoc = SourceLocation(),
          SourceLocation IvarRBraceLoc = SourceLocation());
-  static ObjCCategoryDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCCategoryDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   ObjCInterfaceDecl *getClassInterface() { return ClassInterface; }
   const ObjCInterfaceDecl *getClassInterface() const { return ClassInterface; }
@@ -2558,7 +2561,8 @@ public:
   Create(ASTContext &C, DeclContext *DC, const IdentifierInfo *Id,
          ObjCInterfaceDecl *classInterface, SourceLocation nameLoc,
          SourceLocation atStartLoc, SourceLocation CategoryNameLoc);
-  static ObjCCategoryImplDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCCategoryImplDecl *CreateDeserialized(ASTContext &C,
+                                                  GlobalDeclID ID);
 
   ObjCCategoryDecl *getCategoryDecl() const;
 
@@ -2640,7 +2644,8 @@ public:
                                         SourceLocation IvarLBraceLoc=SourceLocation(),
                                         SourceLocation IvarRBraceLoc=SourceLocation());
 
-  static ObjCImplementationDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCImplementationDecl *CreateDeserialized(ASTContext &C,
+                                                    GlobalDeclID ID);
 
   /// init_iterator - Iterates through the ivar initializer list.
   using init_iterator = CXXCtorInitializer **;
@@ -2780,7 +2785,7 @@ public:
                                          ObjCInterfaceDecl* aliasedClass);
 
   static ObjCCompatibleAliasDecl *CreateDeserialized(ASTContext &C,
-                                                     DeclID ID);
+                                                     GlobalDeclID ID);
 
   const ObjCInterfaceDecl *getClassInterface() const { return AliasedClass; }
   ObjCInterfaceDecl *getClassInterface() { return AliasedClass; }
@@ -2851,7 +2856,8 @@ public:
                                       ObjCIvarDecl *ivarDecl,
                                       SourceLocation ivarLoc);
 
-  static ObjCPropertyImplDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ObjCPropertyImplDecl *CreateDeserialized(ASTContext &C,
+                                                  GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
diff --git a/clang/include/clang/AST/DeclOpenMP.h b/clang/include/clang/AST/DeclOpenMP.h
index c7ede7f2157f..e542c3c8e66b 100644
--- a/clang/include/clang/AST/DeclOpenMP.h
+++ b/clang/include/clang/AST/DeclOpenMP.h
@@ -59,7 +59,7 @@ protected:
   }
 
   template <typename T, typename... Params>
-  static T *createEmptyDirective(const ASTContext &C, unsigned ID,
+  static T *createEmptyDirective(const ASTContext &C, GlobalDeclID ID,
                                  unsigned NumClauses, unsigned NumChildren,
                                  Params &&... P) {
     auto *Inst = new (C, ID, size(NumClauses, NumChildren))
@@ -133,7 +133,7 @@ public:
                                       SourceLocation L,
                                       ArrayRef<Expr *> VL);
   static OMPThreadPrivateDecl *CreateDeserialized(ASTContext &C,
-                                                  DeclID ID, unsigned N);
+                                                  GlobalDeclID ID, unsigned N);
 
   typedef MutableArrayRef<Expr *>::iterator varlist_iterator;
   typedef ArrayRef<const Expr *>::iterator varlist_const_iterator;
@@ -214,7 +214,7 @@ public:
          QualType T, OMPDeclareReductionDecl *PrevDeclInScope);
   /// Create deserialized declare reduction node.
   static OMPDeclareReductionDecl *CreateDeserialized(ASTContext &C,
-                                                     DeclID ID);
+                                                     GlobalDeclID ID);
 
   /// Get combiner expression of the declare reduction construct.
   Expr *getCombiner() { return Combiner; }
@@ -318,8 +318,8 @@ public:
                                       ArrayRef<OMPClause *> Clauses,
                                       OMPDeclareMapperDecl *PrevDeclInScope);
   /// Creates deserialized declare mapper node.
-  static OMPDeclareMapperDecl *CreateDeserialized(ASTContext &C, DeclID ID,
-                                                  unsigned N);
+  static OMPDeclareMapperDecl *CreateDeserialized(ASTContext &C,
+                                                  GlobalDeclID ID, unsigned N);
 
   using clauselist_iterator = MutableArrayRef<OMPClause *>::iterator;
   using clauselist_const_iterator = ArrayRef<const OMPClause *>::iterator;
@@ -397,7 +397,8 @@ public:
                                      IdentifierInfo *Id, QualType T,
                                      SourceLocation StartLoc);
 
-  static OMPCapturedExprDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static OMPCapturedExprDecl *CreateDeserialized(ASTContext &C,
+                                                 GlobalDeclID ID);
 
   SourceRange getSourceRange() const override LLVM_READONLY;
 
@@ -427,7 +428,7 @@ public:
   static OMPRequiresDecl *Create(ASTContext &C, DeclContext *DC,
                                  SourceLocation L, ArrayRef<OMPClause *> CL);
   /// Create deserialized requires node.
-  static OMPRequiresDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static OMPRequiresDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                              unsigned N);
 
   using clauselist_iterator = MutableArrayRef<OMPClause *>::iterator;
@@ -495,7 +496,7 @@ public:
   static OMPAllocateDecl *Create(ASTContext &C, DeclContext *DC,
                                  SourceLocation L, ArrayRef<Expr *> VL,
                                  ArrayRef<OMPClause *> CL);
-  static OMPAllocateDecl *CreateDeserialized(ASTContext &C, DeclID ID,
+  static OMPAllocateDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                              unsigned NVars, unsigned NClauses);
 
   typedef MutableArrayRef<Expr *>::iterator varlist_iterator;
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 231bda44a9fc..3ee03eebdb8c 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -797,7 +797,7 @@ protected:
     ///
     /// The first value in the array is the number of specializations/partial
     /// specializations that follow.
-    Decl::DeclID *LazySpecializations = nullptr;
+    GlobalDeclID *LazySpecializations = nullptr;
 
     /// The set of "injected" template arguments used within this
     /// template.
@@ -1087,7 +1087,8 @@ public:
                                       NamedDecl *Decl);
 
   /// Create an empty function template node.
-  static FunctionTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static FunctionTemplateDecl *CreateDeserialized(ASTContext &C,
+                                                  GlobalDeclID ID);
 
   // Implement isa/cast/dyncast support
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
@@ -1204,9 +1205,9 @@ public:
          bool Typename, bool ParameterPack, bool HasTypeConstraint = false,
          std::optional<unsigned> NumExpanded = std::nullopt);
   static TemplateTypeParmDecl *CreateDeserialized(const ASTContext &C,
-                                                  DeclID ID);
+                                                  GlobalDeclID ID);
   static TemplateTypeParmDecl *CreateDeserialized(const ASTContext &C,
-                                                  DeclID ID,
+                                                  GlobalDeclID ID,
                                                   bool HasTypeConstraint);
 
   /// Whether this template type parameter was declared with
@@ -1413,11 +1414,10 @@ public:
          QualType T, TypeSourceInfo *TInfo, ArrayRef<QualType> ExpandedTypes,
          ArrayRef<TypeSourceInfo *> ExpandedTInfos);
 
+  static NonTypeTemplateParmDecl *
+  CreateDeserialized(ASTContext &C, GlobalDeclID ID, bool HasTypeConstraint);
   static NonTypeTemplateParmDecl *CreateDeserialized(ASTContext &C,
-                                                     DeclID ID,
-                                                     bool HasTypeConstraint);
-  static NonTypeTemplateParmDecl *CreateDeserialized(ASTContext &C,
-                                                     DeclID ID,
+                                                     GlobalDeclID ID,
                                                      unsigned NumExpandedTypes,
                                                      bool HasTypeConstraint);
 
@@ -1632,10 +1632,9 @@ public:
          ArrayRef<TemplateParameterList *> Expansions);
 
   static TemplateTemplateParmDecl *CreateDeserialized(ASTContext &C,
-                                                      DeclID ID);
-  static TemplateTemplateParmDecl *CreateDeserialized(ASTContext &C,
-                                                      DeclID ID,
-                                                      unsigned NumExpansions);
+                                                      GlobalDeclID ID);
+  static TemplateTemplateParmDecl *
+  CreateDeserialized(ASTContext &C, GlobalDeclID ID, unsigned NumExpansions);
 
   using TemplateParmPosition::getDepth;
   using TemplateParmPosition::setDepth;
@@ -1857,8 +1856,8 @@ public:
          ClassTemplateDecl *SpecializedTemplate,
          ArrayRef<TemplateArgument> Args,
          ClassTemplateSpecializationDecl *PrevDecl);
-  static ClassTemplateSpecializationDecl *
-  CreateDeserialized(ASTContext &C, DeclID ID);
+  static ClassTemplateSpecializationDecl *CreateDeserialized(ASTContext &C,
+                                                             GlobalDeclID ID);
 
   void getNameForDiagnostic(raw_ostream &OS, const PrintingPolicy &Policy,
                             bool Qualified) const override;
@@ -2110,7 +2109,7 @@ public:
          ClassTemplatePartialSpecializationDecl *PrevDecl);
 
   static ClassTemplatePartialSpecializationDecl *
-  CreateDeserialized(ASTContext &C, DeclID ID);
+  CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   ClassTemplatePartialSpecializationDecl *getMostRecentDecl() {
     return cast<ClassTemplatePartialSpecializationDecl>(
@@ -2306,7 +2305,7 @@ public:
                                    NamedDecl *Decl);
 
   /// Create an empty class template node.
-  static ClassTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ClassTemplateDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   /// Return the specialization with the provided arguments if it exists,
   /// otherwise return the insertion point.
@@ -2472,7 +2471,7 @@ public:
          MutableArrayRef<TemplateParameterList *> Params, FriendUnion Friend,
          SourceLocation FriendLoc);
 
-  static FriendTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static FriendTemplateDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   /// If this friend declaration names a templated type (or
   /// a dependent member type of a templated type), return that
@@ -2573,7 +2572,8 @@ public:
                                        NamedDecl *Decl);
 
   /// Create an empty alias template node.
-  static TypeAliasTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static TypeAliasTemplateDecl *CreateDeserialized(ASTContext &C,
+                                                   GlobalDeclID ID);
 
   // Implement isa/cast/dyncast support
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
@@ -2670,7 +2670,7 @@ public:
          TypeSourceInfo *TInfo, StorageClass S,
          ArrayRef<TemplateArgument> Args);
   static VarTemplateSpecializationDecl *CreateDeserialized(ASTContext &C,
-                                                           DeclID ID);
+                                                           GlobalDeclID ID);
 
   void getNameForDiagnostic(raw_ostream &OS, const PrintingPolicy &Policy,
                             bool Qualified) const override;
@@ -2900,8 +2900,8 @@ public:
          TypeSourceInfo *TInfo, StorageClass S, ArrayRef<TemplateArgument> Args,
          const TemplateArgumentListInfo &ArgInfos);
 
-  static VarTemplatePartialSpecializationDecl *CreateDeserialized(ASTContext &C,
-                                                                  DeclID ID);
+  static VarTemplatePartialSpecializationDecl *
+  CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   VarTemplatePartialSpecializationDecl *getMostRecentDecl() {
     return cast<VarTemplatePartialSpecializationDecl>(
@@ -3078,7 +3078,7 @@ public:
                                  VarDecl *Decl);
 
   /// Create an empty variable template node.
-  static VarTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static VarTemplateDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   /// Return the specialization with the provided arguments if it exists,
   /// otherwise return the insertion point.
@@ -3183,7 +3183,7 @@ public:
                              SourceLocation L, DeclarationName Name,
                              TemplateParameterList *Params,
                              Expr *ConstraintExpr);
-  static ConceptDecl *CreateDeserialized(ASTContext &C, DeclID ID);
+  static ConceptDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
   Expr *getConstraintExpr() const {
     return ConstraintExpr;
@@ -3232,7 +3232,7 @@ public:
   Create(const ASTContext &C, DeclContext *DC, SourceLocation SL,
          ArrayRef<TemplateArgument> ConvertedArgs);
   static ImplicitConceptSpecializationDecl *
-  CreateDeserialized(const ASTContext &C, DeclID ID,
+  CreateDeserialized(const ASTContext &C, GlobalDeclID ID,
                      unsigned NumTemplateArgs);
 
   ArrayRef<TemplateArgument> getTemplateArguments() const {
@@ -3275,7 +3275,7 @@ private:
   static TemplateParamObjectDecl *Create(const ASTContext &C, QualType T,
                                          const APValue &V);
   static TemplateParamObjectDecl *CreateDeserialized(ASTContext &C,
-                                                     DeclID ID);
+                                                     GlobalDeclID ID);
 
   /// Only ASTContext::getTemplateParamObjectDecl and deserialization
   /// create these.
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 2bfefeabc348..f2bf667636dc 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -6610,6 +6610,275 @@ public:
 
 };
 
+/// This class represents BOTH the OpenMP Array Section and OpenACC 'subarray',
+/// with a boolean differentiator.
+/// OpenMP 5.0 [2.1.5, Array Sections].
+/// To specify an array section in an OpenMP construct, array subscript
+/// expressions are extended with the following syntax:
+/// \code
+/// [ lower-bound : length : stride ]
+/// [ lower-bound : length : ]
+/// [ lower-bound : length ]
+/// [ lower-bound : : stride ]
+/// [ lower-bound : : ]
+/// [ lower-bound : ]
+/// [ : length : stride ]
+/// [ : length : ]
+/// [ : length ]
+/// [ : : stride ]
+/// [ : : ]
+/// [ : ]
+/// \endcode
+/// The array section must be a subset of the original array.
+/// Array sections are allowed on multidimensional arrays. Base language array
+/// subscript expressions can be used to specify length-one dimensions of
+/// multidimensional array sections.
+/// Each of the lower-bound, length, and stride expressions if specified must be
+/// an integral type expressions of the base language. When evaluated
+/// they represent a set of integer values as follows:
+/// \code
+/// { lower-bound, lower-bound + stride, lower-bound + 2 * stride,... ,
+/// lower-bound + ((length - 1) * stride) }
+/// \endcode
+/// The lower-bound and length must evaluate to non-negative integers.
+/// The stride must evaluate to a positive integer.
+/// When the size of the array dimension is not known, the length must be
+/// specified explicitly.
+/// When the stride is absent it defaults to 1.
+/// When the length is absent it defaults to ⌈(size − lower-bound)/stride⌉,
+/// where size is the size of the array dimension. When the lower-bound is
+/// absent it defaults to 0.
+///
+///
+/// OpenACC 3.3 [2.7.1 Data Specification in Data Clauses]
+/// In C and C++, a subarray is an array name followed by an extended array
+/// range specification in brackets, with start and length, such as
+///
+/// AA[2:n]
+///
+/// If the lower bound is missing, zero is used. If the length is missing and
+/// the array has known size, the size of the array is used; otherwise the
+/// length is required. The subarray AA[2:n] means elements AA[2], AA[3], . . .
+/// , AA[2+n-1]. In C and C++, a two dimensional array may be declared in at
+/// least four ways:
+///
+/// -Statically-sized array: float AA[100][200];
+/// -Pointer to statically sized rows: typedef float row[200]; row* BB;
+/// -Statically-sized array of pointers: float* CC[200];
+/// -Pointer to pointers: float** DD;
+///
+/// Each dimension may be statically sized, or a pointer to dynamically
+/// allocated memory. Each of these may be included in a data clause using
+/// subarray notation to specify a rectangular array:
+///
+/// -AA[2:n][0:200]
+/// -BB[2:n][0:m]
+/// -CC[2:n][0:m]
+/// -DD[2:n][0:m]
+///
+/// Multidimensional rectangular subarrays in C and C++ may be specified for any
+/// array with any combination of statically-sized or dynamically-allocated
+/// dimensions. For statically sized dimensions, all dimensions except the first
+/// must specify the whole extent to preserve the contiguous data restriction,
+/// discussed below. For dynamically allocated dimensions, the implementation
+/// will allocate pointers in device memory corresponding to the pointers in
+/// local memory and will fill in those pointers as appropriate.
+///
+/// In Fortran, a subarray is an array name followed by a comma-separated list
+/// of range specifications in parentheses, with lower and upper bound
+/// subscripts, such as
+///
+/// arr(1:high,low:100)
+///
+/// If either the lower or upper bounds are missing, the declared or allocated
+/// bounds of the array, if known, are used. All dimensions except the last must
+/// specify the whole extent, to preserve the contiguous data restriction,
+/// discussed below.
+///
+/// Restrictions
+///
+/// -In Fortran, the upper bound for the last dimension of an assumed-size dummy
+/// array must be specified.
+///
+/// -In C and C++, the length for dynamically allocated dimensions of an array
+/// must be explicitly specified.
+///
+/// -In C and C++, modifying pointers in pointer arrays during the data
+/// lifetime, either on the host or on the device, may result in undefined
+/// behavior.
+///
+/// -If a subarray  appears in a data clause, the implementation may choose to
+/// allocate memory for only that subarray on the accelerator.
+///
+/// -In Fortran, array pointers may appear, but pointer association is not
+/// preserved in device memory.
+///
+/// -Any array or subarray in a data clause, including Fortran array pointers,
+/// must be a contiguous section of memory, except for dynamic multidimensional
+/// C arrays.
+///
+/// -In C and C++, if a variable or array of composite type appears, all the
+/// data members of the struct or class are allocated and copied, as
+/// appropriate. If a composite member is a pointer type, the data addressed by
+/// that pointer are not implicitly copied.
+///
+/// -In Fortran, if a variable or array of composite type appears, all the
+/// members of that derived type are allocated and copied, as appropriate. If
+/// any member has the allocatable or pointer attribute, the data accessed
+/// through that member are not copied.
+///
+/// -If an expression is used in a subscript or subarray expression in a clause
+/// on a data construct, the same value is used when copying data at the end of
+/// the data region, even if the values of variables in the expression change
+/// during the data region.
+class ArraySectionExpr : public Expr {
+  friend class ASTStmtReader;
+  friend class ASTStmtWriter;
+
+public:
+  enum ArraySectionType { OMPArraySection, OpenACCArraySection };
+
+private:
+  enum {
+    BASE,
+    LOWER_BOUND,
+    LENGTH,
+    STRIDE,
+    END_EXPR,
+    OPENACC_END_EXPR = STRIDE
+  };
+
+  ArraySectionType ASType = OMPArraySection;
+  Stmt *SubExprs[END_EXPR] = {nullptr};
+  SourceLocation ColonLocFirst;
+  SourceLocation ColonLocSecond;
+  SourceLocation RBracketLoc;
+
+public:
+  // Constructor for OMP array sections, which include a 'stride'.
+  ArraySectionExpr(Expr *Base, Expr *LowerBound, Expr *Length, Expr *Stride,
+                   QualType Type, ExprValueKind VK, ExprObjectKind OK,
+                   SourceLocation ColonLocFirst, SourceLocation ColonLocSecond,
+                   SourceLocation RBracketLoc)
+      : Expr(ArraySectionExprClass, Type, VK, OK), ASType(OMPArraySection),
+        ColonLocFirst(ColonLocFirst), ColonLocSecond(ColonLocSecond),
+        RBracketLoc(RBracketLoc) {
+    setBase(Base);
+    setLowerBound(LowerBound);
+    setLength(Length);
+    setStride(Stride);
+    setDependence(computeDependence(this));
+  }
+
+  // Constructor for OpenACC sub-arrays, which do not permit a 'stride'.
+  ArraySectionExpr(Expr *Base, Expr *LowerBound, Expr *Length, QualType Type,
+                   ExprValueKind VK, ExprObjectKind OK, SourceLocation ColonLoc,
+                   SourceLocation RBracketLoc)
+      : Expr(ArraySectionExprClass, Type, VK, OK), ASType(OpenACCArraySection),
+        ColonLocFirst(ColonLoc), RBracketLoc(RBracketLoc) {
+    setBase(Base);
+    setLowerBound(LowerBound);
+    setLength(Length);
+    setDependence(computeDependence(this));
+  }
+
+  /// Create an empty array section expression.
+  explicit ArraySectionExpr(EmptyShell Shell)
+      : Expr(ArraySectionExprClass, Shell) {}
+
+  /// Return original type of the base expression for array section.
+  static QualType getBaseOriginalType(const Expr *Base);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == ArraySectionExprClass;
+  }
+
+  bool isOMPArraySection() const { return ASType == OMPArraySection; }
+  bool isOpenACCArraySection() const { return ASType == OpenACCArraySection; }
+
+  /// Get base of the array section.
+  Expr *getBase() { return cast<Expr>(SubExprs[BASE]); }
+  const Expr *getBase() const { return cast<Expr>(SubExprs[BASE]); }
+
+  /// Get lower bound of array section.
+  Expr *getLowerBound() { return cast_or_null<Expr>(SubExprs[LOWER_BOUND]); }
+  const Expr *getLowerBound() const {
+    return cast_or_null<Expr>(SubExprs[LOWER_BOUND]);
+  }
+
+  /// Get length of array section.
+  Expr *getLength() { return cast_or_null<Expr>(SubExprs[LENGTH]); }
+  const Expr *getLength() const { return cast_or_null<Expr>(SubExprs[LENGTH]); }
+
+  /// Get stride of array section.
+  Expr *getStride() {
+    assert(ASType != OpenACCArraySection &&
+           "Stride not valid in OpenACC subarrays");
+    return cast_or_null<Expr>(SubExprs[STRIDE]);
+  }
+
+  const Expr *getStride() const {
+    assert(ASType != OpenACCArraySection &&
+           "Stride not valid in OpenACC subarrays");
+    return cast_or_null<Expr>(SubExprs[STRIDE]);
+  }
+
+  SourceLocation getBeginLoc() const LLVM_READONLY {
+    return getBase()->getBeginLoc();
+  }
+  SourceLocation getEndLoc() const LLVM_READONLY { return RBracketLoc; }
+
+  SourceLocation getColonLocFirst() const { return ColonLocFirst; }
+  SourceLocation getColonLocSecond() const {
+    assert(ASType != OpenACCArraySection &&
+           "second colon for stride not valid in OpenACC subarrays");
+    return ColonLocSecond;
+  }
+  SourceLocation getRBracketLoc() const { return RBracketLoc; }
+
+  SourceLocation getExprLoc() const LLVM_READONLY {
+    return getBase()->getExprLoc();
+  }
+
+  child_range children() {
+    return child_range(
+        &SubExprs[BASE],
+        &SubExprs[ASType == OMPArraySection ? END_EXPR : OPENACC_END_EXPR]);
+  }
+
+  const_child_range children() const {
+    return const_child_range(
+        &SubExprs[BASE],
+        &SubExprs[ASType == OMPArraySection ? END_EXPR : OPENACC_END_EXPR]);
+  }
+
+private:
+  /// Set base of the array section.
+  void setBase(Expr *E) { SubExprs[BASE] = E; }
+
+  /// Set lower bound of the array section.
+  void setLowerBound(Expr *E) { SubExprs[LOWER_BOUND] = E; }
+
+  /// Set length of the array section.
+  void setLength(Expr *E) { SubExprs[LENGTH] = E; }
+
+  /// Set length of the array section.
+  void setStride(Expr *E) {
+    assert(ASType != OpenACCArraySection &&
+           "Stride not valid in OpenACC subarrays");
+    SubExprs[STRIDE] = E;
+  }
+
+  void setColonLocFirst(SourceLocation L) { ColonLocFirst = L; }
+
+  void setColonLocSecond(SourceLocation L) {
+    assert(ASType != OpenACCArraySection &&
+           "second colon for stride not valid in OpenACC subarrays");
+    ColonLocSecond = L;
+  }
+  void setRBracketLoc(SourceLocation L) { RBracketLoc = L; }
+};
+
 /// Frontend produces RecoveryExprs on semantic errors that prevent creating
 /// other well-formed expressions. E.g. when type-checking of a binary operator
 /// fails, we cannot produce a BinaryOperator expression. Instead, we can choose
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index a915745d2d73..ab3f810b4519 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -1482,6 +1482,8 @@ public:
 ///     const S &s_ref = S(); // Requires a CXXBindTemporaryExpr.
 ///   }
 /// \endcode
+///
+/// Destructor might be null if destructor declaration is not valid.
 class CXXBindTemporaryExpr : public Expr {
   CXXTemporary *Temp = nullptr;
   Stmt *SubExpr = nullptr;
diff --git a/clang/include/clang/AST/ExprOpenMP.h b/clang/include/clang/AST/ExprOpenMP.h
index be5b1f3fdd11..54a0c203f656 100644
--- a/clang/include/clang/AST/ExprOpenMP.h
+++ b/clang/include/clang/AST/ExprOpenMP.h
@@ -17,130 +17,6 @@
 #include "clang/AST/Expr.h"
 
 namespace clang {
-/// OpenMP 5.0 [2.1.5, Array Sections].
-/// To specify an array section in an OpenMP construct, array subscript
-/// expressions are extended with the following syntax:
-/// \code
-/// [ lower-bound : length : stride ]
-/// [ lower-bound : length : ]
-/// [ lower-bound : length ]
-/// [ lower-bound : : stride ]
-/// [ lower-bound : : ]
-/// [ lower-bound : ]
-/// [ : length : stride ]
-/// [ : length : ]
-/// [ : length ]
-/// [ : : stride ]
-/// [ : : ]
-/// [ : ]
-/// \endcode
-/// The array section must be a subset of the original array.
-/// Array sections are allowed on multidimensional arrays. Base language array
-/// subscript expressions can be used to specify length-one dimensions of
-/// multidimensional array sections.
-/// Each of the lower-bound, length, and stride expressions if specified must be
-/// an integral type expressions of the base language. When evaluated
-/// they represent a set of integer values as follows:
-/// \code
-/// { lower-bound, lower-bound + stride, lower-bound + 2 * stride,... ,
-/// lower-bound + ((length - 1) * stride) }
-/// \endcode
-/// The lower-bound and length must evaluate to non-negative integers.
-/// The stride must evaluate to a positive integer.
-/// When the size of the array dimension is not known, the length must be
-/// specified explicitly.
-/// When the stride is absent it defaults to 1.
-/// When the length is absent it defaults to ⌈(size − lower-bound)/stride⌉,
-/// where size is the size of the array dimension. When the lower-bound is
-/// absent it defaults to 0.
-class OMPArraySectionExpr : public Expr {
-  enum { BASE, LOWER_BOUND, LENGTH, STRIDE, END_EXPR };
-  Stmt *SubExprs[END_EXPR];
-  SourceLocation ColonLocFirst;
-  SourceLocation ColonLocSecond;
-  SourceLocation RBracketLoc;
-
-public:
-  OMPArraySectionExpr(Expr *Base, Expr *LowerBound, Expr *Length, Expr *Stride,
-                      QualType Type, ExprValueKind VK, ExprObjectKind OK,
-                      SourceLocation ColonLocFirst,
-                      SourceLocation ColonLocSecond, SourceLocation RBracketLoc)
-      : Expr(OMPArraySectionExprClass, Type, VK, OK),
-        ColonLocFirst(ColonLocFirst), ColonLocSecond(ColonLocSecond),
-        RBracketLoc(RBracketLoc) {
-    SubExprs[BASE] = Base;
-    SubExprs[LOWER_BOUND] = LowerBound;
-    SubExprs[LENGTH] = Length;
-    SubExprs[STRIDE] = Stride;
-    setDependence(computeDependence(this));
-  }
-
-  /// Create an empty array section expression.
-  explicit OMPArraySectionExpr(EmptyShell Shell)
-      : Expr(OMPArraySectionExprClass, Shell) {}
-
-  /// An array section can be written only as Base[LowerBound:Length].
-
-  /// Get base of the array section.
-  Expr *getBase() { return cast<Expr>(SubExprs[BASE]); }
-  const Expr *getBase() const { return cast<Expr>(SubExprs[BASE]); }
-  /// Set base of the array section.
-  void setBase(Expr *E) { SubExprs[BASE] = E; }
-
-  /// Return original type of the base expression for array section.
-  static QualType getBaseOriginalType(const Expr *Base);
-
-  /// Get lower bound of array section.
-  Expr *getLowerBound() { return cast_or_null<Expr>(SubExprs[LOWER_BOUND]); }
-  const Expr *getLowerBound() const {
-    return cast_or_null<Expr>(SubExprs[LOWER_BOUND]);
-  }
-  /// Set lower bound of the array section.
-  void setLowerBound(Expr *E) { SubExprs[LOWER_BOUND] = E; }
-
-  /// Get length of array section.
-  Expr *getLength() { return cast_or_null<Expr>(SubExprs[LENGTH]); }
-  const Expr *getLength() const { return cast_or_null<Expr>(SubExprs[LENGTH]); }
-  /// Set length of the array section.
-  void setLength(Expr *E) { SubExprs[LENGTH] = E; }
-
-  /// Get stride of array section.
-  Expr *getStride() { return cast_or_null<Expr>(SubExprs[STRIDE]); }
-  const Expr *getStride() const { return cast_or_null<Expr>(SubExprs[STRIDE]); }
-  /// Set length of the array section.
-  void setStride(Expr *E) { SubExprs[STRIDE] = E; }
-
-  SourceLocation getBeginLoc() const LLVM_READONLY {
-    return getBase()->getBeginLoc();
-  }
-  SourceLocation getEndLoc() const LLVM_READONLY { return RBracketLoc; }
-
-  SourceLocation getColonLocFirst() const { return ColonLocFirst; }
-  void setColonLocFirst(SourceLocation L) { ColonLocFirst = L; }
-
-  SourceLocation getColonLocSecond() const { return ColonLocSecond; }
-  void setColonLocSecond(SourceLocation L) { ColonLocSecond = L; }
-
-  SourceLocation getRBracketLoc() const { return RBracketLoc; }
-  void setRBracketLoc(SourceLocation L) { RBracketLoc = L; }
-
-  SourceLocation getExprLoc() const LLVM_READONLY {
-    return getBase()->getExprLoc();
-  }
-
-  static bool classof(const Stmt *T) {
-    return T->getStmtClass() == OMPArraySectionExprClass;
-  }
-
-  child_range children() {
-    return child_range(&SubExprs[BASE], &SubExprs[END_EXPR]);
-  }
-
-  const_child_range children() const {
-    return const_child_range(&SubExprs[BASE], &SubExprs[END_EXPR]);
-  }
-};
-
 /// An explicit cast in C or a C-style cast in C++, which uses the syntax
 /// ([s1][s2]...[sn])expr. For example: @c ([3][3])f.
 class OMPArrayShapingExpr final
diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h
index eee8d6b6c6ef..385c32edbae0 100644
--- a/clang/include/clang/AST/ExternalASTSource.h
+++ b/clang/include/clang/AST/ExternalASTSource.h
@@ -99,7 +99,7 @@ public:
   /// passes back decl sets as VisibleDeclaration objects.
   ///
   /// The default implementation of this method is a no-op.
-  virtual Decl *GetExternalDecl(Decl::DeclID ID);
+  virtual Decl *GetExternalDecl(GlobalDeclID ID);
 
   /// Resolve a selector ID into a selector.
   ///
@@ -375,7 +375,7 @@ public:
     if (isOffset()) {
       assert(Source &&
              "Cannot deserialize a lazy pointer without an AST source");
-      Ptr = reinterpret_cast<uint64_t>((Source->*Get)(Ptr >> 1));
+      Ptr = reinterpret_cast<uint64_t>((Source->*Get)(OffsT(Ptr >> 1)));
     }
     return reinterpret_cast<T*>(Ptr);
   }
@@ -579,7 +579,7 @@ using LazyDeclStmtPtr =
 
 /// A lazy pointer to a declaration.
 using LazyDeclPtr =
-    LazyOffsetPtr<Decl, Decl::DeclID, &ExternalASTSource::GetExternalDecl>;
+    LazyOffsetPtr<Decl, GlobalDeclID, &ExternalASTSource::GetExternalDecl>;
 
 /// A lazy pointer to a set of CXXCtorInitializers.
 using LazyCXXCtorInitializersPtr =
diff --git a/clang/include/clang/AST/NestedNameSpecifier.h b/clang/include/clang/AST/NestedNameSpecifier.h
index 7b0c21b9e7cf..a1d9e30e660d 100644
--- a/clang/include/clang/AST/NestedNameSpecifier.h
+++ b/clang/include/clang/AST/NestedNameSpecifier.h
@@ -266,7 +266,7 @@ public:
   explicit operator bool() const { return Qualifier; }
 
   /// Evaluates true when this nested-name-specifier location is
-  /// empty.
+  /// non-empty.
   bool hasQualifier() const { return Qualifier; }
 
   /// Retrieve the nested-name-specifier to which this instance
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index 277a351c49fc..bb4cb1f5d508 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -156,51 +156,50 @@ public:
                                    Expr *ConditionExpr, SourceLocation EndLoc);
 };
 
-/// Represents a clause that has one or more IntExprs.  It does not own the
-/// IntExprs, but provides 'children' and other accessors.
-class OpenACCClauseWithIntExprs : public OpenACCClauseWithParams {
-  MutableArrayRef<Expr *> IntExprs;
+/// Represents a clause that has one or more expressions associated with it.
+class OpenACCClauseWithExprs : public OpenACCClauseWithParams {
+  MutableArrayRef<Expr *> Exprs;
 
 protected:
-  OpenACCClauseWithIntExprs(OpenACCClauseKind K, SourceLocation BeginLoc,
-                            SourceLocation LParenLoc, SourceLocation EndLoc)
+  OpenACCClauseWithExprs(OpenACCClauseKind K, SourceLocation BeginLoc,
+                         SourceLocation LParenLoc, SourceLocation EndLoc)
       : OpenACCClauseWithParams(K, BeginLoc, LParenLoc, EndLoc) {}
 
   /// Used only for initialization, the leaf class can initialize this to
   /// trailing storage.
-  void setIntExprs(MutableArrayRef<Expr *> NewIntExprs) {
-    assert(IntExprs.empty() && "Cannot change IntExprs list");
-    IntExprs = NewIntExprs;
+  void setExprs(MutableArrayRef<Expr *> NewExprs) {
+    assert(Exprs.empty() && "Cannot change Exprs list");
+    Exprs = NewExprs;
   }
 
-  /// Gets the entire list of integer expressions, but leave it to the
+  /// Gets the entire list of expressions, but leave it to the
   /// individual clauses to expose this how they'd like.
-  llvm::ArrayRef<Expr *> getIntExprs() const { return IntExprs; }
+  llvm::ArrayRef<Expr *> getExprs() const { return Exprs; }
 
 public:
   child_range children() {
-    return child_range(reinterpret_cast<Stmt **>(IntExprs.begin()),
-                       reinterpret_cast<Stmt **>(IntExprs.end()));
+    return child_range(reinterpret_cast<Stmt **>(Exprs.begin()),
+                       reinterpret_cast<Stmt **>(Exprs.end()));
   }
 
   const_child_range children() const {
     child_range Children =
-        const_cast<OpenACCClauseWithIntExprs *>(this)->children();
+        const_cast<OpenACCClauseWithExprs *>(this)->children();
     return const_child_range(Children.begin(), Children.end());
   }
 };
 
 class OpenACCNumGangsClause final
-    : public OpenACCClauseWithIntExprs,
+    : public OpenACCClauseWithExprs,
       public llvm::TrailingObjects<OpenACCNumGangsClause, Expr *> {
 
   OpenACCNumGangsClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
                         ArrayRef<Expr *> IntExprs, SourceLocation EndLoc)
-      : OpenACCClauseWithIntExprs(OpenACCClauseKind::NumGangs, BeginLoc,
-                                  LParenLoc, EndLoc) {
+      : OpenACCClauseWithExprs(OpenACCClauseKind::NumGangs, BeginLoc, LParenLoc,
+                               EndLoc) {
     std::uninitialized_copy(IntExprs.begin(), IntExprs.end(),
                             getTrailingObjects<Expr *>());
-    setIntExprs(MutableArrayRef(getTrailingObjects<Expr *>(), IntExprs.size()));
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), IntExprs.size()));
   }
 
 public:
@@ -209,35 +208,35 @@ public:
          ArrayRef<Expr *> IntExprs, SourceLocation EndLoc);
 
   llvm::ArrayRef<Expr *> getIntExprs() {
-    return OpenACCClauseWithIntExprs::getIntExprs();
+    return OpenACCClauseWithExprs::getExprs();
   }
 
   llvm::ArrayRef<Expr *> getIntExprs() const {
-    return OpenACCClauseWithIntExprs::getIntExprs();
+    return OpenACCClauseWithExprs::getExprs();
   }
 };
 
 /// Represents one of a handful of clauses that have a single integer
 /// expression.
-class OpenACCClauseWithSingleIntExpr : public OpenACCClauseWithIntExprs {
+class OpenACCClauseWithSingleIntExpr : public OpenACCClauseWithExprs {
   Expr *IntExpr;
 
 protected:
   OpenACCClauseWithSingleIntExpr(OpenACCClauseKind K, SourceLocation BeginLoc,
                                  SourceLocation LParenLoc, Expr *IntExpr,
                                  SourceLocation EndLoc)
-      : OpenACCClauseWithIntExprs(K, BeginLoc, LParenLoc, EndLoc),
+      : OpenACCClauseWithExprs(K, BeginLoc, LParenLoc, EndLoc),
         IntExpr(IntExpr) {
-    setIntExprs(MutableArrayRef<Expr *>{&this->IntExpr, 1});
+    setExprs(MutableArrayRef<Expr *>{&this->IntExpr, 1});
   }
 
 public:
-  bool hasIntExpr() const { return !getIntExprs().empty(); }
+  bool hasIntExpr() const { return !getExprs().empty(); }
   const Expr *getIntExpr() const {
-    return hasIntExpr() ? getIntExprs()[0] : nullptr;
+    return hasIntExpr() ? getExprs()[0] : nullptr;
   }
 
-  Expr *getIntExpr() { return hasIntExpr() ? getIntExprs()[0] : nullptr; };
+  Expr *getIntExpr() { return hasIntExpr() ? getExprs()[0] : nullptr; };
 };
 
 class OpenACCNumWorkersClause : public OpenACCClauseWithSingleIntExpr {
@@ -261,6 +260,40 @@ public:
          Expr *IntExpr, SourceLocation EndLoc);
 };
 
+/// Represents a clause with one or more 'var' objects, represented as an expr,
+/// as its arguments. Var-list is expected to be stored in trailing storage.
+/// For now, we're just storing the original expression in its entirety, unlike
+/// OMP which has to do a bunch of work to create a private.
+class OpenACCClauseWithVarList : public OpenACCClauseWithExprs {
+protected:
+  OpenACCClauseWithVarList(OpenACCClauseKind K, SourceLocation BeginLoc,
+                           SourceLocation LParenLoc, SourceLocation EndLoc)
+      : OpenACCClauseWithExprs(K, BeginLoc, LParenLoc, EndLoc) {}
+
+public:
+  ArrayRef<Expr *> getVarList() { return getExprs(); }
+  ArrayRef<Expr *> getVarList() const { return getExprs(); }
+};
+
+class OpenACCPrivateClause final
+    : public OpenACCClauseWithVarList,
+      public llvm::TrailingObjects<OpenACCPrivateClause, Expr *> {
+
+  OpenACCPrivateClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                       ArrayRef<Expr *> VarList, SourceLocation EndLoc)
+      : OpenACCClauseWithVarList(OpenACCClauseKind::Private, BeginLoc,
+                                 LParenLoc, EndLoc) {
+    std::uninitialized_copy(VarList.begin(), VarList.end(),
+                            getTrailingObjects<Expr *>());
+    setExprs(MutableArrayRef(getTrailingObjects<Expr *>(), VarList.size()));
+  }
+
+public:
+  static OpenACCPrivateClause *
+  Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
+         ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+};
+
 template <class Impl> class OpenACCClauseVisitor {
   Impl &getDerived() { return static_cast<Impl &>(*this); }
 
@@ -299,6 +332,9 @@ public:
 class OpenACCClausePrinter final
     : public OpenACCClauseVisitor<OpenACCClausePrinter> {
   raw_ostream &OS;
+  const PrintingPolicy &Policy;
+
+  void printExpr(const Expr *E);
 
 public:
   void VisitClauseList(ArrayRef<const OpenACCClause *> List) {
@@ -309,7 +345,8 @@ public:
         OS << ' ';
     }
   }
-  OpenACCClausePrinter(raw_ostream &OS) : OS(OS) {}
+  OpenACCClausePrinter(raw_ostream &OS, const PrintingPolicy &Policy)
+      : OS(OS), Policy(Policy) {}
 
 #define VISIT_CLAUSE(CLAUSE_NAME)                                              \
   void Visit##CLAUSE_NAME##Clause(const OpenACC##CLAUSE_NAME##Clause &Clause);
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 7eb92e304a38..f9b145b4e86a 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2740,7 +2740,7 @@ DEF_TRAVERSE_STMT(CXXMemberCallExpr, {})
 DEF_TRAVERSE_STMT(AddrLabelExpr, {})
 DEF_TRAVERSE_STMT(ArraySubscriptExpr, {})
 DEF_TRAVERSE_STMT(MatrixSubscriptExpr, {})
-DEF_TRAVERSE_STMT(OMPArraySectionExpr, {})
+DEF_TRAVERSE_STMT(ArraySectionExpr, {})
 DEF_TRAVERSE_STMT(OMPArrayShapingExpr, {})
 DEF_TRAVERSE_STMT(OMPIteratorExpr, {})
 
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 99f45d518c79..e6643469e0b3 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -25,8 +25,10 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/ExceptionSpecificationType.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/Linkage.h"
 #include "clang/Basic/PartialDiagnostic.h"
+#include "clang/Basic/PointerAuthOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Basic/Visibility.h"
@@ -139,6 +141,174 @@ using CanQualType = CanQual<Type>;
 #define TYPE(Class, Base) class Class##Type;
 #include "clang/AST/TypeNodes.inc"
 
+/// Pointer-authentication qualifiers.
+class PointerAuthQualifier {
+  enum : uint32_t {
+    EnabledShift = 0,
+    EnabledBits = 1,
+    EnabledMask = 1 << EnabledShift,
+    AddressDiscriminatedShift = EnabledShift + EnabledBits,
+    AddressDiscriminatedBits = 1,
+    AddressDiscriminatedMask = 1 << AddressDiscriminatedShift,
+    AuthenticationModeShift =
+        AddressDiscriminatedShift + AddressDiscriminatedBits,
+    AuthenticationModeBits = 2,
+    AuthenticationModeMask = ((1 << AuthenticationModeBits) - 1)
+                             << AuthenticationModeShift,
+    IsaPointerShift = AuthenticationModeShift + AuthenticationModeBits,
+    IsaPointerBits = 1,
+    IsaPointerMask = ((1 << IsaPointerBits) - 1) << IsaPointerShift,
+    AuthenticatesNullValuesShift = IsaPointerShift + IsaPointerBits,
+    AuthenticatesNullValuesBits = 1,
+    AuthenticatesNullValuesMask = ((1 << AuthenticatesNullValuesBits) - 1)
+                                  << AuthenticatesNullValuesShift,
+    KeyShift = AuthenticatesNullValuesShift + AuthenticatesNullValuesBits,
+    KeyBits = 10,
+    KeyMask = ((1 << KeyBits) - 1) << KeyShift,
+    DiscriminatorShift = KeyShift + KeyBits,
+    DiscriminatorBits = 16,
+    DiscriminatorMask = ((1u << DiscriminatorBits) - 1) << DiscriminatorShift,
+  };
+
+  // bits:     |0      |1      |2..3              |4          |
+  //           |Enabled|Address|AuthenticationMode|ISA pointer|
+  // bits:     |5                |6..15|   16...31   |
+  //           |AuthenticatesNull|Key  |Discriminator|
+  uint32_t Data = 0;
+
+  // The following static assertions check that each of the 32 bits is present
+  // exactly in one of the constants.
+  static_assert((EnabledBits + AddressDiscriminatedBits +
+                 AuthenticationModeBits + IsaPointerBits +
+                 AuthenticatesNullValuesBits + KeyBits + DiscriminatorBits) ==
+                    32,
+                "PointerAuthQualifier should be exactly 32 bits");
+  static_assert((EnabledMask + AddressDiscriminatedMask +
+                 AuthenticationModeMask + IsaPointerMask +
+                 AuthenticatesNullValuesMask + KeyMask + DiscriminatorMask) ==
+                    0xFFFFFFFF,
+                "All masks should cover the entire bits");
+  static_assert((EnabledMask ^ AddressDiscriminatedMask ^
+                 AuthenticationModeMask ^ IsaPointerMask ^
+                 AuthenticatesNullValuesMask ^ KeyMask ^ DiscriminatorMask) ==
+                    0xFFFFFFFF,
+                "All masks should cover the entire bits");
+
+  PointerAuthQualifier(unsigned Key, bool IsAddressDiscriminated,
+                       unsigned ExtraDiscriminator,
+                       PointerAuthenticationMode AuthenticationMode,
+                       bool IsIsaPointer, bool AuthenticatesNullValues)
+      : Data(EnabledMask |
+             (IsAddressDiscriminated
+                  ? llvm::to_underlying(AddressDiscriminatedMask)
+                  : 0) |
+             (Key << KeyShift) |
+             (llvm::to_underlying(AuthenticationMode)
+              << AuthenticationModeShift) |
+             (ExtraDiscriminator << DiscriminatorShift) |
+             (IsIsaPointer << IsaPointerShift) |
+             (AuthenticatesNullValues << AuthenticatesNullValuesShift)) {
+    assert(Key <= KeyNoneInternal);
+    assert(ExtraDiscriminator <= MaxDiscriminator);
+    assert((Data == 0) ==
+           (getAuthenticationMode() == PointerAuthenticationMode::None));
+  }
+
+public:
+  enum {
+    KeyNoneInternal = (1u << KeyBits) - 1,
+
+    /// The maximum supported pointer-authentication key.
+    MaxKey = KeyNoneInternal - 1,
+
+    /// The maximum supported pointer-authentication discriminator.
+    MaxDiscriminator = (1u << DiscriminatorBits) - 1
+  };
+
+public:
+  PointerAuthQualifier() = default;
+
+  static PointerAuthQualifier
+  Create(unsigned Key, bool IsAddressDiscriminated, unsigned ExtraDiscriminator,
+         PointerAuthenticationMode AuthenticationMode, bool IsIsaPointer,
+         bool AuthenticatesNullValues) {
+    if (Key == PointerAuthKeyNone)
+      Key = KeyNoneInternal;
+    assert(Key <= KeyNoneInternal && "out-of-range key value");
+    return PointerAuthQualifier(Key, IsAddressDiscriminated, ExtraDiscriminator,
+                                AuthenticationMode, IsIsaPointer,
+                                AuthenticatesNullValues);
+  }
+
+  bool isPresent() const {
+    assert((Data == 0) ==
+           (getAuthenticationMode() == PointerAuthenticationMode::None));
+    return Data != 0;
+  }
+
+  explicit operator bool() const { return isPresent(); }
+
+  unsigned getKey() const {
+    assert(isPresent());
+    return (Data & KeyMask) >> KeyShift;
+  }
+
+  bool hasKeyNone() const { return isPresent() && getKey() == KeyNoneInternal; }
+
+  bool isAddressDiscriminated() const {
+    assert(isPresent());
+    return (Data & AddressDiscriminatedMask) >> AddressDiscriminatedShift;
+  }
+
+  unsigned getExtraDiscriminator() const {
+    assert(isPresent());
+    return (Data >> DiscriminatorShift);
+  }
+
+  PointerAuthenticationMode getAuthenticationMode() const {
+    return PointerAuthenticationMode((Data & AuthenticationModeMask) >>
+                                     AuthenticationModeShift);
+  }
+
+  bool isIsaPointer() const {
+    assert(isPresent());
+    return (Data & IsaPointerMask) >> IsaPointerShift;
+  }
+
+  bool authenticatesNullValues() const {
+    assert(isPresent());
+    return (Data & AuthenticatesNullValuesMask) >> AuthenticatesNullValuesShift;
+  }
+
+  PointerAuthQualifier withoutKeyNone() const {
+    return hasKeyNone() ? PointerAuthQualifier() : *this;
+  }
+
+  friend bool operator==(PointerAuthQualifier Lhs, PointerAuthQualifier Rhs) {
+    return Lhs.Data == Rhs.Data;
+  }
+  friend bool operator!=(PointerAuthQualifier Lhs, PointerAuthQualifier Rhs) {
+    return Lhs.Data != Rhs.Data;
+  }
+
+  bool isEquivalent(PointerAuthQualifier Other) const {
+    return withoutKeyNone() == Other.withoutKeyNone();
+  }
+
+  uint32_t getAsOpaqueValue() const { return Data; }
+
+  // Deserialize pointer-auth qualifiers from an opaque representation.
+  static PointerAuthQualifier fromOpaqueValue(uint32_t Opaque) {
+    PointerAuthQualifier Result;
+    Result.Data = Opaque;
+    assert((Result.Data == 0) ==
+           (Result.getAuthenticationMode() == PointerAuthenticationMode::None));
+    return Result;
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Data); }
+};
+
 /// The collection of all-type qualifiers we support.
 /// Clang supports five independent qualifiers:
 /// * C99: const, volatile, and restrict
@@ -147,8 +317,9 @@ using CanQualType = CanQual<Type>;
 /// * Objective C: the GC attributes (none, weak, or strong)
 class Qualifiers {
 public:
-  enum TQ { // NOTE: These flags must be kept in sync with DeclSpec::TQ.
-    Const    = 0x1,
+  enum TQ : uint64_t {
+    // NOTE: These flags must be kept in sync with DeclSpec::TQ.
+    Const = 0x1,
     Restrict = 0x2,
     Volatile = 0x4,
     CVRMask = Const | Volatile | Restrict
@@ -182,7 +353,7 @@ public:
     OCL_Autoreleasing
   };
 
-  enum {
+  enum : uint64_t {
     /// The maximum supported address space number.
     /// 23 bits should be enough for anyone.
     MaxAddressSpace = 0x7fffffu,
@@ -197,16 +368,25 @@ public:
   /// Returns the common set of qualifiers while removing them from
   /// the given sets.
   static Qualifiers removeCommonQualifiers(Qualifiers &L, Qualifiers &R) {
+    Qualifiers Q;
+    PointerAuthQualifier LPtrAuth = L.getPointerAuth();
+    if (LPtrAuth.isPresent() &&
+        LPtrAuth.getKey() != PointerAuthQualifier::KeyNoneInternal &&
+        LPtrAuth == R.getPointerAuth()) {
+      Q.setPointerAuth(LPtrAuth);
+      PointerAuthQualifier Empty;
+      L.setPointerAuth(Empty);
+      R.setPointerAuth(Empty);
+    }
+
     // If both are only CVR-qualified, bit operations are sufficient.
     if (!(L.Mask & ~CVRMask) && !(R.Mask & ~CVRMask)) {
-      Qualifiers Q;
       Q.Mask = L.Mask & R.Mask;
       L.Mask &= ~Q.Mask;
       R.Mask &= ~Q.Mask;
       return Q;
     }
 
-    Qualifiers Q;
     unsigned CommonCRV = L.getCVRQualifiers() & R.getCVRQualifiers();
     Q.addCVRQualifiers(CommonCRV);
     L.removeCVRQualifiers(CommonCRV);
@@ -251,16 +431,14 @@ public:
   }
 
   // Deserialize qualifiers from an opaque representation.
-  static Qualifiers fromOpaqueValue(unsigned opaque) {
+  static Qualifiers fromOpaqueValue(uint64_t opaque) {
     Qualifiers Qs;
     Qs.Mask = opaque;
     return Qs;
   }
 
   // Serialize these qualifiers into an opaque representation.
-  unsigned getAsOpaqueValue() const {
-    return Mask;
-  }
+  uint64_t getAsOpaqueValue() const { return Mask; }
 
   bool hasConst() const { return Mask & Const; }
   bool hasOnlyConst() const { return Mask == Const; }
@@ -302,7 +480,7 @@ public:
   }
   void removeCVRQualifiers(unsigned mask) {
     assert(!(mask & ~CVRMask) && "bitmask contains non-CVR bits");
-    Mask &= ~mask;
+    Mask &= ~static_cast<uint64_t>(mask);
   }
   void removeCVRQualifiers() {
     removeCVRQualifiers(CVRMask);
@@ -407,6 +585,20 @@ public:
     setAddressSpace(space);
   }
 
+  bool hasPointerAuth() const { return Mask & PtrAuthMask; }
+  PointerAuthQualifier getPointerAuth() const {
+    return PointerAuthQualifier::fromOpaqueValue(Mask >> PtrAuthShift);
+  }
+  void setPointerAuth(PointerAuthQualifier Q) {
+    Mask = (Mask & ~PtrAuthMask) |
+           (uint64_t(Q.getAsOpaqueValue()) << PtrAuthShift);
+  }
+  void removePointerAuth() { Mask &= ~PtrAuthMask; }
+  void addPointerAuth(PointerAuthQualifier Q) {
+    assert(Q.isPresent());
+    setPointerAuth(Q);
+  }
+
   // Fast qualifiers are those that can be allocated directly
   // on a QualType object.
   bool hasFastQualifiers() const { return getFastQualifiers(); }
@@ -417,7 +609,7 @@ public:
   }
   void removeFastQualifiers(unsigned mask) {
     assert(!(mask & ~FastMask) && "bitmask contains non-fast qualifier bits");
-    Mask &= ~mask;
+    Mask &= ~static_cast<uint64_t>(mask);
   }
   void removeFastQualifiers() {
     removeFastQualifiers(FastMask);
@@ -454,6 +646,8 @@ public:
         addObjCGCAttr(Q.getObjCGCAttr());
       if (Q.hasObjCLifetime())
         addObjCLifetime(Q.getObjCLifetime());
+      if (Q.hasPointerAuth())
+        addPointerAuth(Q.getPointerAuth());
     }
   }
 
@@ -471,6 +665,8 @@ public:
         removeObjCLifetime();
       if (getAddressSpace() == Q.getAddressSpace())
         removeAddressSpace();
+      if (getPointerAuth() == Q.getPointerAuth())
+        removePointerAuth();
     }
   }
 
@@ -483,6 +679,8 @@ public:
            !hasObjCGCAttr() || !qs.hasObjCGCAttr());
     assert(getObjCLifetime() == qs.getObjCLifetime() ||
            !hasObjCLifetime() || !qs.hasObjCLifetime());
+    assert(!hasPointerAuth() || !qs.hasPointerAuth() ||
+           getPointerAuth() == qs.getPointerAuth());
     Mask |= qs.Mask;
   }
 
@@ -536,6 +734,8 @@ public:
            // be changed.
            (getObjCGCAttr() == other.getObjCGCAttr() || !hasObjCGCAttr() ||
             !other.hasObjCGCAttr()) &&
+           // Pointer-auth qualifiers must match exactly.
+           getPointerAuth() == other.getPointerAuth() &&
            // ObjC lifetime qualifiers must match exactly.
            getObjCLifetime() == other.getObjCLifetime() &&
            // CVR qualifiers may subset.
@@ -605,24 +805,26 @@ public:
   void print(raw_ostream &OS, const PrintingPolicy &Policy,
              bool appendSpaceIfNonEmpty = false) const;
 
-  void Profile(llvm::FoldingSetNodeID &ID) const {
-    ID.AddInteger(Mask);
-  }
+  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Mask); }
 
 private:
-  // bits:     |0 1 2|3|4 .. 5|6  ..  8|9   ...   31|
-  //           |C R V|U|GCAttr|Lifetime|AddressSpace|
-  uint32_t Mask = 0;
-
-  static const uint32_t UMask = 0x8;
-  static const uint32_t UShift = 3;
-  static const uint32_t GCAttrMask = 0x30;
-  static const uint32_t GCAttrShift = 4;
-  static const uint32_t LifetimeMask = 0x1C0;
-  static const uint32_t LifetimeShift = 6;
-  static const uint32_t AddressSpaceMask =
+  // bits:     |0 1 2|3|4 .. 5|6  ..  8|9   ...   31|32 ... 63|
+  //           |C R V|U|GCAttr|Lifetime|AddressSpace| PtrAuth |
+  uint64_t Mask = 0;
+  static_assert(sizeof(PointerAuthQualifier) == sizeof(uint32_t),
+                "PointerAuthQualifier must be 32 bits");
+
+  static constexpr uint64_t UMask = 0x8;
+  static constexpr uint64_t UShift = 3;
+  static constexpr uint64_t GCAttrMask = 0x30;
+  static constexpr uint64_t GCAttrShift = 4;
+  static constexpr uint64_t LifetimeMask = 0x1C0;
+  static constexpr uint64_t LifetimeShift = 6;
+  static constexpr uint64_t AddressSpaceMask =
       ~(CVRMask | UMask | GCAttrMask | LifetimeMask);
-  static const uint32_t AddressSpaceShift = 9;
+  static constexpr uint64_t AddressSpaceShift = 9;
+  static constexpr uint64_t PtrAuthShift = 32;
+  static constexpr uint64_t PtrAuthMask = uint64_t(0xffffffff) << PtrAuthShift;
 };
 
 class QualifiersAndAtomic {
@@ -1242,6 +1444,10 @@ public:
   // true when Type is objc's weak and weak is enabled but ARC isn't.
   bool isNonWeakInMRRWithObjCWeak(const ASTContext &Context) const;
 
+  PointerAuthQualifier getPointerAuth() const {
+    return getQualifiers().getPointerAuth();
+  }
+
   enum PrimitiveDefaultInitializeKind {
     /// The type does not fall into any of the following categories. Note that
     /// this case is zero-valued so that values of this enum can be used as a
@@ -2172,6 +2378,10 @@ public:
   /// 'riscv_rvv_vector_bits' type attribute as VectorType.
   QualType getRVVEltType(const ASTContext &Ctx) const;
 
+  /// Returns the representative type for the element of a sizeless vector
+  /// builtin type.
+  QualType getSizelessVectorEltType(const ASTContext &Ctx) const;
+
   /// Types are partitioned into 3 broad categories (C99 6.2.5p1):
   /// object types, function types, and incomplete types.
 
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 4408d517e70e..0225598cbbe8 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -592,6 +592,49 @@ class AttrSubjectMatcherAggregateRule<AttrSubject subject> {
 
 def SubjectMatcherForNamed : AttrSubjectMatcherAggregateRule<Named>;
 
+// Enumeration specifying what kind of behavior should be used for late
+// parsing of attributes.
+class LateAttrParseKind <int val> {
+  int Kind = val;
+}
+
+// Never late parsed
+def LateAttrParseNever : LateAttrParseKind<0>;
+
+// Standard late attribute parsing
+//
+// This is language dependent. For example:
+//
+// * For C++ enables late parsing of a declaration attributes
+// * For C does not enable late parsing of attributes
+//
+def LateAttrParseStandard : LateAttrParseKind<1>;
+
+// Experimental extension to standard late attribute parsing
+//
+// This extension behaves like `LateAttrParseStandard` but allows
+// late parsing attributes in more contexts.
+//
+// In contexts where `LateAttrParseStandard` attributes are late
+// parsed, `LateAttrParseExperimentalExt` attributes will also
+// be late parsed.
+//
+// In contexts that only late parse `LateAttrParseExperimentalExt` attributes
+// (see `LateParsedAttrList::lateAttrParseExperimentalExtOnly()`)
+//
+// * If `-fexperimental-late-parse-attributes`
+//   (`LangOpts.ExperimentalLateParseAttributes`) is enabled the attribute
+//   will be late parsed.
+// * If `-fexperimental-late-parse-attributes`
+//   (`LangOpts.ExperimentalLateParseAttributes`) is disabled the attribute
+//   will **not** be late parsed (i.e parsed immediately).
+//
+// The following contexts are supported:
+//
+// * TODO: Add contexts here when they are implemented.
+//
+def LateAttrParseExperimentalExt : LateAttrParseKind<2>;
+
 class Attr {
   // The various ways in which an attribute can be spelled in source
   list<Spelling> Spellings;
@@ -603,8 +646,8 @@ class Attr {
   list<Accessor> Accessors = [];
   // Specify targets for spellings.
   list<TargetSpecificSpelling> TargetSpecificSpellings = [];
-  // Set to true for attributes with arguments which require delayed parsing.
-  bit LateParsed = 0;
+  // Specifies the late parsing kind.
+  LateAttrParseKind LateParsed = LateAttrParseNever;
   // Set to false to prevent an attribute from being propagated from a template
   // to the instantiation.
   bit Clone = 1;
@@ -3173,7 +3216,7 @@ def DiagnoseIf : InheritableAttr {
               BoolArgument<"ArgDependent", 0, /*fake*/ 1>,
               DeclArgument<Named, "Parent", 0, /*fake*/ 1>];
   let InheritEvenIfAlreadyPresent = 1;
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let AdditionalMembers = [{
     bool isError() const { return diagnosticType == DT_Error; }
     bool isWarning() const { return diagnosticType == DT_Warning; }
@@ -3211,7 +3254,7 @@ def ObjCRequiresPropertyDefs : InheritableAttr {
 def Unused : InheritableAttr {
   let Spellings = [CXX11<"", "maybe_unused", 201603>, GCC<"unused">,
                    C23<"", "maybe_unused", 202106>];
-  let Subjects = SubjectList<[Var, ObjCIvar, Type, Enum, EnumConstant, Label,
+  let Subjects = SubjectList<[Var, Binding, ObjCIvar, Type, Enum, EnumConstant, Label,
                               Field, ObjCMethod, FunctionLike]>;
   let Documentation = [WarnMaybeUnusedDocs];
 }
@@ -3472,7 +3515,7 @@ def AssertCapability : InheritableAttr {
   let Spellings = [Clang<"assert_capability", 0>,
                    Clang<"assert_shared_capability", 0>];
   let Subjects = SubjectList<[Function]>;
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3488,7 +3531,7 @@ def AcquireCapability : InheritableAttr {
                    GNU<"exclusive_lock_function">,
                    GNU<"shared_lock_function">];
   let Subjects = SubjectList<[Function]>;
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3504,7 +3547,7 @@ def TryAcquireCapability : InheritableAttr {
                    Clang<"try_acquire_shared_capability", 0>];
   let Subjects = SubjectList<[Function],
                              ErrorDiag>;
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3520,7 +3563,7 @@ def ReleaseCapability : InheritableAttr {
                    Clang<"release_generic_capability", 0>,
                    Clang<"unlock_function", 0>];
   let Subjects = SubjectList<[Function]>;
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3539,7 +3582,7 @@ def RequiresCapability : InheritableAttr {
                    Clang<"requires_shared_capability", 0>,
                    Clang<"shared_locks_required", 0>];
   let Args = [VariadicExprArgument<"Args">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3559,7 +3602,7 @@ def NoThreadSafetyAnalysis : InheritableAttr {
 def GuardedBy : InheritableAttr {
   let Spellings = [GNU<"guarded_by">];
   let Args = [ExprArgument<"Arg">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3570,7 +3613,7 @@ def GuardedBy : InheritableAttr {
 def PtGuardedBy : InheritableAttr {
   let Spellings = [GNU<"pt_guarded_by">];
   let Args = [ExprArgument<"Arg">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3581,7 +3624,7 @@ def PtGuardedBy : InheritableAttr {
 def AcquiredAfter : InheritableAttr {
   let Spellings = [GNU<"acquired_after">];
   let Args = [VariadicExprArgument<"Args">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3592,7 +3635,7 @@ def AcquiredAfter : InheritableAttr {
 def AcquiredBefore : InheritableAttr {
   let Spellings = [GNU<"acquired_before">];
   let Args = [VariadicExprArgument<"Args">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3603,7 +3646,7 @@ def AcquiredBefore : InheritableAttr {
 def AssertExclusiveLock : InheritableAttr {
   let Spellings = [GNU<"assert_exclusive_lock">];
   let Args = [VariadicExprArgument<"Args">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3614,7 +3657,7 @@ def AssertExclusiveLock : InheritableAttr {
 def AssertSharedLock : InheritableAttr {
   let Spellings = [GNU<"assert_shared_lock">];
   let Args = [VariadicExprArgument<"Args">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3627,7 +3670,7 @@ def AssertSharedLock : InheritableAttr {
 def ExclusiveTrylockFunction : InheritableAttr {
   let Spellings = [GNU<"exclusive_trylock_function">];
   let Args = [ExprArgument<"SuccessValue">, VariadicExprArgument<"Args">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3640,7 +3683,7 @@ def ExclusiveTrylockFunction : InheritableAttr {
 def SharedTrylockFunction : InheritableAttr {
   let Spellings = [GNU<"shared_trylock_function">];
   let Args = [ExprArgument<"SuccessValue">, VariadicExprArgument<"Args">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
@@ -3651,7 +3694,7 @@ def SharedTrylockFunction : InheritableAttr {
 def LockReturned : InheritableAttr {
   let Spellings = [GNU<"lock_returned">];
   let Args = [ExprArgument<"Arg">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let Subjects = SubjectList<[Function]>;
@@ -3661,7 +3704,7 @@ def LockReturned : InheritableAttr {
 def LocksExcluded : InheritableAttr {
   let Spellings = [GNU<"locks_excluded">];
   let Args = [VariadicExprArgument<"Args">];
-  let LateParsed = 1;
+  let LateParsed = LateAttrParseStandard;
   let TemplateDependent = 1;
   let ParseArgumentsAsUnevaluated = 1;
   let InheritEvenIfAlreadyPresent = 1;
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 14b08d4927ec..fcffadacc8e6 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -370,4 +370,7 @@ def warn_missing_symbol_graph_dir : Warning<
   "Missing symbol graph output directory, defaulting to working directory">,
   InGroup<ExtractAPIMisuse>;
 
+def err_ast_action_on_llvm_ir : Error<
+  "cannot apply AST actions to LLVM IR file '%0'">,
+  DefaultFatal;
 }
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
index 91a40cd589b3..6896e0f5aa59 100644
--- a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -24,7 +24,7 @@ def err_no_matching_target : Error<"no matching target found for target variant
 def err_unsupported_vendor : Error<"vendor '%0' is not supported: '%1'">;
 def err_unsupported_environment : Error<"environment '%0' is not supported: '%1'">;
 def err_unsupported_os : Error<"os '%0' is not supported: '%1'">;
-def err_cannot_read_alias_list : Error<"could not read alias list '%0': %1">;
+def err_cannot_read_input_list : Error<"could not read %select{alias list|filelist}0 '%1': %2">;
 } // end of command line category.
 
 let CategoryName = "Verification" in {
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 38174cf3549f..44bc4e0e130d 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -478,6 +478,15 @@ def ext_decomp_decl_empty : ExtWarn<
   "ISO C++17 does not allow a decomposition group to be empty">,
   InGroup<DiagGroup<"empty-decomposition">>;
 
+// C++26 structured bindings
+def ext_decl_attrs_on_binding : ExtWarn<
+  "an attribute specifier sequence attached to a structured binding declaration "
+  "is a C++2c extension">, InGroup<CXX26>;
+def warn_cxx23_compat_decl_attrs_on_binding : Warning<
+  "an attribute specifier sequence attached to a structured binding declaration "
+  "is incompatible with C++ standards before C++2c">,
+  InGroup<CXXPre26Compat>, DefaultIgnore;
+
 /// Objective-C parser diagnostics
 def err_expected_minus_or_plus : Error<
   "method type specifier must start with '-' or '+'">;
@@ -1429,6 +1438,9 @@ def err_omp_decl_in_declare_simd_variant : Error<
 def err_omp_sink_and_source_iteration_not_allowd: Error<" '%0 %select{sink:|source:}1' must be with '%select{omp_cur_iteration - 1|omp_cur_iteration}1'">;
 def err_omp_unknown_map_type : Error<
   "incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'">;
+def err_omp_more_one_map_type : Error<"map type is already specified">;
+def note_previous_map_type_specified_here
+    : Note<"map type '%0' is previous specified here">;
 def err_omp_unknown_map_type_modifier : Error<
   "incorrect map type modifier, expected one of: 'always', 'close', 'mapper'"
   "%select{|, 'present'|, 'present', 'iterator'}0%select{|, 'ompx_hold'}1">;
@@ -1436,6 +1448,8 @@ def err_omp_map_type_missing : Error<
   "missing map type">;
 def err_omp_map_type_modifier_missing : Error<
   "missing map type modifier">;
+def err_omp_map_modifier_specification_list : Error<
+  "empty modifier-specification-list is not allowed">;
 def err_omp_declare_simd_inbranch_notinbranch : Error<
   "unexpected '%0' clause, '%1' is specified already">;
 def err_omp_expected_clause_argument
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 6732a1a98452..4b074b853bfe 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -307,6 +307,8 @@ def err_invalid_vector_long_long_decl_spec : Error <
   "POWER7 or later) to be enabled">;
 def err_invalid_vector_long_double_decl_spec : Error<
   "cannot use 'long double' with '__vector'">;
+def err_invalid_vector_complex_decl_spec : Error<
+  "cannot use '_Complex' with '__vector'">;
 def warn_vector_long_decl_spec_combination : Warning<
   "Use of 'long' with '__vector' is deprecated">, InGroup<Deprecated>;
 
@@ -9901,6 +9903,9 @@ def warn_format_invalid_annotation : Warning<
 def warn_format_P_no_precision : Warning<
   "using '%%P' format specifier without precision">,
   InGroup<Format>;
+def warn_format_P_with_objc_pointer : Warning<
+  "using '%%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value">,
+  InGroup<Format>;
 def warn_printf_ignored_flag: Warning<
   "flag '%0' is ignored when flag '%1' is present">,
   InGroup<Format>;
@@ -9950,6 +9955,8 @@ def warn_ret_stack_addr_ref : Warning<
 def warn_ret_local_temp_addr_ref : Warning<
   "returning %select{address of|reference to}0 local temporary object">,
   InGroup<ReturnStackAddress>;
+def err_ret_local_temp_ref : Error<
+  "returning reference to local temporary object">;
 def warn_ret_addr_label : Warning<
   "returning address of label, which is local">,
   InGroup<ReturnStackAddress>;
@@ -10328,9 +10335,13 @@ def err_shufflevector_nonconstant_argument : Error<
 def err_shufflevector_argument_too_large : Error<
   "index for __builtin_shufflevector must be less than the total number "
   "of vector elements">;
+def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error<
+  "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 not permitted in a constexpr context.">;
 
 def err_convertvector_non_vector : Error<
   "first argument to __builtin_convertvector must be a vector">;
+def err_convertvector_constexpr_unsupported_vector_cast : Error<
+  "unsupported vector cast from %0 to %1 in a constant expression.">;
 def err_builtin_non_vector_type : Error<
   "%0 argument to %1 must be of vector type">;
 def err_convertvector_incompatible_vector : Error<
@@ -11161,7 +11172,7 @@ def err_omp_declare_mapper_redefinition : Error<
   "redefinition of user-defined mapper for type %0 with name %1">;
 def err_omp_invalid_mapper: Error<
   "cannot find a valid user-defined mapper for type %0 with name %1">;
-def err_omp_array_section_use : Error<"OpenMP array section is not allowed here">;
+def err_array_section_use : Error<"%select{OpenACC sub-array|OpenMP array section}0 is not allowed here">;
 def err_omp_array_shaping_use : Error<"OpenMP array shaping operation is not allowed here">;
 def err_omp_iterator_use : Error<"OpenMP iterator is not allowed here">;
 def err_omp_typecheck_section_value : Error<
@@ -12296,4 +12307,7 @@ def err_acc_num_gangs_num_args
             "OpenACC 'num_gangs' "
             "%select{|clause: '%1' directive expects maximum of %2, %3 were "
             "provided}0">;
+def err_acc_not_a_var_ref
+    : Error<"OpenACC variable is not a valid variable name, sub-array, array "
+            "element, or composite variable member">;
 } // end of sema component.
diff --git a/clang/include/clang/Basic/FileManager.h b/clang/include/clang/Basic/FileManager.h
index 2245fd78bfc9..8b4206e52cd4 100644
--- a/clang/include/clang/Basic/FileManager.h
+++ b/clang/include/clang/Basic/FileManager.h
@@ -114,6 +114,12 @@ class FileManager : public RefCountedBase<FileManager> {
   ///
   unsigned NextFileUID;
 
+  /// Statistics gathered during the lifetime of the FileManager.
+  unsigned NumDirLookups = 0;
+  unsigned NumFileLookups = 0;
+  unsigned NumDirCacheMisses = 0;
+  unsigned NumFileCacheMisses = 0;
+
   // Caching.
   std::unique_ptr<FileSystemStatCache> StatCache;
 
@@ -341,6 +347,10 @@ private:
 
 public:
   void PrintStats() const;
+
+  /// Import statistics from a child FileManager and add them to this current
+  /// FileManager.
+  void AddStats(const FileManager &Other);
 };
 
 } // end namespace clang
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8ef6700ecdc7..55c81eab1ec1 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -164,6 +164,7 @@ LANGOPT(ExperimentalLibrary, 1, 0, "enable unstable and experimental library fea
 LANGOPT(PointerAuthIntrinsics, 1, 0, "pointer authentication intrinsics")
 
 LANGOPT(DoubleSquareBracketAttributes, 1, 0, "'[[]]' attributes extension for all language standard modes")
+LANGOPT(ExperimentalLateParseAttributes, 1, 0, "experimental late parsing of attributes")
 
 COMPATIBLE_LANGOPT(RecoveryAST, 1, 1, "Preserve expressions in AST when encountering errors")
 COMPATIBLE_LANGOPT(RecoveryASTType, 1, 1, "Preserve the type in recovery expressions")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index ae4715921d16..e2a2aa71b880 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -57,6 +57,13 @@ enum class ShaderStage {
   Invalid,
 };
 
+enum class PointerAuthenticationMode : unsigned {
+  None,
+  Strip,
+  SignAndStrip,
+  SignAndAuth
+};
+
 /// Bitfields of LangOptions, split out from LangOptions in order to ensure that
 /// this large collection of bitfields is a trivial class type.
 class LangOptionsBase {
diff --git a/clang/include/clang/Basic/OpenACCClauses.def b/clang/include/clang/Basic/OpenACCClauses.def
index dd5792e7ca8c..6c3c2db66ef0 100644
--- a/clang/include/clang/Basic/OpenACCClauses.def
+++ b/clang/include/clang/Basic/OpenACCClauses.def
@@ -20,6 +20,7 @@ VISIT_CLAUSE(If)
 VISIT_CLAUSE(Self)
 VISIT_CLAUSE(NumGangs)
 VISIT_CLAUSE(NumWorkers)
+VISIT_CLAUSE(Private)
 VISIT_CLAUSE(VectorLength)
 
 #undef VISIT_CLAUSE
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
new file mode 100644
index 000000000000..e5cdcc31ebfb
--- /dev/null
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -0,0 +1,23 @@
+//===--- PointerAuthOptions.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines options for configuring pointer-auth technologies
+//  like ARMv8.3.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_POINTERAUTHOPTIONS_H
+#define LLVM_CLANG_BASIC_POINTERAUTHOPTIONS_H
+
+namespace clang {
+
+constexpr unsigned PointerAuthKeyNone = -1;
+
+} // end namespace clang
+
+#endif
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index b4e3ae573b95..305f19daa4a9 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -71,7 +71,7 @@ def OffsetOfExpr : StmtNode<Expr>;
 def UnaryExprOrTypeTraitExpr : StmtNode<Expr>;
 def ArraySubscriptExpr : StmtNode<Expr>;
 def MatrixSubscriptExpr : StmtNode<Expr>;
-def OMPArraySectionExpr : StmtNode<Expr>;
+def ArraySectionExpr : StmtNode<Expr>;
 def OMPIteratorExpr : StmtNode<Expr>;
 def CallExpr : StmtNode<Expr>;
 def MemberExpr : StmtNode<Expr>;
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index e1ef7454f016..3ced2e7397a7 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -40,6 +40,7 @@
 #include <cassert>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace llvm {
@@ -1792,6 +1793,15 @@ public:
   /// Whether to support HIP image/texture API's.
   virtual bool hasHIPImageSupport() const { return true; }
 
+  /// The first value in the pair is the minimum offset between two objects to
+  /// avoid false sharing (destructive interference). The second value in the
+  /// pair is maximum size of contiguous memory to promote true sharing
+  /// (constructive interference). Neither of these values are considered part
+  /// of the ABI and can be changed by targets at any time.
+  virtual std::pair<unsigned, unsigned> hardwareInterferenceSizes() const {
+    return std::make_pair(64, 64);
+  }
+
 protected:
   /// Copy type and layout related info.
   void copyAuxTarget(const TargetInfo *Aux);
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 6d655c39360d..6390ba3f9fe5 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -275,7 +275,7 @@ def OP_VCVT_BF16_F32_HI_A32
                            (call "vget_low", $p0))>;
 
 def OP_CVT_F32_BF16
-    : Op<(bitcast "R", (op "<<", (bitcast "int32_t", $p0),
+    : Op<(bitcast "R", (op "<<", (cast "int32_t", (bitcast "int16_t", $p0)),
                                  (literal "int32_t", "16")))>;
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 6cc249837d3f..15340ebb62b3 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1961,19 +1961,20 @@ def SVPSEL_D : SInst<"svpsel_lane_b64", "PPPm", "Pl", MergeNone, "", [IsStreamin
 
 // Standalone sve2.1 builtins
 let TargetGuard = "sve2p1" in {
-def SVORQV   : SInst<"svorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orqv", [IsReductionQV]>;
-def SVEORQV  : SInst<"sveorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorqv", [IsReductionQV]>;
-def SVADDQV  : SInst<"svaddqv[_{d}]", "{Pd", "hfdcsilUcUsUiUl", MergeNone, "aarch64_sve_addqv", [IsReductionQV]>;
-def SVANDQV  : SInst<"svandqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andqv", [IsReductionQV]>;
-def SVSMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "csil", MergeNone, "aarch64_sve_smaxqv", [IsReductionQV]>;
-def SVUMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxqv", [IsReductionQV]>;
-def SVSMINQV : SInst<"svminqv[_{d}]", "{Pd", "csil", MergeNone, "aarch64_sve_sminqv", [IsReductionQV]>;
-def SVUMINQV : SInst<"svminqv[_{d}]", "{Pd", "UcUsUiUl", MergeNone, "aarch64_sve_uminqv", [IsReductionQV]>;
-
-def SVFMAXNMQV: SInst<"svmaxnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxnmqv", [IsReductionQV]>;
-def SVFMINNMQV: SInst<"svminnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminnmqv", [IsReductionQV]>;
-def SVFMAXQV: SInst<"svmaxqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxqv", [IsReductionQV]>;
-def SVFMINQV: SInst<"svminqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminqv", [IsReductionQV]>;
+def SVORQV   : SInst<"svorqv[_{d}]",  "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orqv",   [IsReductionQV]>;
+def SVEORQV  : SInst<"sveorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorqv",  [IsReductionQV]>;
+def SVADDQV  : SInst<"svaddqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_addqv",  [IsReductionQV]>;
+def SVANDQV  : SInst<"svandqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andqv",  [IsReductionQV]>;
+def SVSMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "csil",         MergeNone, "aarch64_sve_smaxqv", [IsReductionQV]>;
+def SVUMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "UcUsUiUl",     MergeNone, "aarch64_sve_umaxqv", [IsReductionQV]>;
+def SVSMINQV : SInst<"svminqv[_{d}]", "{Pd", "csil",         MergeNone, "aarch64_sve_sminqv", [IsReductionQV]>;
+def SVUMINQV : SInst<"svminqv[_{d}]", "{Pd", "UcUsUiUl",     MergeNone, "aarch64_sve_uminqv", [IsReductionQV]>;
+
+def SVFADDQV   : SInst<"svaddqv[_{d}]",   "{Pd", "hfd", MergeNone, "aarch64_sve_faddqv",   [IsReductionQV]>;
+def SVFMAXNMQV : SInst<"svmaxnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxnmqv", [IsReductionQV]>;
+def SVFMINNMQV : SInst<"svminnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminnmqv", [IsReductionQV]>;
+def SVFMAXQV   : SInst<"svmaxqv[_{d}]",   "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxqv",   [IsReductionQV]>;
+def SVFMINQV   : SInst<"svminqv[_{d}]",   "{Pd", "hfd", MergeNone, "aarch64_sve_fminqv",   [IsReductionQV]>;
 }
 
 let TargetGuard = "sve2p1|sme2" in {
diff --git a/clang/include/clang/CIR/CMakeLists.txt b/clang/include/clang/CIR/CMakeLists.txt
index e69de29bb2d1..f8d6f407a03d 100644
--- a/clang/include/clang/CIR/CMakeLists.txt
+++ b/clang/include/clang/CIR/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(MLIR_INCLUDE_DIR ${LLVM_MAIN_SRC_DIR}/../mlir/include ) # --includedir
+set(MLIR_TABLEGEN_OUTPUT_DIR ${CMAKE_BINARY_DIR}/tools/mlir/include)
+include_directories(${MLIR_INCLUDE_DIR})
+include_directories(${MLIR_TABLEGEN_OUTPUT_DIR})
+
+add_subdirectory(Dialect)
diff --git a/clang/include/clang/CIR/Dialect/CMakeLists.txt b/clang/include/clang/CIR/Dialect/CMakeLists.txt
new file mode 100644
index 000000000000..f33061b2d87c
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
new file mode 100644
index 000000000000..d53e5d1663d6
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
@@ -0,0 +1,16 @@
+//===- CIRDialect.h - CIR dialect -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT_H
+#define LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT_H
+
+#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT_H
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
new file mode 100644
index 000000000000..69d6e9774942
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -0,0 +1,44 @@
+//===- CIRDialect.td - CIR dialect -------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT
+#define LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT
+
+include "mlir/IR/OpBase.td"
+
+def CIR_Dialect : Dialect {
+  let name = "cir";
+
+  // A short one-line summary of our dialect.
+  let summary = "A high-level dialect for analyzing and optimizing Clang "
+                "supported languages";
+
+  let cppNamespace = "::mlir::cir";
+
+  let useDefaultAttributePrinterParser = 0;
+  let useDefaultTypePrinterParser = 0;
+
+  let extraClassDeclaration = [{
+    void registerAttributes();
+    void registerTypes();
+
+    Type parseType(DialectAsmParser &parser) const override;
+    void printType(Type type, DialectAsmPrinter &printer) const override;
+
+    Attribute parseAttribute(DialectAsmParser &parser,
+                             Type type) const override;
+
+    void printAttribute(Attribute attr, DialectAsmPrinter &os) const override;
+  }];
+}
+
+#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
new file mode 100644
index 000000000000..7311c8db783e
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -0,0 +1,19 @@
+//===-- CIROps.td - CIR dialect definition -----------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Definition of the CIR dialect
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIROPS
+#define LLVM_CLANG_CIR_DIALECT_IR_CIROPS
+
+include "clang/CIR/Dialect/IR/CIRDialect.td"
+
+#endif // LLVM_CLANG_CIR_DIALECT_IR_CIROPS
diff --git a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
new file mode 100644
index 000000000000..28ae30dab8df
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
@@ -0,0 +1,16 @@
+# This replicates part of the add_mlir_dialect cmake function from MLIR that
+# cannot be used here. This happens because it expects to be run inside MLIR
+# directory which is not the case for CIR (and also FIR, both have similar
+# workarounds).
+
+# Equivalent to add_mlir_dialect(CIROps cir)
+set(LLVM_TARGET_DEFINITIONS CIROps.td)
+mlir_tablegen(CIROps.h.inc -gen-op-decls)
+mlir_tablegen(CIROps.cpp.inc -gen-op-defs)
+mlir_tablegen(CIROpsTypes.h.inc -gen-typedef-decls)
+mlir_tablegen(CIROpsTypes.cpp.inc -gen-typedef-defs)
+mlir_tablegen(CIROpsDialect.h.inc -gen-dialect-decls)
+mlir_tablegen(CIROpsDialect.cpp.inc -gen-dialect-defs)
+add_public_tablegen_target(MLIRCIROpsIncGen)
+add_dependencies(mlir-headers MLIRCIROpsIncGen)
+
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 91f48b6da6d5..1302519c1f9e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1608,6 +1608,13 @@ defm double_square_bracket_attributes : BoolFOption<"double-square-bracket-attri
   LangOpts<"DoubleSquareBracketAttributes">, DefaultTrue, PosFlag<SetTrue>,
  NegFlag<SetFalse>>;
 
+defm experimental_late_parse_attributes : BoolFOption<"experimental-late-parse-attributes",
+  LangOpts<"ExperimentalLateParseAttributes">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption], "Enable">,
+  NegFlag<SetFalse, [], [ClangOption], "Disable">,
+  BothFlags<[], [ClangOption, CC1Option],
+          " experimental late parsing of attributes">>;
+
 defm autolink : BoolFOption<"autolink",
   CodeGenOpts<"Autolink">, DefaultTrue,
   NegFlag<SetFalse, [], [ClangOption, CC1Option],
@@ -2615,6 +2622,11 @@ defm protect_parens : BoolFOption<"protect-parens",
           "floating-point expressions are evaluated">,
   NegFlag<SetFalse>>;
 
+defm daz_ftz : SimpleMFlag<"daz-ftz",
+  "Globally set", "Do not globally set",
+  " the denormals-are-zero (DAZ) and flush-to-zero (FTZ) bits in the "
+  "floating-point control register on program startup">;
+
 def ffor_scope : Flag<["-"], "ffor-scope">, Group<f_Group>;
 def fno_for_scope : Flag<["-"], "fno-for-scope">, Group<f_Group>;
 
@@ -2881,6 +2893,17 @@ def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Gr
 def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<f_Group>;
 def fapple_link_rtlib : Flag<["-"], "fapple-link-rtlib">, Group<f_Group>,
   HelpText<"Force linking the clang builtins runtime library">;
+
+/// ClangIR-specific options - BEGIN
+defm clangir : BoolFOption<"clangir",
+  FrontendOpts<"UseClangIRPipeline">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use the ClangIR pipeline to compile">,
+  NegFlag<SetFalse, [], [ClangOption, CC1Option], "Use the AST -> LLVM pipeline to compile">,
+  BothFlags<[], [ClangOption, CC1Option], "">>;
+def emit_cir : Flag<["-"], "emit-cir">, Visibility<[CC1Option]>,
+  Group<Action_Group>, HelpText<"Build ASTs and then lower to ClangIR">;
+/// ClangIR-specific options - END
+
 def flto_EQ : Joined<["-"], "flto=">,
   Visibility<[ClangOption, CLOption, CC1Option, FC1Option, FlangOption]>,
   Group<f_Group>,
@@ -4880,6 +4903,8 @@ def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
 def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
 def mrelaxed_simd : Flag<["-"], "mrelaxed-simd">, Group<m_wasm_Features_Group>;
 def mno_relaxed_simd : Flag<["-"], "mno-relaxed-simd">, Group<m_wasm_Features_Group>;
+def mhalf_precision : Flag<["-"], "mhalf-precision">, Group<m_wasm_Features_Group>;
+def mno_half_precision : Flag<["-"], "mno-half-precision">, Group<m_wasm_Features_Group>;
 def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
 def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;
 def msign_ext : Flag<["-"], "msign-ext">, Group<m_wasm_Features_Group>;
@@ -6586,12 +6611,6 @@ def J : JoinedOrSeparate<["-"], "J">,
   Group<gfortran_Group>,
   Alias<module_dir>;
 
-let Visibility = [FlangOption] in {
-def no_fortran_main : Flag<["-"], "fno-fortran-main">,
-  Visibility<[FlangOption]>, Group<f_Group>,
-  HelpText<"Do not include Fortran_main.a (provided by Flang) when linking">;
-} // let Visibility = [ FlangOption ]
-
 //===----------------------------------------------------------------------===//
 // FC1 Options
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h
index a2c1b25dd224..080844893c13 100644
--- a/clang/include/clang/Frontend/ASTUnit.h
+++ b/clang/include/clang/Frontend/ASTUnit.h
@@ -241,7 +241,7 @@ private:
 
   /// A list of the serialization ID numbers for each of the top-level
   /// declarations parsed within the precompiled preamble.
-  std::vector<serialization::DeclID> TopLevelDeclsInPreamble;
+  std::vector<LocalDeclID> TopLevelDeclsInPreamble;
 
   /// Whether we should be caching code-completion results.
   bool ShouldCacheCodeCompletionResults : 1;
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index a738c1f37576..bd4981ca0ac0 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -65,6 +65,9 @@ enum ActionKind {
   /// Translate input source into HTML.
   EmitHTML,
 
+  /// Emit a .cir file
+  EmitCIR,
+
   /// Emit a .ll file.
   EmitLLVM,
 
@@ -408,6 +411,10 @@ public:
   LLVM_PREFERRED_TYPE(bool)
   unsigned GenReducedBMI : 1;
 
+  /// Use Clang IR pipeline to emit code
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned UseClangIRPipeline : 1;
+
   CodeCompleteOptions CodeCompleteOpts;
 
   /// Specifies the output format of the AST.
@@ -590,7 +597,7 @@ public:
         EmitSymbolGraph(false), EmitExtensionSymbolGraphs(false),
         EmitSymbolGraphSymbolLabelsForTesting(false),
         EmitPrettySymbolGraphs(false), GenReducedBMI(false),
-        TimeTraceGranularity(500) {}
+        UseClangIRPipeline(false), TimeTraceGranularity(500) {}
 
   /// getInputKindForExtension - Return the appropriate input kind for a file
   /// extension. For example, "c" would return Language::C.
diff --git a/clang/include/clang/Frontend/MultiplexConsumer.h b/clang/include/clang/Frontend/MultiplexConsumer.h
index 7f8d2858b386..f29c8e92fded 100644
--- a/clang/include/clang/Frontend/MultiplexConsumer.h
+++ b/clang/include/clang/Frontend/MultiplexConsumer.h
@@ -35,7 +35,7 @@ public:
   void IdentifierRead(serialization::IdentID ID, IdentifierInfo *II) override;
   void MacroRead(serialization::MacroID ID, MacroInfo *MI) override;
   void TypeRead(serialization::TypeIdx Idx, QualType T) override;
-  void DeclRead(serialization::DeclID ID, const Decl *D) override;
+  void DeclRead(GlobalDeclID ID, const Decl *D) override;
   void SelectorRead(serialization::SelectorID iD, Selector Sel) override;
   void MacroDefinitionRead(serialization::PreprocessedEntityID,
                            MacroDefinitionRecord *MD) override;
diff --git a/clang/include/clang/InstallAPI/FileList.h b/clang/include/clang/InstallAPI/FileList.h
index 460af003b6a0..913734b8dc7c 100644
--- a/clang/include/clang/InstallAPI/FileList.h
+++ b/clang/include/clang/InstallAPI/FileList.h
@@ -29,9 +29,10 @@ public:
   ///
   /// \param InputBuffer JSON input data.
   /// \param Destination Container to load headers into.
+  /// \param FM Optional File Manager to validate input files exist.
   static llvm::Error
   loadHeaders(std::unique_ptr<llvm::MemoryBuffer> InputBuffer,
-              HeaderSeq &Destination);
+              HeaderSeq &Destination, clang::FileManager *FM = nullptr);
 
   FileListReader() = delete;
 };
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index fb117bf04087..daefd4f28f01 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1398,12 +1398,21 @@ private:
   // A list of late-parsed attributes.  Used by ParseGNUAttributes.
   class LateParsedAttrList: public SmallVector<LateParsedAttribute *, 2> {
   public:
-    LateParsedAttrList(bool PSoon = false) : ParseSoon(PSoon) { }
+    LateParsedAttrList(bool PSoon = false,
+                       bool LateAttrParseExperimentalExtOnly = false)
+        : ParseSoon(PSoon),
+          LateAttrParseExperimentalExtOnly(LateAttrParseExperimentalExtOnly) {}
 
     bool parseSoon() { return ParseSoon; }
+    /// returns true iff the attribute to be parsed should only be late parsed
+    /// if it is annotated with `LateAttrParseExperimentalExt`
+    bool lateAttrParseExperimentalExtOnly() {
+      return LateAttrParseExperimentalExtOnly;
+    }
 
   private:
-    bool ParseSoon;  // Are we planning to parse these shortly after creation?
+    bool ParseSoon; // Are we planning to parse these shortly after creation?
+    bool LateAttrParseExperimentalExtOnly;
   };
 
   /// Contains the lexed tokens of a member function definition
@@ -3645,11 +3654,12 @@ private:
   ExprResult ParseOpenACCIDExpression();
   /// Parses the variable list for the `cache` construct.
   void ParseOpenACCCacheVarList();
+
+  using OpenACCVarParseResult = std::pair<ExprResult, OpenACCParseCanContinue>;
   /// Parses a single variable in a variable list for OpenACC.
-  bool ParseOpenACCVar();
-  /// Parses the variable list for the variety of clauses that take a var-list,
-  /// including the optional Special Token listed for some,based on clause type.
-  bool ParseOpenACCClauseVarList(OpenACCClauseKind Kind);
+  OpenACCVarParseResult ParseOpenACCVar();
+  /// Parses the variable list for the variety of places that take a var-list.
+  llvm::SmallVector<Expr *> ParseOpenACCVarList();
   /// Parses any parameters for an OpenACC Clause, including required/optional
   /// parens.
   OpenACCClauseParseResult
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index c9eecdafe62c..23bc780e0497 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -36,6 +36,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <optional>
 
 namespace clang {
   class ASTContext;
@@ -1790,6 +1791,7 @@ public:
   struct Binding {
     IdentifierInfo *Name;
     SourceLocation NameLoc;
+    std::optional<ParsedAttributes> Attrs;
   };
 
 private:
@@ -1809,15 +1811,15 @@ public:
       : Bindings(nullptr), NumBindings(0), DeleteBindings(false) {}
   DecompositionDeclarator(const DecompositionDeclarator &G) = delete;
   DecompositionDeclarator &operator=(const DecompositionDeclarator &G) = delete;
-  ~DecompositionDeclarator() {
-    if (DeleteBindings)
-      delete[] Bindings;
-  }
+  ~DecompositionDeclarator() { clear(); }
 
   void clear() {
     LSquareLoc = RSquareLoc = SourceLocation();
     if (DeleteBindings)
       delete[] Bindings;
+    else
+      llvm::for_each(llvm::MutableArrayRef(Bindings, NumBindings),
+                     [](Binding &B) { B.Attrs.reset(); });
     Bindings = nullptr;
     NumBindings = 0;
     DeleteBindings = false;
@@ -2339,10 +2341,10 @@ public:
   }
 
   /// Set the decomposition bindings for this declarator.
-  void
-  setDecompositionBindings(SourceLocation LSquareLoc,
-                           ArrayRef<DecompositionDeclarator::Binding> Bindings,
-                           SourceLocation RSquareLoc);
+  void setDecompositionBindings(
+      SourceLocation LSquareLoc,
+      MutableArrayRef<DecompositionDeclarator::Binding> Bindings,
+      SourceLocation RSquareLoc);
 
   /// AddTypeInfo - Add a chunk to this declarator. Also extend the range to
   /// EndLoc, which should be the last token of the chunk.
diff --git a/clang/include/clang/Sema/Lookup.h b/clang/include/clang/Sema/Lookup.h
index 0db5b847038f..b0a08a05ac6a 100644
--- a/clang/include/clang/Sema/Lookup.h
+++ b/clang/include/clang/Sema/Lookup.h
@@ -499,7 +499,9 @@ public:
   /// Note that while no result was found in the current instantiation,
   /// there were dependent base classes that could not be searched.
   void setNotFoundInCurrentInstantiation() {
-    assert(ResultKind == NotFound && Decls.empty());
+    assert((ResultKind == NotFound ||
+            ResultKind == NotFoundInCurrentInstantiation) &&
+           Decls.empty());
     ResultKind = NotFoundInCurrentInstantiation;
   }
 
diff --git a/clang/include/clang/Sema/MultiplexExternalSemaSource.h b/clang/include/clang/Sema/MultiplexExternalSemaSource.h
index 993c9b1daa30..238fb398b7d1 100644
--- a/clang/include/clang/Sema/MultiplexExternalSemaSource.h
+++ b/clang/include/clang/Sema/MultiplexExternalSemaSource.h
@@ -65,7 +65,7 @@ public:
 
   /// Resolve a declaration ID into a declaration, potentially
   /// building a new declaration.
-  Decl *GetExternalDecl(Decl::DeclID ID) override;
+  Decl *GetExternalDecl(GlobalDeclID ID) override;
 
   /// Complete the redeclaration chain if it's been extended since the
   /// previous generation of the AST source.
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 25a5fa05b21c..8368d9ce6146 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -948,6 +948,7 @@ public:
   ParsedAttributes(AttributeFactory &factory) : pool(factory) {}
   ParsedAttributes(const ParsedAttributes &) = delete;
   ParsedAttributes &operator=(const ParsedAttributes &) = delete;
+  ParsedAttributes(ParsedAttributes &&G) = default;
 
   AttributePool &getPool() const { return pool; }
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 1ca523ec88c2..a80ac6dbc761 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4097,12 +4097,12 @@ public:
                                    SmallVectorImpl<QualType> &Exceptions,
                                    FunctionProtoType::ExceptionSpecInfo &ESI);
 
-  /// Add an exception-specification to the given member function
-  /// (or member function template). The exception-specification was parsed
-  /// after the method itself was declared.
+  /// Add an exception-specification to the given member or friend function
+  /// (or function template). The exception-specification was parsed
+  /// after the function itself was declared.
   void actOnDelayedExceptionSpecification(
-      Decl *Method, ExceptionSpecificationType EST,
-      SourceRange SpecificationRange, ArrayRef<ParsedType> DynamicExceptions,
+      Decl *D, ExceptionSpecificationType EST, SourceRange SpecificationRange,
+      ArrayRef<ParsedType> DynamicExceptions,
       ArrayRef<SourceRange> DynamicExceptionRanges, Expr *NoexceptExpr);
 
   class InheritedConstructorInfo;
@@ -6978,12 +6978,6 @@ public:
                                    SourceLocation TemplateKWLoc,
                                    UnqualifiedId &Member, Decl *ObjCImpDecl);
 
-  MemberExpr *BuildMemberExpr(
-      Expr *Base, bool IsArrow, SourceLocation OpLoc, const CXXScopeSpec *SS,
-      SourceLocation TemplateKWLoc, ValueDecl *Member, DeclAccessPair FoundDecl,
-      bool HadMultipleCandidates, const DeclarationNameInfo &MemberNameInfo,
-      QualType Ty, ExprValueKind VK, ExprObjectKind OK,
-      const TemplateArgumentListInfo *TemplateArgs = nullptr);
   MemberExpr *
   BuildMemberExpr(Expr *Base, bool IsArrow, SourceLocation OpLoc,
                   NestedNameSpecifierLoc NNS, SourceLocation TemplateKWLoc,
@@ -7472,7 +7466,7 @@ public:
   bool LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
                            CXXScopeSpec &SS);
   bool LookupParsedName(LookupResult &R, Scope *S, CXXScopeSpec *SS,
-                        bool AllowBuiltinCreation = false,
+                        QualType ObjectType, bool AllowBuiltinCreation = false,
                         bool EnteringContext = false);
   ObjCProtocolDecl *LookupProtocol(
       IdentifierInfo *II, SourceLocation IdLoc,
@@ -8881,11 +8875,13 @@ public:
     /// functions (but no function templates).
     FoundFunctions,
   };
-  bool LookupTemplateName(
-      LookupResult &R, Scope *S, CXXScopeSpec &SS, QualType ObjectType,
-      bool EnteringContext, bool &MemberOfUnknownSpecialization,
-      RequiredTemplateKind RequiredTemplate = SourceLocation(),
-      AssumedTemplateKind *ATK = nullptr, bool AllowTypoCorrection = true);
+
+  bool
+  LookupTemplateName(LookupResult &R, Scope *S, CXXScopeSpec &SS,
+                     QualType ObjectType, bool EnteringContext,
+                     RequiredTemplateKind RequiredTemplate = SourceLocation(),
+                     AssumedTemplateKind *ATK = nullptr,
+                     bool AllowTypoCorrection = true);
 
   TemplateNameKind isTemplateName(Scope *S, CXXScopeSpec &SS,
                                   bool hasTemplateKeyword,
@@ -9249,7 +9245,8 @@ public:
   void NoteTemplateParameterLocation(const NamedDecl &Decl);
 
   ExprResult BuildExpressionFromDeclTemplateArgument(
-      const TemplateArgument &Arg, QualType ParamType, SourceLocation Loc);
+      const TemplateArgument &Arg, QualType ParamType, SourceLocation Loc,
+      NamedDecl *TemplateParam = nullptr);
   ExprResult
   BuildExpressionFromNonTypeTemplateArgument(const TemplateArgument &Arg,
                                              SourceLocation Loc);
@@ -9572,9 +9569,10 @@ public:
 
   bool isSameOrCompatibleFunctionType(QualType Param, QualType Arg);
 
-  TemplateArgumentLoc getTrivialTemplateArgumentLoc(const TemplateArgument &Arg,
-                                                    QualType NTTPType,
-                                                    SourceLocation Loc);
+  TemplateArgumentLoc
+  getTrivialTemplateArgumentLoc(const TemplateArgument &Arg, QualType NTTPType,
+                                SourceLocation Loc,
+                                NamedDecl *TemplateParam = nullptr);
 
   /// Get a template argument mapping the given template parameter to itself,
   /// e.g. for X in \c template<int X>, this would return an expression template
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index ea28617f79b8..edb0cbb7c5d5 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -48,8 +48,12 @@ public:
       SmallVector<Expr *> IntExprs;
     };
 
+    struct VarListDetails {
+      SmallVector<Expr *> VarList;
+    };
+
     std::variant<std::monostate, DefaultDetails, ConditionDetails,
-                 IntExprDetails>
+                 IntExprDetails, VarListDetails>
         Details = std::monostate{};
 
   public:
@@ -112,6 +116,16 @@ public:
       return const_cast<OpenACCParsedClause *>(this)->getIntExprs();
     }
 
+    ArrayRef<Expr *> getVarList() {
+      assert(ClauseKind == OpenACCClauseKind::Private &&
+             "Parsed clause kind does not have a var-list");
+      return std::get<VarListDetails>(Details).VarList;
+    }
+
+    ArrayRef<Expr *> getVarList() const {
+      return const_cast<OpenACCParsedClause *>(this)->getVarList();
+    }
+
     void setLParenLoc(SourceLocation EndLoc) { LParenLoc = EndLoc; }
     void setEndLoc(SourceLocation EndLoc) { ClauseRange.setEnd(EndLoc); }
 
@@ -147,7 +161,19 @@ public:
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::VectorLength) &&
              "Parsed clause kind does not have a int exprs");
-      Details = IntExprDetails{IntExprs};
+      Details = IntExprDetails{std::move(IntExprs)};
+    }
+
+    void setVarListDetails(ArrayRef<Expr *> VarList) {
+      assert(ClauseKind == OpenACCClauseKind::Private &&
+             "Parsed clause kind does not have a var-list");
+      Details = VarListDetails{{VarList.begin(), VarList.end()}};
+    }
+
+    void setVarListDetails(llvm::SmallVector<Expr *> &&VarList) {
+      assert(ClauseKind == OpenACCClauseKind::Private &&
+             "Parsed clause kind does not have a var-list");
+      Details = VarListDetails{std::move(VarList)};
     }
   };
 
@@ -193,6 +219,16 @@ public:
   /// conversions and diagnostics to 'int'.
   ExprResult ActOnIntExpr(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
                           SourceLocation Loc, Expr *IntExpr);
+
+  /// Called when encountering a 'var' for OpenACC, ensures it is actually a
+  /// declaration reference to a variable of the correct type.
+  ExprResult ActOnVar(Expr *VarExpr);
+
+  /// Checks and creates an Array Section used in an OpenACC construct/clause.
+  ExprResult ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc,
+                                   Expr *LowerBound,
+                                   SourceLocation ColonLocFirst, Expr *Length,
+                                   SourceLocation RBLoc);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index dcfa4ac0c196..a8df5a0bda08 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -17,6 +17,7 @@
 #ifndef LLVM_CLANG_SERIALIZATION_ASTBITCODES_H
 #define LLVM_CLANG_SERIALIZATION_ASTBITCODES_H
 
+#include "clang/AST/DeclID.h"
 #include "clang/AST/DeclarationName.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/IdentifierTable.h"
@@ -59,91 +60,9 @@ const unsigned VERSION_MINOR = 1;
 /// and start at 1. 0 is reserved for NULL.
 using IdentifierID = uint32_t;
 
-/// An ID number that refers to a declaration in an AST file.
-///
-/// The ID numbers of declarations are consecutive (in order of
-/// discovery), with values below NUM_PREDEF_DECL_IDS being reserved.
-/// At the start of a chain of precompiled headers, declaration ID 1 is
-/// used for the translation unit declaration.
-///
-/// FIXME: Merge with Decl::DeclID
-using DeclID = uint32_t;
-
-class LocalDeclID {
-public:
-  explicit LocalDeclID(DeclID ID) : ID(ID) {}
-
-  DeclID get() const { return ID; }
-
-private:
-  DeclID ID;
-};
-
-/// Wrapper class for DeclID. This is helpful to not mix the use of LocalDeclID
-/// and GlobalDeclID to improve the type safety.
-class GlobalDeclID {
-public:
-  GlobalDeclID() : ID(0) {}
-  explicit GlobalDeclID(DeclID ID) : ID(ID) {}
-
-  DeclID get() const { return ID; }
-
-  explicit operator DeclID() const { return ID; }
-
-  friend bool operator==(const GlobalDeclID &LHS, const GlobalDeclID &RHS) {
-    return LHS.ID == RHS.ID;
-  }
-  friend bool operator!=(const GlobalDeclID &LHS, const GlobalDeclID &RHS) {
-    return LHS.ID != RHS.ID;
-  }
-  // We may sort the global decl ID.
-  friend bool operator<(const GlobalDeclID &LHS, const GlobalDeclID &RHS) {
-    return LHS.ID < RHS.ID;
-  }
-  friend bool operator>(const GlobalDeclID &LHS, const GlobalDeclID &RHS) {
-    return LHS.ID > RHS.ID;
-  }
-  friend bool operator<=(const GlobalDeclID &LHS, const GlobalDeclID &RHS) {
-    return LHS.ID <= RHS.ID;
-  }
-  friend bool operator>=(const GlobalDeclID &LHS, const GlobalDeclID &RHS) {
-    return LHS.ID >= RHS.ID;
-  }
-
-private:
-  DeclID ID;
-};
-
-/// A helper iterator adaptor to convert the iterators to `SmallVector<DeclID>`
-/// to the iterators to `SmallVector<GlobalDeclID>`.
-class GlobalDeclIDIterator
-    : public llvm::iterator_adaptor_base<GlobalDeclIDIterator, const DeclID *,
-                                         std::forward_iterator_tag,
-                                         GlobalDeclID> {
-public:
-  GlobalDeclIDIterator() : iterator_adaptor_base(nullptr) {}
-
-  GlobalDeclIDIterator(const DeclID *ID) : iterator_adaptor_base(ID) {}
-
-  value_type operator*() const { return GlobalDeclID(*I); }
-
-  bool operator==(const GlobalDeclIDIterator &RHS) const { return I == RHS.I; }
-};
-
-/// A helper iterator adaptor to convert the iterators to
-/// `SmallVector<GlobalDeclID>` to the iterators to `SmallVector<DeclID>`.
-class DeclIDIterator
-    : public llvm::iterator_adaptor_base<DeclIDIterator, const GlobalDeclID *,
-                                         std::forward_iterator_tag, DeclID> {
-public:
-  DeclIDIterator() : iterator_adaptor_base(nullptr) {}
-
-  DeclIDIterator(const GlobalDeclID *ID) : iterator_adaptor_base(ID) {}
-
-  value_type operator*() const { return DeclID(*I); }
-
-  bool operator==(const DeclIDIterator &RHS) const { return I == RHS.I; }
-};
+/// An ID number that refers to a declaration in an AST file. See the comments
+/// in DeclIDBase for details.
+using DeclID = DeclIDBase::DeclID;
 
 /// An ID number that refers to a type in an AST file.
 ///
@@ -1054,8 +973,8 @@ enum PredefinedTypeIDs {
   /// OpenCL reserve_id type.
   PREDEF_TYPE_RESERVE_ID_ID = 41,
 
-  /// The placeholder type for OpenMP array section.
-  PREDEF_TYPE_OMP_ARRAY_SECTION = 42,
+  /// The placeholder type for an array section.
+  PREDEF_TYPE_ARRAY_SECTION = 42,
 
   /// The '__float128' type
   PREDEF_TYPE_FLOAT128_ID = 43,
@@ -1238,74 +1157,6 @@ enum SpecialTypeIDs {
 /// The number of special type IDs.
 const unsigned NumSpecialTypeIDs = 8;
 
-/// Predefined declaration IDs.
-///
-/// These declaration IDs correspond to predefined declarations in the AST
-/// context, such as the NULL declaration ID. Such declarations are never
-/// actually serialized, since they will be built by the AST context when
-/// it is created.
-enum PredefinedDeclIDs {
-  /// The NULL declaration.
-  PREDEF_DECL_NULL_ID = 0,
-
-  /// The translation unit.
-  PREDEF_DECL_TRANSLATION_UNIT_ID = 1,
-
-  /// The Objective-C 'id' type.
-  PREDEF_DECL_OBJC_ID_ID = 2,
-
-  /// The Objective-C 'SEL' type.
-  PREDEF_DECL_OBJC_SEL_ID = 3,
-
-  /// The Objective-C 'Class' type.
-  PREDEF_DECL_OBJC_CLASS_ID = 4,
-
-  /// The Objective-C 'Protocol' type.
-  PREDEF_DECL_OBJC_PROTOCOL_ID = 5,
-
-  /// The signed 128-bit integer type.
-  PREDEF_DECL_INT_128_ID = 6,
-
-  /// The unsigned 128-bit integer type.
-  PREDEF_DECL_UNSIGNED_INT_128_ID = 7,
-
-  /// The internal 'instancetype' typedef.
-  PREDEF_DECL_OBJC_INSTANCETYPE_ID = 8,
-
-  /// The internal '__builtin_va_list' typedef.
-  PREDEF_DECL_BUILTIN_VA_LIST_ID = 9,
-
-  /// The internal '__va_list_tag' struct, if any.
-  PREDEF_DECL_VA_LIST_TAG = 10,
-
-  /// The internal '__builtin_ms_va_list' typedef.
-  PREDEF_DECL_BUILTIN_MS_VA_LIST_ID = 11,
-
-  /// The predeclared '_GUID' struct.
-  PREDEF_DECL_BUILTIN_MS_GUID_ID = 12,
-
-  /// The extern "C" context.
-  PREDEF_DECL_EXTERN_C_CONTEXT_ID = 13,
-
-  /// The internal '__make_integer_seq' template.
-  PREDEF_DECL_MAKE_INTEGER_SEQ_ID = 14,
-
-  /// The internal '__NSConstantString' typedef.
-  PREDEF_DECL_CF_CONSTANT_STRING_ID = 15,
-
-  /// The internal '__NSConstantString' tag type.
-  PREDEF_DECL_CF_CONSTANT_STRING_TAG_ID = 16,
-
-  /// The internal '__type_pack_element' template.
-  PREDEF_DECL_TYPE_PACK_ELEMENT_ID = 17,
-};
-
-/// The number of declaration IDs that are predefined.
-///
-/// For more information about predefined declarations, see the
-/// \c PredefinedDeclIDs type and the PREDEF_DECL_*_ID constants.
-const unsigned int NUM_PREDEF_DECL_IDS = 18;
-
 /// Record of updates for a declaration that was modified after
 /// being deserialized. This can occur within DECLTYPES_BLOCK_ID.
 const unsigned int DECL_UPDATES = 49;
@@ -2075,7 +1926,7 @@ enum StmtCode {
   STMT_OMP_TARGET_TEAMS_GENERIC_LOOP_DIRECTIVE,
   STMT_OMP_PARALLEL_GENERIC_LOOP_DIRECTIVE,
   STMT_OMP_TARGET_PARALLEL_GENERIC_LOOP_DIRECTIVE,
-  EXPR_OMP_ARRAY_SECTION,
+  EXPR_ARRAY_SECTION,
   EXPR_OMP_ARRAY_SHAPING,
   EXPR_OMP_ITERATOR,
 
@@ -2132,7 +1983,7 @@ enum CleanupObjectKind { COK_Block, COK_CompoundLiteral };
 /// Describes the categories of an Objective-C class.
 struct ObjCCategoriesInfo {
   // The ID of the definition
-  DeclID DefinitionID;
+  LocalDeclID DefinitionID;
 
   // Offset into the array of category lists.
   unsigned Offset;
@@ -2231,27 +2082,6 @@ template <> struct DenseMapInfo<clang::serialization::DeclarationNameKey> {
   }
 };
 
-template <> struct DenseMapInfo<clang::serialization::GlobalDeclID> {
-  using DeclID = clang::serialization::DeclID;
-  using GlobalDeclID = clang::serialization::GlobalDeclID;
-
-  static GlobalDeclID getEmptyKey() {
-    return GlobalDeclID(DenseMapInfo<DeclID>::getEmptyKey());
-  }
-
-  static GlobalDeclID getTombstoneKey() {
-    return GlobalDeclID(DenseMapInfo<DeclID>::getTombstoneKey());
-  }
-
-  static unsigned getHashValue(const GlobalDeclID &Key) {
-    return DenseMapInfo<DeclID>::getHashValue(Key.get());
-  }
-
-  static bool isEqual(const GlobalDeclID &L, const GlobalDeclID &R) {
-    return L == R;
-  }
-};
-
 } // namespace llvm
 
 #endif // LLVM_CLANG_SERIALIZATION_ASTBITCODES_H
diff --git a/clang/include/clang/Serialization/ASTDeserializationListener.h b/clang/include/clang/Serialization/ASTDeserializationListener.h
index f3a01a4b9731..3ab7f1a91843 100644
--- a/clang/include/clang/Serialization/ASTDeserializationListener.h
+++ b/clang/include/clang/Serialization/ASTDeserializationListener.h
@@ -44,7 +44,7 @@ public:
   ///        unqualified.
   virtual void TypeRead(serialization::TypeIdx Idx, QualType T) { }
   /// A decl was deserialized from the AST file.
-  virtual void DeclRead(serialization::DeclID ID, const Decl *D) { }
+  virtual void DeclRead(GlobalDeclID ID, const Decl *D) {}
   /// A selector was read from the AST file.
   virtual void SelectorRead(serialization::SelectorID iD, Selector Sel) {}
   /// A macro definition was read from the AST file.
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index ed917aa16422..64f1ebc117b3 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -501,10 +501,7 @@ private:
   /// = I + 1 has already been loaded.
   llvm::PagedVector<Decl *> DeclsLoaded;
 
-  static_assert(std::is_same_v<serialization::DeclID, Decl::DeclID>);
-
-  using GlobalDeclMapType =
-      ContinuousRangeMap<serialization::GlobalDeclID, ModuleFile *, 4>;
+  using GlobalDeclMapType = ContinuousRangeMap<GlobalDeclID, ModuleFile *, 4>;
 
   /// Mapping from global declaration IDs to the module in which the
   /// declaration resides.
@@ -512,16 +509,15 @@ private:
 
   using FileOffset = std::pair<ModuleFile *, uint64_t>;
   using FileOffsetsTy = SmallVector<FileOffset, 2>;
-  using DeclUpdateOffsetsMap =
-      llvm::DenseMap<serialization::GlobalDeclID, FileOffsetsTy>;
+  using DeclUpdateOffsetsMap = llvm::DenseMap<GlobalDeclID, FileOffsetsTy>;
 
   /// Declarations that have modifications residing in a later file
   /// in the chain.
   DeclUpdateOffsetsMap DeclUpdateOffsets;
 
-  using DelayedNamespaceOffsetMapTy = llvm::DenseMap<
-      serialization::GlobalDeclID,
-      std::pair</*LexicalOffset*/ uint64_t, /*VisibleOffset*/ uint64_t>>;
+  using DelayedNamespaceOffsetMapTy =
+      llvm::DenseMap<GlobalDeclID, std::pair</*LexicalOffset*/ uint64_t,
+                                             /*VisibleOffset*/ uint64_t>>;
 
   /// Mapping from global declaration IDs to the lexical and visible block
   /// offset for delayed namespace in reduced BMI.
@@ -535,13 +531,12 @@ private:
 
   struct PendingUpdateRecord {
     Decl *D;
-    serialization::GlobalDeclID ID;
+    GlobalDeclID ID;
 
     // Whether the declaration was just deserialized.
     bool JustLoaded;
 
-    PendingUpdateRecord(serialization::GlobalDeclID ID, Decl *D,
-                        bool JustLoaded)
+    PendingUpdateRecord(GlobalDeclID ID, Decl *D, bool JustLoaded)
         : D(D), ID(ID), JustLoaded(JustLoaded) {}
   };
 
@@ -594,10 +589,10 @@ private:
 
   struct FileDeclsInfo {
     ModuleFile *Mod = nullptr;
-    ArrayRef<serialization::LocalDeclID> Decls;
+    ArrayRef<LocalDeclID> Decls;
 
     FileDeclsInfo() = default;
-    FileDeclsInfo(ModuleFile *Mod, ArrayRef<serialization::LocalDeclID> Decls)
+    FileDeclsInfo(ModuleFile *Mod, ArrayRef<LocalDeclID> Decls)
         : Mod(Mod), Decls(Decls) {}
   };
 
@@ -635,8 +630,7 @@ private:
 
   /// Updates to the visible declarations of declaration contexts that
   /// haven't been loaded yet.
-  llvm::DenseMap<serialization::GlobalDeclID, DeclContextVisibleUpdates>
-      PendingVisibleUpdates;
+  llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates> PendingVisibleUpdates;
 
   /// The set of C++ or Objective-C classes that have forward
   /// declarations that have not yet been linked to their definitions.
@@ -662,8 +656,7 @@ private:
   /// Read the record that describes the visible contents of a DC.
   bool ReadVisibleDeclContextStorage(ModuleFile &M,
                                      llvm::BitstreamCursor &Cursor,
-                                     uint64_t Offset,
-                                     serialization::GlobalDeclID ID);
+                                     uint64_t Offset, GlobalDeclID ID);
 
   /// A vector containing identifiers that have already been
   /// loaded.
@@ -816,14 +809,14 @@ private:
   /// This contains the data loaded from all EAGERLY_DESERIALIZED_DECLS blocks
   /// in the chain. The referenced declarations are deserialized and passed to
   /// the consumer eagerly.
-  SmallVector<serialization::GlobalDeclID, 16> EagerlyDeserializedDecls;
+  SmallVector<GlobalDeclID, 16> EagerlyDeserializedDecls;
 
   /// The IDs of all tentative definitions stored in the chain.
   ///
   /// Sema keeps track of all tentative definitions in a TU because it has to
   /// complete them and pass them on to CodeGen. Thus, tentative definitions in
   /// the PCH chain must be eagerly deserialized.
-  SmallVector<serialization::GlobalDeclID, 16> TentativeDefinitions;
+  SmallVector<GlobalDeclID, 16> TentativeDefinitions;
 
   /// The IDs of all CXXRecordDecls stored in the chain whose VTables are
   /// used.
@@ -831,7 +824,7 @@ private:
   /// CodeGen has to emit VTables for these records, so they have to be eagerly
   /// deserialized.
   struct VTableUse {
-    serialization::GlobalDeclID ID;
+    GlobalDeclID ID;
     SourceLocation::UIntTy RawLoc;
     bool Used;
   };
@@ -844,7 +837,7 @@ private:
   /// instantiation where the first value is the ID of the decl and the second
   /// is the instantiation location.
   struct PendingInstantiation {
-    serialization::GlobalDeclID ID;
+    GlobalDeclID ID;
     SourceLocation::UIntTy RawLoc;
   };
   SmallVector<PendingInstantiation, 64> PendingInstantiations;
@@ -857,11 +850,11 @@ private:
 
   /// A snapshot of Sema's unused file-scoped variable tracking, for
   /// generating warnings.
-  SmallVector<serialization::GlobalDeclID, 16> UnusedFileScopedDecls;
+  SmallVector<GlobalDeclID, 16> UnusedFileScopedDecls;
 
   /// A list of all the delegating constructors we've seen, to diagnose
   /// cycles.
-  SmallVector<serialization::GlobalDeclID, 4> DelegatingCtorDecls;
+  SmallVector<GlobalDeclID, 4> DelegatingCtorDecls;
 
   /// Method selectors used in a @selector expression. Used for
   /// implementation of -Wselector.
@@ -874,7 +867,7 @@ private:
   /// The IDs of type aliases for ext_vectors that exist in the chain.
   ///
   /// Used by Sema for finding sugared names for ext_vectors in diagnostics.
-  SmallVector<serialization::GlobalDeclID, 4> ExtVectorDecls;
+  SmallVector<GlobalDeclID, 4> ExtVectorDecls;
 
   //@}
 
@@ -885,7 +878,7 @@ private:
   /// The IDs of all potentially unused typedef names in the chain.
   ///
   /// Sema tracks these to emit warnings.
-  SmallVector<serialization::GlobalDeclID, 16> UnusedLocalTypedefNameCandidates;
+  SmallVector<GlobalDeclID, 16> UnusedLocalTypedefNameCandidates;
 
   /// Our current depth in #pragma cuda force_host_device begin/end
   /// macros.
@@ -894,7 +887,7 @@ private:
   /// The IDs of the declarations Sema stores directly.
   ///
   /// Sema tracks a few important decls, such as namespace std, directly.
-  SmallVector<serialization::GlobalDeclID, 4> SemaDeclRefs;
+  SmallVector<GlobalDeclID, 4> SemaDeclRefs;
 
   /// The IDs of the types ASTContext stores directly.
   ///
@@ -905,7 +898,7 @@ private:
   ///
   /// The AST context tracks a few important decls, currently cudaConfigureCall,
   /// directly.
-  SmallVector<serialization::GlobalDeclID, 2> CUDASpecialDeclRefs;
+  SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
 
   /// The floating point pragma option settings.
   SmallVector<uint64_t, 1> FPPragmaOptions;
@@ -954,12 +947,12 @@ private:
   llvm::DenseMap<const Decl *, std::set<std::string>> OpenCLDeclExtMap;
 
   /// A list of the namespaces we've seen.
-  SmallVector<serialization::GlobalDeclID, 4> KnownNamespaces;
+  SmallVector<GlobalDeclID, 4> KnownNamespaces;
 
   /// A list of undefined decls with internal linkage followed by the
   /// SourceLocation of a matching ODR-use.
   struct UndefinedButUsedDecl {
-    serialization::GlobalDeclID ID;
+    GlobalDeclID ID;
     SourceLocation::UIntTy RawLoc;
   };
   SmallVector<UndefinedButUsedDecl, 8> UndefinedButUsed;
@@ -974,8 +967,7 @@ private:
   /// The IDs of all decls to be checked for deferred diags.
   ///
   /// Sema tracks these to emit deferred diags.
-  llvm::SmallSetVector<serialization::GlobalDeclID, 4>
-      DeclsToCheckForDeferredDiags;
+  llvm::SmallSetVector<GlobalDeclID, 4> DeclsToCheckForDeferredDiags;
 
 private:
   struct ImportedSubmodule {
@@ -1112,7 +1104,7 @@ private:
   ///
   /// The declarations on the identifier chain for these identifiers will be
   /// loaded once the recursive loading has completed.
-  llvm::MapVector<IdentifierInfo *, SmallVector<serialization::GlobalDeclID, 4>>
+  llvm::MapVector<IdentifierInfo *, SmallVector<GlobalDeclID, 4>>
       PendingIdentifierInfos;
 
   /// The set of lookup results that we have faked in order to support
@@ -1157,8 +1149,8 @@ private:
   /// been loaded but its DeclContext was not set yet.
   struct PendingDeclContextInfo {
     Decl *D;
-    serialization::GlobalDeclID SemaDC;
-    serialization::GlobalDeclID LexicalDC;
+    GlobalDeclID SemaDC;
+    GlobalDeclID LexicalDC;
   };
 
   /// The set of Decls that have been loaded but their DeclContexts are
@@ -1239,8 +1231,7 @@ private:
   /// module is loaded.
   SmallVector<ObjCInterfaceDecl *, 16> ObjCClassesLoaded;
 
-  using KeyDeclsMap =
-      llvm::DenseMap<Decl *, SmallVector<serialization::GlobalDeclID, 2>>;
+  using KeyDeclsMap = llvm::DenseMap<Decl *, SmallVector<GlobalDeclID, 2>>;
 
   /// A mapping from canonical declarations to the set of global
   /// declaration IDs for key declaration that have been merged with that
@@ -1449,7 +1440,7 @@ private:
   QualType readTypeRecord(unsigned Index);
   RecordLocation TypeCursorForIndex(unsigned Index);
   void LoadedDecl(unsigned Index, Decl *D);
-  Decl *ReadDeclRecord(serialization::GlobalDeclID ID);
+  Decl *ReadDeclRecord(GlobalDeclID ID);
   void markIncompleteDeclChain(Decl *D);
 
   /// Returns the most recent declaration of a declaration (which must be
@@ -1457,11 +1448,10 @@ private:
   /// merged into its redecl chain.
   Decl *getMostRecentExistingDecl(Decl *D);
 
-  RecordLocation DeclCursorForID(serialization::GlobalDeclID ID,
-                                 SourceLocation &Location);
+  RecordLocation DeclCursorForID(GlobalDeclID ID, SourceLocation &Location);
   void loadDeclUpdateRecords(PendingUpdateRecord &Record);
   void loadPendingDeclChain(Decl *D, uint64_t LocalOffset);
-  void loadObjCCategories(serialization::GlobalDeclID ID, ObjCInterfaceDecl *D,
+  void loadObjCCategories(GlobalDeclID ID, ObjCInterfaceDecl *D,
                           unsigned PreviousGeneration = 0);
 
   RecordLocation getLocalBitOffset(uint64_t GlobalOffset);
@@ -1496,11 +1486,10 @@ private:
                                unsigned ClientLoadCapabilities);
 
 public:
-  class ModuleDeclIterator
-      : public llvm::iterator_adaptor_base<
-            ModuleDeclIterator, const serialization::LocalDeclID *,
-            std::random_access_iterator_tag, const Decl *, ptrdiff_t,
-            const Decl *, const Decl *> {
+  class ModuleDeclIterator : public llvm::iterator_adaptor_base<
+                                 ModuleDeclIterator, const LocalDeclID *,
+                                 std::random_access_iterator_tag, const Decl *,
+                                 ptrdiff_t, const Decl *, const Decl *> {
     ASTReader *Reader = nullptr;
     ModuleFile *Mod = nullptr;
 
@@ -1508,7 +1497,7 @@ public:
     ModuleDeclIterator() : iterator_adaptor_base(nullptr) {}
 
     ModuleDeclIterator(ASTReader *Reader, ModuleFile *Mod,
-                       const serialization::LocalDeclID *Pos)
+                       const LocalDeclID *Pos)
         : iterator_adaptor_base(Pos), Reader(Reader), Mod(Mod) {}
 
     value_type operator*() const {
@@ -1536,9 +1525,8 @@ private:
 
   void pushExternalDeclIntoScope(NamedDecl *D, DeclarationName Name);
 
-  void addPendingDeclContextInfo(Decl *D,
-                                 serialization::GlobalDeclID SemaDC,
-                                 serialization::GlobalDeclID LexicalDC) {
+  void addPendingDeclContextInfo(Decl *D, GlobalDeclID SemaDC,
+                                 GlobalDeclID LexicalDC) {
     assert(D);
     PendingDeclContextInfo Info = { D, SemaDC, LexicalDC };
     PendingDeclContextInfos.push_back(Info);
@@ -1916,38 +1904,36 @@ public:
 
   /// Map from a local declaration ID within a given module to a
   /// global declaration ID.
-  serialization::GlobalDeclID
-  getGlobalDeclID(ModuleFile &F, serialization::LocalDeclID LocalID) const;
+  GlobalDeclID getGlobalDeclID(ModuleFile &F, LocalDeclID LocalID) const;
 
   /// Returns true if global DeclID \p ID originated from module \p M.
-  bool isDeclIDFromModule(serialization::GlobalDeclID ID, ModuleFile &M) const;
+  bool isDeclIDFromModule(GlobalDeclID ID, ModuleFile &M) const;
 
   /// Retrieve the module file that owns the given declaration, or NULL
   /// if the declaration is not from a module file.
   ModuleFile *getOwningModuleFile(const Decl *D);
 
   /// Returns the source location for the decl \p ID.
-  SourceLocation getSourceLocationForDeclID(serialization::GlobalDeclID ID);
+  SourceLocation getSourceLocationForDeclID(GlobalDeclID ID);
 
   /// Resolve a declaration ID into a declaration, potentially
   /// building a new declaration.
-  Decl *GetDecl(serialization::GlobalDeclID ID);
-  Decl *GetExternalDecl(Decl::DeclID ID) override;
+  Decl *GetDecl(GlobalDeclID ID);
+  Decl *GetExternalDecl(GlobalDeclID ID) override;
 
   /// Resolve a declaration ID into a declaration. Return 0 if it's not
   /// been loaded yet.
-  Decl *GetExistingDecl(serialization::GlobalDeclID ID);
+  Decl *GetExistingDecl(GlobalDeclID ID);
 
   /// Reads a declaration with the given local ID in the given module.
-  Decl *GetLocalDecl(ModuleFile &F, serialization::LocalDeclID LocalID) {
+  Decl *GetLocalDecl(ModuleFile &F, LocalDeclID LocalID) {
     return GetDecl(getGlobalDeclID(F, LocalID));
   }
 
   /// Reads a declaration with the given local ID in the given module.
   ///
   /// \returns The requested declaration, casted to the given return type.
-  template <typename T>
-  T *GetLocalDeclAs(ModuleFile &F, serialization::LocalDeclID LocalID) {
+  template <typename T> T *GetLocalDeclAs(ModuleFile &F, LocalDeclID LocalID) {
     return cast_or_null<T>(GetLocalDecl(F, LocalID));
   }
 
@@ -1956,16 +1942,15 @@ public:
   ///
   /// \returns the global ID of the given declaration as known in the given
   /// module file.
-  serialization::DeclID
-  mapGlobalIDToModuleFileGlobalID(ModuleFile &M,
-                                  serialization::GlobalDeclID GlobalID);
+  LocalDeclID mapGlobalIDToModuleFileGlobalID(ModuleFile &M,
+                                              GlobalDeclID GlobalID);
 
   /// Reads a declaration ID from the given position in a record in the
   /// given module.
   ///
   /// \returns The declaration ID read from the record, adjusted to a global ID.
-  serialization::GlobalDeclID
-  ReadDeclID(ModuleFile &F, const RecordData &Record, unsigned &Idx);
+  GlobalDeclID ReadDeclID(ModuleFile &F, const RecordData &Record,
+                          unsigned &Idx);
 
   /// Reads a declaration from the given position in a record in the
   /// given module.
@@ -2139,10 +2124,9 @@ public:
   void LoadSelector(Selector Sel);
 
   void SetIdentifierInfo(unsigned ID, IdentifierInfo *II);
-  void SetGloballyVisibleDecls(
-      IdentifierInfo *II,
-      const SmallVectorImpl<serialization::GlobalDeclID> &DeclIDs,
-      SmallVectorImpl<Decl *> *Decls = nullptr);
+  void SetGloballyVisibleDecls(IdentifierInfo *II,
+                               const SmallVectorImpl<GlobalDeclID> &DeclIDs,
+                               SmallVectorImpl<Decl *> *Decls = nullptr);
 
   /// Report a diagnostic.
   DiagnosticBuilder Diag(unsigned DiagID) const;
@@ -2383,7 +2367,7 @@ public:
 
   // Contains the IDs for declarations that were requested before we have
   // access to a Sema object.
-  SmallVector<serialization::GlobalDeclID, 16> PreloadedDeclIDs;
+  SmallVector<GlobalDeclID, 16> PreloadedDeclIDs;
 
   /// Retrieve the semantic analysis object used to analyze the
   /// translation unit in which the precompiled header is being
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index 9eaf50a76d52..1e11d2d5e42f 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -136,7 +136,7 @@ public:
   /// Reads a declaration with the given local ID in the given module.
   ///
   /// \returns The requested declaration, casted to the given return type.
-  template <typename T> T *GetLocalDeclAs(serialization::LocalDeclID LocalID) {
+  template <typename T> T *GetLocalDeclAs(LocalDeclID LocalID) {
     return cast_or_null<T>(Reader->GetLocalDecl(*F, LocalID));
   }
 
@@ -182,9 +182,7 @@ public:
   /// Reads a declaration ID from the given position in this record.
   ///
   /// \returns The declaration ID read from the record, adjusted to a global ID.
-  serialization::GlobalDeclID readDeclID() {
-    return Reader->ReadDeclID(*F, Record, Idx);
-  }
+  GlobalDeclID readDeclID() { return Reader->ReadDeclID(*F, Record, Idx); }
 
   /// Reads a declaration from the given position in a record in the
   /// given module, advancing Idx.
@@ -271,6 +269,9 @@ public:
   /// Read an OpenMP children, advancing Idx.
   void readOMPChildren(OMPChildren *Data);
 
+  /// Read a list of Exprs used for a var-list.
+  llvm::SmallVector<Expr *> readOpenACCVarList();
+
   /// Read an OpenACC clause, advancing Idx.
   OpenACCClause *readOpenACCClause();
 
diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h
index 1feb8fcbacf7..8b1da49bd4c5 100644
--- a/clang/include/clang/Serialization/ASTRecordWriter.h
+++ b/clang/include/clang/Serialization/ASTRecordWriter.h
@@ -15,6 +15,7 @@
 #define LLVM_CLANG_SERIALIZATION_ASTRECORDWRITER_H
 
 #include "clang/AST/AbstractBasicWriter.h"
+#include "clang/AST/OpenACCClause.h"
 #include "clang/AST/OpenMPClause.h"
 #include "clang/Serialization/ASTWriter.h"
 #include "clang/Serialization/SourceLocationEncoding.h"
@@ -293,6 +294,8 @@ public:
   /// Writes data related to the OpenMP directives.
   void writeOMPChildren(OMPChildren *Data);
 
+  void writeOpenACCVarList(const OpenACCClauseWithVarList *C);
+
   /// Writes out a single OpenACC Clause.
   void writeOpenACCClause(const OpenACCClause *C);
 
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 13b4ad4ad295..a55dfd327670 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -76,6 +76,10 @@ class StoredDeclsList;
 class SwitchCase;
 class Token;
 
+namespace SrcMgr {
+class FileInfo;
+} // namespace SrcMgr
+
 /// Writes an AST file containing the contents of a translation unit.
 ///
 /// The ASTWriter class produces a bitstream containing the serialized
@@ -212,10 +216,10 @@ private:
   llvm::SmallVector<NamespaceDecl *, 16> DelayedNamespace;
 
   /// The first ID number we can use for our own declarations.
-  serialization::DeclID FirstDeclID = serialization::NUM_PREDEF_DECL_IDS;
+  LocalDeclID FirstDeclID = LocalDeclID(clang::NUM_PREDEF_DECL_IDS);
 
   /// The decl ID that will be assigned to the next new decl.
-  serialization::DeclID NextDeclID = FirstDeclID;
+  LocalDeclID NextDeclID = FirstDeclID;
 
   /// Map that provides the ID numbers of each declaration within
   /// the output stream, as well as those deserialized from a chained PCH.
@@ -223,7 +227,7 @@ private:
   /// The ID numbers of declarations are consecutive (in order of
   /// discovery) and start at 2. 1 is reserved for the translation
   /// unit, while 0 is reserved for NULL.
-  llvm::DenseMap<const Decl *, serialization::DeclID> DeclIDs;
+  llvm::DenseMap<const Decl *, LocalDeclID> DeclIDs;
 
   /// Offset of each declaration in the bitstream, indexed by
   /// the declaration's ID.
@@ -233,9 +237,8 @@ private:
   /// are relative to this value.
   uint64_t DeclTypesBlockStartOffset = 0;
 
-  /// Sorted (by file offset) vector of pairs of file offset/DeclID.
-  using LocDeclIDsTy =
-      SmallVector<std::pair<unsigned, serialization::DeclID>, 64>;
+  /// Sorted (by file offset) vector of pairs of file offset/LocalDeclID.
+  using LocDeclIDsTy = SmallVector<std::pair<unsigned, LocalDeclID>, 64>;
   struct DeclIDInFileInfo {
     LocDeclIDsTy DeclIDs;
 
@@ -250,7 +253,7 @@ private:
   /// that it contains.
   FileDeclIDsTy FileDeclIDs;
 
-  void associateDeclWithFile(const Decl *D, serialization::DeclID);
+  void associateDeclWithFile(const Decl *D, LocalDeclID);
 
   /// The first ID number we can use for our own types.
   serialization::TypeID FirstTypeID = serialization::NUM_PREDEF_TYPE_IDS;
@@ -421,8 +424,8 @@ private:
   /// headers. The declarations themselves are stored as declaration
   /// IDs, since they will be written out to an EAGERLY_DESERIALIZED_DECLS
   /// record.
-  SmallVector<serialization::DeclID, 16> EagerlyDeserializedDecls;
-  SmallVector<serialization::DeclID, 16> ModularCodegenDecls;
+  RecordData EagerlyDeserializedDecls;
+  RecordData ModularCodegenDecls;
 
   /// DeclContexts that have received extensions since their serialized
   /// form.
@@ -491,6 +494,11 @@ private:
   /// during \c SourceManager serialization.
   void computeNonAffectingInputFiles();
 
+  /// Some affecting files can be included from files that are not affecting.
+  /// This function erases source locations pointing into such files.
+  SourceLocation getAffectingIncludeLoc(const SourceManager &SourceMgr,
+                                        const SrcMgr::FileInfo &File);
+
   /// Returns an adjusted \c FileID, accounting for any non-affecting input
   /// files.
   FileID getAdjustedFileID(FileID FID) const;
@@ -526,6 +534,7 @@ private:
 
   /// Calculate hash of the pcm content.
   std::pair<ASTFileSignature, ASTFileSignature> createSignature() const;
+  ASTFileSignature createSignatureForNamedModule() const;
 
   void WriteInputFiles(SourceManager &SourceMgr, HeaderSearchOptions &HSOpts);
   void WriteSourceManagerBlock(SourceManager &SourceMgr,
@@ -709,7 +718,7 @@ public:
       return false;
     auto I = DeclIDs.find(D);
     return (I == DeclIDs.end() ||
-            I->second >= serialization::NUM_PREDEF_DECL_IDS);
+            I->second.get() >= clang::NUM_PREDEF_DECL_IDS);
   };
 
   /// Emit a reference to a declaration.
@@ -717,12 +726,13 @@ public:
   // Emit a reference to a declaration if the declaration was emitted.
   void AddEmittedDeclRef(const Decl *D, RecordDataImpl &Record);
 
-  /// Force a declaration to be emitted and get its ID.
-  serialization::DeclID GetDeclRef(const Decl *D);
+  /// Force a declaration to be emitted and get its local ID to the module file
+  /// been writing.
+  LocalDeclID GetDeclRef(const Decl *D);
 
-  /// Determine the declaration ID of an already-emitted
+  /// Determine the local declaration ID of an already-emitted
   /// declaration.
-  serialization::DeclID getDeclID(const Decl *D);
+  LocalDeclID getDeclID(const Decl *D);
 
   /// Whether or not the declaration got emitted. If not, it wouldn't be
   /// emitted.
@@ -885,6 +895,8 @@ private:
 /// AST and semantic-analysis consumer that generates a
 /// precompiled header from the parsed source code.
 class PCHGenerator : public SemaConsumer {
+  void anchor() override;
+
   Preprocessor &PP;
   std::string OutputFile;
   std::string isysroot;
@@ -928,17 +940,34 @@ public:
   bool hasEmittedPCH() const { return Buffer->IsComplete; }
 };
 
-class ReducedBMIGenerator : public PCHGenerator {
+class CXX20ModulesGenerator : public PCHGenerator {
+  void anchor() override;
+
 protected:
   virtual Module *getEmittingModule(ASTContext &Ctx) override;
 
+  CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache,
+                        StringRef OutputFile, bool GeneratingReducedBMI);
+
 public:
-  ReducedBMIGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache,
-                      StringRef OutputFile);
+  CXX20ModulesGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache,
+                        StringRef OutputFile)
+      : CXX20ModulesGenerator(PP, ModuleCache, OutputFile,
+                              /*GeneratingReducedBMI=*/false) {}
 
   void HandleTranslationUnit(ASTContext &Ctx) override;
 };
 
+class ReducedBMIGenerator : public CXX20ModulesGenerator {
+  void anchor() override;
+
+public:
+  ReducedBMIGenerator(Preprocessor &PP, InMemoryModuleCache &ModuleCache,
+                      StringRef OutputFile)
+      : CXX20ModulesGenerator(PP, ModuleCache, OutputFile,
+                              /*GeneratingReducedBMI=*/true) {}
+};
+
 /// If we can elide the definition of \param D in reduced BMI.
 ///
 /// Generally, we can elide the definition of a declaration if it won't affect
diff --git a/clang/include/clang/Serialization/ModuleFile.h b/clang/include/clang/Serialization/ModuleFile.h
index 492c35dceb08..25f644e76edb 100644
--- a/clang/include/clang/Serialization/ModuleFile.h
+++ b/clang/include/clang/Serialization/ModuleFile.h
@@ -474,7 +474,7 @@ public:
   llvm::DenseMap<ModuleFile *, serialization::DeclID> GlobalToLocalDeclIDs;
 
   /// Array of file-level DeclIDs sorted by file.
-  const serialization::LocalDeclID *FileSortedDecls = nullptr;
+  const LocalDeclID *FileSortedDecls = nullptr;
   unsigned NumFileSortedDecls = 0;
 
   /// Array of category list location information within this
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 9aa1c6ddfe44..520286b57c9f 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -563,6 +563,20 @@ def MismatchedDeallocatorChecker : Checker<"MismatchedDeallocator">,
   Dependencies<[DynamicMemoryModeling]>,
   Documentation<HasDocumentation>;
 
+// This must appear before StdCLibraryFunctionsChecker because a dependency.
+def StreamChecker : Checker<"Stream">,
+  HelpText<"Check stream handling functions">,
+  WeakDependencies<[NonNullParamChecker]>,
+  CheckerOptions<[
+    CmdLineOption<Boolean,
+                  "Pedantic",
+                  "If false, assume that stream operations which are often not "
+                  "checked for error do not fail.",
+                  "false",
+                  InAlpha>
+  ]>,
+  Documentation<HasDocumentation>;
+
 def StdCLibraryFunctionsChecker : Checker<"StdCLibraryFunctions">,
   HelpText<"Check for invalid arguments of C standard library functions, "
            "and apply relations between arguments and return value">,
@@ -581,7 +595,7 @@ def StdCLibraryFunctionsChecker : Checker<"StdCLibraryFunctions">,
                   "true",
                   InAlpha>
   ]>,
-  WeakDependencies<[CallAndMessageChecker, NonNullParamChecker]>,
+  WeakDependencies<[CallAndMessageChecker, NonNullParamChecker, StreamChecker]>,
   Documentation<HasDocumentation>;
 
 def VforkChecker : Checker<"Vfork">,
@@ -601,20 +615,6 @@ def PthreadLockChecker : Checker<"PthreadLock">,
   Dependencies<[PthreadLockBase]>,
   Documentation<HasDocumentation>;
 
-def StreamChecker : Checker<"Stream">,
-  HelpText<"Check stream handling functions">,
-  WeakDependencies<[NonNullParamChecker]>,
-  CheckerOptions<[
-    CmdLineOption<Boolean,
-                  "Pedantic",
-                  "If false, assume that stream operations which are often not "
-                  "checked for error do not fail."
-                  "fail.",
-                  "false",
-                  InAlpha>
-  ]>,
-  Documentation<HasDocumentation>;
-
 def SimpleStreamChecker : Checker<"SimpleStream">,
   HelpText<"Check for misuses of stream APIs">,
   Documentation<HasDocumentation>;
@@ -1628,6 +1628,7 @@ def TaintTesterChecker : Checker<"TaintTest">,
 def StreamTesterChecker : Checker<"StreamTester">,
   HelpText<"Add test functions to StreamChecker for test and debugging "
            "purposes.">,
+  WeakDependencies<[StreamChecker]>,
   Documentation<NotDocumented>;
 
 def ErrnoTesterChecker : Checker<"ErrnoTest">,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
index fac0c04ae2ca..ef23b160a3c0 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
@@ -225,15 +225,11 @@ public:
   ///   invalidated. This should include any regions explicitly invalidated
   ///   even if they do not currently have bindings. Pass \c NULL if this
   ///   information will not be used.
-  virtual StoreRef invalidateRegions(Store store,
-                                  ArrayRef<SVal> Values,
-                                  const Expr *E, unsigned Count,
-                                  const LocationContext *LCtx,
-                                  const CallEvent *Call,
-                                  InvalidatedSymbols &IS,
-                                  RegionAndSymbolInvalidationTraits &ITraits,
-                                  InvalidatedRegions *InvalidatedTopLevel,
-                                  InvalidatedRegions *Invalidated) = 0;
+  virtual StoreRef invalidateRegions(
+      Store store, ArrayRef<SVal> Values, const Expr *Ex, unsigned Count,
+      const LocationContext *LCtx, const CallEvent *Call,
+      InvalidatedSymbols &IS, RegionAndSymbolInvalidationTraits &ITraits,
+      InvalidatedRegions *TopLevelRegions, InvalidatedRegions *Invalidated) = 0;
 
   /// enterStackFrame - Let the StoreManager to do something when execution
   /// engine is about to execute into a callee.
diff --git a/clang/include/clang/Tooling/CommonOptionsParser.h b/clang/include/clang/Tooling/CommonOptionsParser.h
index 3c0480af3779..5e2cdc6ac458 100644
--- a/clang/include/clang/Tooling/CommonOptionsParser.h
+++ b/clang/include/clang/Tooling/CommonOptionsParser.h
@@ -49,17 +49,22 @@ namespace tooling {
 /// using namespace clang::tooling;
 /// using namespace llvm;
 ///
-/// static cl::OptionCategory MyToolCategory("My tool options");
+/// static cl::OptionCategory MyToolCategory("my-tool options");
 /// static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage);
 /// static cl::extrahelp MoreHelp("\nMore help text...\n");
-/// static cl::opt<bool> YourOwnOption(...);
-/// ...
 ///
 /// int main(int argc, const char **argv) {
-///   CommonOptionsParser OptionsParser(argc, argv, MyToolCategory);
+///   auto ExpectedParser =
+///       CommonOptionsParser::create(argc, argv, MyToolCategory);
+///   if (!ExpectedParser) {
+///     llvm::errs() << ExpectedParser.takeError();
+///     return 1;
+///   }
+///   CommonOptionsParser& OptionsParser = ExpectedParser.get();
 ///   ClangTool Tool(OptionsParser.getCompilations(),
 ///                  OptionsParser.getSourcePathList());
-///   return Tool.run(newFrontendActionFactory<SyntaxOnlyAction>().get());
+///   return Tool.run(
+///       newFrontendActionFactory<clang::SyntaxOnlyAction>().get());
 /// }
 /// \endcode
 class CommonOptionsParser {
diff --git a/clang/lib/APINotes/APINotesFormat.h b/clang/lib/APINotes/APINotesFormat.h
index 615314c46f09..97e630e97fdc 100644
--- a/clang/lib/APINotes/APINotesFormat.h
+++ b/clang/lib/APINotes/APINotesFormat.h
@@ -24,7 +24,10 @@ const uint16_t VERSION_MAJOR = 0;
 /// API notes file minor version number.
 ///
 /// When the format changes IN ANY WAY, this number should be incremented.
-const uint16_t VERSION_MINOR = 25; // SwiftImportAs
+const uint16_t VERSION_MINOR = 26; // SwiftCopyable
+
+const uint8_t kSwiftCopyable = 1;
+const uint8_t kSwiftNonCopyable = 2;
 
 using IdentifierID = llvm::PointerEmbeddedInt<unsigned, 31>;
 using IdentifierIDField = llvm::BCVBR<16>;
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index dfc3beb6fa13..b60ca685f62c 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -527,6 +527,13 @@ public:
       Info.EnumExtensibility =
           static_cast<EnumExtensibilityKind>((Payload & 0x3) - 1);
 
+    uint8_t Copyable =
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
+    if (Copyable == kSwiftNonCopyable)
+      Info.setSwiftCopyable(std::optional(false));
+    else if (Copyable == kSwiftCopyable)
+      Info.setSwiftCopyable(std::optional(true));
+
     unsigned ImportAsLength =
         endian::readNext<uint16_t, llvm::endianness::little>(Data);
     if (ImportAsLength > 0) {
diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
index e3f5d102fcd0..3e6159763150 100644
--- a/clang/lib/APINotes/APINotesWriter.cpp
+++ b/clang/lib/APINotes/APINotesWriter.cpp
@@ -1128,7 +1128,7 @@ public:
     return 2 + (TI.SwiftImportAs ? TI.SwiftImportAs->size() : 0) +
            2 + (TI.SwiftRetainOp ? TI.SwiftRetainOp->size() : 0) +
            2 + (TI.SwiftReleaseOp ? TI.SwiftReleaseOp->size() : 0) +
-           1 + getCommonTypeInfoSize(TI);
+           2 + getCommonTypeInfoSize(TI);
   }
 
   void emitUnversionedInfo(raw_ostream &OS, const TagInfo &TI) {
@@ -1146,6 +1146,11 @@ public:
 
     writer.write<uint8_t>(Flags);
 
+    if (auto Copyable = TI.isSwiftCopyable())
+      writer.write<uint8_t>(*Copyable ? kSwiftCopyable : kSwiftNonCopyable);
+    else
+      writer.write<uint8_t>(0);
+
     if (auto ImportAs = TI.SwiftImportAs) {
       writer.write<uint16_t>(ImportAs->size() + 1);
       OS.write(ImportAs->c_str(), ImportAs->size());
diff --git a/clang/lib/APINotes/APINotesYAMLCompiler.cpp b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
index 57d6da7a1775..2295d769d344 100644
--- a/clang/lib/APINotes/APINotesYAMLCompiler.cpp
+++ b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
@@ -419,6 +419,7 @@ struct Tag {
   std::optional<EnumExtensibilityKind> EnumExtensibility;
   std::optional<bool> FlagEnum;
   std::optional<EnumConvenienceAliasKind> EnumConvenienceKind;
+  std::optional<bool> SwiftCopyable;
 };
 
 typedef std::vector<Tag> TagsSeq;
@@ -452,6 +453,7 @@ template <> struct MappingTraits<Tag> {
     IO.mapOptional("EnumExtensibility", T.EnumExtensibility);
     IO.mapOptional("FlagEnum", T.FlagEnum);
     IO.mapOptional("EnumKind", T.EnumConvenienceKind);
+    IO.mapOptional("SwiftCopyable", T.SwiftCopyable);
   }
 };
 } // namespace yaml
@@ -1009,6 +1011,9 @@ public:
       if (Tag.SwiftReleaseOp)
         TI.SwiftReleaseOp = Tag.SwiftReleaseOp;
 
+      if (Tag.SwiftCopyable)
+        TI.setSwiftCopyable(Tag.SwiftCopyable);
+
       if (Tag.EnumConvenienceKind) {
         if (Tag.EnumExtensibility) {
           emitError(
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0f894c623bee..cbf4932aff9a 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1084,7 +1084,7 @@ void ASTContext::addModuleInitializer(Module *M, Decl *D) {
 }
 
 void ASTContext::addLazyModuleInitializers(Module *M,
-                                           ArrayRef<Decl::DeclID> IDs) {
+                                           ArrayRef<GlobalDeclID> IDs) {
   auto *&Inits = ModuleInitializers[M];
   if (!Inits)
     Inits = new (*this) PerModuleInitializers;
@@ -1321,16 +1321,14 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
 
   // Placeholder type for OMP array sections.
   if (LangOpts.OpenMP) {
-    InitBuiltinType(OMPArraySectionTy, BuiltinType::OMPArraySection);
+    InitBuiltinType(ArraySectionTy, BuiltinType::ArraySection);
     InitBuiltinType(OMPArrayShapingTy, BuiltinType::OMPArrayShaping);
     InitBuiltinType(OMPIteratorTy, BuiltinType::OMPIterator);
   }
-  // Placeholder type for OpenACC array sections.
-  if (LangOpts.OpenACC) {
-    // FIXME: Once we implement OpenACC array sections in Sema, this will either
-    // be combined with the OpenMP type, or given its own type. In the meantime,
-    // just use the OpenMP type so that parsing can work.
-    InitBuiltinType(OMPArraySectionTy, BuiltinType::OMPArraySection);
+  // Placeholder type for OpenACC array sections, if we are ALSO in OMP mode,
+  // don't bother, as we're just using the same type as OMP.
+  if (LangOpts.OpenACC && !LangOpts.OpenMP) {
+    InitBuiltinType(ArraySectionTy, BuiltinType::ArraySection);
   }
   if (LangOpts.MatrixTypes)
     InitBuiltinType(IncompleteMatrixIdxTy, BuiltinType::IncompleteMatrixIdx);
diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp
index 5ec3013fabba..bad8e75b2f87 100644
--- a/clang/lib/AST/ComputeDependence.cpp
+++ b/clang/lib/AST/ComputeDependence.cpp
@@ -443,12 +443,17 @@ ExprDependence clang::computeDependence(ObjCIndirectCopyRestoreExpr *E) {
   return E->getSubExpr()->getDependence();
 }
 
-ExprDependence clang::computeDependence(OMPArraySectionExpr *E) {
+ExprDependence clang::computeDependence(ArraySectionExpr *E) {
   auto D = E->getBase()->getDependence();
   if (auto *LB = E->getLowerBound())
     D |= LB->getDependence();
   if (auto *Len = E->getLength())
     D |= Len->getDependence();
+
+  if (E->isOMPArraySection()) {
+    if (auto *Stride = E->getStride())
+      D |= Stride->getDependence();
+  }
   return D;
 }
 
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 474e0ccde5bb..e7e95c16b697 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2151,7 +2151,7 @@ VarDecl *VarDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation StartL,
   return new (C, DC) VarDecl(Var, C, DC, StartL, IdL, Id, T, TInfo, S);
 }
 
-VarDecl *VarDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+VarDecl *VarDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID)
       VarDecl(Var, C, nullptr, SourceLocation(), SourceLocation(), nullptr,
               QualType(), nullptr, SC_None);
@@ -2929,7 +2929,7 @@ QualType ParmVarDecl::getOriginalType() const {
   return T;
 }
 
-ParmVarDecl *ParmVarDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+ParmVarDecl *ParmVarDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID)
       ParmVarDecl(ParmVar, C, nullptr, SourceLocation(), SourceLocation(),
                   nullptr, QualType(), nullptr, SC_None, nullptr);
@@ -4553,7 +4553,7 @@ FieldDecl *FieldDecl::Create(const ASTContext &C, DeclContext *DC,
                                BW, Mutable, InitStyle);
 }
 
-FieldDecl *FieldDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+FieldDecl *FieldDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) FieldDecl(Field, nullptr, SourceLocation(),
                                SourceLocation(), nullptr, QualType(), nullptr,
                                nullptr, false, ICIS_NoInit);
@@ -4863,7 +4863,7 @@ EnumDecl *EnumDecl::Create(ASTContext &C, DeclContext *DC,
   return Enum;
 }
 
-EnumDecl *EnumDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+EnumDecl *EnumDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   EnumDecl *Enum =
       new (C, ID) EnumDecl(C, nullptr, SourceLocation(), SourceLocation(),
                            nullptr, nullptr, false, false, false);
@@ -5025,7 +5025,8 @@ RecordDecl *RecordDecl::Create(const ASTContext &C, TagKind TK, DeclContext *DC,
   return R;
 }
 
-RecordDecl *RecordDecl::CreateDeserialized(const ASTContext &C, Decl::DeclID ID) {
+RecordDecl *RecordDecl::CreateDeserialized(const ASTContext &C,
+                                           GlobalDeclID ID) {
   RecordDecl *R = new (C, ID)
       RecordDecl(Record, TagTypeKind::Struct, C, nullptr, SourceLocation(),
                  SourceLocation(), nullptr, nullptr);
@@ -5297,7 +5298,7 @@ PragmaCommentDecl *PragmaCommentDecl::Create(const ASTContext &C,
 }
 
 PragmaCommentDecl *PragmaCommentDecl::CreateDeserialized(ASTContext &C,
-                                                         Decl::DeclID ID,
+                                                         GlobalDeclID ID,
                                                          unsigned ArgSize) {
   return new (C, ID, additionalSizeToAlloc<char>(ArgSize + 1))
       PragmaCommentDecl(nullptr, SourceLocation(), PCK_Unknown);
@@ -5322,7 +5323,7 @@ PragmaDetectMismatchDecl::Create(const ASTContext &C, TranslationUnitDecl *DC,
 }
 
 PragmaDetectMismatchDecl *
-PragmaDetectMismatchDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+PragmaDetectMismatchDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                              unsigned NameValueSize) {
   return new (C, ID, additionalSizeToAlloc<char>(NameValueSize + 1))
       PragmaDetectMismatchDecl(nullptr, SourceLocation(), 0);
@@ -5349,7 +5350,7 @@ LabelDecl *LabelDecl::Create(ASTContext &C, DeclContext *DC,
   return new (C, DC) LabelDecl(DC, IdentL, II, nullptr, GnuLabelL);
 }
 
-LabelDecl *LabelDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+LabelDecl *LabelDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) LabelDecl(nullptr, SourceLocation(), nullptr, nullptr,
                                SourceLocation());
 }
@@ -5390,7 +5391,7 @@ ImplicitParamDecl *ImplicitParamDecl::Create(ASTContext &C, QualType Type,
 }
 
 ImplicitParamDecl *ImplicitParamDecl::CreateDeserialized(ASTContext &C,
-                                                         Decl::DeclID ID) {
+                                                         GlobalDeclID ID) {
   return new (C, ID) ImplicitParamDecl(C, QualType(), ImplicitParamKind::Other);
 }
 
@@ -5408,7 +5409,7 @@ FunctionDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
   return New;
 }
 
-FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) FunctionDecl(
       Function, C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(),
       nullptr, SC_None, false, false, ConstexprSpecKind::Unspecified, nullptr);
@@ -5418,7 +5419,7 @@ BlockDecl *BlockDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) {
   return new (C, DC) BlockDecl(DC, L);
 }
 
-BlockDecl *BlockDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+BlockDecl *BlockDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) BlockDecl(nullptr, SourceLocation());
 }
 
@@ -5432,7 +5433,7 @@ CapturedDecl *CapturedDecl::Create(ASTContext &C, DeclContext *DC,
       CapturedDecl(DC, NumParams);
 }
 
-CapturedDecl *CapturedDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+CapturedDecl *CapturedDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                                unsigned NumParams) {
   return new (C, ID, additionalSizeToAlloc<ImplicitParamDecl *>(NumParams))
       CapturedDecl(nullptr, NumParams);
@@ -5458,8 +5459,8 @@ EnumConstantDecl *EnumConstantDecl::Create(ASTContext &C, EnumDecl *CD,
   return new (C, CD) EnumConstantDecl(C, CD, L, Id, T, E, V);
 }
 
-EnumConstantDecl *
-EnumConstantDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+EnumConstantDecl *EnumConstantDecl::CreateDeserialized(ASTContext &C,
+                                                       GlobalDeclID ID) {
   return new (C, ID) EnumConstantDecl(C, nullptr, SourceLocation(), nullptr,
                                       QualType(), nullptr, llvm::APSInt());
 }
@@ -5486,7 +5487,7 @@ IndirectFieldDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L,
 }
 
 IndirectFieldDecl *IndirectFieldDecl::CreateDeserialized(ASTContext &C,
-                                                         Decl::DeclID ID) {
+                                                         GlobalDeclID ID) {
   return new (C, ID)
       IndirectFieldDecl(C, nullptr, SourceLocation(), DeclarationName(),
                         QualType(), std::nullopt);
@@ -5547,7 +5548,7 @@ bool TypedefNameDecl::isTransparentTagSlow() const {
   return isTransparent;
 }
 
-TypedefDecl *TypedefDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+TypedefDecl *TypedefDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) TypedefDecl(C, nullptr, SourceLocation(), SourceLocation(),
                                  nullptr, nullptr);
 }
@@ -5560,7 +5561,8 @@ TypeAliasDecl *TypeAliasDecl::Create(ASTContext &C, DeclContext *DC,
   return new (C, DC) TypeAliasDecl(C, DC, StartLoc, IdLoc, Id, TInfo);
 }
 
-TypeAliasDecl *TypeAliasDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+TypeAliasDecl *TypeAliasDecl::CreateDeserialized(ASTContext &C,
+                                                 GlobalDeclID ID) {
   return new (C, ID) TypeAliasDecl(C, nullptr, SourceLocation(),
                                    SourceLocation(), nullptr, nullptr);
 }
@@ -5591,7 +5593,7 @@ FileScopeAsmDecl *FileScopeAsmDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 FileScopeAsmDecl *FileScopeAsmDecl::CreateDeserialized(ASTContext &C,
-                                                       Decl::DeclID ID) {
+                                                       GlobalDeclID ID) {
   return new (C, ID) FileScopeAsmDecl(nullptr, nullptr, SourceLocation(),
                                       SourceLocation());
 }
@@ -5609,7 +5611,7 @@ TopLevelStmtDecl *TopLevelStmtDecl::Create(ASTContext &C, Stmt *Statement) {
 }
 
 TopLevelStmtDecl *TopLevelStmtDecl::CreateDeserialized(ASTContext &C,
-                                                       Decl::DeclID ID) {
+                                                       GlobalDeclID ID) {
   return new (C, ID)
       TopLevelStmtDecl(/*DC=*/nullptr, SourceLocation(), /*S=*/nullptr);
 }
@@ -5630,7 +5632,7 @@ EmptyDecl *EmptyDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) {
   return new (C, DC) EmptyDecl(DC, L);
 }
 
-EmptyDecl *EmptyDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+EmptyDecl *EmptyDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) EmptyDecl(nullptr, SourceLocation());
 }
 
@@ -5663,7 +5665,8 @@ HLSLBufferDecl *HLSLBufferDecl::Create(ASTContext &C,
   return Result;
 }
 
-HLSLBufferDecl *HLSLBufferDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+HLSLBufferDecl *HLSLBufferDecl::CreateDeserialized(ASTContext &C,
+                                                   GlobalDeclID ID) {
   return new (C, ID) HLSLBufferDecl(nullptr, false, SourceLocation(), nullptr,
                                     SourceLocation(), SourceLocation());
 }
@@ -5719,7 +5722,7 @@ ImportDecl *ImportDecl::CreateImplicit(ASTContext &C, DeclContext *DC,
   return Import;
 }
 
-ImportDecl *ImportDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+ImportDecl *ImportDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                            unsigned NumLocations) {
   return new (C, ID, additionalSizeToAlloc<SourceLocation>(NumLocations))
       ImportDecl(EmptyShell());
@@ -5752,6 +5755,6 @@ ExportDecl *ExportDecl::Create(ASTContext &C, DeclContext *DC,
   return new (C, DC) ExportDecl(DC, ExportLoc);
 }
 
-ExportDecl *ExportDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+ExportDecl *ExportDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) ExportDecl(nullptr, SourceLocation());
 }
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 7cb6b31c541f..03e1055251c2 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -71,7 +71,7 @@ void Decl::updateOutOfDate(IdentifierInfo &II) const {
 #include "clang/AST/DeclNodes.inc"
 
 void *Decl::operator new(std::size_t Size, const ASTContext &Context,
-                         Decl::DeclID ID, std::size_t Extra) {
+                         GlobalDeclID ID, std::size_t Extra) {
   // Allocate an extra 8 bytes worth of storage, which ensures that the
   // resulting pointer will still be 8-byte aligned.
   static_assert(sizeof(unsigned) * 2 >= alignof(Decl),
@@ -85,7 +85,7 @@ void *Decl::operator new(std::size_t Size, const ASTContext &Context,
   PrefixPtr[0] = 0;
 
   // Store the global declaration ID in the second 4 bytes.
-  PrefixPtr[1] = ID;
+  PrefixPtr[1] = ID.get();
 
   return Result;
 }
@@ -1115,7 +1115,9 @@ int64_t Decl::getID() const {
 
 const FunctionType *Decl::getFunctionType(bool BlocksToo) const {
   QualType Ty;
-  if (const auto *D = dyn_cast<ValueDecl>(this))
+  if (isa<BindingDecl>(this))
+    return nullptr;
+  else if (const auto *D = dyn_cast<ValueDecl>(this))
     Ty = D->getType();
   else if (const auto *D = dyn_cast<TypedefNameDecl>(this))
     Ty = D->getUnderlyingType();
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 426c52620510..75c441293d62 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -57,7 +57,8 @@ using namespace clang;
 
 void AccessSpecDecl::anchor() {}
 
-AccessSpecDecl *AccessSpecDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+AccessSpecDecl *AccessSpecDecl::CreateDeserialized(ASTContext &C,
+                                                   GlobalDeclID ID) {
   return new (C, ID) AccessSpecDecl(EmptyShell());
 }
 
@@ -68,7 +69,7 @@ void LazyASTUnresolvedSet::getFromExternalSource(ASTContext &C) const {
 
   for (ASTUnresolvedSet::iterator I = Impl.begin(); I != Impl.end(); ++I)
     I.setDecl(cast<NamedDecl>(Source->GetExternalDecl(
-        reinterpret_cast<uintptr_t>(I.getDecl()) >> 2)));
+        GlobalDeclID(reinterpret_cast<uintptr_t>(I.getDecl()) >> 2))));
   Impl.Decls.setLazy(false);
 }
 
@@ -160,8 +161,8 @@ CXXRecordDecl::CreateLambda(const ASTContext &C, DeclContext *DC,
   return R;
 }
 
-CXXRecordDecl *
-CXXRecordDecl::CreateDeserialized(const ASTContext &C, Decl::DeclID ID) {
+CXXRecordDecl *CXXRecordDecl::CreateDeserialized(const ASTContext &C,
+                                                 GlobalDeclID ID) {
   auto *R = new (C, ID)
       CXXRecordDecl(CXXRecord, TagTypeKind::Struct, C, nullptr,
                     SourceLocation(), SourceLocation(), nullptr, nullptr);
@@ -2162,8 +2163,8 @@ CXXDeductionGuideDecl *CXXDeductionGuideDecl::Create(
                                            TInfo, EndLocation, Ctor, Kind);
 }
 
-CXXDeductionGuideDecl *CXXDeductionGuideDecl::CreateDeserialized(ASTContext &C,
-                                                                 Decl::DeclID ID) {
+CXXDeductionGuideDecl *
+CXXDeductionGuideDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) CXXDeductionGuideDecl(
       C, nullptr, SourceLocation(), ExplicitSpecifier(), DeclarationNameInfo(),
       QualType(), nullptr, SourceLocation(), nullptr,
@@ -2175,8 +2176,8 @@ RequiresExprBodyDecl *RequiresExprBodyDecl::Create(
   return new (C, DC) RequiresExprBodyDecl(C, DC, StartLoc);
 }
 
-RequiresExprBodyDecl *RequiresExprBodyDecl::CreateDeserialized(ASTContext &C,
-                                                               Decl::DeclID ID) {
+RequiresExprBodyDecl *
+RequiresExprBodyDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) RequiresExprBodyDecl(C, nullptr, SourceLocation());
 }
 
@@ -2281,7 +2282,8 @@ CXXMethodDecl::Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
       isInline, ConstexprKind, EndLocation, TrailingRequiresClause);
 }
 
-CXXMethodDecl *CXXMethodDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+CXXMethodDecl *CXXMethodDecl::CreateDeserialized(ASTContext &C,
+                                                 GlobalDeclID ID) {
   return new (C, ID) CXXMethodDecl(
       CXXMethod, C, nullptr, SourceLocation(), DeclarationNameInfo(),
       QualType(), nullptr, SC_None, false, false,
@@ -2699,7 +2701,7 @@ CXXConstructorDecl::CXXConstructorDecl(
 void CXXConstructorDecl::anchor() {}
 
 CXXConstructorDecl *CXXConstructorDecl::CreateDeserialized(ASTContext &C,
-                                                           Decl::DeclID ID,
+                                                           GlobalDeclID ID,
                                                            uint64_t AllocKind) {
   bool hasTrailingExplicit = static_cast<bool>(AllocKind & TAKHasTailExplicit);
   bool isInheritingConstructor =
@@ -2845,8 +2847,8 @@ bool CXXConstructorDecl::isSpecializationCopyingObject() const {
 
 void CXXDestructorDecl::anchor() {}
 
-CXXDestructorDecl *
-CXXDestructorDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+CXXDestructorDecl *CXXDestructorDecl::CreateDeserialized(ASTContext &C,
+                                                         GlobalDeclID ID) {
   return new (C, ID) CXXDestructorDecl(
       C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr,
       false, false, false, ConstexprSpecKind::Unspecified, nullptr);
@@ -2877,8 +2879,8 @@ void CXXDestructorDecl::setOperatorDelete(FunctionDecl *OD, Expr *ThisArg) {
 
 void CXXConversionDecl::anchor() {}
 
-CXXConversionDecl *
-CXXConversionDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+CXXConversionDecl *CXXConversionDecl::CreateDeserialized(ASTContext &C,
+                                                         GlobalDeclID ID) {
   return new (C, ID) CXXConversionDecl(
       C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr,
       false, false, ExplicitSpecifier(), ConstexprSpecKind::Unspecified,
@@ -2924,7 +2926,7 @@ LinkageSpecDecl *LinkageSpecDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 LinkageSpecDecl *LinkageSpecDecl::CreateDeserialized(ASTContext &C,
-                                                     Decl::DeclID ID) {
+                                                     GlobalDeclID ID) {
   return new (C, ID)
       LinkageSpecDecl(nullptr, SourceLocation(), SourceLocation(),
                       LinkageSpecLanguageIDs::C, false);
@@ -2946,7 +2948,7 @@ UsingDirectiveDecl *UsingDirectiveDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 UsingDirectiveDecl *UsingDirectiveDecl::CreateDeserialized(ASTContext &C,
-                                                           Decl::DeclID ID) {
+                                                           GlobalDeclID ID) {
   return new (C, ID) UsingDirectiveDecl(nullptr, SourceLocation(),
                                         SourceLocation(),
                                         NestedNameSpecifierLoc(),
@@ -2985,7 +2987,8 @@ NamespaceDecl *NamespaceDecl::Create(ASTContext &C, DeclContext *DC,
       NamespaceDecl(C, DC, Inline, StartLoc, IdLoc, Id, PrevDecl, Nested);
 }
 
-NamespaceDecl *NamespaceDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+NamespaceDecl *NamespaceDecl::CreateDeserialized(ASTContext &C,
+                                                 GlobalDeclID ID) {
   return new (C, ID) NamespaceDecl(C, nullptr, false, SourceLocation(),
                                    SourceLocation(), nullptr, nullptr, false);
 }
@@ -3046,8 +3049,8 @@ NamespaceAliasDecl *NamespaceAliasDecl::Create(ASTContext &C, DeclContext *DC,
                                         QualifierLoc, IdentLoc, Namespace);
 }
 
-NamespaceAliasDecl *
-NamespaceAliasDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+NamespaceAliasDecl *NamespaceAliasDecl::CreateDeserialized(ASTContext &C,
+                                                           GlobalDeclID ID) {
   return new (C, ID) NamespaceAliasDecl(C, nullptr, SourceLocation(),
                                         SourceLocation(), nullptr,
                                         NestedNameSpecifierLoc(),
@@ -3102,8 +3105,8 @@ UsingShadowDecl::UsingShadowDecl(Kind K, ASTContext &C, EmptyShell Empty)
     : NamedDecl(K, nullptr, SourceLocation(), DeclarationName()),
       redeclarable_base(C) {}
 
-UsingShadowDecl *
-UsingShadowDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+UsingShadowDecl *UsingShadowDecl::CreateDeserialized(ASTContext &C,
+                                                     GlobalDeclID ID) {
   return new (C, ID) UsingShadowDecl(UsingShadow, C, EmptyShell());
 }
 
@@ -3126,7 +3129,7 @@ ConstructorUsingShadowDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 ConstructorUsingShadowDecl *
-ConstructorUsingShadowDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+ConstructorUsingShadowDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) ConstructorUsingShadowDecl(C, EmptyShell());
 }
 
@@ -3174,7 +3177,7 @@ UsingDecl *UsingDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation UL,
   return new (C, DC) UsingDecl(DC, UL, QualifierLoc, NameInfo, HasTypename);
 }
 
-UsingDecl *UsingDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+UsingDecl *UsingDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) UsingDecl(nullptr, SourceLocation(),
                                NestedNameSpecifierLoc(), DeclarationNameInfo(),
                                false);
@@ -3198,7 +3201,8 @@ UsingEnumDecl *UsingEnumDecl::Create(ASTContext &C, DeclContext *DC,
       UsingEnumDecl(DC, EnumType->getType()->getAsTagDecl()->getDeclName(), UL, EL, NL, EnumType);
 }
 
-UsingEnumDecl *UsingEnumDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+UsingEnumDecl *UsingEnumDecl::CreateDeserialized(ASTContext &C,
+                                                 GlobalDeclID ID) {
   return new (C, ID)
       UsingEnumDecl(nullptr, DeclarationName(), SourceLocation(),
                     SourceLocation(), SourceLocation(), nullptr);
@@ -3217,7 +3221,7 @@ UsingPackDecl *UsingPackDecl::Create(ASTContext &C, DeclContext *DC,
   return new (C, DC, Extra) UsingPackDecl(DC, InstantiatedFrom, UsingDecls);
 }
 
-UsingPackDecl *UsingPackDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+UsingPackDecl *UsingPackDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                                  unsigned NumExpansions) {
   size_t Extra = additionalSizeToAlloc<NamedDecl *>(NumExpansions);
   auto *Result =
@@ -3243,7 +3247,7 @@ UnresolvedUsingValueDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 UnresolvedUsingValueDecl *
-UnresolvedUsingValueDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+UnresolvedUsingValueDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) UnresolvedUsingValueDecl(nullptr, QualType(),
                                               SourceLocation(),
                                               NestedNameSpecifierLoc(),
@@ -3273,7 +3277,8 @@ UnresolvedUsingTypenameDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 UnresolvedUsingTypenameDecl *
-UnresolvedUsingTypenameDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+UnresolvedUsingTypenameDecl::CreateDeserialized(ASTContext &C,
+                                                GlobalDeclID ID) {
   return new (C, ID) UnresolvedUsingTypenameDecl(
       nullptr, SourceLocation(), SourceLocation(), NestedNameSpecifierLoc(),
       SourceLocation(), nullptr, SourceLocation());
@@ -3286,7 +3291,8 @@ UnresolvedUsingIfExistsDecl::Create(ASTContext &Ctx, DeclContext *DC,
 }
 
 UnresolvedUsingIfExistsDecl *
-UnresolvedUsingIfExistsDecl::CreateDeserialized(ASTContext &Ctx, Decl::DeclID ID) {
+UnresolvedUsingIfExistsDecl::CreateDeserialized(ASTContext &Ctx,
+                                                GlobalDeclID ID) {
   return new (Ctx, ID)
       UnresolvedUsingIfExistsDecl(nullptr, SourceLocation(), DeclarationName());
 }
@@ -3310,7 +3316,7 @@ StaticAssertDecl *StaticAssertDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 StaticAssertDecl *StaticAssertDecl::CreateDeserialized(ASTContext &C,
-                                                       Decl::DeclID ID) {
+                                                       GlobalDeclID ID) {
   return new (C, ID) StaticAssertDecl(nullptr, SourceLocation(), nullptr,
                                       nullptr, SourceLocation(), false);
 }
@@ -3332,7 +3338,7 @@ BindingDecl *BindingDecl::Create(ASTContext &C, DeclContext *DC,
   return new (C, DC) BindingDecl(DC, IdLoc, Id);
 }
 
-BindingDecl *BindingDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+BindingDecl *BindingDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) BindingDecl(nullptr, SourceLocation(), nullptr);
 }
 
@@ -3363,7 +3369,7 @@ DecompositionDecl *DecompositionDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 DecompositionDecl *DecompositionDecl::CreateDeserialized(ASTContext &C,
-                                                         Decl::DeclID ID,
+                                                         GlobalDeclID ID,
                                                          unsigned NumBindings) {
   size_t Extra = additionalSizeToAlloc<BindingDecl *>(NumBindings);
   auto *Result = new (C, ID, Extra)
@@ -3402,7 +3408,7 @@ MSPropertyDecl *MSPropertyDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 MSPropertyDecl *MSPropertyDecl::CreateDeserialized(ASTContext &C,
-                                                   Decl::DeclID ID) {
+                                                   GlobalDeclID ID) {
   return new (C, ID) MSPropertyDecl(nullptr, SourceLocation(),
                                     DeclarationName(), QualType(), nullptr,
                                     SourceLocation(), nullptr, nullptr);
@@ -3419,7 +3425,7 @@ MSGuidDecl *MSGuidDecl::Create(const ASTContext &C, QualType T, Parts P) {
   return new (C, DC) MSGuidDecl(DC, T, P);
 }
 
-MSGuidDecl *MSGuidDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+MSGuidDecl *MSGuidDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) MSGuidDecl(nullptr, QualType(), Parts());
 }
 
@@ -3529,7 +3535,7 @@ UnnamedGlobalConstantDecl::Create(const ASTContext &C, QualType T,
 }
 
 UnnamedGlobalConstantDecl *
-UnnamedGlobalConstantDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+UnnamedGlobalConstantDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID)
       UnnamedGlobalConstantDecl(C, nullptr, QualType(), APValue());
 }
diff --git a/clang/lib/AST/DeclFriend.cpp b/clang/lib/AST/DeclFriend.cpp
index 1fabf8aa80c2..04b9b93699f3 100644
--- a/clang/lib/AST/DeclFriend.cpp
+++ b/clang/lib/AST/DeclFriend.cpp
@@ -62,7 +62,7 @@ FriendDecl *FriendDecl::Create(ASTContext &C, DeclContext *DC,
   return FD;
 }
 
-FriendDecl *FriendDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+FriendDecl *FriendDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                            unsigned FriendTypeNumTPLists) {
   std::size_t Extra =
       additionalSizeToAlloc<TemplateParameterList *>(FriendTypeNumTPLists);
diff --git a/clang/lib/AST/DeclObjC.cpp b/clang/lib/AST/DeclObjC.cpp
index d4275eea0582..83062b0e6887 100644
--- a/clang/lib/AST/DeclObjC.cpp
+++ b/clang/lib/AST/DeclObjC.cpp
@@ -862,7 +862,8 @@ ObjCMethodDecl *ObjCMethodDecl::Create(
       isImplicitlyDeclared, isDefined, impControl, HasRelatedResultType);
 }
 
-ObjCMethodDecl *ObjCMethodDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+ObjCMethodDecl *ObjCMethodDecl::CreateDeserialized(ASTContext &C,
+                                                   GlobalDeclID ID) {
   return new (C, ID) ObjCMethodDecl(SourceLocation(), SourceLocation(),
                                     Selector(), QualType(), nullptr, nullptr);
 }
@@ -1486,7 +1487,7 @@ ObjCTypeParamDecl *ObjCTypeParamDecl::Create(ASTContext &ctx, DeclContext *dc,
 }
 
 ObjCTypeParamDecl *ObjCTypeParamDecl::CreateDeserialized(ASTContext &ctx,
-                                                         Decl::DeclID ID) {
+                                                         GlobalDeclID ID) {
   return new (ctx, ID) ObjCTypeParamDecl(ctx, nullptr,
                                          ObjCTypeParamVariance::Invariant,
                                          SourceLocation(), 0, SourceLocation(),
@@ -1551,7 +1552,7 @@ ObjCInterfaceDecl *ObjCInterfaceDecl::Create(
 }
 
 ObjCInterfaceDecl *ObjCInterfaceDecl::CreateDeserialized(const ASTContext &C,
-                                                         Decl::DeclID ID) {
+                                                         GlobalDeclID ID) {
   auto *Result = new (C, ID)
       ObjCInterfaceDecl(C, nullptr, SourceLocation(), nullptr, nullptr,
                         SourceLocation(), nullptr, false);
@@ -1865,7 +1866,7 @@ ObjCIvarDecl *ObjCIvarDecl::Create(ASTContext &C, ObjCContainerDecl *DC,
                                   synthesized);
 }
 
-ObjCIvarDecl *ObjCIvarDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+ObjCIvarDecl *ObjCIvarDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) ObjCIvarDecl(nullptr, SourceLocation(), SourceLocation(),
                                   nullptr, QualType(), nullptr,
                                   ObjCIvarDecl::None, nullptr, false);
@@ -1914,7 +1915,7 @@ ObjCAtDefsFieldDecl
 }
 
 ObjCAtDefsFieldDecl *ObjCAtDefsFieldDecl::CreateDeserialized(ASTContext &C,
-                                                             Decl::DeclID ID) {
+                                                             GlobalDeclID ID) {
   return new (C, ID) ObjCAtDefsFieldDecl(nullptr, SourceLocation(),
                                          SourceLocation(), nullptr, QualType(),
                                          nullptr);
@@ -1949,7 +1950,7 @@ ObjCProtocolDecl *ObjCProtocolDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 ObjCProtocolDecl *ObjCProtocolDecl::CreateDeserialized(ASTContext &C,
-                                                       Decl::DeclID ID) {
+                                                       GlobalDeclID ID) {
   ObjCProtocolDecl *Result =
       new (C, ID) ObjCProtocolDecl(C, nullptr, nullptr, SourceLocation(),
                                    SourceLocation(), nullptr);
@@ -2148,7 +2149,7 @@ ObjCCategoryDecl *ObjCCategoryDecl::Create(
 }
 
 ObjCCategoryDecl *ObjCCategoryDecl::CreateDeserialized(ASTContext &C,
-                                                       Decl::DeclID ID) {
+                                                       GlobalDeclID ID) {
   return new (C, ID) ObjCCategoryDecl(nullptr, SourceLocation(),
                                       SourceLocation(), SourceLocation(),
                                       nullptr, nullptr, nullptr);
@@ -2188,8 +2189,8 @@ ObjCCategoryImplDecl *ObjCCategoryImplDecl::Create(
                                           atStartLoc, CategoryNameLoc);
 }
 
-ObjCCategoryImplDecl *ObjCCategoryImplDecl::CreateDeserialized(ASTContext &C,
-                                                               Decl::DeclID ID) {
+ObjCCategoryImplDecl *
+ObjCCategoryImplDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) ObjCCategoryImplDecl(nullptr, nullptr, nullptr,
                                           SourceLocation(), SourceLocation(),
                                           SourceLocation());
@@ -2296,7 +2297,7 @@ ObjCImplementationDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 ObjCImplementationDecl *
-ObjCImplementationDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+ObjCImplementationDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) ObjCImplementationDecl(nullptr, nullptr, nullptr,
                                             SourceLocation(), SourceLocation());
 }
@@ -2339,7 +2340,7 @@ ObjCCompatibleAliasDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 ObjCCompatibleAliasDecl *
-ObjCCompatibleAliasDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+ObjCCompatibleAliasDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) ObjCCompatibleAliasDecl(nullptr, SourceLocation(),
                                              nullptr, nullptr);
 }
@@ -2360,7 +2361,7 @@ ObjCPropertyDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L,
 }
 
 ObjCPropertyDecl *ObjCPropertyDecl::CreateDeserialized(ASTContext &C,
-                                                       Decl::DeclID ID) {
+                                                       GlobalDeclID ID) {
   return new (C, ID) ObjCPropertyDecl(nullptr, SourceLocation(), nullptr,
                                       SourceLocation(), SourceLocation(),
                                       QualType(), nullptr, None);
@@ -2392,8 +2393,8 @@ ObjCPropertyImplDecl *ObjCPropertyImplDecl::Create(ASTContext &C,
                                           ivarLoc);
 }
 
-ObjCPropertyImplDecl *ObjCPropertyImplDecl::CreateDeserialized(ASTContext &C,
-                                                               Decl::DeclID ID) {
+ObjCPropertyImplDecl *
+ObjCPropertyImplDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) ObjCPropertyImplDecl(nullptr, SourceLocation(),
                                           SourceLocation(), nullptr, Dynamic,
                                           nullptr, SourceLocation());
diff --git a/clang/lib/AST/DeclOpenMP.cpp b/clang/lib/AST/DeclOpenMP.cpp
index b178a15aab5f..81ca48e60942 100644
--- a/clang/lib/AST/DeclOpenMP.cpp
+++ b/clang/lib/AST/DeclOpenMP.cpp
@@ -36,7 +36,7 @@ OMPThreadPrivateDecl *OMPThreadPrivateDecl::Create(ASTContext &C,
 }
 
 OMPThreadPrivateDecl *OMPThreadPrivateDecl::CreateDeserialized(ASTContext &C,
-                                                               Decl::DeclID ID,
+                                                               GlobalDeclID ID,
                                                                unsigned N) {
   return OMPDeclarativeDirective::createEmptyDirective<OMPThreadPrivateDecl>(
       C, ID, 0, N);
@@ -63,7 +63,8 @@ OMPAllocateDecl *OMPAllocateDecl::Create(ASTContext &C, DeclContext *DC,
   return D;
 }
 
-OMPAllocateDecl *OMPAllocateDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+OMPAllocateDecl *OMPAllocateDecl::CreateDeserialized(ASTContext &C,
+                                                     GlobalDeclID ID,
                                                      unsigned NVars,
                                                      unsigned NClauses) {
   return OMPDeclarativeDirective::createEmptyDirective<OMPAllocateDecl>(
@@ -89,7 +90,8 @@ OMPRequiresDecl *OMPRequiresDecl::Create(ASTContext &C, DeclContext *DC,
                                                                    L);
 }
 
-OMPRequiresDecl *OMPRequiresDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+OMPRequiresDecl *OMPRequiresDecl::CreateDeserialized(ASTContext &C,
+                                                     GlobalDeclID ID,
                                                      unsigned N) {
   return OMPDeclarativeDirective::createEmptyDirective<OMPRequiresDecl>(
       C, ID, N, 0, SourceLocation());
@@ -117,7 +119,7 @@ OMPDeclareReductionDecl *OMPDeclareReductionDecl::Create(
 }
 
 OMPDeclareReductionDecl *
-OMPDeclareReductionDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+OMPDeclareReductionDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) OMPDeclareReductionDecl(
       OMPDeclareReduction, /*DC=*/nullptr, SourceLocation(), DeclarationName(),
       QualType(), /*PrevDeclInScope=*/nullptr);
@@ -148,7 +150,7 @@ OMPDeclareMapperDecl *OMPDeclareMapperDecl::Create(
 }
 
 OMPDeclareMapperDecl *OMPDeclareMapperDecl::CreateDeserialized(ASTContext &C,
-                                                               Decl::DeclID ID,
+                                                               GlobalDeclID ID,
                                                                unsigned N) {
   return OMPDeclarativeDirective::createEmptyDirective<OMPDeclareMapperDecl>(
       C, ID, N, 1, SourceLocation(), DeclarationName(), QualType(),
@@ -179,7 +181,7 @@ OMPCapturedExprDecl *OMPCapturedExprDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 OMPCapturedExprDecl *OMPCapturedExprDecl::CreateDeserialized(ASTContext &C,
-                                                             Decl::DeclID ID) {
+                                                             GlobalDeclID ID) {
   return new (C, ID) OMPCapturedExprDecl(C, nullptr, nullptr, QualType(),
                                          /*TInfo=*/nullptr, SourceLocation());
 }
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 67bb9e41e3e6..d27a30e0c5fc 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -337,9 +337,10 @@ void RedeclarableTemplateDecl::loadLazySpecializationsImpl() const {
   CommonBase *CommonBasePtr = getMostRecentDecl()->getCommonPtr();
   if (CommonBasePtr->LazySpecializations) {
     ASTContext &Context = getASTContext();
-    Decl::DeclID *Specs = CommonBasePtr->LazySpecializations;
+    GlobalDeclID *Specs = CommonBasePtr->LazySpecializations;
     CommonBasePtr->LazySpecializations = nullptr;
-    for (uint32_t I = 0, N = *Specs++; I != N; ++I)
+    unsigned SpecSize = (*Specs++).get();
+    for (unsigned I = 0; I != SpecSize; ++I)
       (void)Context.getExternalSource()->GetExternalDecl(Specs[I]);
   }
 }
@@ -417,8 +418,8 @@ FunctionTemplateDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L,
   return TD;
 }
 
-FunctionTemplateDecl *FunctionTemplateDecl::CreateDeserialized(ASTContext &C,
-                                                               Decl::DeclID ID) {
+FunctionTemplateDecl *
+FunctionTemplateDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) FunctionTemplateDecl(C, nullptr, SourceLocation(),
                                           DeclarationName(), nullptr, nullptr);
 }
@@ -503,7 +504,7 @@ ClassTemplateDecl *ClassTemplateDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 ClassTemplateDecl *ClassTemplateDecl::CreateDeserialized(ASTContext &C,
-                                                         Decl::DeclID ID) {
+                                                         GlobalDeclID ID) {
   return new (C, ID) ClassTemplateDecl(C, nullptr, SourceLocation(),
                                        DeclarationName(), nullptr, nullptr);
 }
@@ -652,14 +653,14 @@ TemplateTypeParmDecl *TemplateTypeParmDecl::Create(
 }
 
 TemplateTypeParmDecl *
-TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, Decl::DeclID ID) {
+TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, GlobalDeclID ID) {
   return new (C, ID)
       TemplateTypeParmDecl(nullptr, SourceLocation(), SourceLocation(), nullptr,
                            false, false, std::nullopt);
 }
 
 TemplateTypeParmDecl *
-TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, Decl::DeclID ID,
+TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, GlobalDeclID ID,
                                          bool HasTypeConstraint) {
   return new (C, ID,
               additionalSizeToAlloc<TypeConstraint>(HasTypeConstraint ? 1 : 0))
@@ -759,7 +760,7 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create(
 }
 
 NonTypeTemplateParmDecl *
-NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                             bool HasTypeConstraint) {
   return new (C, ID, additionalSizeToAlloc<std::pair<QualType,
                                                      TypeSourceInfo *>,
@@ -770,7 +771,7 @@ NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
 }
 
 NonTypeTemplateParmDecl *
-NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                             unsigned NumExpandedTypes,
                                             bool HasTypeConstraint) {
   auto *NTTP =
@@ -836,13 +837,13 @@ TemplateTemplateParmDecl::Create(const ASTContext &C, DeclContext *DC,
 }
 
 TemplateTemplateParmDecl *
-TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) TemplateTemplateParmDecl(nullptr, SourceLocation(), 0, 0,
                                               false, nullptr, false, nullptr);
 }
 
 TemplateTemplateParmDecl *
-TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID,
+TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
                                              unsigned NumExpansions) {
   auto *TTP =
       new (C, ID, additionalSizeToAlloc<TemplateParameterList *>(NumExpansions))
@@ -949,7 +950,7 @@ ClassTemplateSpecializationDecl::Create(ASTContext &Context, TagKind TK,
 
 ClassTemplateSpecializationDecl *
 ClassTemplateSpecializationDecl::CreateDeserialized(ASTContext &C,
-                                                    Decl::DeclID ID) {
+                                                    GlobalDeclID ID) {
   auto *Result =
     new (C, ID) ClassTemplateSpecializationDecl(C, ClassTemplateSpecialization);
   Result->setMayHaveOutOfDateDef(false);
@@ -1035,8 +1036,7 @@ ConceptDecl *ConceptDecl::Create(ASTContext &C, DeclContext *DC,
   return TD;
 }
 
-ConceptDecl *ConceptDecl::CreateDeserialized(ASTContext &C,
-                                             Decl::DeclID ID) {
+ConceptDecl *ConceptDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   ConceptDecl *Result = new (C, ID) ConceptDecl(nullptr, SourceLocation(),
                                                 DeclarationName(),
                                                 nullptr, nullptr);
@@ -1070,7 +1070,7 @@ ImplicitConceptSpecializationDecl *ImplicitConceptSpecializationDecl::Create(
 
 ImplicitConceptSpecializationDecl *
 ImplicitConceptSpecializationDecl::CreateDeserialized(
-    const ASTContext &C, Decl::DeclID ID, unsigned NumTemplateArgs) {
+    const ASTContext &C, GlobalDeclID ID, unsigned NumTemplateArgs) {
   return new (C, ID, additionalSizeToAlloc<TemplateArgument>(NumTemplateArgs))
       ImplicitConceptSpecializationDecl(EmptyShell{}, NumTemplateArgs);
 }
@@ -1133,7 +1133,7 @@ Create(ASTContext &Context, TagKind TK,DeclContext *DC,
 
 ClassTemplatePartialSpecializationDecl *
 ClassTemplatePartialSpecializationDecl::CreateDeserialized(ASTContext &C,
-                                                           Decl::DeclID ID) {
+                                                           GlobalDeclID ID) {
   auto *Result = new (C, ID) ClassTemplatePartialSpecializationDecl(C);
   Result->setMayHaveOutOfDateDef(false);
   return Result;
@@ -1160,7 +1160,7 @@ FriendTemplateDecl::Create(ASTContext &Context, DeclContext *DC,
 }
 
 FriendTemplateDecl *FriendTemplateDecl::CreateDeserialized(ASTContext &C,
-                                                           Decl::DeclID ID) {
+                                                           GlobalDeclID ID) {
   return new (C, ID) FriendTemplateDecl(EmptyShell());
 }
 
@@ -1179,8 +1179,8 @@ TypeAliasTemplateDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L,
   return TD;
 }
 
-TypeAliasTemplateDecl *TypeAliasTemplateDecl::CreateDeserialized(ASTContext &C,
-                                                                 Decl::DeclID ID) {
+TypeAliasTemplateDecl *
+TypeAliasTemplateDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) TypeAliasTemplateDecl(C, nullptr, SourceLocation(),
                                            DeclarationName(), nullptr, nullptr);
 }
@@ -1218,7 +1218,7 @@ VarTemplateDecl *VarTemplateDecl::Create(ASTContext &C, DeclContext *DC,
 }
 
 VarTemplateDecl *VarTemplateDecl::CreateDeserialized(ASTContext &C,
-                                                     Decl::DeclID ID) {
+                                                     GlobalDeclID ID) {
   return new (C, ID) VarTemplateDecl(C, nullptr, SourceLocation(),
                                      DeclarationName(), nullptr, nullptr);
 }
@@ -1340,7 +1340,8 @@ VarTemplateSpecializationDecl *VarTemplateSpecializationDecl::Create(
 }
 
 VarTemplateSpecializationDecl *
-VarTemplateSpecializationDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+VarTemplateSpecializationDecl::CreateDeserialized(ASTContext &C,
+                                                  GlobalDeclID ID) {
   return new (C, ID)
       VarTemplateSpecializationDecl(VarTemplateSpecialization, C);
 }
@@ -1432,7 +1433,7 @@ VarTemplatePartialSpecializationDecl::Create(
 
 VarTemplatePartialSpecializationDecl *
 VarTemplatePartialSpecializationDecl::CreateDeserialized(ASTContext &C,
-                                                         Decl::DeclID ID) {
+                                                         GlobalDeclID ID) {
   return new (C, ID) VarTemplatePartialSpecializationDecl(C);
 }
 
@@ -1546,7 +1547,7 @@ TemplateParamObjectDecl *TemplateParamObjectDecl::Create(const ASTContext &C,
 }
 
 TemplateParamObjectDecl *
-TemplateParamObjectDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) {
+TemplateParamObjectDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   auto *TPOD = new (C, ID) TemplateParamObjectDecl(nullptr, QualType(), APValue());
   C.addDestruction(&TPOD->Value);
   return TPOD;
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 9eec7edc9d1a..ac0b1b38f016 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -103,7 +103,7 @@ const Expr *Expr::skipRValueSubobjectAdjustments(
       }
     } else if (const auto *ME = dyn_cast<MemberExpr>(E)) {
       if (!ME->isArrow()) {
-        assert(ME->getBase()->getType()->isRecordType());
+        assert(ME->getBase()->getType()->getAsRecordDecl());
         if (const auto *Field = dyn_cast<FieldDecl>(ME->getMemberDecl())) {
           if (!Field->isBitField() && !Field->getType()->isReferenceType()) {
             E = ME->getBase();
@@ -3680,7 +3680,7 @@ bool Expr::HasSideEffects(const ASTContext &Ctx,
   case ParenExprClass:
   case ArraySubscriptExprClass:
   case MatrixSubscriptExprClass:
-  case OMPArraySectionExprClass:
+  case ArraySectionExprClass:
   case OMPArrayShapingExprClass:
   case OMPIteratorExprClass:
   case MemberExprClass:
@@ -3893,9 +3893,14 @@ namespace {
     }
 
     void VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *E) {
-      if (E->getTemporary()->getDestructor()->isTrivial()) {
-        Inherited::VisitStmt(E);
-        return;
+      // Destructor of the temporary might be null if destructor declaration
+      // is not valid.
+      if (const CXXDestructorDecl *DtorDecl =
+              E->getTemporary()->getDestructor()) {
+        if (DtorDecl->isTrivial()) {
+          Inherited::VisitStmt(E);
+          return;
+        }
       }
 
       NonTrivial = true;
@@ -5060,9 +5065,9 @@ QualType AtomicExpr::getValueType() const {
   return T;
 }
 
-QualType OMPArraySectionExpr::getBaseOriginalType(const Expr *Base) {
+QualType ArraySectionExpr::getBaseOriginalType(const Expr *Base) {
   unsigned ArraySectionCount = 0;
-  while (auto *OASE = dyn_cast<OMPArraySectionExpr>(Base->IgnoreParens())) {
+  while (auto *OASE = dyn_cast<ArraySectionExpr>(Base->IgnoreParens())) {
     Base = OASE->getBase();
     ++ArraySectionCount;
   }
diff --git a/clang/lib/AST/ExprClassification.cpp b/clang/lib/AST/ExprClassification.cpp
index 7026fca8554c..390000e3ed38 100644
--- a/clang/lib/AST/ExprClassification.cpp
+++ b/clang/lib/AST/ExprClassification.cpp
@@ -145,7 +145,7 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) {
   case Expr::FunctionParmPackExprClass:
   case Expr::MSPropertyRefExprClass:
   case Expr::MSPropertySubscriptExprClass:
-  case Expr::OMPArraySectionExprClass:
+  case Expr::ArraySectionExprClass:
   case Expr::OMPArrayShapingExprClass:
   case Expr::OMPIteratorExprClass:
     return Cl::CL_LValue;
@@ -216,8 +216,13 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) {
     return ClassifyInternal(Ctx,
                  cast<SubstNonTypeTemplateParmExpr>(E)->getReplacement());
 
-  case Expr::PackIndexingExprClass:
+  case Expr::PackIndexingExprClass: {
+    // A pack-index-expression always expands to an id-expression.
+    // Consider it as an LValue expression.
+    if (cast<PackIndexingExpr>(E)->isInstantiationDependent())
+      return Cl::CL_LValue;
     return ClassifyInternal(Ctx, cast<PackIndexingExpr>(E)->getSelectedExpr());
+  }
 
     // C, C++98 [expr.sub]p1: The result is an lvalue of type "T".
     // C++11 (DR1213): in the case of an array operand, the result is an lvalue
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index de3c2a63913e..f1aa19e4409e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2706,7 +2706,11 @@ static bool checkFloatingPointResult(EvalInfo &Info, const Expr *E,
 static bool HandleFloatToFloatCast(EvalInfo &Info, const Expr *E,
                                    QualType SrcType, QualType DestType,
                                    APFloat &Result) {
-  assert(isa<CastExpr>(E) || isa<CompoundAssignOperator>(E));
+  assert((isa<CastExpr>(E) || isa<CompoundAssignOperator>(E) ||
+          isa<ConvertVectorExpr>(E)) &&
+         "HandleFloatToFloatCast has been checked with only CastExpr, "
+         "CompoundAssignOperator and ConvertVectorExpr. Please either validate "
+         "the new expression or address the root cause of this usage.");
   llvm::RoundingMode RM = getActiveRoundingMode(Info, E);
   APFloat::opStatus St;
   APFloat Value = Result;
@@ -9237,9 +9241,10 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) {
       bool HasValidResult = !Result.InvalidBase && !Result.Designator.Invalid &&
                             !Result.IsNullPtr;
       bool VoidPtrCastMaybeOK =
-          HasValidResult &&
-          Info.Ctx.hasSameUnqualifiedType(Result.Designator.getType(Info.Ctx),
-                                          E->getType()->getPointeeType());
+          Result.IsNullPtr ||
+          (HasValidResult &&
+           Info.Ctx.hasSimilarType(Result.Designator.getType(Info.Ctx),
+                                   E->getType()->getPointeeType()));
       // 1. We'll allow it in std::allocator::allocate, and anything which that
       //    calls.
       // 2. HACK 2022-03-28: Work around an issue with libstdc++'s
@@ -10709,8 +10714,11 @@ namespace {
     bool VisitUnaryImag(const UnaryOperator *E);
     bool VisitBinaryOperator(const BinaryOperator *E);
     bool VisitUnaryOperator(const UnaryOperator *E);
+    bool VisitConvertVectorExpr(const ConvertVectorExpr *E);
+    bool VisitShuffleVectorExpr(const ShuffleVectorExpr *E);
+
     // FIXME: Missing: conditional operator (for GNU
-    //                 conditional select), shufflevector, ExtVectorElementExpr
+    //                 conditional select), ExtVectorElementExpr
   };
 } // end anonymous namespace
 
@@ -10961,6 +10969,122 @@ bool VectorExprEvaluator::VisitUnaryOperator(const UnaryOperator *E) {
   return Success(APValue(ResultElements.data(), ResultElements.size()), E);
 }
 
+static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
+                                    const Expr *E, QualType SourceTy,
+                                    QualType DestTy, APValue const &Original,
+                                    APValue &Result) {
+  if (SourceTy->isIntegerType()) {
+    if (DestTy->isRealFloatingType()) {
+      Result = APValue(APFloat(0.0));
+      return HandleIntToFloatCast(Info, E, FPO, SourceTy, Original.getInt(),
+                                  DestTy, Result.getFloat());
+    }
+    if (DestTy->isIntegerType()) {
+      Result = APValue(
+          HandleIntToIntCast(Info, E, DestTy, SourceTy, Original.getInt()));
+      return true;
+    }
+  } else if (SourceTy->isRealFloatingType()) {
+    if (DestTy->isRealFloatingType()) {
+      Result = Original;
+      return HandleFloatToFloatCast(Info, E, SourceTy, DestTy,
+                                    Result.getFloat());
+    }
+    if (DestTy->isIntegerType()) {
+      Result = APValue(APSInt());
+      return HandleFloatToIntCast(Info, E, SourceTy, Original.getFloat(),
+                                  DestTy, Result.getInt());
+    }
+  }
+
+  Info.FFDiag(E, diag::err_convertvector_constexpr_unsupported_vector_cast)
+      << SourceTy << DestTy;
+  return false;
+}
+
+bool VectorExprEvaluator::VisitConvertVectorExpr(const ConvertVectorExpr *E) {
+  APValue Source;
+  QualType SourceVecType = E->getSrcExpr()->getType();
+  if (!EvaluateAsRValue(Info, E->getSrcExpr(), Source))
+    return false;
+
+  QualType DestTy = E->getType()->castAs<VectorType>()->getElementType();
+  QualType SourceTy = SourceVecType->castAs<VectorType>()->getElementType();
+
+  const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+
+  auto SourceLen = Source.getVectorLength();
+  SmallVector<APValue, 4> ResultElements;
+  ResultElements.reserve(SourceLen);
+  for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
+    APValue Elt;
+    if (!handleVectorElementCast(Info, FPO, E, SourceTy, DestTy,
+                                 Source.getVectorElt(EltNum), Elt))
+      return false;
+    ResultElements.push_back(std::move(Elt));
+  }
+
+  return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+}
+
+static bool handleVectorShuffle(EvalInfo &Info, const ShuffleVectorExpr *E,
+                                QualType ElemType, APValue const &VecVal1,
+                                APValue const &VecVal2, unsigned EltNum,
+                                APValue &Result) {
+  unsigned const TotalElementsInInputVector1 = VecVal1.getVectorLength();
+  unsigned const TotalElementsInInputVector2 = VecVal2.getVectorLength();
+
+  APSInt IndexVal = E->getShuffleMaskIdx(Info.Ctx, EltNum);
+  int64_t index = IndexVal.getExtValue();
+  // The spec says that -1 should be treated as undef for optimizations,
+  // but in constexpr we'd have to produce an APValue::Indeterminate,
+  // which is prohibited from being a top-level constant value. Emit a
+  // diagnostic instead.
+  if (index == -1) {
+    Info.FFDiag(
+        E, diag::err_shufflevector_minus_one_is_undefined_behavior_constexpr)
+        << EltNum;
+    return false;
+  }
+
+  if (index < 0 ||
+      index >= TotalElementsInInputVector1 + TotalElementsInInputVector2)
+    llvm_unreachable("Out of bounds shuffle index");
+
+  if (index >= TotalElementsInInputVector1)
+    Result = VecVal2.getVectorElt(index - TotalElementsInInputVector1);
+  else
+    Result = VecVal1.getVectorElt(index);
+  return true;
+}
+
+bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
+  APValue VecVal1;
+  const Expr *Vec1 = E->getExpr(0);
+  if (!EvaluateAsRValue(Info, Vec1, VecVal1))
+    return false;
+  APValue VecVal2;
+  const Expr *Vec2 = E->getExpr(1);
+  if (!EvaluateAsRValue(Info, Vec2, VecVal2))
+    return false;
+
+  VectorType const *DestVecTy = E->getType()->castAs<VectorType>();
+  QualType DestElTy = DestVecTy->getElementType();
+
+  auto TotalElementsInOutputVector = DestVecTy->getNumElements();
+
+  SmallVector<APValue, 4> ResultElements;
+  ResultElements.reserve(TotalElementsInOutputVector);
+  for (unsigned EltNum = 0; EltNum < TotalElementsInOutputVector; ++EltNum) {
+    APValue Elt;
+    if (!handleVectorShuffle(Info, E, DestElTy, VecVal1, VecVal2, EltNum, Elt))
+      return false;
+    ResultElements.push_back(std::move(Elt));
+  }
+
+  return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+}
+
 //===----------------------------------------------------------------------===//
 // Array Evaluation
 //===----------------------------------------------------------------------===//
@@ -16130,7 +16254,7 @@ static ICEDiag CheckICE(const Expr* E, const ASTContext &Ctx) {
   case Expr::StringLiteralClass:
   case Expr::ArraySubscriptExprClass:
   case Expr::MatrixSubscriptExprClass:
-  case Expr::OMPArraySectionExprClass:
+  case Expr::ArraySectionExprClass:
   case Expr::OMPArrayShapingExprClass:
   case Expr::OMPIteratorExprClass:
   case Expr::MemberExprClass:
diff --git a/clang/lib/AST/ExternalASTSource.cpp b/clang/lib/AST/ExternalASTSource.cpp
index 2e54d9f9af1c..e96a47496851 100644
--- a/clang/lib/AST/ExternalASTSource.cpp
+++ b/clang/lib/AST/ExternalASTSource.cpp
@@ -68,7 +68,7 @@ bool ExternalASTSource::layoutRecordType(
   return false;
 }
 
-Decl *ExternalASTSource::GetExternalDecl(Decl::DeclID ID) { return nullptr; }
+Decl *ExternalASTSource::GetExternalDecl(GlobalDeclID ID) { return nullptr; }
 
 Selector ExternalASTSource::GetExternalSelector(uint32_t ID) {
   return Selector();
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 8cd0c198d9a8..f1a51e81a92c 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -110,18 +110,37 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!this->visit(SubExpr))
       return false;
 
-    unsigned DerivedOffset = collectBaseOffset(getRecordTy(CE->getType()),
-                                               getRecordTy(SubExpr->getType()));
+    const auto extractRecordDecl = [](QualType Ty) -> const CXXRecordDecl * {
+      if (const auto *PT = dyn_cast<PointerType>(Ty))
+        return PT->getPointeeType()->getAsCXXRecordDecl();
+      return Ty->getAsCXXRecordDecl();
+    };
 
-    return this->emitGetPtrBasePop(DerivedOffset, CE);
+    // FIXME: We can express a series of non-virtual casts as a single
+    // GetPtrBasePop op.
+    QualType CurType = SubExpr->getType();
+    for (const CXXBaseSpecifier *B : CE->path()) {
+      if (B->isVirtual()) {
+        if (!this->emitGetPtrVirtBasePop(extractRecordDecl(B->getType()), CE))
+          return false;
+        CurType = B->getType();
+      } else {
+        unsigned DerivedOffset = collectBaseOffset(B->getType(), CurType);
+        if (!this->emitGetPtrBasePop(DerivedOffset, CE))
+          return false;
+        CurType = B->getType();
+      }
+    }
+
+    return true;
   }
 
   case CK_BaseToDerived: {
     if (!this->visit(SubExpr))
       return false;
 
-    unsigned DerivedOffset = collectBaseOffset(getRecordTy(SubExpr->getType()),
-                                               getRecordTy(CE->getType()));
+    unsigned DerivedOffset =
+        collectBaseOffset(SubExpr->getType(), CE->getType());
 
     return this->emitGetPtrDerivedPop(DerivedOffset, CE);
   }
@@ -193,6 +212,13 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!this->visit(SubExpr))
       return false;
 
+    // If SubExpr doesn't result in a pointer, make it one.
+    if (PrimType FromT = classifyPrim(SubExpr->getType()); FromT != PT_Ptr) {
+      assert(isPtrType(FromT));
+      if (!this->emitDecayPtr(FromT, PT_Ptr, CE))
+        return false;
+    }
+
     PrimType T = classifyPrim(CE->getType());
     if (T == PT_IntAP)
       return this->emitCastPointerIntegralAP(Ctx.getBitWidth(CE->getType()),
@@ -905,8 +931,31 @@ bool ByteCodeExprGen<Emitter>::VisitImplicitValueInitExpr(const ImplicitValueIni
   if (std::optional<PrimType> T = classify(QT))
     return this->visitZeroInitializer(*T, QT, E);
 
-  if (QT->isRecordType())
-    return false;
+  if (QT->isRecordType()) {
+    const RecordDecl *RD = QT->getAsRecordDecl();
+    assert(RD);
+    if (RD->isInvalidDecl())
+      return false;
+    if (RD->isUnion()) {
+      // C++11 [dcl.init]p5: If T is a (possibly cv-qualified) union type, the
+      // object's first non-static named data member is zero-initialized
+      // FIXME
+      return false;
+    }
+
+    if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD);
+        CXXRD && CXXRD->getNumVBases() > 0) {
+      // TODO: Diagnose.
+      return false;
+    }
+
+    const Record *R = getRecord(QT);
+    if (!R)
+      return false;
+
+    assert(Initializing);
+    return this->visitZeroRecordInitializer(R, E);
+  }
 
   if (QT->isIncompleteArrayType())
     return true;
@@ -981,122 +1030,98 @@ bool ByteCodeExprGen<Emitter>::VisitArraySubscriptExpr(
 
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
+                                             const Expr *ArrayFiller,
                                              const Expr *E) {
-  assert(E->getType()->isRecordType());
-  const Record *R = getRecord(E->getType());
+  if (E->getType()->isVoidType())
+    return this->emitInvalid(E);
 
-  if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) {
-    return this->visitInitializer(Inits[0]);
+  // Handle discarding first.
+  if (DiscardResult) {
+    for (const Expr *Init : Inits) {
+      if (!this->discard(Init))
+        return false;
+    }
+    return true;
   }
 
-  unsigned InitIndex = 0;
-  for (const Expr *Init : Inits) {
-    // Skip unnamed bitfields.
-    while (InitIndex < R->getNumFields() &&
-           R->getField(InitIndex)->Decl->isUnnamedBitField())
-      ++InitIndex;
+  // Primitive values.
+  if (std::optional<PrimType> T = classify(E->getType())) {
+    assert(!DiscardResult);
+    if (Inits.size() == 0)
+      return this->visitZeroInitializer(*T, E->getType(), E);
+    assert(Inits.size() == 1);
+    return this->delegate(Inits[0]);
+  }
 
-    if (!this->emitDupPtr(E))
-      return false;
+  QualType T = E->getType();
+  if (T->isRecordType()) {
+    const Record *R = getRecord(E->getType());
 
-    if (std::optional<PrimType> T = classify(Init)) {
-      const Record::Field *FieldToInit = R->getField(InitIndex);
-      if (!this->visit(Init))
-        return false;
+    if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) {
+      return this->visitInitializer(Inits[0]);
+    }
 
-      if (FieldToInit->isBitField()) {
-        if (!this->emitInitBitField(*T, FieldToInit, E))
-          return false;
-      } else {
-        if (!this->emitInitField(*T, FieldToInit->Offset, E))
-          return false;
-      }
+    unsigned InitIndex = 0;
+    for (const Expr *Init : Inits) {
+      // Skip unnamed bitfields.
+      while (InitIndex < R->getNumFields() &&
+             R->getField(InitIndex)->Decl->isUnnamedBitField())
+        ++InitIndex;
 
-      if (!this->emitPopPtr(E))
+      if (!this->emitDupPtr(E))
         return false;
-      ++InitIndex;
-    } else {
-      // Initializer for a direct base class.
-      if (const Record::Base *B = R->getBase(Init->getType())) {
-        if (!this->emitGetPtrBasePop(B->Offset, Init))
-          return false;
-
-        if (!this->visitInitializer(Init))
-          return false;
 
-        if (!this->emitFinishInitPop(E))
-          return false;
-        // Base initializers don't increase InitIndex, since they don't count
-        // into the Record's fields.
-      } else {
+      if (std::optional<PrimType> T = classify(Init)) {
         const Record::Field *FieldToInit = R->getField(InitIndex);
-        // Non-primitive case. Get a pointer to the field-to-initialize
-        // on the stack and recurse into visitInitializer().
-        if (!this->emitGetPtrField(FieldToInit->Offset, Init))
+        if (!this->visit(Init))
           return false;
 
-        if (!this->visitInitializer(Init))
-          return false;
+        if (FieldToInit->isBitField()) {
+          if (!this->emitInitBitField(*T, FieldToInit, E))
+            return false;
+        } else {
+          if (!this->emitInitField(*T, FieldToInit->Offset, E))
+            return false;
+        }
 
         if (!this->emitPopPtr(E))
           return false;
         ++InitIndex;
-      }
-    }
-  }
-  return true;
-}
+      } else {
+        // Initializer for a direct base class.
+        if (const Record::Base *B = R->getBase(Init->getType())) {
+          if (!this->emitGetPtrBasePop(B->Offset, Init))
+            return false;
 
-/// Pointer to the array(not the element!) must be on the stack when calling
-/// this.
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
-                                                  const Expr *Init) {
-  if (std::optional<PrimType> T = classify(Init->getType())) {
-    // Visit the primitive element like normal.
-    if (!this->visit(Init))
-      return false;
-    return this->emitInitElem(*T, ElemIndex, Init);
-  }
+          if (!this->visitInitializer(Init))
+            return false;
 
-  // Advance the pointer currently on the stack to the given
-  // dimension.
-  if (!this->emitConstUint32(ElemIndex, Init))
-    return false;
-  if (!this->emitArrayElemPtrUint32(Init))
-    return false;
-  if (!this->visitInitializer(Init))
-    return false;
-  return this->emitFinishInitPop(Init);
-}
+          if (!this->emitFinishInitPop(E))
+            return false;
+          // Base initializers don't increase InitIndex, since they don't count
+          // into the Record's fields.
+        } else {
+          const Record::Field *FieldToInit = R->getField(InitIndex);
+          // Non-primitive case. Get a pointer to the field-to-initialize
+          // on the stack and recurse into visitInitializer().
+          if (!this->emitGetPtrField(FieldToInit->Offset, Init))
+            return false;
 
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
-  // Handle discarding first.
-  if (DiscardResult) {
-    for (const Expr *Init : E->inits()) {
-      if (!this->discard(Init))
-        return false;
+          if (!this->visitInitializer(Init))
+            return false;
+
+          if (!this->emitPopPtr(E))
+            return false;
+          ++InitIndex;
+        }
+      }
     }
     return true;
   }
 
-  // Primitive values.
-  if (std::optional<PrimType> T = classify(E->getType())) {
-    assert(!DiscardResult);
-    if (E->getNumInits() == 0)
-      return this->visitZeroInitializer(*T, E->getType(), E);
-    assert(E->getNumInits() == 1);
-    return this->delegate(E->inits()[0]);
-  }
-
-  QualType T = E->getType();
-  if (T->isRecordType())
-    return this->visitInitList(E->inits(), E);
-
   if (T->isArrayType()) {
     unsigned ElementIndex = 0;
-    for (const Expr *Init : E->inits()) {
+    for (const Expr *Init : Inits) {
       if (!this->visitArrayElemInit(ElementIndex, Init))
         return false;
       ++ElementIndex;
@@ -1104,13 +1129,13 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
 
     // Expand the filler expression.
     // FIXME: This should go away.
-    if (const Expr *Filler = E->getArrayFiller()) {
+    if (ArrayFiller) {
       const ConstantArrayType *CAT =
           Ctx.getASTContext().getAsConstantArrayType(E->getType());
       uint64_t NumElems = CAT->getZExtSize();
 
       for (; ElementIndex != NumElems; ++ElementIndex) {
-        if (!this->visitArrayElemInit(ElementIndex, Filler))
+        if (!this->visitArrayElemInit(ElementIndex, ArrayFiller))
           return false;
       }
     }
@@ -1119,10 +1144,10 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
   }
 
   if (const auto *ComplexTy = E->getType()->getAs<ComplexType>()) {
-    unsigned NumInits = E->getNumInits();
+    unsigned NumInits = Inits.size();
 
     if (NumInits == 1)
-      return this->delegate(E->inits()[0]);
+      return this->delegate(Inits[0]);
 
     QualType ElemQT = ComplexTy->getElementType();
     PrimType ElemT = classifyPrim(ElemQT);
@@ -1136,7 +1161,7 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
       }
     } else if (NumInits == 2) {
       unsigned InitIndex = 0;
-      for (const Expr *Init : E->inits()) {
+      for (const Expr *Init : Inits) {
         if (!this->visit(Init))
           return false;
 
@@ -1150,22 +1175,32 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
 
   if (const auto *VecT = E->getType()->getAs<VectorType>()) {
     unsigned NumVecElements = VecT->getNumElements();
-    assert(NumVecElements >= E->getNumInits());
+    assert(NumVecElements >= Inits.size());
 
     QualType ElemQT = VecT->getElementType();
     PrimType ElemT = classifyPrim(ElemQT);
 
     // All initializer elements.
     unsigned InitIndex = 0;
-    for (const Expr *Init : E->inits()) {
+    for (const Expr *Init : Inits) {
       if (!this->visit(Init))
         return false;
 
-      if (!this->emitInitElem(ElemT, InitIndex, E))
-        return false;
-      ++InitIndex;
+      // If the initializer is of vector type itself, we have to deconstruct
+      // that and initialize all the target fields from the initializer fields.
+      if (const auto *InitVecT = Init->getType()->getAs<VectorType>()) {
+        if (!this->emitCopyArray(ElemT, 0, InitIndex, InitVecT->getNumElements(), E))
+          return false;
+        InitIndex += InitVecT->getNumElements();
+      } else {
+        if (!this->emitInitElem(ElemT, InitIndex, E))
+          return false;
+        ++InitIndex;
+      }
     }
 
+    assert(InitIndex <= NumVecElements);
+
     // Fill the rest with zeroes.
     for (; InitIndex != NumVecElements; ++InitIndex) {
       if (!this->visitZeroInitializer(ElemT, ElemQT, E))
@@ -1179,19 +1214,38 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
   return false;
 }
 
+/// Pointer to the array(not the element!) must be on the stack when calling
+/// this.
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXParenListInitExpr(
-    const CXXParenListInitExpr *E) {
-  if (DiscardResult) {
-    for (const Expr *Init : E->getInitExprs()) {
-      if (!this->discard(Init))
-        return false;
-    }
-    return true;
+bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
+                                                  const Expr *Init) {
+  if (std::optional<PrimType> T = classify(Init->getType())) {
+    // Visit the primitive element like normal.
+    if (!this->visit(Init))
+      return false;
+    return this->emitInitElem(*T, ElemIndex, Init);
   }
 
-  assert(E->getType()->isRecordType());
-  return this->visitInitList(E->getInitExprs(), E);
+  // Advance the pointer currently on the stack to the given
+  // dimension.
+  if (!this->emitConstUint32(ElemIndex, Init))
+    return false;
+  if (!this->emitArrayElemPtrUint32(Init))
+    return false;
+  if (!this->visitInitializer(Init))
+    return false;
+  return this->emitFinishInitPop(Init);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
+  return this->visitInitList(E->inits(), E->getArrayFiller(), E);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitCXXParenListInitExpr(
+    const CXXParenListInitExpr *E) {
+  return this->visitInitList(E->getInitExprs(), E->getArrayFiller(), E);
 }
 
 template <class Emitter>
@@ -1314,6 +1368,20 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
     assert(E->getTypeOfArgument()->isSizelessVectorType());
   }
 
+  if (Kind == UETT_VecStep) {
+    if (const auto *VT = E->getTypeOfArgument()->getAs<VectorType>()) {
+      unsigned N = VT->getNumElements();
+
+      // The vec_step built-in functions that take a 3-component
+      // vector return 4. (OpenCL 1.1 spec 6.11.12)
+      if (N == 3)
+        N = 4;
+
+      return this->emitConst(N, E);
+    }
+    return this->emitConst(1, E);
+  }
+
   return false;
 }
 
@@ -2224,26 +2292,54 @@ bool ByteCodeExprGen<Emitter>::VisitCXXScalarValueInitExpr(
   if (std::optional<PrimType> T = classify(Ty))
     return this->visitZeroInitializer(*T, Ty, E);
 
-  assert(Ty->isAnyComplexType());
-  if (!Initializing) {
-    std::optional<unsigned> LocalIndex = allocateLocal(E, /*IsExtended=*/false);
-    if (!LocalIndex)
-      return false;
-    if (!this->emitGetPtrLocal(*LocalIndex, E))
-      return false;
+  if (const auto *CT = Ty->getAs<ComplexType>()) {
+    if (!Initializing) {
+      std::optional<unsigned> LocalIndex =
+          allocateLocal(E, /*IsExtended=*/false);
+      if (!LocalIndex)
+        return false;
+      if (!this->emitGetPtrLocal(*LocalIndex, E))
+        return false;
+    }
+
+    // Initialize both fields to 0.
+    QualType ElemQT = CT->getElementType();
+    PrimType ElemT = classifyPrim(ElemQT);
+
+    for (unsigned I = 0; I != 2; ++I) {
+      if (!this->visitZeroInitializer(ElemT, ElemQT, E))
+        return false;
+      if (!this->emitInitElem(ElemT, I, E))
+        return false;
+    }
+    return true;
   }
 
-  // Initialize both fields to 0.
-  QualType ElemQT = Ty->getAs<ComplexType>()->getElementType();
-  PrimType ElemT = classifyPrim(ElemQT);
+  if (const auto *VT = Ty->getAs<VectorType>()) {
+    // FIXME: Code duplication with the _Complex case above.
+    if (!Initializing) {
+      std::optional<unsigned> LocalIndex =
+          allocateLocal(E, /*IsExtended=*/false);
+      if (!LocalIndex)
+        return false;
+      if (!this->emitGetPtrLocal(*LocalIndex, E))
+        return false;
+    }
 
-  for (unsigned I = 0; I != 2; ++I) {
-    if (!this->visitZeroInitializer(ElemT, ElemQT, E))
-      return false;
-    if (!this->emitInitElem(ElemT, I, E))
-      return false;
+    // Initialize all fields to 0.
+    QualType ElemQT = VT->getElementType();
+    PrimType ElemT = classifyPrim(ElemQT);
+
+    for (unsigned I = 0, N = VT->getNumElements(); I != N; ++I) {
+      if (!this->visitZeroInitializer(ElemT, ElemQT, E))
+        return false;
+      if (!this->emitInitElem(ElemT, I, E))
+        return false;
+    }
+    return true;
   }
-  return true;
+
+  return false;
 }
 
 template <class Emitter>
@@ -2321,8 +2417,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
   if (!this->emitGetPtrGlobal(*GlobalIndex, E))
     return false;
 
-  const Record *R = this->getRecord(E->getType());
-  assert(R);
+  assert(this->getRecord(E->getType()));
 
   const APValue &V = E->getGuidDecl()->getAsAPValue();
   if (V.getKind() == APValue::None)
@@ -2330,41 +2425,8 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
 
   assert(V.isStruct());
   assert(V.getStructNumBases() == 0);
-  // FIXME: This could be useful in visitAPValue, too.
-  for (unsigned I = 0, N = V.getStructNumFields(); I != N; ++I) {
-    const APValue &F = V.getStructField(I);
-    const Record::Field *RF = R->getField(I);
-
-    if (F.isInt()) {
-      PrimType T = classifyPrim(RF->Decl->getType());
-      if (!this->visitAPValue(F, T, E))
-        return false;
-      if (!this->emitInitField(T, RF->Offset, E))
-        return false;
-    } else if (F.isArray()) {
-      assert(RF->Desc->isPrimitiveArray());
-      const auto *ArrType = RF->Decl->getType()->getAsArrayTypeUnsafe();
-      PrimType ElemT = classifyPrim(ArrType->getElementType());
-      assert(ArrType);
-
-      if (!this->emitDupPtr(E))
-        return false;
-      if (!this->emitGetPtrField(RF->Offset, E))
-        return false;
-
-      for (unsigned A = 0, AN = F.getArraySize(); A != AN; ++A) {
-        if (!this->visitAPValue(F.getArrayInitializedElt(A), ElemT, E))
-          return false;
-        if (!this->emitInitElem(ElemT, A, E))
-          return false;
-      }
-
-      if (!this->emitPopPtr(E))
-        return false;
-    } else {
-      assert(false && "I don't think this should be possible");
-    }
-  }
+  if (!this->visitAPValueInitializer(V, E))
+    return false;
 
   return this->emitFinishInit(E);
 }
@@ -2930,6 +2992,54 @@ bool ByteCodeExprGen<Emitter>::visitAPValue(const APValue &Val,
 }
 
 template <class Emitter>
+bool ByteCodeExprGen<Emitter>::visitAPValueInitializer(const APValue &Val,
+                                                       const Expr *E) {
+  if (Val.isStruct()) {
+    const Record *R = this->getRecord(E->getType());
+    assert(R);
+
+    for (unsigned I = 0, N = Val.getStructNumFields(); I != N; ++I) {
+      const APValue &F = Val.getStructField(I);
+      const Record::Field *RF = R->getField(I);
+
+      if (F.isInt()) {
+        PrimType T = classifyPrim(RF->Decl->getType());
+        if (!this->visitAPValue(F, T, E))
+          return false;
+        if (!this->emitInitField(T, RF->Offset, E))
+          return false;
+      } else if (F.isArray()) {
+        assert(RF->Desc->isPrimitiveArray());
+        const auto *ArrType = RF->Decl->getType()->getAsArrayTypeUnsafe();
+        PrimType ElemT = classifyPrim(ArrType->getElementType());
+        assert(ArrType);
+
+        if (!this->emitDupPtr(E))
+          return false;
+        if (!this->emitGetPtrField(RF->Offset, E))
+          return false;
+
+        for (unsigned A = 0, AN = F.getArraySize(); A != AN; ++A) {
+          if (!this->visitAPValue(F.getArrayInitializedElt(A), ElemT, E))
+            return false;
+          if (!this->emitInitElem(ElemT, A, E))
+            return false;
+        }
+
+        if (!this->emitPopPtr(E))
+          return false;
+      } else {
+        assert(false && "I don't think this should be possible");
+      }
+    }
+    return true;
+  }
+  // TODO: Other types.
+
+  return false;
+}
+
+template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitBuiltinCallExpr(const CallExpr *E) {
   const Function *Func = getFunction(E->getDirectCallee());
   if (!Func)
@@ -3450,9 +3560,17 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   } else if (const auto *FuncDecl = dyn_cast<FunctionDecl>(D)) {
     const Function *F = getFunction(FuncDecl);
     return F && this->emitGetFnPtr(F, E);
-  } else if (isa<TemplateParamObjectDecl>(D)) {
-    if (std::optional<unsigned> Index = P.getOrCreateGlobal(D))
-      return this->emitGetPtrGlobal(*Index, E);
+  } else if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(D)) {
+    if (std::optional<unsigned> Index = P.getOrCreateGlobal(D)) {
+      if (!this->emitGetPtrGlobal(*Index, E))
+        return false;
+      if (std::optional<PrimType> T = classify(E->getType())) {
+        if (!this->visitAPValue(TPOD->getValue(), *T, E))
+          return false;
+        return this->emitInitGlobal(*T, *Index, E);
+      }
+      return this->visitAPValueInitializer(TPOD->getValue(), E);
+    }
     return false;
   }
 
@@ -3529,35 +3647,17 @@ void ByteCodeExprGen<Emitter>::emitCleanup() {
 
 template <class Emitter>
 unsigned
-ByteCodeExprGen<Emitter>::collectBaseOffset(const RecordType *BaseType,
-                                            const RecordType *DerivedType) {
-  assert(BaseType);
-  assert(DerivedType);
-  const auto *FinalDecl = cast<CXXRecordDecl>(BaseType->getDecl());
-  const RecordDecl *CurDecl = DerivedType->getDecl();
-  const Record *CurRecord = getRecord(CurDecl);
-  assert(CurDecl && FinalDecl);
-
-  unsigned OffsetSum = 0;
-  for (;;) {
-    assert(CurRecord->getNumBases() > 0);
-    // One level up
-    for (const Record::Base &B : CurRecord->bases()) {
-      const auto *BaseDecl = cast<CXXRecordDecl>(B.Decl);
-
-      if (BaseDecl == FinalDecl || BaseDecl->isDerivedFrom(FinalDecl)) {
-        OffsetSum += B.Offset;
-        CurRecord = B.R;
-        CurDecl = BaseDecl;
-        break;
-      }
-    }
-    if (CurDecl == FinalDecl)
-      break;
-  }
+ByteCodeExprGen<Emitter>::collectBaseOffset(const QualType BaseType,
+                                            const QualType DerivedType) {
+  const auto extractRecordDecl = [](QualType Ty) -> const CXXRecordDecl * {
+    if (const auto *PT = dyn_cast<PointerType>(Ty))
+      return PT->getPointeeType()->getAsCXXRecordDecl();
+    return Ty->getAsCXXRecordDecl();
+  };
+  const CXXRecordDecl *BaseDecl = extractRecordDecl(BaseType);
+  const CXXRecordDecl *DerivedDecl = extractRecordDecl(DerivedType);
 
-  assert(OffsetSum > 0);
-  return OffsetSum;
+  return Ctx.collectBaseOffset(BaseDecl, DerivedDecl);
 }
 
 /// Emit casts from a PrimType to another PrimType.
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 7e9dc8631fc0..a89e37c67aa6 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -181,6 +181,7 @@ protected:
   bool visitVarDecl(const VarDecl *VD);
   /// Visit an APValue.
   bool visitAPValue(const APValue &Val, PrimType ValType, const Expr *E);
+  bool visitAPValueInitializer(const APValue &Val, const Expr *E);
 
   /// Visits an expression and converts it to a boolean.
   bool visitBool(const Expr *E);
@@ -224,7 +225,8 @@ protected:
     return this->emitFinishInitPop(I);
   }
 
-  bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *E);
+  bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *ArrayFiller,
+                     const Expr *E);
   bool visitArrayElemInit(unsigned ElemIndex, const Expr *Init);
 
   /// Creates a local primitive value.
@@ -283,8 +285,8 @@ private:
 
   bool emitRecordDestruction(const Record *R);
   bool emitDestruction(const Descriptor *Desc);
-  unsigned collectBaseOffset(const RecordType *BaseType,
-                             const RecordType *DerivedType);
+  unsigned collectBaseOffset(const QualType BaseType,
+                             const QualType DerivedType);
 
 protected:
   /// Variable to storage mapping.
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 36dab6252ece..ff92bc117f9e 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -189,14 +189,23 @@ bool ByteCodeStmtGen<Emitter>::visitFunc(const FunctionDecl *F) {
         if (!emitFieldInitializer(F, F->Offset, InitExpr))
           return false;
       } else if (const Type *Base = Init->getBaseClass()) {
-        // Base class initializer.
-        // Get This Base and call initializer on it.
         const auto *BaseDecl = Base->getAsCXXRecordDecl();
         assert(BaseDecl);
-        const Record::Base *B = R->getBase(BaseDecl);
-        assert(B);
-        if (!this->emitGetPtrThisBase(B->Offset, InitExpr))
-          return false;
+
+        if (Init->isBaseVirtual()) {
+          assert(R->getVirtualBase(BaseDecl));
+          if (!this->emitGetPtrThisVirtBase(BaseDecl, InitExpr))
+            return false;
+
+        } else {
+          // Base class initializer.
+          // Get This Base and call initializer on it.
+          const Record::Base *B = R->getBase(BaseDecl);
+          assert(B);
+          if (!this->emitGetPtrThisBase(B->Offset, InitExpr))
+            return false;
+        }
+
         if (!this->visitInitializer(InitExpr))
           return false;
         if (!this->emitFinishInitPop(InitExpr))
@@ -283,8 +292,9 @@ bool ByteCodeStmtGen<Emitter>::visitStmt(const Stmt *S) {
   case Stmt::GCCAsmStmtClass:
   case Stmt::MSAsmStmtClass:
   case Stmt::GotoStmtClass:
-  case Stmt::LabelStmtClass:
     return this->emitInvalid(S);
+  case Stmt::LabelStmtClass:
+    return this->visitStmt(cast<LabelStmt>(S)->getSubStmt());
   default: {
     if (auto *Exp = dyn_cast<Expr>(S))
       return this->discard(Exp);
@@ -323,7 +333,8 @@ bool ByteCodeStmtGen<Emitter>::visitCompoundStmt(
 template <class Emitter>
 bool ByteCodeStmtGen<Emitter>::visitDeclStmt(const DeclStmt *DS) {
   for (auto *D : DS->decls()) {
-    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl>(D))
+    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl,
+            FunctionDecl>(D))
       continue;
 
     const auto *VD = dyn_cast<VarDecl>(D);
diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp
index 274178837bf0..d51a57e5e92e 100644
--- a/clang/lib/AST/Interp/Context.cpp
+++ b/clang/lib/AST/Interp/Context.cpp
@@ -262,3 +262,36 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FD) {
 
   return Func;
 }
+
+unsigned Context::collectBaseOffset(const RecordDecl *BaseDecl,
+                                    const RecordDecl *DerivedDecl) const {
+  assert(BaseDecl);
+  assert(DerivedDecl);
+  const auto *FinalDecl = cast<CXXRecordDecl>(BaseDecl);
+  const RecordDecl *CurDecl = DerivedDecl;
+  const Record *CurRecord = P->getOrCreateRecord(CurDecl);
+  assert(CurDecl && FinalDecl);
+
+  unsigned OffsetSum = 0;
+  for (;;) {
+    assert(CurRecord->getNumBases() > 0);
+    // One level up
+    for (const Record::Base &B : CurRecord->bases()) {
+      const auto *BaseDecl = cast<CXXRecordDecl>(B.Decl);
+
+      if (BaseDecl == FinalDecl || BaseDecl->isDerivedFrom(FinalDecl)) {
+        OffsetSum += B.Offset;
+        CurRecord = B.R;
+        CurDecl = BaseDecl;
+        break;
+      }
+    }
+    if (CurDecl == FinalDecl)
+      break;
+
+    // break;
+  }
+
+  assert(OffsetSum > 0);
+  return OffsetSum;
+}
diff --git a/clang/lib/AST/Interp/Context.h b/clang/lib/AST/Interp/Context.h
index 23c439ad8912..360e9499d084 100644
--- a/clang/lib/AST/Interp/Context.h
+++ b/clang/lib/AST/Interp/Context.h
@@ -104,6 +104,9 @@ public:
   /// Returns the program. This is only needed for unittests.
   Program &getProgram() const { return *P.get(); }
 
+  unsigned collectBaseOffset(const RecordDecl *BaseDecl,
+                             const RecordDecl *DerivedDecl) const;
+
 private:
   /// Runs a function.
   bool Run(State &Parent, const Function *Func, APValue &Result);
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index a4ccc0236d29..954c58c8cb37 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -136,28 +136,66 @@ static void moveArrayDesc(Block *B, const std::byte *Src, std::byte *Dst,
   }
 }
 
+static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
+                      bool IsActive, const Descriptor *D,
+                      unsigned FieldOffset) {
+  bool IsUnion = false; // FIXME
+  auto *Desc = reinterpret_cast<InlineDescriptor *>(Ptr + FieldOffset) - 1;
+  Desc->Offset = FieldOffset;
+  Desc->Desc = D;
+  Desc->IsInitialized = D->IsArray;
+  Desc->IsBase = false;
+  Desc->IsActive = IsActive && !IsUnion;
+  Desc->IsConst = IsConst || D->IsConst;
+  Desc->IsFieldMutable = IsMutable || D->IsMutable;
+
+  if (auto Fn = D->CtorFn)
+    Fn(B, Ptr + FieldOffset, Desc->IsConst, Desc->IsFieldMutable,
+       Desc->IsActive, D);
+}
+
+static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
+                     bool IsActive, const Descriptor *D, unsigned FieldOffset,
+                     bool IsVirtualBase) {
+  assert(D);
+  assert(D->ElemRecord);
+
+  bool IsUnion = D->ElemRecord->isUnion();
+  auto *Desc = reinterpret_cast<InlineDescriptor *>(Ptr + FieldOffset) - 1;
+  Desc->Offset = FieldOffset;
+  Desc->Desc = D;
+  Desc->IsInitialized = D->IsArray;
+  Desc->IsBase = true;
+  Desc->IsActive = IsActive && !IsUnion;
+  Desc->IsConst = IsConst || D->IsConst;
+  Desc->IsFieldMutable = IsMutable || D->IsMutable;
+
+  for (const auto &V : D->ElemRecord->bases())
+    initBase(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, V.Desc,
+             V.Offset, false);
+  for (const auto &F : D->ElemRecord->fields())
+    initField(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, F.Desc,
+              F.Offset);
+
+  // If this is initializing a virtual base, we do NOT want to consider its
+  // virtual bases, those are already flattened into the parent record when
+  // creating it.
+  if (IsVirtualBase)
+    return;
+
+  for (const auto &V : D->ElemRecord->virtual_bases())
+    initBase(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, V.Desc,
+             V.Offset, true);
+}
+
 static void ctorRecord(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
                        bool IsActive, const Descriptor *D) {
-  const bool IsUnion = D->ElemRecord->isUnion();
-  auto CtorSub = [=](unsigned SubOff, const Descriptor *F, bool IsBase) {
-    auto *Desc = reinterpret_cast<InlineDescriptor *>(Ptr + SubOff) - 1;
-    Desc->Offset = SubOff;
-    Desc->Desc = F;
-    Desc->IsInitialized = F->IsArray && !IsBase;
-    Desc->IsBase = IsBase;
-    Desc->IsActive = IsActive && !IsUnion;
-    Desc->IsConst = IsConst || F->IsConst;
-    Desc->IsFieldMutable = IsMutable || F->IsMutable;
-    if (auto Fn = F->CtorFn)
-      Fn(B, Ptr + SubOff, Desc->IsConst, Desc->IsFieldMutable, Desc->IsActive,
-         F);
-  };
-  for (const auto &B : D->ElemRecord->bases())
-    CtorSub(B.Offset, B.Desc, /*isBase=*/true);
+  for (const auto &V : D->ElemRecord->bases())
+    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, false);
   for (const auto &F : D->ElemRecord->fields())
-    CtorSub(F.Offset, F.Desc, /*isBase=*/false);
+    initField(B, Ptr, IsConst, IsMutable, IsActive, F.Desc, F.Offset);
   for (const auto &V : D->ElemRecord->virtual_bases())
-    CtorSub(V.Offset, V.Desc, /*isBase=*/true);
+    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, true);
 }
 
 static void dtorRecord(Block *B, std::byte *Ptr, const Descriptor *D) {
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index c386fc8ac7b0..cd20495c259c 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -82,6 +82,9 @@ struct InlineDescriptor {
   InlineDescriptor(const Descriptor *D)
       : Offset(sizeof(InlineDescriptor)), IsConst(false), IsInitialized(false),
         IsBase(false), IsActive(false), IsFieldMutable(false), Desc(D) {}
+
+  void dump() const { dump(llvm::errs()); }
+  void dump(llvm::raw_ostream &OS) const;
 };
 
 /// Describes a memory block created by an allocation site.
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index d127f33223e8..ccdc96a79436 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -200,7 +200,7 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
     OS << " primitive";
 
   if (isZeroSizeArray())
-    OS << " zero-size-arrary";
+    OS << " zero-size-array";
   else if (isUnknownSizeArray())
     OS << " unknown-size-array";
 
@@ -208,6 +208,25 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
     OS << " dummy";
 }
 
+LLVM_DUMP_METHOD void InlineDescriptor::dump(llvm::raw_ostream &OS) const {
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true});
+    OS << "InlineDescriptor " << (const void *)this << "\n";
+  }
+  OS << "Offset: " << Offset << "\n";
+  OS << "IsConst: " << IsConst << "\n";
+  OS << "IsInitialized: " << IsInitialized << "\n";
+  OS << "IsBase: " << IsBase << "\n";
+  OS << "IsActive: " << IsActive << "\n";
+  OS << "IsFieldMutable: " << IsFieldMutable << "\n";
+  OS << "Desc: ";
+  if (Desc)
+    Desc->dump(OS);
+  else
+    OS << "nullptr";
+  OS << "\n";
+}
+
 LLVM_DUMP_METHOD void InterpFrame::dump(llvm::raw_ostream &OS,
                                         unsigned Indent) const {
   unsigned Spaces = Indent * 2;
@@ -251,8 +270,6 @@ LLVM_DUMP_METHOD void Record::dump(llvm::raw_ostream &OS, unsigned Indentation,
     ++I;
   }
 
-  // FIXME: Virtual bases.
-
   I = 0;
   for (const Record::Field &F : fields()) {
     OS.indent(Indent) << "- Field " << I << ": ";
@@ -263,6 +280,14 @@ LLVM_DUMP_METHOD void Record::dump(llvm::raw_ostream &OS, unsigned Indentation,
     OS << ". Offset " << (Offset + F.Offset) << "\n";
     ++I;
   }
+
+  I = 0;
+  for (const Record::Base &B : virtual_bases()) {
+    OS.indent(Indent) << "- Virtual Base " << I << ". Offset "
+                      << (Offset + B.Offset) << "\n";
+    B.R->dump(OS, Indentation + 1, Offset + B.Offset);
+    ++I;
+  }
 }
 
 LLVM_DUMP_METHOD void Block::dump(llvm::raw_ostream &OS) const {
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 9283f697c007..66d30cc3fbaa 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1355,20 +1355,26 @@ inline bool VirtBaseHelper(InterpState &S, CodePtr OpPC, const RecordDecl *Decl,
   while (Base.isBaseClass())
     Base = Base.getBase();
 
-  auto *Field = Base.getRecord()->getVirtualBase(Decl);
-  S.Stk.push<Pointer>(Base.atField(Field->Offset));
+  const Record::Base *VirtBase = Base.getRecord()->getVirtualBase(Decl);
+  S.Stk.push<Pointer>(Base.atField(VirtBase->Offset));
   return true;
 }
 
-inline bool GetPtrVirtBase(InterpState &S, CodePtr OpPC, const RecordDecl *D) {
+inline bool GetPtrVirtBasePop(InterpState &S, CodePtr OpPC,
+                              const RecordDecl *D) {
+  assert(D);
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (!CheckNull(S, OpPC, Ptr, CSK_Base))
     return false;
+  if (Ptr.isDummy()) // FIXME: Once we have type info for dummy pointers, this
+                     // needs to go.
+    return false;
   return VirtBaseHelper(S, OpPC, D, Ptr);
 }
 
 inline bool GetPtrThisVirtBase(InterpState &S, CodePtr OpPC,
                                const RecordDecl *D) {
+  assert(D);
   if (S.checkingPotentialConstantExpression())
     return false;
   const Pointer &This = S.Current->getThis();
@@ -1929,10 +1935,15 @@ template <PrimType NameL, PrimType NameR>
 inline bool Shr(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
   using RT = typename PrimConv<NameR>::T;
-  const auto &RHS = S.Stk.pop<RT>();
+  auto RHS = S.Stk.pop<RT>();
   const auto &LHS = S.Stk.pop<LT>();
   const unsigned Bits = LHS.bitWidth();
 
+  // OpenCL 6.3j: shift values are effectively % word size of LHS.
+  if (S.getLangOpts().OpenCL)
+    RT::bitAnd(RHS, RT::from(LHS.bitWidth() - 1, RHS.bitWidth()),
+               RHS.bitWidth(), &RHS);
+
   if (!CheckShift(S, OpPC, LHS, RHS, Bits))
     return false;
 
@@ -1954,10 +1965,15 @@ template <PrimType NameL, PrimType NameR>
 inline bool Shl(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
   using RT = typename PrimConv<NameR>::T;
-  const auto &RHS = S.Stk.pop<RT>();
+  auto RHS = S.Stk.pop<RT>();
   const auto &LHS = S.Stk.pop<LT>();
   const unsigned Bits = LHS.bitWidth();
 
+  // OpenCL 6.3j: shift values are effectively % word size of LHS.
+  if (S.getLangOpts().OpenCL)
+    RT::bitAnd(RHS, RT::from(LHS.bitWidth() - 1, RHS.bitWidth()),
+               RHS.bitWidth(), &RHS);
+
   if (!CheckShift(S, OpPC, LHS, RHS, Bits))
     return false;
 
@@ -2074,6 +2090,24 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) {
   return true;
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+inline bool CopyArray(InterpState &S, CodePtr OpPC, uint32_t SrcIndex, uint32_t DestIndex, uint32_t Size) {
+  const auto &SrcPtr = S.Stk.pop<Pointer>();
+  const auto &DestPtr = S.Stk.peek<Pointer>();
+
+  for (uint32_t I = 0; I != Size; ++I) {
+    const Pointer &SP = SrcPtr.atIndex(SrcIndex + I);
+
+    if (!CheckLoad(S, OpPC, SP))
+      return false;
+
+    const Pointer &DP = DestPtr.atIndex(DestIndex + I);
+    DP.deref<T>() = SP.deref<T>();
+    DP.initialize();
+  }
+  return true;
+}
+
 /// Just takes a pointer and checks if it's an incomplete
 /// array type.
 inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 742785b28eb4..cfbd7f93c32d 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -336,7 +336,7 @@ def GetPtrDerivedPop : Opcode {
 }
 
 // [Pointer] -> [Pointer]
-def GetPtrVirtBase : Opcode {
+def GetPtrVirtBasePop : Opcode {
   // RecordDecl of base class.
   let Args = [ArgRecordDecl];
 }
@@ -376,7 +376,11 @@ def ArrayElem : Opcode {
   let HasGroup = 1;
 }
 
-
+def CopyArray : Opcode {
+  let Args = [ArgUint32, ArgUint32, ArgUint32];
+  let Types = [AllTypeClass];
+  let HasGroup = 1;
+}
 
 //===----------------------------------------------------------------------===//
 // Direct field accessors
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 3773e0662f78..02075c20cf55 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -173,7 +173,8 @@ std::optional<unsigned> Program::createGlobal(const ValueDecl *VD,
   if (const auto *Var = dyn_cast<VarDecl>(VD)) {
     IsStatic = Context::shouldBeGloballyIndexed(VD);
     IsExtern = Var->hasExternalStorage();
-  } else if (isa<UnnamedGlobalConstantDecl, MSGuidDecl>(VD)) {
+  } else if (isa<UnnamedGlobalConstantDecl, MSGuidDecl,
+                 TemplateParamObjectDecl>(VD)) {
     IsStatic = true;
     IsExtern = false;
   } else {
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 106c69dd5bee..ed9e6eeb36c7 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4715,7 +4715,7 @@ recurse:
   case Expr::MSPropertySubscriptExprClass:
   case Expr::TypoExprClass: // This should no longer exist in the AST by now.
   case Expr::RecoveryExprClass:
-  case Expr::OMPArraySectionExprClass:
+  case Expr::ArraySectionExprClass:
   case Expr::OMPArrayShapingExprClass:
   case Expr::OMPIteratorExprClass:
   case Expr::CXXInheritedCtorInitExprClass:
diff --git a/clang/lib/AST/NSAPI.cpp b/clang/lib/AST/NSAPI.cpp
index ecc56c13fb75..6f586173edb0 100644
--- a/clang/lib/AST/NSAPI.cpp
+++ b/clang/lib/AST/NSAPI.cpp
@@ -462,7 +462,7 @@ NSAPI::getNSNumberFactoryMethodKind(QualType T) const {
   case BuiltinType::PseudoObject:
   case BuiltinType::BuiltinFn:
   case BuiltinType::IncompleteMatrixIdx:
-  case BuiltinType::OMPArraySection:
+  case BuiltinType::ArraySection:
   case BuiltinType::OMPArrayShaping:
   case BuiltinType::OMPIterator:
   case BuiltinType::BFloat16:
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index 6cd5b2880218..885f3b7618ec 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -134,35 +134,67 @@ OpenACCNumGangsClause *OpenACCNumGangsClause::Create(const ASTContext &C,
   return new (Mem) OpenACCNumGangsClause(BeginLoc, LParenLoc, IntExprs, EndLoc);
 }
 
+OpenACCPrivateClause *OpenACCPrivateClause::Create(const ASTContext &C,
+                                                   SourceLocation BeginLoc,
+                                                   SourceLocation LParenLoc,
+                                                   ArrayRef<Expr *> VarList,
+                                                   SourceLocation EndLoc) {
+  void *Mem = C.Allocate(
+      OpenACCPrivateClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCPrivateClause(BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
 //===----------------------------------------------------------------------===//
 //  OpenACC clauses printing methods
 //===----------------------------------------------------------------------===//
+
+void OpenACCClausePrinter::printExpr(const Expr *E) {
+  E->printPretty(OS, nullptr, Policy, 0);
+}
+
 void OpenACCClausePrinter::VisitDefaultClause(const OpenACCDefaultClause &C) {
   OS << "default(" << C.getDefaultClauseKind() << ")";
 }
 
 void OpenACCClausePrinter::VisitIfClause(const OpenACCIfClause &C) {
-  OS << "if(" << C.getConditionExpr() << ")";
+  OS << "if(";
+  printExpr(C.getConditionExpr());
+  OS << ")";
 }
 
 void OpenACCClausePrinter::VisitSelfClause(const OpenACCSelfClause &C) {
   OS << "self";
-  if (const Expr *CondExpr = C.getConditionExpr())
-    OS << "(" << CondExpr << ")";
+  if (const Expr *CondExpr = C.getConditionExpr()) {
+    OS << "(";
+    printExpr(CondExpr);
+    OS << ")";
+  }
 }
 
 void OpenACCClausePrinter::VisitNumGangsClause(const OpenACCNumGangsClause &C) {
   OS << "num_gangs(";
-  llvm::interleaveComma(C.getIntExprs(), OS);
+  llvm::interleaveComma(C.getIntExprs(), OS,
+                        [&](const Expr *E) { printExpr(E); });
   OS << ")";
 }
 
 void OpenACCClausePrinter::VisitNumWorkersClause(
     const OpenACCNumWorkersClause &C) {
-  OS << "num_workers(" << C.getIntExpr() << ")";
+  OS << "num_workers(";
+  printExpr(C.getIntExpr());
+  OS << ")";
 }
 
 void OpenACCClausePrinter::VisitVectorLengthClause(
     const OpenACCVectorLengthClause &C) {
-  OS << "vector_length(" << C.getIntExpr() << ")";
+  OS << "vector_length(";
+  printExpr(C.getIntExpr());
+  OS << ")";
+}
+
+void OpenACCClausePrinter::VisitPrivateClause(const OpenACCPrivateClause &C) {
+  OS << "private(";
+  llvm::interleaveComma(C.getVarList(), OS,
+                        [&](const Expr *E) { printExpr(E); });
+  OS << ")";
 }
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 5855ab3141ed..be2d5a2eb6b4 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1148,9 +1148,10 @@ void StmtPrinter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
 
   if (!S->clauses().empty()) {
     OS << ' ';
-    OpenACCClausePrinter Printer(OS);
+    OpenACCClausePrinter Printer(OS, Policy);
     Printer.VisitClauseList(S->clauses());
   }
+  OS << '\n';
 
   PrintStmt(S->getStructuredBlock());
 }
@@ -1521,7 +1522,7 @@ void StmtPrinter::VisitMatrixSubscriptExpr(MatrixSubscriptExpr *Node) {
   OS << "]";
 }
 
-void StmtPrinter::VisitOMPArraySectionExpr(OMPArraySectionExpr *Node) {
+void StmtPrinter::VisitArraySectionExpr(ArraySectionExpr *Node) {
   PrintExpr(Node->getBase());
   OS << "[";
   if (Node->getLowerBound())
@@ -1531,7 +1532,7 @@ void StmtPrinter::VisitOMPArraySectionExpr(OMPArraySectionExpr *Node) {
     if (Node->getLength())
       PrintExpr(Node->getLength());
   }
-  if (Node->getColonLocSecond().isValid()) {
+  if (Node->isOMPArraySection() && Node->getColonLocSecond().isValid()) {
     OS << ":";
     if (Node->getStride())
       PrintExpr(Node->getStride());
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index c81724f84dd9..973f6f97bae0 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -1435,7 +1435,7 @@ void StmtProfiler::VisitMatrixSubscriptExpr(const MatrixSubscriptExpr *S) {
   VisitExpr(S);
 }
 
-void StmtProfiler::VisitOMPArraySectionExpr(const OMPArraySectionExpr *S) {
+void StmtProfiler::VisitArraySectionExpr(const ArraySectionExpr *S) {
   VisitExpr(S);
 }
 
@@ -2509,6 +2509,12 @@ void OpenACCClauseProfiler::VisitNumWorkersClause(
   Profiler.VisitStmt(Clause.getIntExpr());
 }
 
+void OpenACCClauseProfiler::VisitPrivateClause(
+    const OpenACCPrivateClause &Clause) {
+  for (auto *E : Clause.getVarList())
+    Profiler.VisitStmt(E);
+}
+
 void OpenACCClauseProfiler::VisitVectorLengthClause(
     const OpenACCVectorLengthClause &Clause) {
   assert(Clause.hasIntExpr() &&
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 8f0a9a9b0ed0..89f50d6dacfd 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -401,6 +401,7 @@ void TextNodeDumper::Visit(const OpenACCClause *C) {
     case OpenACCClauseKind::Self:
     case OpenACCClauseKind::NumGangs:
     case OpenACCClauseKind::NumWorkers:
+    case OpenACCClauseKind::Private:
     case OpenACCClauseKind::VectorLength:
       // The condition expression will be printed as a part of the 'children',
       // but print 'clause' here so it is clear what is happening from the dump.
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index cb22c91a12aa..68e81f45b4c2 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2510,6 +2510,18 @@ bool Type::isSveVLSBuiltinType() const {
   return false;
 }
 
+QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const {
+  assert(isSizelessVectorType() && "Must be sizeless vector type");
+  // Currently supports SVE and RVV
+  if (isSVESizelessBuiltinType())
+    return getSveEltType(Ctx);
+
+  if (isRVVSizelessBuiltinType())
+    return getRVVEltType(Ctx);
+
+  llvm_unreachable("Unhandled type");
+}
+
 QualType Type::getSveEltType(const ASTContext &Ctx) const {
   assert(isSveVLSBuiltinType() && "unsupported type!");
 
@@ -3413,8 +3425,8 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
     return "reserve_id_t";
   case IncompleteMatrixIdx:
     return "<incomplete matrix index type>";
-  case OMPArraySection:
-    return "<OpenMP array section type>";
+  case ArraySection:
+    return "<array section type>";
   case OMPArrayShaping:
     return "<OpenMP array shaping type>";
   case OMPIterator:
@@ -4710,7 +4722,7 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
     case BuiltinType::BuiltinFn:
     case BuiltinType::NullPtr:
     case BuiltinType::IncompleteMatrixIdx:
-    case BuiltinType::OMPArraySection:
+    case BuiltinType::ArraySection:
     case BuiltinType::OMPArrayShaping:
     case BuiltinType::OMPIterator:
       return false;
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index 21e152f6aea8..ce45b47d5cfe 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -429,7 +429,7 @@ TypeSpecifierType BuiltinTypeLoc::getWrittenTypeSpec() const {
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
   case BuiltinType::BuiltinFn:
   case BuiltinType::IncompleteMatrixIdx:
-  case BuiltinType::OMPArraySection:
+  case BuiltinType::ArraySection:
   case BuiltinType::OMPArrayShaping:
   case BuiltinType::OMPIterator:
     return TST_unspecified;
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 941322be8f87..3b3782fa1db9 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -235,15 +235,17 @@ const Stmt *ExprMutationAnalyzer::Analyzer::findMutationMemoized(
   if (Memoized != MemoizedResults.end())
     return Memoized->second;
 
+  // Assume Exp is not mutated before analyzing Exp.
+  MemoizedResults[Exp] = nullptr;
   if (isUnevaluated(Exp))
-    return MemoizedResults[Exp] = nullptr;
+    return nullptr;
 
   for (const auto &Finder : Finders) {
     if (const Stmt *S = (this->*Finder)(Exp))
       return MemoizedResults[Exp] = S;
   }
 
-  return MemoizedResults[Exp] = nullptr;
+  return nullptr;
 }
 
 const Stmt *
diff --git a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
index 619bf772bba5..bd1676583ecc 100644
--- a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
@@ -33,12 +33,20 @@ namespace clang::dataflow {
 
 const Expr &ignoreCFGOmittedNodes(const Expr &E) {
   const Expr *Current = &E;
-  if (auto *EWC = dyn_cast<ExprWithCleanups>(Current)) {
-    Current = EWC->getSubExpr();
+  const Expr *Last = nullptr;
+  while (Current != Last) {
+    Last = Current;
+    if (auto *EWC = dyn_cast<ExprWithCleanups>(Current)) {
+      Current = EWC->getSubExpr();
+      assert(Current != nullptr);
+    }
+    if (auto *CE = dyn_cast<ConstantExpr>(Current)) {
+      Current = CE->getSubExpr();
+      assert(Current != nullptr);
+    }
+    Current = Current->IgnoreParens();
     assert(Current != nullptr);
   }
-  Current = Current->IgnoreParens();
-  assert(Current != nullptr);
   return *Current;
 }
 
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 3cb656adcbdc..d79e73440289 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -157,7 +157,13 @@ static WidenResult widenDistinctValues(QualType Type, Value &Prev,
                                        Value &Current, Environment &CurrentEnv,
                                        Environment::ValueModel &Model) {
   // Boolean-model widening.
-  if (auto *PrevBool = dyn_cast<BoolValue>(&Prev)) {
+  if (isa<BoolValue>(Prev) && isa<BoolValue>(Current)) {
+    // FIXME: Checking both values should be unnecessary, but we can currently
+    // end up with `BoolValue`s in integer-typed variables. See comment in
+    // `joinDistinctValues()` for details.
+    auto &PrevBool = cast<BoolValue>(Prev);
+    auto &CurBool = cast<BoolValue>(Current);
+
     if (isa<TopBoolValue>(Prev))
       // Safe to return `Prev` here, because Top is never dependent on the
       // environment.
@@ -166,13 +172,12 @@ static WidenResult widenDistinctValues(QualType Type, Value &Prev,
     // We may need to widen to Top, but before we do so, check whether both
     // values are implied to be either true or false in the current environment.
     // In that case, we can simply return a literal instead.
-    auto &CurBool = cast<BoolValue>(Current);
-    bool TruePrev = PrevEnv.proves(PrevBool->formula());
+    bool TruePrev = PrevEnv.proves(PrevBool.formula());
     bool TrueCur = CurrentEnv.proves(CurBool.formula());
     if (TruePrev && TrueCur)
       return {&CurrentEnv.getBoolLiteralValue(true), LatticeEffect::Unchanged};
     if (!TruePrev && !TrueCur &&
-        PrevEnv.proves(PrevEnv.arena().makeNot(PrevBool->formula())) &&
+        PrevEnv.proves(PrevEnv.arena().makeNot(PrevBool.formula())) &&
         CurrentEnv.proves(CurrentEnv.arena().makeNot(CurBool.formula())))
       return {&CurrentEnv.getBoolLiteralValue(false), LatticeEffect::Unchanged};
 
@@ -333,6 +338,18 @@ public:
     }
   }
 
+  bool TraverseDecl(Decl *D) {
+    // Don't traverse nested record or function declarations.
+    // - We won't be analyzing code contained in these anyway
+    // - We don't model fields that are used only in these nested declaration,
+    //   so trying to propagate a result object to initializers of such fields
+    //   would cause an error.
+    if (isa_and_nonnull<RecordDecl>(D) || isa_and_nonnull<FunctionDecl>(D))
+      return true;
+
+    return RecursiveASTVisitor<ResultObjectVisitor>::TraverseDecl(D);
+  }
+
   bool TraverseBindingDecl(BindingDecl *BD) {
     // `RecursiveASTVisitor` doesn't traverse holding variables for
     // `BindingDecl`s by itself, so we need to tell it to.
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 43fdfa5abcbb..fd224aeb79b1 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -41,7 +41,11 @@ namespace dataflow {
 
 const Environment *StmtToEnvMap::getEnvironment(const Stmt &S) const {
   auto BlockIt = ACFG.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
-  assert(BlockIt != ACFG.getStmtToBlock().end());
+  if (BlockIt == ACFG.getStmtToBlock().end()) {
+    assert(false);
+    // Return null to avoid dereferencing the end iterator in non-assert builds.
+    return nullptr;
+  }
   if (!ACFG.isBlockReachable(*BlockIt->getSecond()))
     return nullptr;
   if (BlockIt->getSecond()->getBlockID() == CurBlockID)
diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt
index 2e218ba7c84c..824d4a0e2eee 100644
--- a/clang/lib/Basic/CMakeLists.txt
+++ b/clang/lib/Basic/CMakeLists.txt
@@ -130,6 +130,9 @@ add_clang_library(clangBasic
   DEPENDS
   omp_gen
   ClangDriverOptions
+  # These generated headers are included transitively.
+  ARMTargetParserTableGen
+  AArch64TargetParserTableGen
   )
 
 target_link_libraries(clangBasic
diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index cd520a6375e0..143c04309d07 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -39,12 +39,6 @@ using namespace clang;
 
 #define DEBUG_TYPE "file-search"
 
-ALWAYS_ENABLED_STATISTIC(NumDirLookups, "Number of directory lookups.");
-ALWAYS_ENABLED_STATISTIC(NumFileLookups, "Number of file lookups.");
-ALWAYS_ENABLED_STATISTIC(NumDirCacheMisses,
-                         "Number of directory cache misses.");
-ALWAYS_ENABLED_STATISTIC(NumFileCacheMisses, "Number of file cache misses.");
-
 //===----------------------------------------------------------------------===//
 // Common logic.
 //===----------------------------------------------------------------------===//
@@ -656,6 +650,14 @@ StringRef FileManager::getCanonicalName(const void *Entry, StringRef Name) {
   return CanonicalName;
 }
 
+void FileManager::AddStats(const FileManager &Other) {
+  assert(&Other != this && "Collecting stats into the same FileManager");
+  NumDirLookups += Other.NumDirLookups;
+  NumFileLookups += Other.NumFileLookups;
+  NumDirCacheMisses += Other.NumDirCacheMisses;
+  NumFileCacheMisses += Other.NumFileCacheMisses;
+}
+
 void FileManager::PrintStats() const {
   llvm::errs() << "\n*** File Manager Stats:\n";
   llvm::errs() << UniqueRealFiles.size() << " real files found, "
diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index e69adbe75473..df9855a52e61 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -225,6 +225,10 @@ public:
   bool hasBitIntType() const override { return true; }
 
   const char *getBFloat16Mangling() const override { return "u6__bf16"; };
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(getTriple().isArch64Bit() ? 256 : 64, 64);
+  }
 };
 
 class LLVM_LIBRARY_VISIBILITY ARMleTargetInfo : public ARMTargetInfo {
diff --git a/clang/lib/Basic/Targets/AVR.h b/clang/lib/Basic/Targets/AVR.h
index 9376c46cd98c..feeb04f37eeb 100644
--- a/clang/lib/Basic/Targets/AVR.h
+++ b/clang/lib/Basic/Targets/AVR.h
@@ -175,6 +175,10 @@ public:
   std::optional<std::string> handleAsmEscapedChar(char EscChar) const override;
   StringRef getABI() const override { return ABI; }
 
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
+
 protected:
   std::string CPU;
   StringRef ABI;
diff --git a/clang/lib/Basic/Targets/BPF.h b/clang/lib/Basic/Targets/BPF.h
index 489f29fc4fea..d19b37dd4df7 100644
--- a/clang/lib/Basic/Targets/BPF.h
+++ b/clang/lib/Basic/Targets/BPF.h
@@ -113,6 +113,10 @@ public:
     StringRef CPUName(Name);
     return isValidCPUName(CPUName);
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Basic/Targets/M68k.h b/clang/lib/Basic/Targets/M68k.h
index 7ffa901127e5..b732add77e03 100644
--- a/clang/lib/Basic/Targets/M68k.h
+++ b/clang/lib/Basic/Targets/M68k.h
@@ -56,6 +56,10 @@ public:
   BuiltinVaListKind getBuiltinVaListKind() const override;
   bool setCPU(const std::string &Name) override;
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override;
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 
 } // namespace targets
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index 0d6e4b4d0808..730deb674aa5 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -431,6 +431,10 @@ public:
 
   bool validateTarget(DiagnosticsEngine &Diags) const override;
   bool hasBitIntType() const override { return true; }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 60bc1dec8f95..cd0f08dfb3bc 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -423,6 +423,10 @@ public:
     // This is the ELF definition
     return TargetInfo::PowerABIBuiltinVaList;
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 
 // Note: ABI differences may eventually require us to have a separate
@@ -503,6 +507,10 @@ public:
       return CCCR_Warning;
     }
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(128, 128);
+  }
 };
 
 class LLVM_LIBRARY_VISIBILITY AIXPPC32TargetInfo :
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 9fa42e75bbfd..d0e9cdc6da07 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -122,6 +122,10 @@ public:
   void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) const override;
   bool supportsTargetAttributeTune() const override { return true; }
   ParsedTargetAttr parseTargetAttr(StringRef Str) const override;
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo {
 public:
diff --git a/clang/lib/Basic/Targets/Sparc.h b/clang/lib/Basic/Targets/Sparc.h
index 214fef88e1dc..3357bee33e1a 100644
--- a/clang/lib/Basic/Targets/Sparc.h
+++ b/clang/lib/Basic/Targets/Sparc.h
@@ -140,6 +140,10 @@ public:
     CPU = getCPUKind(Name);
     return CPU != CK_GENERIC;
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 
 // SPARC v8 is the 32-bit mode selected by Triple::sparc.
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index 8e302acd51b8..73d3aa01a043 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -220,6 +220,10 @@ public:
   int getEHDataRegisterNumber(unsigned RegNo) const override {
     return RegNo < 4 ? 6 + RegNo : -1;
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(256, 256);
+  }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
index d473fd190864..1f0418b21c1f 100644
--- a/clang/lib/Basic/Targets/WebAssembly.cpp
+++ b/clang/lib/Basic/Targets/WebAssembly.cpp
@@ -47,6 +47,7 @@ bool WebAssemblyTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("simd128", SIMDLevel >= SIMD128)
       .Case("relaxed-simd", SIMDLevel >= RelaxedSIMD)
+      .Case("half-precision", HasHalfPrecision)
       .Case("nontrapping-fptoint", HasNontrappingFPToInt)
       .Case("sign-ext", HasSignExt)
       .Case("exception-handling", HasExceptionHandling)
@@ -99,6 +100,8 @@ void WebAssemblyTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__wasm_extended_const__");
   if (HasMultiMemory)
     Builder.defineMacro("__wasm_multimemory__");
+  if (HasHalfPrecision)
+    Builder.defineMacro("__wasm_half_precision__");
 
   Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
   Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
@@ -147,19 +150,26 @@ void WebAssemblyTargetInfo::setFeatureEnabled(llvm::StringMap<bool> &Features,
 bool WebAssemblyTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
-  if (CPU == "bleeding-edge") {
+  auto addGenericFeatures = [&]() {
+    Features["multivalue"] = true;
+    Features["mutable-globals"] = true;
+    Features["reference-types"] = true;
+    Features["sign-ext"] = true;
+  };
+  auto addBleedingEdgeFeatures = [&]() {
+    addGenericFeatures();
     Features["atomics"] = true;
     Features["bulk-memory"] = true;
     Features["multimemory"] = true;
-    Features["mutable-globals"] = true;
     Features["nontrapping-fptoint"] = true;
-    Features["reference-types"] = true;
-    Features["sign-ext"] = true;
     Features["tail-call"] = true;
+    Features["half-precision"] = true;
     setSIMDLevel(Features, SIMD128, true);
-  } else if (CPU == "generic") {
-    Features["mutable-globals"] = true;
-    Features["sign-ext"] = true;
+  };
+  if (CPU == "generic") {
+    addGenericFeatures();
+  } else if (CPU == "bleeding-edge") {
+    addBleedingEdgeFeatures();
   }
 
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
@@ -216,6 +226,15 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
       HasBulkMemory = false;
       continue;
     }
+    if (Feature == "+half-precision") {
+      SIMDLevel = std::max(SIMDLevel, SIMD128);
+      HasHalfPrecision = true;
+      continue;
+    }
+    if (Feature == "-half-precision") {
+      HasHalfPrecision = false;
+      continue;
+    }
     if (Feature == "+atomics") {
       HasAtomics = true;
       continue;
diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h
index 5568aa28eaef..e4c18879182e 100644
--- a/clang/lib/Basic/Targets/WebAssembly.h
+++ b/clang/lib/Basic/Targets/WebAssembly.h
@@ -64,6 +64,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
   bool HasReferenceTypes = false;
   bool HasExtendedConst = false;
   bool HasMultiMemory = false;
+  bool HasHalfPrecision = false;
 
   std::string ABI;
 
diff --git a/clang/lib/CIR/CMakeLists.txt b/clang/lib/CIR/CMakeLists.txt
index e69de29bb2d1..d2ff200e0da5 100644
--- a/clang/lib/CIR/CMakeLists.txt
+++ b/clang/lib/CIR/CMakeLists.txt
@@ -0,0 +1,4 @@
+include_directories(${LLVM_MAIN_SRC_DIR}/../mlir/include)
+include_directories(${CMAKE_BINARY_DIR}/tools/mlir/include)
+
+add_subdirectory(Dialect)
diff --git a/clang/lib/CIR/Dialect/CMakeLists.txt b/clang/lib/CIR/Dialect/CMakeLists.txt
new file mode 100644
index 000000000000..f33061b2d87c
--- /dev/null
+++ b/clang/lib/CIR/Dialect/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
new file mode 100644
index 000000000000..c2829c3ff2af
--- /dev/null
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -0,0 +1,13 @@
+//===- CIRDialect.cpp - MLIR CIR ops implementation -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CIR dialect and its operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include <clang/CIR/Dialect/IR/CIRDialect.h>
diff --git a/clang/lib/CIR/Dialect/IR/CMakeLists.txt b/clang/lib/CIR/Dialect/IR/CMakeLists.txt
new file mode 100644
index 000000000000..0d7476b55570
--- /dev/null
+++ b/clang/lib/CIR/Dialect/IR/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_clang_library(MLIRCIR
+  CIRDialect.cpp
+  )
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7e5f2edfc732..a370734e00d3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -822,8 +822,9 @@ CodeGenFunction::evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type,
   return ConstantInt::get(ResType, ObjectSize, /*isSigned=*/true);
 }
 
-const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField(
-    ASTContext &Ctx, const RecordDecl *RD, StringRef Name, uint64_t &Offset) {
+const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberFieldAndOffset(
+    ASTContext &Ctx, const RecordDecl *RD, const FieldDecl *FAMDecl,
+    uint64_t &Offset) {
   const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
       getLangOpts().getStrictFlexArraysLevel();
   uint32_t FieldNo = 0;
@@ -832,7 +833,7 @@ const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField(
     return nullptr;
 
   for (const FieldDecl *FD : RD->fields()) {
-    if ((Name.empty() || FD->getNameAsString() == Name) &&
+    if ((!FAMDecl || FD == FAMDecl) &&
         Decl::isFlexibleArrayMemberLike(
             Ctx, FD, FD->getType(), StrictFlexArraysLevel,
             /*IgnoreTemplateOrMacroSubstitution=*/true)) {
@@ -843,8 +844,8 @@ const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField(
 
     QualType Ty = FD->getType();
     if (Ty->isRecordType()) {
-      if (const FieldDecl *Field = FindFlexibleArrayMemberField(
-              Ctx, Ty->getAsRecordDecl(), Name, Offset)) {
+      if (const FieldDecl *Field = FindFlexibleArrayMemberFieldAndOffset(
+              Ctx, Ty->getAsRecordDecl(), FAMDecl, Offset)) {
         const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
         Offset += Layout.getFieldOffset(FieldNo);
         return Field;
@@ -930,12 +931,14 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
 
   // Get the flexible array member Decl.
   const RecordDecl *OuterRD = nullptr;
-  std::string FAMName;
+  const FieldDecl *FAMDecl = nullptr;
   if (const auto *ME = dyn_cast<MemberExpr>(Base)) {
     // Check if \p Base is referencing the FAM itself.
     const ValueDecl *VD = ME->getMemberDecl();
     OuterRD = VD->getDeclContext()->getOuterLexicalRecordContext();
-    FAMName = VD->getNameAsString();
+    FAMDecl = dyn_cast<FieldDecl>(VD);
+    if (!FAMDecl)
+      return nullptr;
   } else if (const auto *DRE = dyn_cast<DeclRefExpr>(Base)) {
     // Check if we're pointing to the whole struct.
     QualType Ty = DRE->getDecl()->getType();
@@ -974,9 +977,11 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
   if (!OuterRD)
     return nullptr;
 
+  // We call FindFlexibleArrayMemberAndOffset even if FAMDecl is non-null to
+  // get its offset.
   uint64_t Offset = 0;
-  const FieldDecl *FAMDecl =
-      FindFlexibleArrayMemberField(Ctx, OuterRD, FAMName, Offset);
+  FAMDecl =
+      FindFlexibleArrayMemberFieldAndOffset(Ctx, OuterRD, FAMDecl, Offset);
   Offset = Ctx.toCharUnitsFromBits(Offset).getQuantity();
 
   if (!FAMDecl || !FAMDecl->getType()->isCountAttributedType())
@@ -3623,7 +3628,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     // frexpl instead of legalizing this type in the BE.
     if (&getTarget().getLongDoubleFormat() == &llvm::APFloat::PPCDoubleDouble())
       break;
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   }
   case Builtin::BI__builtin_frexp:
   case Builtin::BI__builtin_frexpf:
@@ -3880,9 +3885,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_reduce_max: {
-    auto GetIntrinsicID = [](QualType QT) {
+    auto GetIntrinsicID = [this](QualType QT) {
       if (auto *VecTy = QT->getAs<VectorType>())
         QT = VecTy->getElementType();
+      else if (QT->isSizelessVectorType())
+        QT = QT->getSizelessVectorEltType(CGM.getContext());
+
       if (QT->isSignedIntegerType())
         return llvm::Intrinsic::vector_reduce_smax;
       if (QT->isUnsignedIntegerType())
@@ -3895,9 +3903,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_reduce_min: {
-    auto GetIntrinsicID = [](QualType QT) {
+    auto GetIntrinsicID = [this](QualType QT) {
       if (auto *VecTy = QT->getAs<VectorType>())
         QT = VecTy->getElementType();
+      else if (QT->isSizelessVectorType())
+        QT = QT->getSizelessVectorEltType(CGM.getContext());
+
       if (QT->isSignedIntegerType())
         return llvm::Intrinsic::vector_reduce_smin;
       if (QT->isUnsignedIntegerType())
@@ -5361,7 +5372,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_ptrauth_auth_and_resign:
       if (Args[4]->getType()->isPointerTy())
         Args[4] = Builder.CreatePtrToInt(Args[4], IntPtrTy);
-      LLVM_FALLTHROUGH;
+      [[fallthrough]];
 
     case Builtin::BI__builtin_ptrauth_auth:
     case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
@@ -18845,7 +18856,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
       AppendFalseForOpselArg = true;
-      LLVM_FALLTHROUGH;
+      [[fallthrough]];
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
@@ -18854,7 +18865,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
       AppendFalseForOpselArg = true;
-      LLVM_FALLTHROUGH;
+      [[fallthrough]];
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
       ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 370642cb3d53..670bc4bf72ce 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -424,6 +424,33 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
                LaunchKernelArgs);
+
+  // To prevent CUDA device stub functions from being merged by ICF in MSVC
+  // environment, create an unique global variable for each kernel and write to
+  // the variable in the device stub.
+  if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+      !CGF.getLangOpts().HIP) {
+    llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
+    std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
+
+    llvm::GlobalVariable *HandleVar =
+        CGM.getModule().getNamedGlobal(GlobalVarName);
+    if (!HandleVar) {
+      HandleVar = new llvm::GlobalVariable(
+          CGM.getModule(), CGM.Int8Ty,
+          /*Constant=*/false, KernelFunction->getLinkage(),
+          llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
+      HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
+      HandleVar->setVisibility(KernelFunction->getVisibility());
+      if (KernelFunction->hasComdat())
+        HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
+    }
+
+    CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
+                                   HandleVar, CharUnits::One(),
+                                   /*IsVolatile=*/true);
+  }
+
   CGF.EmitBranch(EndBlock);
 
   CGF.EmitBlock(EndBlock);
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 45033ced1d83..525cd10c9a21 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1586,6 +1586,11 @@ bool CodeGenModule::ReturnTypeUsesSRet(const CGFunctionInfo &FI) {
   return RI.isIndirect() || (RI.isInAlloca() && RI.getInAllocaSRet());
 }
 
+bool CodeGenModule::ReturnTypeHasInReg(const CGFunctionInfo &FI) {
+  const auto &RI = FI.getReturnInfo();
+  return RI.getInReg();
+}
+
 bool CodeGenModule::ReturnSlotInterferesWithArgs(const CGFunctionInfo &FI) {
   return ReturnTypeUsesSRet(FI) &&
          getTargetCodeGenInfo().doesReturnSlotInterfereWithArgs();
@@ -4694,11 +4699,11 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     AggValueSlot Slot = args.isUsingInAlloca()
         ? createPlaceholderSlot(*this, type) : CreateAggTemp(type, "agg.tmp");
 
-    bool DestroyedInCallee = true, NeedsEHCleanup = true;
+    bool DestroyedInCallee = true, NeedsCleanup = true;
     if (const auto *RD = type->getAsCXXRecordDecl())
       DestroyedInCallee = RD->hasNonTrivialDestructor();
     else
-      NeedsEHCleanup = needsEHCleanup(type.isDestructedType());
+      NeedsCleanup = type.isDestructedType();
 
     if (DestroyedInCallee)
       Slot.setExternallyDestructed();
@@ -4707,14 +4712,15 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     RValue RV = Slot.asRValue();
     args.add(RV, type);
 
-    if (DestroyedInCallee && NeedsEHCleanup) {
+    if (DestroyedInCallee && NeedsCleanup) {
       // Create a no-op GEP between the placeholder and the cleanup so we can
       // RAUW it successfully.  It also serves as a marker of the first
       // instruction where the cleanup is active.
-      pushFullExprCleanup<DestroyUnpassedArg>(EHCleanup, Slot.getAddress(),
-                                              type);
+      pushFullExprCleanup<DestroyUnpassedArg>(NormalAndEHCleanup,
+                                              Slot.getAddress(), type);
       // This unreachable is a temporary marker which will be removed later.
-      llvm::Instruction *IsActive = Builder.CreateUnreachable();
+      llvm::Instruction *IsActive =
+          Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
       args.addArgCleanupDeactivation(EHStack.stable_begin(), IsActive);
     }
     return;
@@ -5016,6 +5022,11 @@ static unsigned getMaxVectorWidth(const llvm::Type *Ty) {
   return MaxVectorWidth;
 }
 
+static bool isCXXDeclType(const FunctionDecl *FD) {
+  return isa<CXXConstructorDecl>(FD) || isa<CXXMethodDecl>(FD) ||
+         isa<CXXDestructorDecl>(FD);
+}
+
 RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                                  const CGCallee &Callee,
                                  ReturnValueSlot ReturnValue,
@@ -5689,32 +5700,29 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   Attrs = AllocAlignAttrEmitter.TryEmitAsCallSiteAttribute(Attrs);
 
   if (CGM.getCodeGenOpts().CallGraphSection) {
-    // FIXME: create operand bundle only for indirect calls, not for all
-
-    assert((TargetDecl && TargetDecl->getFunctionType() ||
-            Callee.getAbstractInfo().getCalleeFunctionProtoType()) &&
-           "cannot find callsite type");
-
-    QualType CST;
-    if (TargetDecl && TargetDecl->getFunctionType())
-      CST = QualType(TargetDecl->getFunctionType(), 0);
-    else if (const auto *FPT =
-                 Callee.getAbstractInfo().getCalleeFunctionProtoType())
-      CST = QualType(FPT, 0);
-
-    if (!CST.isNull()) {
-      auto *TypeIdMD = CGM.CreateMetadataIdentifierGeneralized(CST);
-      auto *TypeIdMDVal =
-          llvm::MetadataAsValue::get(getLLVMContext(), TypeIdMD);
-      BundleList.emplace_back("type", TypeIdMDVal);
-    }
-
-    // Set type identifier metadata of indirect calls for call graph section.
+    // Create operand bundle only for indirect calls, not for all
     if (callOrInvoke && *callOrInvoke && (*callOrInvoke)->isIndirectCall()) {
+      assert((TargetDecl && TargetDecl->getFunctionType() ||
+              Callee.getAbstractInfo().getCalleeFunctionProtoType()) &&
+             "cannot find callsite type");
+      QualType CST;
+      if (TargetDecl && TargetDecl->getFunctionType())
+        CST = QualType(TargetDecl->getFunctionType(), 0);
+      else if (const auto *FPT =
+                   Callee.getAbstractInfo().getCalleeFunctionProtoType())
+        CST = QualType(FPT, 0);
+
+      if (!CST.isNull()) {
+        auto *TypeIdMD = CGM.CreateMetadataIdentifierGeneralized(CST);
+        auto *TypeIdMDVal =
+            llvm::MetadataAsValue::get(getLLVMContext(), TypeIdMD);
+        BundleList.emplace_back("type", TypeIdMDVal);
+      }
+
+      // Set type identifier metadata of indirect calls for call graph section.
       if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl)) {
         // Type id metadata is set only for C/C++ contexts.
-        if (isa<CXXConstructorDecl>(FD) || isa<CXXMethodDecl>(FD) ||
-            isa<CXXDestructorDecl>(FD)) {
+        if (isCXXDeclType(FD)) {
           CGM.CreateFunctionTypeMetadataForIcall(FD->getType(), *callOrInvoke);
         }
       }
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index e6f8e6873004..469e0363b744 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -634,12 +634,19 @@ static void destroyOptimisticNormalEntry(CodeGenFunction &CGF,
 /// Pops a cleanup block.  If the block includes a normal cleanup, the
 /// current insertion point is threaded through the cleanup, as are
 /// any branch fixups on the cleanup.
-void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
+void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough,
+                                      bool ForDeactivation) {
   assert(!EHStack.empty() && "cleanup stack is empty!");
   assert(isa<EHCleanupScope>(*EHStack.begin()) && "top not a cleanup!");
   EHCleanupScope &Scope = cast<EHCleanupScope>(*EHStack.begin());
   assert(Scope.getFixupDepth() <= EHStack.getNumBranchFixups());
 
+  // If we are deactivating a normal cleanup, we need to pretend that the
+  // fallthrough is unreachable. We restore this IP before returning.
+  CGBuilderTy::InsertPoint NormalDeactivateOrigIP;
+  if (ForDeactivation && (Scope.isNormalCleanup() || !getLangOpts().EHAsynch)) {
+    NormalDeactivateOrigIP = Builder.saveAndClearIP();
+  }
   // Remember activation information.
   bool IsActive = Scope.isActive();
   Address NormalActiveFlag =
@@ -667,7 +674,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
 
   // - whether there's a fallthrough
   llvm::BasicBlock *FallthroughSource = Builder.GetInsertBlock();
-  bool HasFallthrough = (FallthroughSource != nullptr && IsActive);
+  bool HasFallthrough =
+      FallthroughSource != nullptr && (IsActive || HasExistingBranches);
 
   // Branch-through fall-throughs leave the insertion point set to the
   // end of the last cleanup, which points to the current scope.  The
@@ -692,7 +700,11 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
 
   // If we have a prebranched fallthrough into an inactive normal
   // cleanup, rewrite it so that it leads to the appropriate place.
-  if (Scope.isNormalCleanup() && HasPrebranchedFallthrough && !IsActive) {
+  if (Scope.isNormalCleanup() && HasPrebranchedFallthrough &&
+      !RequiresNormalCleanup) {
+    // FIXME: Come up with a program which would need forwarding prebranched
+    // fallthrough and add tests. Otherwise delete this and assert against it.
+    assert(!IsActive);
     llvm::BasicBlock *prebranchDest;
 
     // If the prebranch is semantically branching through the next
@@ -724,6 +736,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
     EHStack.popCleanup(); // safe because there are no fixups
     assert(EHStack.getNumBranchFixups() == 0 ||
            EHStack.hasNormalCleanups());
+    if (NormalDeactivateOrigIP.isSet())
+      Builder.restoreIP(NormalDeactivateOrigIP);
     return;
   }
 
@@ -760,11 +774,19 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
   if (!RequiresNormalCleanup) {
     // Mark CPP scope end for passed-by-value Arg temp
     //   per Windows ABI which is "normally" Cleanup in callee
-    if (IsEHa && getInvokeDest() && Builder.GetInsertBlock()) {
-      if (Personality.isMSVCXXPersonality())
+    if (IsEHa && getInvokeDest()) {
+      // If we are deactivating a normal cleanup then we don't have a
+      // fallthrough. Restore original IP to emit CPP scope ends in the correct
+      // block.
+      if (NormalDeactivateOrigIP.isSet())
+        Builder.restoreIP(NormalDeactivateOrigIP);
+      if (Personality.isMSVCXXPersonality() && Builder.GetInsertBlock())
         EmitSehCppScopeEnd();
+      if (NormalDeactivateOrigIP.isSet())
+        NormalDeactivateOrigIP = Builder.saveAndClearIP();
     }
     destroyOptimisticNormalEntry(*this, Scope);
+    Scope.MarkEmitted();
     EHStack.popCleanup();
   } else {
     // If we have a fallthrough and no other need for the cleanup,
@@ -781,6 +803,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
       }
 
       destroyOptimisticNormalEntry(*this, Scope);
+      Scope.MarkEmitted();
       EHStack.popCleanup();
 
       EmitCleanup(*this, Fn, cleanupFlags, NormalActiveFlag);
@@ -916,6 +939,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
       }
 
       // IV.  Pop the cleanup and emit it.
+      Scope.MarkEmitted();
       EHStack.popCleanup();
       assert(EHStack.hasNormalCleanups() == HasEnclosingCleanups);
 
@@ -984,6 +1008,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
     }
   }
 
+  if (NormalDeactivateOrigIP.isSet())
+    Builder.restoreIP(NormalDeactivateOrigIP);
   assert(EHStack.hasNormalCleanups() || EHStack.getNumBranchFixups() == 0);
 
   // Emit the EH cleanup if required.
@@ -1143,25 +1169,6 @@ void CodeGenFunction::EmitBranchThroughCleanup(JumpDest Dest) {
   Builder.ClearInsertionPoint();
 }
 
-static bool IsUsedAsNormalCleanup(EHScopeStack &EHStack,
-                                  EHScopeStack::stable_iterator C) {
-  // If we needed a normal block for any reason, that counts.
-  if (cast<EHCleanupScope>(*EHStack.find(C)).getNormalBlock())
-    return true;
-
-  // Check whether any enclosed cleanups were needed.
-  for (EHScopeStack::stable_iterator
-         I = EHStack.getInnermostNormalCleanup();
-         I != C; ) {
-    assert(C.strictlyEncloses(I));
-    EHCleanupScope &S = cast<EHCleanupScope>(*EHStack.find(I));
-    if (S.getNormalBlock()) return true;
-    I = S.getEnclosingNormalCleanup();
-  }
-
-  return false;
-}
-
 static bool IsUsedAsEHCleanup(EHScopeStack &EHStack,
                               EHScopeStack::stable_iterator cleanup) {
   // If we needed an EH block for any reason, that counts.
@@ -1210,8 +1217,7 @@ static void SetupCleanupBlockActivation(CodeGenFunction &CGF,
   // Calculate whether the cleanup was used:
 
   //   - as a normal cleanup
-  if (Scope.isNormalCleanup() &&
-      (isActivatedInConditional || IsUsedAsNormalCleanup(CGF.EHStack, C))) {
+  if (Scope.isNormalCleanup()) {
     Scope.setTestFlagInNormalCleanup();
     needFlag = true;
   }
@@ -1224,13 +1230,16 @@ static void SetupCleanupBlockActivation(CodeGenFunction &CGF,
   }
 
   // If it hasn't yet been used as either, we're done.
-  if (!needFlag) return;
+  if (!needFlag)
+    return;
 
   Address var = Scope.getActiveFlag();
   if (!var.isValid()) {
+    CodeGenFunction::AllocaTrackerRAII AllocaTracker(CGF);
     var = CGF.CreateTempAlloca(CGF.Builder.getInt1Ty(), CharUnits::One(),
                                "cleanup.isactive");
     Scope.setActiveFlag(var);
+    Scope.AddAuxAllocas(AllocaTracker.Take());
 
     assert(dominatingIP && "no existing variable and no dominating IP!");
 
@@ -1273,17 +1282,8 @@ void CodeGenFunction::DeactivateCleanupBlock(EHScopeStack::stable_iterator C,
   // to the current RunCleanupsScope.
   if (C == EHStack.stable_begin() &&
       CurrentCleanupScopeDepth.strictlyEncloses(C)) {
-    // Per comment below, checking EHAsynch is not really necessary
-    // it's there to assure zero-impact w/o EHAsynch option
-    if (!Scope.isNormalCleanup() && getLangOpts().EHAsynch) {
-      PopCleanupBlock();
-    } else {
-      // If it's a normal cleanup, we need to pretend that the
-      // fallthrough is unreachable.
-      CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP();
-      PopCleanupBlock();
-      Builder.restoreIP(SavedIP);
-    }
+    PopCleanupBlock(/*FallthroughIsBranchThrough=*/false,
+                    /*ForDeactivation=*/true);
     return;
   }
 
diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h
index 03e4a29d7b3d..c73c97146abc 100644
--- a/clang/lib/CodeGen/CGCleanup.h
+++ b/clang/lib/CodeGen/CGCleanup.h
@@ -16,8 +16,11 @@
 #include "EHScopeStack.h"
 
 #include "Address.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instruction.h"
 
 namespace llvm {
 class BasicBlock;
@@ -266,6 +269,51 @@ class alignas(8) EHCleanupScope : public EHScope {
   };
   mutable struct ExtInfo *ExtInfo;
 
+  /// Erases auxillary allocas and their usages for an unused cleanup.
+  /// Cleanups should mark these allocas as 'used' if the cleanup is
+  /// emitted, otherwise these instructions would be erased.
+  struct AuxillaryAllocas {
+    SmallVector<llvm::Instruction *, 1> AuxAllocas;
+    bool used = false;
+
+    // Records a potentially unused instruction to be erased later.
+    void Add(llvm::AllocaInst *Alloca) { AuxAllocas.push_back(Alloca); }
+
+    // Mark all recorded instructions as used. These will not be erased later.
+    void MarkUsed() {
+      used = true;
+      AuxAllocas.clear();
+    }
+
+    ~AuxillaryAllocas() {
+      if (used)
+        return;
+      llvm::SetVector<llvm::Instruction *> Uses;
+      for (auto *Inst : llvm::reverse(AuxAllocas))
+        CollectUses(Inst, Uses);
+      // Delete uses in the reverse order of insertion.
+      for (auto *I : llvm::reverse(Uses))
+        I->eraseFromParent();
+    }
+
+  private:
+    void CollectUses(llvm::Instruction *I,
+                     llvm::SetVector<llvm::Instruction *> &Uses) {
+      if (!I || !Uses.insert(I))
+        return;
+      for (auto *User : I->users())
+        CollectUses(cast<llvm::Instruction>(User), Uses);
+    }
+  };
+  mutable struct AuxillaryAllocas *AuxAllocas;
+
+  AuxillaryAllocas &getAuxillaryAllocas() {
+    if (!AuxAllocas) {
+      AuxAllocas = new struct AuxillaryAllocas();
+    }
+    return *AuxAllocas;
+  }
+
   /// The number of fixups required by enclosing scopes (not including
   /// this one).  If this is the top cleanup scope, all the fixups
   /// from this index onwards belong to this scope.
@@ -298,7 +346,7 @@ public:
                  EHScopeStack::stable_iterator enclosingEH)
       : EHScope(EHScope::Cleanup, enclosingEH),
         EnclosingNormal(enclosingNormal), NormalBlock(nullptr),
-        ActiveFlag(Address::invalid()), ExtInfo(nullptr),
+        ActiveFlag(Address::invalid()), ExtInfo(nullptr), AuxAllocas(nullptr),
         FixupDepth(fixupDepth) {
     CleanupBits.IsNormalCleanup = isNormal;
     CleanupBits.IsEHCleanup = isEH;
@@ -312,8 +360,15 @@ public:
   }
 
   void Destroy() {
+    if (AuxAllocas)
+      delete AuxAllocas;
     delete ExtInfo;
   }
+  void AddAuxAllocas(llvm::SmallVector<llvm::AllocaInst *> Allocas) {
+    for (auto *Alloca : Allocas)
+      getAuxillaryAllocas().Add(Alloca);
+  }
+  void MarkEmitted() { getAuxillaryAllocas().MarkUsed(); }
   // Objects of EHCleanupScope are not destructed. Use Destroy().
   ~EHCleanupScope() = delete;
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 539ded5cca5e..fac278f0e20a 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1372,7 +1372,28 @@ llvm::DIType *CGDebugInfo::CreateType(const TemplateSpecializationType *Ty,
 
   SourceLocation Loc = AliasDecl->getLocation();
 
-  if (CGM.getCodeGenOpts().DebugTemplateAlias) {
+  if (CGM.getCodeGenOpts().DebugTemplateAlias &&
+      // FIXME: This is a workaround for the issue
+      //        https://github.com/llvm/llvm-project/issues/89774
+      // The TemplateSpecializationType doesn't contain any instantiation
+      // information; dependent template arguments can't be resolved. For now,
+      // fall back to DW_TAG_typedefs for template aliases that are
+      // instantiation dependent, e.g.:
+      // ```
+      // template <int>
+      // using A = int;
+      //
+      // template<int I>
+      // struct S {
+      //   using AA = A<I>; // Instantiation dependent.
+      //   AA aa;
+      // };
+      //
+      // S<0> s;
+      // ```
+      // S::AA's underlying type A<I> is dependent on I so will be emitted as a
+      // DW_TAG_typedef.
+      !Ty->isInstantiationDependentType()) {
     auto ArgVector = ::GetTemplateArgs(TD, Ty);
     TemplateArgs Args = {TD->getTemplateParameters(), ArgVector};
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index ce6d6d895607..9cc67cdbe424 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -19,6 +19,7 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
+#include "EHScopeStack.h"
 #include "PatternInit.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
@@ -35,6 +36,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include <optional>
@@ -2201,6 +2203,27 @@ void CodeGenFunction::pushDestroy(CleanupKind cleanupKind, Address addr,
                                      destroyer, useEHCleanupForArray);
 }
 
+// Pushes a destroy and defers its deactivation until its
+// CleanupDeactivationScope is exited.
+void CodeGenFunction::pushDestroyAndDeferDeactivation(
+    QualType::DestructionKind dtorKind, Address addr, QualType type) {
+  assert(dtorKind && "cannot push destructor for trivial type");
+
+  CleanupKind cleanupKind = getCleanupKind(dtorKind);
+  pushDestroyAndDeferDeactivation(
+      cleanupKind, addr, type, getDestroyer(dtorKind), cleanupKind & EHCleanup);
+}
+
+void CodeGenFunction::pushDestroyAndDeferDeactivation(
+    CleanupKind cleanupKind, Address addr, QualType type, Destroyer *destroyer,
+    bool useEHCleanupForArray) {
+  llvm::Instruction *DominatingIP =
+      Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
+  pushDestroy(cleanupKind, addr, type, destroyer, useEHCleanupForArray);
+  DeferredDeactivationCleanupStack.push_back(
+      {EHStack.stable_begin(), DominatingIP});
+}
+
 void CodeGenFunction::pushStackRestore(CleanupKind Kind, Address SPMem) {
   EHStack.pushCleanup<CallStackRestore>(Kind, SPMem);
 }
@@ -2217,39 +2240,48 @@ void CodeGenFunction::pushLifetimeExtendedDestroy(CleanupKind cleanupKind,
   // If we're not in a conditional branch, we don't need to bother generating a
   // conditional cleanup.
   if (!isInConditionalBranch()) {
-    // Push an EH-only cleanup for the object now.
     // FIXME: When popping normal cleanups, we need to keep this EH cleanup
     // around in case a temporary's destructor throws an exception.
-    if (cleanupKind & EHCleanup)
-      EHStack.pushCleanup<DestroyObject>(
-          static_cast<CleanupKind>(cleanupKind & ~NormalCleanup), addr, type,
-          destroyer, useEHCleanupForArray);
 
+    // Add the cleanup to the EHStack. After the full-expr, this would be
+    // deactivated before being popped from the stack.
+    pushDestroyAndDeferDeactivation(cleanupKind, addr, type, destroyer,
+                                    useEHCleanupForArray);
+
+    // Since this is lifetime-extended, push it once again to the EHStack after
+    // the full expression.
     return pushCleanupAfterFullExprWithActiveFlag<DestroyObject>(
-        cleanupKind, Address::invalid(), addr, type, destroyer, useEHCleanupForArray);
+        cleanupKind, Address::invalid(), addr, type, destroyer,
+        useEHCleanupForArray);
   }
 
   // Otherwise, we should only destroy the object if it's been initialized.
-  // Re-use the active flag and saved address across both the EH and end of
-  // scope cleanups.
 
-  using SavedType = typename DominatingValue<Address>::saved_type;
   using ConditionalCleanupType =
       EHScopeStack::ConditionalCleanup<DestroyObject, Address, QualType,
                                        Destroyer *, bool>;
-
-  Address ActiveFlag = createCleanupActiveFlag();
-  SavedType SavedAddr = saveValueInCond(addr);
-
-  if (cleanupKind & EHCleanup) {
-    EHStack.pushCleanup<ConditionalCleanupType>(
-        static_cast<CleanupKind>(cleanupKind & ~NormalCleanup), SavedAddr, type,
-        destroyer, useEHCleanupForArray);
-    initFullExprCleanupWithFlag(ActiveFlag);
-  }
-
+  DominatingValue<Address>::saved_type SavedAddr = saveValueInCond(addr);
+
+  // Remember to emit cleanup if we branch-out before end of full-expression
+  // (eg: through stmt-expr or coro suspensions).
+  AllocaTrackerRAII DeactivationAllocas(*this);
+  Address ActiveFlagForDeactivation = createCleanupActiveFlag();
+
+  pushCleanupAndDeferDeactivation<ConditionalCleanupType>(
+      cleanupKind, SavedAddr, type, destroyer, useEHCleanupForArray);
+  initFullExprCleanupWithFlag(ActiveFlagForDeactivation);
+  EHCleanupScope &cleanup = cast<EHCleanupScope>(*EHStack.begin());
+  // Erase the active flag if the cleanup was not emitted.
+  cleanup.AddAuxAllocas(std::move(DeactivationAllocas).Take());
+
+  // Since this is lifetime-extended, push it once again to the EHStack after
+  // the full expression.
+  // The previous active flag would always be 'false' due to forced deferred
+  // deactivation. Use a separate flag for lifetime-extension to correctly
+  // remember if this branch was taken and the object was initialized.
+  Address ActiveFlagForLifetimeExt = createCleanupActiveFlag();
   pushCleanupAfterFullExprWithActiveFlag<ConditionalCleanupType>(
-      cleanupKind, ActiveFlag, SavedAddr, type, destroyer,
+      cleanupKind, ActiveFlagForLifetimeExt, SavedAddr, type, destroyer,
       useEHCleanupForArray);
 }
 
@@ -2442,9 +2474,9 @@ namespace {
   };
 } // end anonymous namespace
 
-/// pushIrregularPartialArrayCleanup - Push an EH cleanup to destroy
-/// already-constructed elements of the given array.  The cleanup
-/// may be popped with DeactivateCleanupBlock or PopCleanupBlock.
+/// pushIrregularPartialArrayCleanup - Push a NormalAndEHCleanup to
+/// destroy already-constructed elements of the given array.  The cleanup may be
+/// popped with DeactivateCleanupBlock or PopCleanupBlock.
 ///
 /// \param elementType - the immediate element type of the array;
 ///   possibly still an array type
@@ -2453,10 +2485,9 @@ void CodeGenFunction::pushIrregularPartialArrayCleanup(llvm::Value *arrayBegin,
                                                        QualType elementType,
                                                        CharUnits elementAlign,
                                                        Destroyer *destroyer) {
-  pushFullExprCleanup<IrregularPartialArrayDestroy>(EHCleanup,
-                                                    arrayBegin, arrayEndPointer,
-                                                    elementType, elementAlign,
-                                                    destroyer);
+  pushFullExprCleanup<IrregularPartialArrayDestroy>(
+      NormalAndEHCleanup, arrayBegin, arrayEndPointer, elementType,
+      elementAlign, destroyer);
 }
 
 /// pushRegularPartialArrayCleanup - Push an EH cleanup to destroy
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index ac8df0052361..c31df538d61c 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -115,10 +115,16 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, CharUnits Align,
 llvm::AllocaInst *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty,
                                                     const Twine &Name,
                                                     llvm::Value *ArraySize) {
+  llvm::AllocaInst *Alloca;
   if (ArraySize)
-    return Builder.CreateAlloca(Ty, ArraySize, Name);
-  return new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(),
-                              ArraySize, Name, AllocaInsertPt);
+    Alloca = Builder.CreateAlloca(Ty, ArraySize, Name);
+  else
+    Alloca = new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(),
+                                  ArraySize, Name, AllocaInsertPt);
+  if (Allocas) {
+    Allocas->Add(Alloca);
+  }
+  return Alloca;
 }
 
 /// CreateDefaultAlignTempAlloca - This creates an alloca with the
@@ -1621,8 +1627,8 @@ LValue CodeGenFunction::EmitLValueHelper(const Expr *E,
     return EmitArraySubscriptExpr(cast<ArraySubscriptExpr>(E));
   case Expr::MatrixSubscriptExprClass:
     return EmitMatrixSubscriptExpr(cast<MatrixSubscriptExpr>(E));
-  case Expr::OMPArraySectionExprClass:
-    return EmitOMPArraySectionExpr(cast<OMPArraySectionExpr>(E));
+  case Expr::ArraySectionExprClass:
+    return EmitArraySectionExpr(cast<ArraySectionExpr>(E));
   case Expr::ExtVectorElementExprClass:
     return EmitExtVectorElementExpr(cast<ExtVectorElementExpr>(E));
   case Expr::CXXThisExprClass:
@@ -4363,8 +4369,8 @@ static Address emitOMPArraySectionBase(CodeGenFunction &CGF, const Expr *Base,
                                        QualType BaseTy, QualType ElTy,
                                        bool IsLowerBound) {
   LValue BaseLVal;
-  if (auto *ASE = dyn_cast<OMPArraySectionExpr>(Base->IgnoreParenImpCasts())) {
-    BaseLVal = CGF.EmitOMPArraySectionExpr(ASE, IsLowerBound);
+  if (auto *ASE = dyn_cast<ArraySectionExpr>(Base->IgnoreParenImpCasts())) {
+    BaseLVal = CGF.EmitArraySectionExpr(ASE, IsLowerBound);
     if (BaseTy->isArrayType()) {
       Address Addr = BaseLVal.getAddress(CGF);
       BaseInfo = BaseLVal.getBaseInfo();
@@ -4396,9 +4402,13 @@ static Address emitOMPArraySectionBase(CodeGenFunction &CGF, const Expr *Base,
   return CGF.EmitPointerWithAlignment(Base, &BaseInfo, &TBAAInfo);
 }
 
-LValue CodeGenFunction::EmitOMPArraySectionExpr(const OMPArraySectionExpr *E,
-                                                bool IsLowerBound) {
-  QualType BaseTy = OMPArraySectionExpr::getBaseOriginalType(E->getBase());
+LValue CodeGenFunction::EmitArraySectionExpr(const ArraySectionExpr *E,
+                                             bool IsLowerBound) {
+
+  assert(!E->isOpenACCArraySection() &&
+         "OpenACC Array section codegen not implemented");
+
+  QualType BaseTy = ArraySectionExpr::getBaseOriginalType(E->getBase());
   QualType ResultExprTy;
   if (auto *AT = getContext().getAsArrayType(BaseTy))
     ResultExprTy = AT->getElementType();
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 355fec42be44..44d476976a55 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
+#include "EHScopeStack.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 using namespace clang;
@@ -558,24 +560,27 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
   // For that, we'll need an EH cleanup.
   QualType::DestructionKind dtorKind = elementType.isDestructedType();
   Address endOfInit = Address::invalid();
-  EHScopeStack::stable_iterator cleanup;
-  llvm::Instruction *cleanupDominator = nullptr;
-  if (CGF.needsEHCleanup(dtorKind)) {
+  CodeGenFunction::CleanupDeactivationScope deactivation(CGF);
+
+  if (dtorKind) {
+    CodeGenFunction::AllocaTrackerRAII allocaTracker(CGF);
     // In principle we could tell the cleanup where we are more
     // directly, but the control flow can get so varied here that it
     // would actually be quite complex.  Therefore we go through an
     // alloca.
+    llvm::Instruction *dominatingIP =
+        Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(CGF.Int8PtrTy));
     endOfInit = CGF.CreateTempAlloca(begin->getType(), CGF.getPointerAlign(),
                                      "arrayinit.endOfInit");
-    cleanupDominator = Builder.CreateStore(begin, endOfInit);
+    Builder.CreateStore(begin, endOfInit);
     CGF.pushIrregularPartialArrayCleanup(begin, endOfInit, elementType,
                                          elementAlign,
                                          CGF.getDestroyer(dtorKind));
-    cleanup = CGF.EHStack.stable_begin();
+    cast<EHCleanupScope>(*CGF.EHStack.find(CGF.EHStack.stable_begin()))
+        .AddAuxAllocas(allocaTracker.Take());
 
-  // Otherwise, remember that we didn't need a cleanup.
-  } else {
-    dtorKind = QualType::DK_none;
+    CGF.DeferredDeactivationCleanupStack.push_back(
+        {CGF.EHStack.stable_begin(), dominatingIP});
   }
 
   llvm::Value *one = llvm::ConstantInt::get(CGF.SizeTy, 1);
@@ -671,9 +676,6 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
 
     CGF.EmitBlock(endBB);
   }
-
-  // Leave the partial-array cleanup if we entered one.
-  if (dtorKind) CGF.DeactivateCleanupBlock(cleanup, cleanupDominator);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1374,9 +1376,8 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
   LValue SlotLV = CGF.MakeAddrLValue(Slot.getAddress(), E->getType());
 
   // We'll need to enter cleanup scopes in case any of the element
-  // initializers throws an exception.
-  SmallVector<EHScopeStack::stable_iterator, 16> Cleanups;
-  llvm::Instruction *CleanupDominator = nullptr;
+  // initializers throws an exception or contains branch out of the expressions.
+  CodeGenFunction::CleanupDeactivationScope scope(CGF);
 
   CXXRecordDecl::field_iterator CurField = E->getLambdaClass()->field_begin();
   for (LambdaExpr::const_capture_init_iterator i = E->capture_init_begin(),
@@ -1395,28 +1396,12 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
     if (QualType::DestructionKind DtorKind =
             CurField->getType().isDestructedType()) {
       assert(LV.isSimple());
-      if (CGF.needsEHCleanup(DtorKind)) {
-        if (!CleanupDominator)
-          CleanupDominator = CGF.Builder.CreateAlignedLoad(
-              CGF.Int8Ty,
-              llvm::Constant::getNullValue(CGF.Int8PtrTy),
-              CharUnits::One()); // placeholder
-
-        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), CurField->getType(),
-                        CGF.getDestroyer(DtorKind), false);
-        Cleanups.push_back(CGF.EHStack.stable_begin());
-      }
+      if (DtorKind)
+        CGF.pushDestroyAndDeferDeactivation(
+            NormalAndEHCleanup, LV.getAddress(CGF), CurField->getType(),
+            CGF.getDestroyer(DtorKind), false);
     }
   }
-
-  // Deactivate all the partial cleanups in reverse order, which
-  // generally means popping them.
-  for (unsigned i = Cleanups.size(); i != 0; --i)
-    CGF.DeactivateCleanupBlock(Cleanups[i-1], CleanupDominator);
-
-  // Destroy the placeholder if we made one.
-  if (CleanupDominator)
-    CleanupDominator->eraseFromParent();
 }
 
 void AggExprEmitter::VisitExprWithCleanups(ExprWithCleanups *E) {
@@ -1705,14 +1690,7 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
   // We'll need to enter cleanup scopes in case any of the element
   // initializers throws an exception.
   SmallVector<EHScopeStack::stable_iterator, 16> cleanups;
-  llvm::Instruction *cleanupDominator = nullptr;
-  auto addCleanup = [&](const EHScopeStack::stable_iterator &cleanup) {
-    cleanups.push_back(cleanup);
-    if (!cleanupDominator) // create placeholder once needed
-      cleanupDominator = CGF.Builder.CreateAlignedLoad(
-          CGF.Int8Ty, llvm::Constant::getNullValue(CGF.Int8PtrTy),
-          CharUnits::One());
-  };
+  CodeGenFunction::CleanupDeactivationScope DeactivateCleanups(CGF);
 
   unsigned curInitIndex = 0;
 
@@ -1735,10 +1713,8 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
       CGF.EmitAggExpr(InitExprs[curInitIndex++], AggSlot);
 
       if (QualType::DestructionKind dtorKind =
-              Base.getType().isDestructedType()) {
-        CGF.pushDestroy(dtorKind, V, Base.getType());
-        addCleanup(CGF.EHStack.stable_begin());
-      }
+              Base.getType().isDestructedType())
+        CGF.pushDestroyAndDeferDeactivation(dtorKind, V, Base.getType());
     }
   }
 
@@ -1815,10 +1791,10 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
     if (QualType::DestructionKind dtorKind
           = field->getType().isDestructedType()) {
       assert(LV.isSimple());
-      if (CGF.needsEHCleanup(dtorKind)) {
-        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), field->getType(),
-                        CGF.getDestroyer(dtorKind), false);
-        addCleanup(CGF.EHStack.stable_begin());
+      if (dtorKind) {
+        CGF.pushDestroyAndDeferDeactivation(
+            NormalAndEHCleanup, LV.getAddress(CGF), field->getType(),
+            CGF.getDestroyer(dtorKind), false);
         pushedCleanup = true;
       }
     }
@@ -1831,17 +1807,6 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
         if (GEP->use_empty())
           GEP->eraseFromParent();
   }
-
-  // Deactivate all the partial cleanups in reverse order, which
-  // generally means popping them.
-  assert((cleanupDominator || cleanups.empty()) &&
-         "Missing cleanupDominator before deactivating cleanup blocks");
-  for (unsigned i = cleanups.size(); i != 0; --i)
-    CGF.DeactivateCleanupBlock(cleanups[i-1], cleanupDominator);
-
-  // Destroy the placeholder if we made one.
-  if (cleanupDominator)
-    cleanupDominator->eraseFromParent();
 }
 
 void AggExprEmitter::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E,
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 673ccef84d67..c18c36d3f3f3 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1008,8 +1008,8 @@ void CodeGenFunction::EmitNewArrayInitializer(
   const Expr *Init = E->getInitializer();
   Address EndOfInit = Address::invalid();
   QualType::DestructionKind DtorKind = ElementType.isDestructedType();
-  EHScopeStack::stable_iterator Cleanup;
-  llvm::Instruction *CleanupDominator = nullptr;
+  CleanupDeactivationScope deactivation(*this);
+  bool pushedCleanup = false;
 
   CharUnits ElementSize = getContext().getTypeSizeInChars(ElementType);
   CharUnits ElementAlign =
@@ -1105,19 +1105,24 @@ void CodeGenFunction::EmitNewArrayInitializer(
     }
 
     // Enter a partial-destruction Cleanup if necessary.
-    if (needsEHCleanup(DtorKind)) {
+    if (DtorKind) {
+      AllocaTrackerRAII AllocaTracker(*this);
       // In principle we could tell the Cleanup where we are more
       // directly, but the control flow can get so varied here that it
       // would actually be quite complex.  Therefore we go through an
       // alloca.
+      llvm::Instruction *DominatingIP =
+          Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy));
       EndOfInit = CreateTempAlloca(BeginPtr.getType(), getPointerAlign(),
                                    "array.init.end");
-      CleanupDominator =
-          Builder.CreateStore(BeginPtr.emitRawPointer(*this), EndOfInit);
       pushIrregularPartialArrayCleanup(BeginPtr.emitRawPointer(*this),
                                        EndOfInit, ElementType, ElementAlign,
                                        getDestroyer(DtorKind));
-      Cleanup = EHStack.stable_begin();
+      cast<EHCleanupScope>(*EHStack.find(EHStack.stable_begin()))
+          .AddAuxAllocas(AllocaTracker.Take());
+      DeferredDeactivationCleanupStack.push_back(
+          {EHStack.stable_begin(), DominatingIP});
+      pushedCleanup = true;
     }
 
     CharUnits StartAlign = CurPtr.getAlignment();
@@ -1164,9 +1169,6 @@ void CodeGenFunction::EmitNewArrayInitializer(
   // initialization.
   llvm::ConstantInt *ConstNum = dyn_cast<llvm::ConstantInt>(NumElements);
   if (ConstNum && ConstNum->getZExtValue() <= InitListElements) {
-    // If there was a Cleanup, deactivate it.
-    if (CleanupDominator)
-      DeactivateCleanupBlock(Cleanup, CleanupDominator);
     return;
   }
 
@@ -1281,13 +1283,14 @@ void CodeGenFunction::EmitNewArrayInitializer(
     Builder.CreateStore(CurPtr.emitRawPointer(*this), EndOfInit);
 
   // Enter a partial-destruction Cleanup if necessary.
-  if (!CleanupDominator && needsEHCleanup(DtorKind)) {
-    llvm::Value *BeginPtrRaw = BeginPtr.emitRawPointer(*this);
-    llvm::Value *CurPtrRaw = CurPtr.emitRawPointer(*this);
-    pushRegularPartialArrayCleanup(BeginPtrRaw, CurPtrRaw, ElementType,
+  if (!pushedCleanup && needsEHCleanup(DtorKind)) {
+    llvm::Instruction *DominatingIP =
+        Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy));
+    pushRegularPartialArrayCleanup(BeginPtr.emitRawPointer(*this),
+                                   CurPtr.emitRawPointer(*this), ElementType,
                                    ElementAlign, getDestroyer(DtorKind));
-    Cleanup = EHStack.stable_begin();
-    CleanupDominator = Builder.CreateUnreachable();
+    DeferredDeactivationCleanupStack.push_back(
+        {EHStack.stable_begin(), DominatingIP});
   }
 
   // Emit the initializer into this element.
@@ -1295,10 +1298,7 @@ void CodeGenFunction::EmitNewArrayInitializer(
                           AggValueSlot::DoesNotOverlap);
 
   // Leave the Cleanup if we entered one.
-  if (CleanupDominator) {
-    DeactivateCleanupBlock(Cleanup, CleanupDominator);
-    CleanupDominator->eraseFromParent();
-  }
+  deactivation.ForceDeactivate();
 
   // Advance to the next element by adjusting the pointer type as necessary.
   llvm::Value *NextPtr = Builder.CreateConstInBoundsGEP1_32(
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 40a5cd20c3d7..af48e8d2b839 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2330,7 +2330,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     }
 
     // Perform VLAT <-> VLST bitcast through memory.
-    // TODO: since the llvm.experimental.vector.{insert,extract} intrinsics
+    // TODO: since the llvm.vector.{insert,extract} intrinsics
     //       require the element types of the vectors to be the same, we
     //       need to keep this around for bitcasts between VLAT <-> VLST where
     //       the element types of the vectors are not the same, until we figure
diff --git a/clang/lib/CodeGen/CGLoopInfo.cpp b/clang/lib/CodeGen/CGLoopInfo.cpp
index 72d1471021ac..0d4800b90a2f 100644
--- a/clang/lib/CodeGen/CGLoopInfo.cpp
+++ b/clang/lib/CodeGen/CGLoopInfo.cpp
@@ -673,8 +673,6 @@ void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx,
         setPipelineDisabled(true);
         break;
       case LoopHintAttr::UnrollCount:
-        setUnrollState(LoopAttributes::Disable);
-        break;
       case LoopHintAttr::UnrollAndJamCount:
       case LoopHintAttr::VectorizeWidth:
       case LoopHintAttr::InterleaveCount:
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index 4e7f777ba1d9..43dd38659518 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -2905,23 +2905,29 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
       break;
     case CodeGenOptions::Mixed:
     case CodeGenOptions::NonLegacy:
+      StringRef name = "objc_msgSend";
       if (CGM.ReturnTypeUsesFPRet(ResultType)) {
-        imp =
-            CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true),
-                                      "objc_msgSend_fpret")
-                .getCallee();
+        name = "objc_msgSend_fpret";
       } else if (CGM.ReturnTypeUsesSRet(MSI.CallInfo)) {
-        // The actual types here don't matter - we're going to bitcast the
-        // function anyway
-        imp =
-            CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true),
-                                      "objc_msgSend_stret")
-                .getCallee();
-      } else {
-        imp = CGM.CreateRuntimeFunction(
-                     llvm::FunctionType::get(IdTy, IdTy, true), "objc_msgSend")
-                  .getCallee();
+        name = "objc_msgSend_stret";
+
+        // The address of the memory block is be passed in x8 for POD type,
+        // or in x0 for non-POD type (marked as inreg).
+        bool shouldCheckForInReg =
+            CGM.getContext()
+                .getTargetInfo()
+                .getTriple()
+                .isWindowsMSVCEnvironment() &&
+            CGM.getContext().getTargetInfo().getTriple().isAArch64();
+        if (shouldCheckForInReg && CGM.ReturnTypeHasInReg(MSI.CallInfo)) {
+          name = "objc_msgSend_stret2";
+        }
       }
+      // The actual types here don't matter - we're going to bitcast the
+      // function anyway
+      imp = CGM.CreateRuntimeFunction(llvm::FunctionType::get(IdTy, IdTy, true),
+                                      name)
+                .getCallee();
     }
 
   // Reset the receiver in case the lookup modified it
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 2ae11e129c75..e39c7c58d278 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -742,8 +742,8 @@ LValue ReductionCodeGen::emitSharedLValue(CodeGenFunction &CGF, const Expr *E) {
 
 LValue ReductionCodeGen::emitSharedLValueUB(CodeGenFunction &CGF,
                                             const Expr *E) {
-  if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(E))
-    return CGF.EmitOMPArraySectionExpr(OASE, /*IsLowerBound=*/false);
+  if (const auto *OASE = dyn_cast<ArraySectionExpr>(E))
+    return CGF.EmitArraySectionExpr(OASE, /*IsLowerBound=*/false);
   return LValue();
 }
 
@@ -800,7 +800,7 @@ void ReductionCodeGen::emitSharedOrigLValue(CodeGenFunction &CGF, unsigned N) {
 
 void ReductionCodeGen::emitAggregateType(CodeGenFunction &CGF, unsigned N) {
   QualType PrivateType = getPrivateType(N);
-  bool AsArraySection = isa<OMPArraySectionExpr>(ClausesData[N].Ref);
+  bool AsArraySection = isa<ArraySectionExpr>(ClausesData[N].Ref);
   if (!PrivateType->isVariablyModifiedType()) {
     Sizes.emplace_back(
         CGF.getTypeSize(OrigAddresses[N].first.getType().getNonReferenceType()),
@@ -941,9 +941,9 @@ static Address castToBase(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
 
 static const VarDecl *getBaseDecl(const Expr *Ref, const DeclRefExpr *&DE) {
   const VarDecl *OrigVD = nullptr;
-  if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(Ref)) {
+  if (const auto *OASE = dyn_cast<ArraySectionExpr>(Ref)) {
     const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
-    while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+    while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
       Base = TempOASE->getBase()->IgnoreParenImpCasts();
     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
       Base = TempASE->getBase()->IgnoreParenImpCasts();
@@ -3570,9 +3570,8 @@ getPointerAndSize(CodeGenFunction &CGF, const Expr *E) {
       SizeVal = CGF.Builder.CreateNUWMul(SizeVal, Sz);
     }
   } else if (const auto *ASE =
-                 dyn_cast<OMPArraySectionExpr>(E->IgnoreParenImpCasts())) {
-    LValue UpAddrLVal =
-        CGF.EmitOMPArraySectionExpr(ASE, /*IsLowerBound=*/false);
+                 dyn_cast<ArraySectionExpr>(E->IgnoreParenImpCasts())) {
+    LValue UpAddrLVal = CGF.EmitArraySectionExpr(ASE, /*IsLowerBound=*/false);
     Address UpAddrAddress = UpAddrLVal.getAddress(CGF);
     llvm::Value *UpAddr = CGF.Builder.CreateConstGEP1_32(
         UpAddrAddress.getElementType(), UpAddrAddress.emitRawPointer(CGF),
@@ -6672,8 +6671,8 @@ private:
     // Given that an array section is considered a built-in type, we need to
     // do the calculation based on the length of the section instead of relying
     // on CGF.getTypeSize(E->getType()).
-    if (const auto *OAE = dyn_cast<OMPArraySectionExpr>(E)) {
-      QualType BaseTy = OMPArraySectionExpr::getBaseOriginalType(
+    if (const auto *OAE = dyn_cast<ArraySectionExpr>(E)) {
+      QualType BaseTy = ArraySectionExpr::getBaseOriginalType(
                             OAE->getBase()->IgnoreParenImpCasts())
                             .getCanonicalType();
 
@@ -6779,7 +6778,7 @@ private:
   /// Return true if the provided expression is a final array section. A
   /// final array section, is one whose length can't be proved to be one.
   bool isFinalArraySectionExpression(const Expr *E) const {
-    const auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
+    const auto *OASE = dyn_cast<ArraySectionExpr>(E);
 
     // It is not an array section and therefore not a unity-size one.
     if (!OASE)
@@ -6795,7 +6794,7 @@ private:
     // for this dimension. Also, we should always expect a length if the
     // base type is pointer.
     if (!Length) {
-      QualType BaseQTy = OMPArraySectionExpr::getBaseOriginalType(
+      QualType BaseQTy = ArraySectionExpr::getBaseOriginalType(
                              OASE->getBase()->IgnoreParenImpCasts())
                              .getCanonicalType();
       if (const auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
@@ -7027,7 +7026,7 @@ private:
     Address BP = Address::invalid();
     const Expr *AssocExpr = I->getAssociatedExpression();
     const auto *AE = dyn_cast<ArraySubscriptExpr>(AssocExpr);
-    const auto *OASE = dyn_cast<OMPArraySectionExpr>(AssocExpr);
+    const auto *OASE = dyn_cast<ArraySectionExpr>(AssocExpr);
     const auto *OAShE = dyn_cast<OMPArrayShapingExpr>(AssocExpr);
 
     if (isa<MemberExpr>(AssocExpr)) {
@@ -7179,14 +7178,14 @@ private:
       // special treatment for array sections given that they are built-in
       // types.
       const auto *OASE =
-          dyn_cast<OMPArraySectionExpr>(I->getAssociatedExpression());
+          dyn_cast<ArraySectionExpr>(I->getAssociatedExpression());
       const auto *OAShE =
           dyn_cast<OMPArrayShapingExpr>(I->getAssociatedExpression());
       const auto *UO = dyn_cast<UnaryOperator>(I->getAssociatedExpression());
       const auto *BO = dyn_cast<BinaryOperator>(I->getAssociatedExpression());
       bool IsPointer =
           OAShE ||
-          (OASE && OMPArraySectionExpr::getBaseOriginalType(OASE)
+          (OASE && ArraySectionExpr::getBaseOriginalType(OASE)
                        .getCanonicalType()
                        ->isAnyPointerType()) ||
           I->getAssociatedExpression()->getType()->isAnyPointerType();
@@ -7207,7 +7206,7 @@ private:
         assert((Next == CE ||
                 isa<MemberExpr>(Next->getAssociatedExpression()) ||
                 isa<ArraySubscriptExpr>(Next->getAssociatedExpression()) ||
-                isa<OMPArraySectionExpr>(Next->getAssociatedExpression()) ||
+                isa<ArraySectionExpr>(Next->getAssociatedExpression()) ||
                 isa<OMPArrayShapingExpr>(Next->getAssociatedExpression()) ||
                 isa<UnaryOperator>(Next->getAssociatedExpression()) ||
                 isa<BinaryOperator>(Next->getAssociatedExpression())) &&
@@ -7439,7 +7438,7 @@ private:
             PartialStruct.LowestElem = {FieldIndex, LowestElem};
             if (IsFinalArraySection) {
               Address HB =
-                  CGF.EmitOMPArraySectionExpr(OASE, /*IsLowerBound=*/false)
+                  CGF.EmitArraySectionExpr(OASE, /*IsLowerBound=*/false)
                       .getAddress(CGF);
               PartialStruct.HighestElem = {FieldIndex, HB};
             } else {
@@ -7452,7 +7451,7 @@ private:
           } else if (FieldIndex > PartialStruct.HighestElem.first) {
             if (IsFinalArraySection) {
               Address HB =
-                  CGF.EmitOMPArraySectionExpr(OASE, /*IsLowerBound=*/false)
+                  CGF.EmitArraySectionExpr(OASE, /*IsLowerBound=*/false)
                       .getAddress(CGF);
               PartialStruct.HighestElem = {FieldIndex, HB};
             } else {
@@ -7510,12 +7509,12 @@ private:
     for (const OMPClauseMappableExprCommon::MappableComponent &Component :
          Components) {
       const Expr *AssocExpr = Component.getAssociatedExpression();
-      const auto *OASE = dyn_cast<OMPArraySectionExpr>(AssocExpr);
+      const auto *OASE = dyn_cast<ArraySectionExpr>(AssocExpr);
 
       if (!OASE)
         continue;
 
-      QualType Ty = OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
+      QualType Ty = ArraySectionExpr::getBaseOriginalType(OASE->getBase());
       auto *CAT = Context.getAsConstantArrayType(Ty);
       auto *VAT = Context.getAsVariableArrayType(Ty);
 
@@ -7589,7 +7588,7 @@ private:
         continue;
       }
 
-      const auto *OASE = dyn_cast<OMPArraySectionExpr>(AssocExpr);
+      const auto *OASE = dyn_cast<ArraySectionExpr>(AssocExpr);
 
       if (!OASE)
         continue;
@@ -8780,7 +8779,7 @@ static ValueDecl *getDeclFromThisExpr(const Expr *E) {
   if (!E)
     return nullptr;
 
-  if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(E->IgnoreParenCasts()))
+  if (const auto *OASE = dyn_cast<ArraySectionExpr>(E->IgnoreParenCasts()))
     if (const MemberExpr *ME =
             dyn_cast<MemberExpr>(OASE->getBase()->IgnoreParenImpCasts()))
       return ME->getMemberDecl();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index eb716520e5ff..87496c8e488c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -92,9 +92,9 @@ static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
       Base = TempASE->getBase()->IgnoreParenImpCasts();
     RefExpr = Base;
-  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
+  } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
     const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
-    while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+    while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
       Base = TempOASE->getBase()->IgnoreParenImpCasts();
     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
       Base = TempASE->getBase()->IgnoreParenImpCasts();
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index a0a8a07c76ba..ef3aa3a8e0dc 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1256,7 +1256,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     const auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
     const auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
     QualType Type = PrivateVD->getType();
-    bool isaOMPArraySectionExpr = isa<OMPArraySectionExpr>(IRef);
+    bool isaOMPArraySectionExpr = isa<ArraySectionExpr>(IRef);
     if (isaOMPArraySectionExpr && Type->isVariablyModifiedType()) {
       // Store the address of the original variable associated with the LHS
       // implicit variable.
@@ -7289,7 +7289,7 @@ void CodeGenFunction::EmitOMPUseDevicePtrClause(
 
 static const VarDecl *getBaseDecl(const Expr *Ref) {
   const Expr *Base = Ref->IgnoreParenImpCasts();
-  while (const auto *OASE = dyn_cast<OMPArraySectionExpr>(Base))
+  while (const auto *OASE = dyn_cast<ArraySectionExpr>(Base))
     Base = OASE->getBase()->IgnoreParenImpCasts();
   while (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Base))
     Base = ASE->getBase()->IgnoreParenImpCasts();
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index 52216d93a302..7a933d0ed0d0 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -143,6 +143,9 @@ add_clang_library(clangCodeGen
   DEPENDS
   intrinsics_gen
   ClangDriverOptions
+  # These generated headers are included transitively.
+  ARMTargetParserTableGen
+  AArch64TargetParserTableGen
 
   LINK_LIBS
   clangAST
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 86a6ddd80cc1..87766a758311 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -91,6 +91,8 @@ CodeGenFunction::CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext)
 
 CodeGenFunction::~CodeGenFunction() {
   assert(LifetimeExtendedCleanupStack.empty() && "failed to emit a cleanup");
+  assert(DeferredDeactivationCleanupStack.empty() &&
+         "missed to deactivate a cleanup");
 
   if (getLangOpts().OpenMP && CurFn)
     CGM.getOpenMPRuntime().functionFinished(*this);
@@ -346,6 +348,10 @@ static void EmitIfUsed(CodeGenFunction &CGF, llvm::BasicBlock *BB) {
 void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   assert(BreakContinueStack.empty() &&
          "mismatched push/pop in break/continue stack!");
+  assert(LifetimeExtendedCleanupStack.empty() &&
+         "mismatched push/pop of cleanups in EHStack!");
+  assert(DeferredDeactivationCleanupStack.empty() &&
+         "mismatched activate/deactivate of cleanups!");
 
   bool OnlySimpleReturnStmts = NumSimpleReturnExprs > 0
     && NumSimpleReturnExprs == NumReturnExprs
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index ff1873325d40..6e7417fc7f52 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -39,6 +39,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/SanitizerStats.h"
@@ -670,6 +671,51 @@ public:
 
   EHScopeStack EHStack;
   llvm::SmallVector<char, 256> LifetimeExtendedCleanupStack;
+
+  // A stack of cleanups which were added to EHStack but have to be deactivated
+  // later before being popped or emitted. These are usually deactivated on
+  // exiting a `CleanupDeactivationScope` scope. For instance, after a
+  // full-expr.
+  //
+  // These are specially useful for correctly emitting cleanups while
+  // encountering branches out of expression (through stmt-expr or coroutine
+  // suspensions).
+  struct DeferredDeactivateCleanup {
+    EHScopeStack::stable_iterator Cleanup;
+    llvm::Instruction *DominatingIP;
+  };
+  llvm::SmallVector<DeferredDeactivateCleanup> DeferredDeactivationCleanupStack;
+
+  // Enters a new scope for capturing cleanups which are deferred to be
+  // deactivated, all of which will be deactivated once the scope is exited.
+  struct CleanupDeactivationScope {
+    CodeGenFunction &CGF;
+    size_t OldDeactivateCleanupStackSize;
+    bool Deactivated;
+    CleanupDeactivationScope(CodeGenFunction &CGF)
+        : CGF(CGF), OldDeactivateCleanupStackSize(
+                        CGF.DeferredDeactivationCleanupStack.size()),
+          Deactivated(false) {}
+
+    void ForceDeactivate() {
+      assert(!Deactivated && "Deactivating already deactivated scope");
+      auto &Stack = CGF.DeferredDeactivationCleanupStack;
+      for (size_t I = Stack.size(); I > OldDeactivateCleanupStackSize; I--) {
+        CGF.DeactivateCleanupBlock(Stack[I - 1].Cleanup,
+                                   Stack[I - 1].DominatingIP);
+        Stack[I - 1].DominatingIP->eraseFromParent();
+      }
+      Stack.resize(OldDeactivateCleanupStackSize);
+      Deactivated = true;
+    }
+
+    ~CleanupDeactivationScope() {
+      if (Deactivated)
+        return;
+      ForceDeactivate();
+    }
+  };
+
   llvm::SmallVector<const JumpDest *, 2> SEHTryEpilogueStack;
 
   llvm::Instruction *CurrentFuncletPad = nullptr;
@@ -875,6 +921,19 @@ public:
       new (Buffer + sizeof(Header) + sizeof(T)) RawAddress(ActiveFlag);
   }
 
+  // Push a cleanup onto EHStack and deactivate it later. It is usually
+  // deactivated when exiting a `CleanupDeactivationScope` (for example: after a
+  // full expression).
+  template <class T, class... As>
+  void pushCleanupAndDeferDeactivation(CleanupKind Kind, As... A) {
+    // Placeholder dominating IP for this cleanup.
+    llvm::Instruction *DominatingIP =
+        Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
+    EHStack.pushCleanup<T>(Kind, A...);
+    DeferredDeactivationCleanupStack.push_back(
+        {EHStack.stable_begin(), DominatingIP});
+  }
+
   /// Set up the last cleanup that was pushed as a conditional
   /// full-expression cleanup.
   void initFullExprCleanup() {
@@ -898,7 +957,8 @@ public:
 
   /// PopCleanupBlock - Will pop the cleanup entry on the stack and
   /// process all branch fixups.
-  void PopCleanupBlock(bool FallThroughIsBranchThrough = false);
+  void PopCleanupBlock(bool FallThroughIsBranchThrough = false,
+                       bool ForDeactivation = false);
 
   /// DeactivateCleanupBlock - Deactivates the given cleanup block.
   /// The block cannot be reactivated.  Pops it if it's the top of the
@@ -926,6 +986,7 @@ public:
   class RunCleanupsScope {
     EHScopeStack::stable_iterator CleanupStackDepth, OldCleanupScopeDepth;
     size_t LifetimeExtendedCleanupStackSize;
+    CleanupDeactivationScope DeactivateCleanups;
     bool OldDidCallStackSave;
   protected:
     bool PerformCleanup;
@@ -940,8 +1001,7 @@ public:
   public:
     /// Enter a new cleanup scope.
     explicit RunCleanupsScope(CodeGenFunction &CGF)
-      : PerformCleanup(true), CGF(CGF)
-    {
+        : DeactivateCleanups(CGF), PerformCleanup(true), CGF(CGF) {
       CleanupStackDepth = CGF.EHStack.stable_begin();
       LifetimeExtendedCleanupStackSize =
           CGF.LifetimeExtendedCleanupStack.size();
@@ -971,6 +1031,7 @@ public:
     void ForceCleanup(std::initializer_list<llvm::Value**> ValuesToReload = {}) {
       assert(PerformCleanup && "Already forced cleanup");
       CGF.DidCallStackSave = OldDidCallStackSave;
+      DeactivateCleanups.ForceDeactivate();
       CGF.PopCleanupBlocks(CleanupStackDepth, LifetimeExtendedCleanupStackSize,
                            ValuesToReload);
       PerformCleanup = false;
@@ -2160,6 +2221,11 @@ public:
                      Address addr, QualType type);
   void pushDestroy(CleanupKind kind, Address addr, QualType type,
                    Destroyer *destroyer, bool useEHCleanupForArray);
+  void pushDestroyAndDeferDeactivation(QualType::DestructionKind dtorKind,
+                                       Address addr, QualType type);
+  void pushDestroyAndDeferDeactivation(CleanupKind cleanupKind, Address addr,
+                                       QualType type, Destroyer *destroyer,
+                                       bool useEHCleanupForArray);
   void pushLifetimeExtendedDestroy(CleanupKind kind, Address addr,
                                    QualType type, Destroyer *destroyer,
                                    bool useEHCleanupForArray);
@@ -2698,6 +2764,33 @@ public:
                             TBAAAccessInfo *TBAAInfo = nullptr);
   LValue EmitLoadOfPointerLValue(Address Ptr, const PointerType *PtrTy);
 
+private:
+  struct AllocaTracker {
+    void Add(llvm::AllocaInst *I) { Allocas.push_back(I); }
+    llvm::SmallVector<llvm::AllocaInst *> Take() { return std::move(Allocas); }
+
+  private:
+    llvm::SmallVector<llvm::AllocaInst *> Allocas;
+  };
+  AllocaTracker *Allocas = nullptr;
+
+public:
+  // Captures all the allocas created during the scope of its RAII object.
+  struct AllocaTrackerRAII {
+    AllocaTrackerRAII(CodeGenFunction &CGF)
+        : CGF(CGF), OldTracker(CGF.Allocas) {
+      CGF.Allocas = &Tracker;
+    }
+    ~AllocaTrackerRAII() { CGF.Allocas = OldTracker; }
+
+    llvm::SmallVector<llvm::AllocaInst *> Take() { return Tracker.Take(); }
+
+  private:
+    CodeGenFunction &CGF;
+    AllocaTracker *OldTracker;
+    AllocaTracker Tracker;
+  };
+
   /// CreateTempAlloca - This creates an alloca and inserts it into the entry
   /// block if \p ArraySize is nullptr, otherwise inserts it at the current
   /// insertion point of the builder. The caller is responsible for setting an
@@ -3204,12 +3297,12 @@ public:
                            llvm::Value *Index, QualType IndexType,
                            QualType IndexedType, bool Accessed);
 
-  // Find a struct's flexible array member. It may be embedded inside multiple
-  // sub-structs, but must still be the last field.
-  const FieldDecl *FindFlexibleArrayMemberField(ASTContext &Ctx,
-                                                const RecordDecl *RD,
-                                                StringRef Name,
-                                                uint64_t &Offset);
+  // Find a struct's flexible array member and get its offset. It may be
+  // embedded inside multiple sub-structs, but must still be the last field.
+  const FieldDecl *
+  FindFlexibleArrayMemberFieldAndOffset(ASTContext &Ctx, const RecordDecl *RD,
+                                        const FieldDecl *FAMDecl,
+                                        uint64_t &Offset);
 
   /// Find the FieldDecl specified in a FAM's "counted_by" attribute. Returns
   /// \p nullptr if either the attribute or the field doesn't exist.
@@ -4169,8 +4262,8 @@ public:
   LValue EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
                                 bool Accessed = false);
   LValue EmitMatrixSubscriptExpr(const MatrixSubscriptExpr *E);
-  LValue EmitOMPArraySectionExpr(const OMPArraySectionExpr *E,
-                                 bool IsLowerBound = true);
+  LValue EmitArraySectionExpr(const ArraySectionExpr *E,
+                              bool IsLowerBound = true);
   LValue EmitExtVectorElementExpr(const ExtVectorElementExpr *E);
   LValue EmitMemberExpr(const MemberExpr *E);
   LValue EmitObjCIsaExpr(const ObjCIsaExpr *E);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 7196635aa5d2..1951747ac6e2 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -73,6 +73,7 @@
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/TargetParser/X86TargetParser.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <optional>
 
 using namespace clang;
@@ -442,6 +443,11 @@ CodeGenModule::CodeGenModule(ASTContext &C,
       }
     ModuleNameHash = llvm::getUniqueInternalLinkagePostfix(Path);
   }
+
+  // Record mregparm value now so it is visible through all of codegen.
+  if (Context.getTargetInfo().getTriple().getArch() == llvm::Triple::x86)
+    getModule().addModuleFlag(llvm::Module::Error, "NumRegisterParameters",
+                              CodeGenOpts.NumRegisterParameters);
 }
 
 CodeGenModule::~CodeGenModule() {}
@@ -980,11 +986,6 @@ void CodeGenModule::Release() {
       NMD->addOperand(MD);
   }
 
-  // Record mregparm value now so it is visible through rest of codegen.
-  if (Context.getTargetInfo().getTriple().getArch() == llvm::Triple::x86)
-    getModule().addModuleFlag(llvm::Module::Error, "NumRegisterParameters",
-                              CodeGenOpts.NumRegisterParameters);
-
   if (CodeGenOpts.DwarfVersion) {
     getModule().addModuleFlag(llvm::Module::Max, "Dwarf Version",
                               CodeGenOpts.DwarfVersion);
@@ -4806,6 +4807,10 @@ CodeGenModule::CreateRuntimeFunction(llvm::FunctionType *FTy, StringRef Name,
         }
       }
       setDSOLocal(F);
+      // FIXME: We should use CodeGenModule::SetLLVMFunctionAttributes() instead
+      // of trying to approximate the attributes using the LLVM function
+      // signature. This requires revising the API of CreateRuntimeFunction().
+      markRegisterParameterAttributes(F);
     }
   }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 12ac8d0f57ee..03e77ddc1292 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1241,6 +1241,9 @@ public:
   /// Return true iff the given type uses 'sret' when used as a return type.
   bool ReturnTypeUsesSRet(const CGFunctionInfo &FI);
 
+  /// Return true iff the given type has `inreg` set.
+  bool ReturnTypeHasInReg(const CGFunctionInfo &FI);
+
   /// Return true iff the given type uses an argument slot when 'sret' is used
   /// as a return type.
   bool ReturnSlotInterferesWithArgs(const CGFunctionInfo &FI);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 1568b6e6275b..e8d75eda029e 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -409,7 +409,7 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       break;
     case BuiltinType::LongDouble:
       LongDoubleReferenced = true;
-      LLVM_FALLTHROUGH;
+      [[fallthrough]];
     case BuiltinType::BFloat16:
     case BuiltinType::Float:
     case BuiltinType::Double:
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index d38a26940a3c..d47927745759 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1131,9 +1131,15 @@ static bool isTrivialForMSVC(const CXXRecordDecl *RD, QualType Ty,
     return false;
   if (RD->hasNonTrivialCopyAssignment())
     return false;
-  for (const CXXConstructorDecl *Ctor : RD->ctors())
-    if (Ctor->isUserProvided())
-      return false;
+  for (const Decl *D : RD->decls()) {
+    if (auto *Ctor = dyn_cast<CXXConstructorDecl>(D)) {
+      if (Ctor->isUserProvided())
+        return false;
+    } else if (auto *Template = dyn_cast<FunctionTemplateDecl>(D)) {
+      if (isa<CXXConstructorDecl>(Template->getTemplatedDecl()))
+        return false;
+    }
+  }
   if (RD->hasNonTrivialDestructor())
     return false;
   return true;
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 58427e3f83c4..32a4378ab499 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -90,6 +90,9 @@ add_clang_library(clangDriver
 
   DEPENDS
   ClangDriverOptions
+  # These generated headers are included transitively.
+  ARMTargetParserTableGen
+  AArch64TargetParserTableGen
 
   LINK_LIBS
   clangBasic
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 76b7b9fdfb4f..114320f5d314 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -361,6 +361,7 @@ phases::ID Driver::getFinalPhase(const DerivedArgList &DAL,
              (PhaseArg = DAL.getLastArg(options::OPT_rewrite_legacy_objc)) ||
              (PhaseArg = DAL.getLastArg(options::OPT__migrate)) ||
              (PhaseArg = DAL.getLastArg(options::OPT__analyze)) ||
+             (PhaseArg = DAL.getLastArg(options::OPT_emit_cir)) ||
              (PhaseArg = DAL.getLastArg(options::OPT_emit_ast))) {
     FinalPhase = phases::Compile;
 
@@ -4799,6 +4800,8 @@ Action *Driver::ConstructPhaseAction(
       return C.MakeAction<MigrateJobAction>(Input, types::TY_Remap);
     if (Args.hasArg(options::OPT_emit_ast))
       return C.MakeAction<CompileJobAction>(Input, types::TY_AST);
+    if (Args.hasArg(options::OPT_emit_cir))
+      return C.MakeAction<CompileJobAction>(Input, types::TY_CIR);
     if (Args.hasArg(options::OPT_module_file_info))
       return C.MakeAction<CompileJobAction>(Input, types::TY_ModuleFile);
     if (Args.hasArg(options::OPT_verify_pch))
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 237092ed07e5..0e86bc07e0ea 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1307,19 +1307,35 @@ void ToolChain::AddCCKextLibArgs(const ArgList &Args,
 
 bool ToolChain::isFastMathRuntimeAvailable(const ArgList &Args,
                                            std::string &Path) const {
+  // Don't implicitly link in mode-changing libraries in a shared library, since
+  // this can have very deleterious effects. See the various links from
+  // https://github.com/llvm/llvm-project/issues/57589 for more information.
+  bool Default = !Args.hasArgNoClaim(options::OPT_shared);
+
   // Do not check for -fno-fast-math or -fno-unsafe-math when -Ofast passed
   // (to keep the linker options consistent with gcc and clang itself).
-  if (!isOptimizationLevelFast(Args)) {
+  if (Default && !isOptimizationLevelFast(Args)) {
     // Check if -ffast-math or -funsafe-math.
-    Arg *A =
-      Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math,
-                      options::OPT_funsafe_math_optimizations,
-                      options::OPT_fno_unsafe_math_optimizations);
+    Arg *A = Args.getLastArg(
+        options::OPT_ffast_math, options::OPT_fno_fast_math,
+        options::OPT_funsafe_math_optimizations,
+        options::OPT_fno_unsafe_math_optimizations, options::OPT_ffp_model_EQ);
 
     if (!A || A->getOption().getID() == options::OPT_fno_fast_math ||
         A->getOption().getID() == options::OPT_fno_unsafe_math_optimizations)
-      return false;
+      Default = false;
+    if (A && A->getOption().getID() == options::OPT_ffp_model_EQ) {
+      StringRef Model = A->getValue();
+      if (Model != "fast")
+        Default = false;
+    }
   }
+
+  // Whatever decision came as a result of the above implicit settings, either
+  // -mdaz-ftz or -mno-daz-ftz is capable of overriding it.
+  if (!Args.hasFlag(options::OPT_mdaz_ftz, options::OPT_mno_daz_ftz, Default))
+    return false;
+
   // If crtfastmath.o exists add it to the arguments.
   Path = GetFilePath("crtfastmath.o");
   return (Path != "crtfastmath.o"); // Not found.
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index c1b350893b37..aab98506adb9 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -376,9 +376,7 @@ void AIX::AddOpenMPIncludeArgs(const ArgList &DriverArgs,
       addSystemInclude(DriverArgs, CC1Args, PathOpenMP.str());
       break;
     case Driver::OMPRT_IOMP5:
-      LLVM_FALLTHROUGH;
     case Driver::OMPRT_GOMP:
-      LLVM_FALLTHROUGH;
     case Driver::OMPRT_Unknown:
       // Unknown / unsupported include paths.
       break;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d56cf63d8b8c..2ec06aaf5aed 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -835,46 +835,6 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
   }
 }
 
-/// Check whether the given input tree contains any compilation actions.
-static bool ContainsCompileAction(const Action *A) {
-  if (isa<CompileJobAction>(A) || isa<BackendJobAction>(A))
-    return true;
-
-  return llvm::any_of(A->inputs(), ContainsCompileAction);
-}
-
-/// Check if -relax-all should be passed to the internal assembler.
-/// This is done by default when compiling non-assembler source with -O0.
-static bool UseRelaxAll(Compilation &C, const ArgList &Args) {
-  bool RelaxDefault = true;
-
-  if (Arg *A = Args.getLastArg(options::OPT_O_Group))
-    RelaxDefault = A->getOption().matches(options::OPT_O0);
-
-  // RISC-V requires an indirect jump for offsets larger than 1MiB. This cannot
-  // be done by assembler branch relaxation as it needs a free temporary
-  // register. Because of this, branch relaxation is handled by a MachineIR
-  // pass before the assembler. Forcing assembler branch relaxation for -O0
-  // makes the MachineIR branch relaxation inaccurate and it will miss cases
-  // where an indirect branch is necessary. To avoid this issue we are
-  // sacrificing the compile time improvement of using -mrelax-all for -O0.
-  if (C.getDefaultToolChain().getTriple().isRISCV())
-    RelaxDefault = false;
-
-  if (RelaxDefault) {
-    RelaxDefault = false;
-    for (const auto &Act : C.getActions()) {
-      if (ContainsCompileAction(Act)) {
-        RelaxDefault = true;
-        break;
-      }
-    }
-  }
-
-  return Args.hasFlag(options::OPT_mrelax_all, options::OPT_mno_relax_all,
-                      RelaxDefault);
-}
-
 static void
 RenderDebugEnablingArgs(const ArgList &Args, ArgStringList &CmdArgs,
                         llvm::codegenoptions::DebugInfoKind DebugInfoKind,
@@ -2472,8 +2432,16 @@ static void CollectArgsForIntegratedAssembler(Compilation &C,
                                               const ArgList &Args,
                                               ArgStringList &CmdArgs,
                                               const Driver &D) {
-  if (UseRelaxAll(C, Args))
-    CmdArgs.push_back("-mrelax-all");
+  // Default to -mno-relax-all.
+  //
+  // Note: RISC-V requires an indirect jump for offsets larger than 1MiB. This
+  // cannot be done by assembler branch relaxation as it needs a free temporary
+  // register. Because of this, branch relaxation is handled by a MachineIR pass
+  // before the assembler. Forcing assembler branch relaxation for -O0 makes the
+  // MachineIR branch relaxation inaccurate and it will miss cases where an
+  // indirect branch is necessary.
+  Args.addOptInFlag(CmdArgs, options::OPT_mrelax_all,
+                    options::OPT_mno_relax_all);
 
   // Only default to -mincremental-linker-compatible if we think we are
   // targeting the MSVC linker.
@@ -2775,13 +2743,11 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   StringRef FPExceptionBehavior = "";
   // -ffp-eval-method options: double, extended, source
   StringRef FPEvalMethod = "";
-  const llvm::DenormalMode DefaultDenormalFPMath =
+  llvm::DenormalMode DenormalFPMath =
       TC.getDefaultDenormalModeForType(Args, JA);
-  const llvm::DenormalMode DefaultDenormalFP32Math =
+  llvm::DenormalMode DenormalFP32Math =
       TC.getDefaultDenormalModeForType(Args, JA, &llvm::APFloat::IEEEsingle());
 
-  llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
-  llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math;
   // CUDA and HIP don't rely on the frontend to pass an ffp-contract option.
   // If one wasn't given by the user, don't pass it here.
   StringRef FPContract;
@@ -2931,11 +2897,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       SignedZeros = true;
       // -fno_fast_math restores default denormal and fpcontract handling
       FPContract = "on";
-      DenormalFPMath = llvm::DenormalMode::getIEEE();
-
-      // FIXME: The target may have picked a non-IEEE default mode here based on
-      // -cl-denorms-are-zero. Should the target consider -fp-model interaction?
-      DenormalFP32Math = llvm::DenormalMode::getIEEE();
 
       StringRef Val = A->getValue();
       if (OFastEnabled && !Val.equals("fast")) {
@@ -3151,12 +3112,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = false;
       SignedZeros = true;
       ApproxFunc = false;
-      TrappingMath = true;
-      FPExceptionBehavior = "strict";
 
-      // The target may have opted to flush by default, so force IEEE.
-      DenormalFPMath = llvm::DenormalMode::getIEEE();
-      DenormalFP32Math = llvm::DenormalMode::getIEEE();
       if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
           !JA.isOffloading(Action::OFK_HIP)) {
         if (LastSeenFfpContractOption != "") {
@@ -3186,9 +3142,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = false;
       ApproxFunc = false;
       SignedZeros = true;
-      // -fno_fast_math restores default denormal and fpcontract handling
-      DenormalFPMath = DefaultDenormalFPMath;
-      DenormalFP32Math = llvm::DenormalMode::getIEEE();
+      // -fno_fast_math restores default fpcontract handling
       if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
           !JA.isOffloading(Action::OFK_HIP)) {
         if (LastSeenFfpContractOption != "") {
@@ -3203,8 +3157,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       // subsequent options conflict then emit warning diagnostic.
       if (HonorINFs && HonorNaNs && !AssociativeMath && !ReciprocalMath &&
           SignedZeros && TrappingMath && RoundingFPMath && !ApproxFunc &&
-          DenormalFPMath == llvm::DenormalMode::getIEEE() &&
-          DenormalFP32Math == llvm::DenormalMode::getIEEE() &&
           FPContract.equals("off"))
         // OK: Current Arg doesn't conflict with -ffp-model=strict
         ;
@@ -7540,6 +7492,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.addOptInFlag(CmdArgs, options::OPT_fsafe_buffer_usage_suggestions,
                     options::OPT_fno_safe_buffer_usage_suggestions);
 
+  Args.addOptInFlag(CmdArgs, options::OPT_fexperimental_late_parse_attributes,
+                    options::OPT_fno_experimental_late_parse_attributes);
+
   // Setup statistics file output.
   SmallString<128> StatsFile = getStatsFileName(Args, Output, Input, D);
   if (!StatsFile.empty()) {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index b65b96db16bd..6796b43a1550 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -737,7 +737,7 @@ bool tools::isTLSDESCEnabled(const ToolChain &TC,
   StringRef V = A->getValue();
   bool SupportedArgument = false, EnableTLSDESC = false;
   bool Unsupported = !Triple.isOSBinFormatELF();
-  if (Triple.isRISCV()) {
+  if (Triple.isLoongArch() || Triple.isRISCV()) {
     SupportedArgument = V == "desc" || V == "trad";
     EnableTLSDESC = V == "desc";
   } else if (Triple.isX86()) {
@@ -1191,118 +1191,10 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
   return true;
 }
 
-/// Determines if --whole-archive is active in the list of arguments.
-static bool isWholeArchivePresent(const ArgList &Args) {
-  bool WholeArchiveActive = false;
-  for (auto *Arg : Args.filtered(options::OPT_Wl_COMMA)) {
-    if (Arg) {
-      for (StringRef ArgValue : Arg->getValues()) {
-        if (ArgValue == "--whole-archive")
-          WholeArchiveActive = true;
-        if (ArgValue == "--no-whole-archive")
-          WholeArchiveActive = false;
-      }
-    }
-  }
-
-  return WholeArchiveActive;
-}
-
-/// Determine if driver is invoked to create a shared object library (-static)
-static bool isSharedLinkage(const ArgList &Args) {
-  return Args.hasArg(options::OPT_shared);
-}
-
-/// Determine if driver is invoked to create a static object library (-shared)
-static bool isStaticLinkage(const ArgList &Args) {
-  return Args.hasArg(options::OPT_static);
-}
-
-/// Add Fortran runtime libs for MSVC
-static void addFortranRuntimeLibsMSVC(const ArgList &Args,
-                                      llvm::opt::ArgStringList &CmdArgs) {
-  unsigned RTOptionID = options::OPT__SLASH_MT;
-  if (auto *rtl = Args.getLastArg(options::OPT_fms_runtime_lib_EQ)) {
-    RTOptionID = llvm::StringSwitch<unsigned>(rtl->getValue())
-                     .Case("static", options::OPT__SLASH_MT)
-                     .Case("static_dbg", options::OPT__SLASH_MTd)
-                     .Case("dll", options::OPT__SLASH_MD)
-                     .Case("dll_dbg", options::OPT__SLASH_MDd)
-                     .Default(options::OPT__SLASH_MT);
-  }
-  switch (RTOptionID) {
-  case options::OPT__SLASH_MT:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.static.lib");
-    break;
-  case options::OPT__SLASH_MTd:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.static_dbg.lib");
-    break;
-  case options::OPT__SLASH_MD:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.dynamic.lib");
-    break;
-  case options::OPT__SLASH_MDd:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.dynamic_dbg.lib");
-    break;
-  }
-}
-
-// Add FortranMain runtime lib
-static void addFortranMain(const ToolChain &TC, const ArgList &Args,
-                           llvm::opt::ArgStringList &CmdArgs) {
-  // 0. Shared-library linkage
-  // If we are attempting to link a library, we should not add
-  // -lFortran_main.a to the link line, as the `main` symbol is not
-  // required for a library and should also be provided by one of
-  // the translation units of the code that this shared library
-  // will be linked against eventually.
-  if (isSharedLinkage(Args) || isStaticLinkage(Args)) {
-    return;
-  }
-
-  // 1. MSVC
-  if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
-    addFortranRuntimeLibsMSVC(Args, CmdArgs);
-    return;
-  }
-
-  // 2. GNU and similar
-  const Driver &D = TC.getDriver();
-  const char *FortranMainLinkFlag = "-lFortran_main";
-
-  // Warn if the user added `-lFortran_main` - this library is an implementation
-  // detail of Flang and should be handled automaticaly by the driver.
-  for (const char *arg : CmdArgs) {
-    if (strncmp(arg, FortranMainLinkFlag, strlen(FortranMainLinkFlag)) == 0)
-      D.Diag(diag::warn_drv_deprecated_custom)
-          << FortranMainLinkFlag
-          << "see the Flang driver documentation for correct usage";
-  }
-
-  // The --whole-archive option needs to be part of the link line to make
-  // sure that the main() function from Fortran_main.a is pulled in by the
-  // linker. However, it shouldn't be used if it's already active.
-  // TODO: Find an equivalent of `--whole-archive` for Darwin and AIX.
-  if (!isWholeArchivePresent(Args) && !TC.getTriple().isMacOSX() &&
-      !TC.getTriple().isOSAIX()) {
-    CmdArgs.push_back("--whole-archive");
-    CmdArgs.push_back(FortranMainLinkFlag);
-    CmdArgs.push_back("--no-whole-archive");
-    return;
-  }
-
-  CmdArgs.push_back(FortranMainLinkFlag);
-}
-
 /// Add Fortran runtime libs
 void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
                                   llvm::opt::ArgStringList &CmdArgs) {
-  // 1. Link FortranMain
-  // FortranMain depends on FortranRuntime, so needs to be listed first. If
-  // -fno-fortran-main has been passed, skip linking Fortran_main.a
-  if (!Args.hasArg(options::OPT_no_fortran_main))
-    addFortranMain(TC, Args, CmdArgs);
-
-  // 2. Link FortranRuntime and FortranDecimal
+  // Link FortranRuntime and FortranDecimal
   // These are handled earlier on Windows by telling the frontend driver to
   // add the correct libraries to link against as dependents in the object
   // file.
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 6d93c1f3d703..8955b9fb653c 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -282,7 +282,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
   assert(TC.getTriple().isKnownWindowsMSVCEnvironment() &&
          "can only add VS runtime library on Windows!");
   // if -fno-fortran-main has been passed, skip linking Fortran_main.a
-  bool LinkFortranMain = !Args.hasArg(options::OPT_no_fortran_main);
   if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
     CmdArgs.push_back(Args.MakeArgString(
         "--dependent-lib=" + TC.getCompilerRTBasename(Args, "builtins")));
@@ -300,8 +299,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
   case options::OPT__SLASH_MT:
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("--dependent-lib=libcmt");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.static.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.static.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.static.lib");
     break;
@@ -309,8 +306,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("--dependent-lib=libcmtd");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.static_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.static_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.static_dbg.lib");
     break;
@@ -318,8 +313,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-D_DLL");
     CmdArgs.push_back("--dependent-lib=msvcrt");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.dynamic.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.dynamic.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.dynamic.lib");
     break;
@@ -328,8 +321,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_DLL");
     CmdArgs.push_back("--dependent-lib=msvcrtd");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.dynamic_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.dynamic_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.dynamic_dbg.lib");
     break;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index f55b8bf48c13..9849c59685cc 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -1796,9 +1796,7 @@ selectRISCVMultilib(const MultilibSet &RISCVMultilibSet, StringRef Arch,
       }
       auto &MLConfigISAInfo = *MLConfigParseResult;
 
-      const llvm::RISCVISAInfo::OrderedExtensionMap &MLConfigArchExts =
-          MLConfigISAInfo->getExtensions();
-      for (auto MLConfigArchExt : MLConfigArchExts) {
+      for (auto &MLConfigArchExt : MLConfigISAInfo->getExtensions()) {
         auto ExtName = MLConfigArchExt.first;
         NewMultilib.flag(Twine("-", ExtName).str());
 
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index fb65881061ef..db2c20d7b461 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -842,25 +842,6 @@ void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
   ToolChain::addProfileRTLibs(Args, CmdArgs);
 }
 
-llvm::DenormalMode
-Linux::getDefaultDenormalModeForType(const llvm::opt::ArgList &DriverArgs,
-                                     const JobAction &JA,
-                                     const llvm::fltSemantics *FPType) const {
-  switch (getTriple().getArch()) {
-  case llvm::Triple::x86:
-  case llvm::Triple::x86_64: {
-    std::string Unused;
-    // DAZ and FTZ are turned on in crtfastmath.o
-    if (!DriverArgs.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles) &&
-        isFastMathRuntimeAvailable(DriverArgs, Unused))
-      return llvm::DenormalMode::getPreserveSign();
-    return llvm::DenormalMode::getIEEE();
-  }
-  default:
-    return llvm::DenormalMode::getIEEE();
-  }
-}
-
 void Linux::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
   for (const auto &Opt : ExtraOpts)
     CmdArgs.push_back(Opt.c_str());
diff --git a/clang/lib/Driver/ToolChains/Linux.h b/clang/lib/Driver/ToolChains/Linux.h
index 524391743090..2d9e674e50a6 100644
--- a/clang/lib/Driver/ToolChains/Linux.h
+++ b/clang/lib/Driver/ToolChains/Linux.h
@@ -59,10 +59,6 @@ public:
 
   std::vector<std::string> ExtraOpts;
 
-  llvm::DenormalMode getDefaultDenormalModeForType(
-      const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
-      const llvm::fltSemantics *FPType = nullptr) const override;
-
   const char *getDefaultLinker() const override;
 
 protected:
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index ccb2c9190e2e..c8d8ec3afbd9 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -807,7 +807,6 @@ template <> struct MappingTraits<FormatStyle> {
         FormatStyle PredefinedStyle;
         if (getPredefinedStyle(StyleName, Style.Language, &PredefinedStyle) &&
             Style == PredefinedStyle) {
-          IO.mapOptional("# BasedOnStyle", StyleName);
           BasedOnStyle = StyleName;
           break;
         }
@@ -3117,6 +3116,7 @@ static void sortCppIncludes(const FormatStyle &Style,
     return;
   }
 
+  const auto OldCursor = Cursor ? *Cursor : 0;
   std::string result;
   for (unsigned Index : Indices) {
     if (!result.empty()) {
@@ -3140,6 +3140,8 @@ static void sortCppIncludes(const FormatStyle &Style,
   // the entire range of blocks. Otherwise, no replacement is generated.
   if (replaceCRLF(result) == replaceCRLF(std::string(Code.substr(
                                  IncludesBeginOffset, IncludesBlockSize)))) {
+    if (Cursor)
+      *Cursor = OldCursor;
     return;
   }
 
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index f651e6228c20..28b6488e54a4 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -1623,10 +1623,10 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_then;
 
   /// Returns \c true if \p Tok is a keyword or an identifier.
-  bool isWordLike(const FormatToken &Tok) const {
+  bool isWordLike(const FormatToken &Tok, bool IsVerilog = true) const {
     // getIdentifierinfo returns non-null for keywords as well as identifiers.
     return Tok.Tok.getIdentifierInfo() &&
-           !Tok.isOneOf(kw_verilogHash, kw_verilogHashHash, kw_apostrophe);
+           (!IsVerilog || !isVerilogKeywordSymbol(Tok));
   }
 
   /// Returns \c true if \p Tok is a true JavaScript identifier, returns
@@ -1755,6 +1755,10 @@ struct AdditionalKeywords {
     }
   }
 
+  bool isVerilogKeywordSymbol(const FormatToken &Tok) const {
+    return Tok.isOneOf(kw_verilogHash, kw_verilogHashHash, kw_apostrophe);
+  }
+
   bool isVerilogWordOperator(const FormatToken &Tok) const {
     return Tok.isOneOf(kw_before, kw_intersect, kw_dist, kw_iff, kw_inside,
                        kw_with);
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index cdfb4256e41d..d366ae2080bc 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4780,9 +4780,14 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
   if (Left.Finalized)
     return Right.hasWhitespaceBefore();
 
+  const bool IsVerilog = Style.isVerilog();
+  assert(!IsVerilog || !IsCpp);
+
   // Never ever merge two words.
-  if (Keywords.isWordLike(Right) && Keywords.isWordLike(Left))
+  if (Keywords.isWordLike(Right, IsVerilog) &&
+      Keywords.isWordLike(Left, IsVerilog)) {
     return true;
+  }
 
   // Leave a space between * and /* to avoid C4138 `comment end` found outside
   // of comment.
@@ -4834,10 +4839,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
         Right.is(TT_TemplateOpener)) {
       return true;
     }
-    if (Left.is(tok::identifier) && Right.is(tok::numeric_constant) &&
-        Right.TokenText[0] == '.') {
-      return false;
-    }
+    if (Left.Tok.getIdentifierInfo() && Right.is(tok::numeric_constant))
+      return Right.TokenText[0] != '.';
   } else if (Style.isProto()) {
     if (Right.is(tok::period) &&
         Left.isOneOf(Keywords.kw_optional, Keywords.kw_required,
@@ -5065,12 +5068,10 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
         Right.is(TT_TemplateOpener)) {
       return true;
     }
-  } else if (Style.isVerilog()) {
+  } else if (IsVerilog) {
     // An escaped identifier ends with whitespace.
-    if (Style.isVerilog() && Left.is(tok::identifier) &&
-        Left.TokenText[0] == '\\') {
+    if (Left.is(tok::identifier) && Left.TokenText[0] == '\\')
       return true;
-    }
     // Add space between things in a primitive's state table unless in a
     // transition like `(0?)`.
     if ((Left.is(TT_VerilogTableItem) &&
@@ -5266,21 +5267,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     return true;
   }
   if (Left.is(TT_UnaryOperator)) {
-    if (Right.isNot(tok::l_paren)) {
-      // The alternative operators for ~ and ! are "compl" and "not".
-      // If they are used instead, we do not want to combine them with
-      // the token to the right, unless that is a left paren.
-      if (Left.is(tok::exclaim) && Left.TokenText == "not")
-        return true;
-      if (Left.is(tok::tilde) && Left.TokenText == "compl")
-        return true;
-      // Lambda captures allow for a lone &, so "&]" needs to be properly
-      // handled.
-      if (Left.is(tok::amp) && Right.is(tok::r_square))
-        return Style.SpacesInSquareBrackets;
-    }
-    return (Style.SpaceAfterLogicalNot && Left.is(tok::exclaim)) ||
-           Right.is(TT_BinaryOperator);
+    // Lambda captures allow for a lone &, so "&]" needs to be properly
+    // handled.
+    if (Left.is(tok::amp) && Right.is(tok::r_square))
+      return Style.SpacesInSquareBrackets;
+    return Style.SpaceAfterLogicalNot && Left.is(tok::exclaim);
   }
 
   // If the next token is a binary operator or a selector name, we have
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 6e4e6901e473..854428389740 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -534,11 +534,11 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
     case tok::r_brace:
       if (LBraceStack.empty())
         break;
-      if (LBraceStack.back().Tok->is(BK_Unknown)) {
+      if (auto *LBrace = LBraceStack.back().Tok; LBrace->is(BK_Unknown)) {
         bool ProbablyBracedList = false;
         if (Style.Language == FormatStyle::LK_Proto) {
           ProbablyBracedList = NextTok->isOneOf(tok::comma, tok::r_square);
-        } else {
+        } else if (LBrace->isNot(TT_EnumLBrace)) {
           // Using OriginalColumn to distinguish between ObjC methods and
           // binary operators is a bit hacky.
           bool NextIsObjCMethod = NextTok->isOneOf(tok::plus, tok::minus) &&
@@ -552,7 +552,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
 
           // If we already marked the opening brace as braced list, the closing
           // must also be part of it.
-          ProbablyBracedList = LBraceStack.back().Tok->is(TT_BracedListLBrace);
+          ProbablyBracedList = LBrace->is(TT_BracedListLBrace);
 
           ProbablyBracedList = ProbablyBracedList ||
                                (Style.isJavaScript() &&
@@ -608,13 +608,9 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
             ProbablyBracedList = true;
           }
         }
-        if (ProbablyBracedList) {
-          Tok->setBlockKind(BK_BracedInit);
-          LBraceStack.back().Tok->setBlockKind(BK_BracedInit);
-        } else {
-          Tok->setBlockKind(BK_Block);
-          LBraceStack.back().Tok->setBlockKind(BK_Block);
-        }
+        const auto BlockKind = ProbablyBracedList ? BK_BracedInit : BK_Block;
+        Tok->setBlockKind(BlockKind);
+        LBrace->setBlockKind(BlockKind);
       }
       LBraceStack.pop_back();
       break;
@@ -2418,6 +2414,7 @@ bool UnwrappedLineParser::tryToParseChildBlock() {
 }
 
 bool UnwrappedLineParser::parseBracedList(bool IsAngleBracket, bool IsEnum) {
+  assert(!IsAngleBracket || !IsEnum);
   bool HasError = false;
 
   // FIXME: Once we have an expression parser in the UnwrappedLineParser,
@@ -2440,8 +2437,11 @@ bool UnwrappedLineParser::parseBracedList(bool IsAngleBracket, bool IsEnum) {
       }
     }
     if (FormatTok->is(IsAngleBracket ? tok::greater : tok::r_brace)) {
-      if (IsEnum && !Style.AllowShortEnumsOnASingleLine)
-        addUnwrappedLine();
+      if (IsEnum) {
+        FormatTok->setBlockKind(BK_Block);
+        if (!Style.AllowShortEnumsOnASingleLine)
+          addUnwrappedLine();
+      }
       nextToken();
       return !HasError;
     }
@@ -3944,8 +3944,11 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
     switch (FormatTok->Tok.getKind()) {
     case tok::l_paren:
       // We can have macros in between 'class' and the class name.
-      if (!IsNonMacroIdentifier(Previous))
+      if (!IsNonMacroIdentifier(Previous) ||
+          // e.g. `struct macro(a) S { int i; };`
+          Previous->Previous == &InitialToken) {
         parseParens();
+      }
       break;
     case tok::coloncolon:
       break;
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 4f822807dd98..44fd807ec27e 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -128,11 +128,14 @@ const tooling::Replacements &WhitespaceManager::generateReplacements() {
 void WhitespaceManager::calculateLineBreakInformation() {
   Changes[0].PreviousEndOfTokenColumn = 0;
   Change *LastOutsideTokenChange = &Changes[0];
-  for (unsigned i = 1, e = Changes.size(); i != e; ++i) {
+  for (unsigned I = 1, e = Changes.size(); I != e; ++I) {
+    auto &C = Changes[I];
+    auto &P = Changes[I - 1];
+    auto &PrevTokLength = P.TokenLength;
     SourceLocation OriginalWhitespaceStart =
-        Changes[i].OriginalWhitespaceRange.getBegin();
+        C.OriginalWhitespaceRange.getBegin();
     SourceLocation PreviousOriginalWhitespaceEnd =
-        Changes[i - 1].OriginalWhitespaceRange.getEnd();
+        P.OriginalWhitespaceRange.getEnd();
     unsigned OriginalWhitespaceStartOffset =
         SourceMgr.getFileOffset(OriginalWhitespaceStart);
     unsigned PreviousOriginalWhitespaceEndOffset =
@@ -167,31 +170,28 @@ void WhitespaceManager::calculateLineBreakInformation() {
     // line of the token.
     auto NewlinePos = Text.find_first_of('\n');
     if (NewlinePos == StringRef::npos) {
-      Changes[i - 1].TokenLength = OriginalWhitespaceStartOffset -
-                                   PreviousOriginalWhitespaceEndOffset +
-                                   Changes[i].PreviousLinePostfix.size() +
-                                   Changes[i - 1].CurrentLinePrefix.size();
+      PrevTokLength = OriginalWhitespaceStartOffset -
+                      PreviousOriginalWhitespaceEndOffset +
+                      C.PreviousLinePostfix.size() + P.CurrentLinePrefix.size();
+      if (!P.IsInsideToken)
+        PrevTokLength = std::min(PrevTokLength, P.Tok->ColumnWidth);
     } else {
-      Changes[i - 1].TokenLength =
-          NewlinePos + Changes[i - 1].CurrentLinePrefix.size();
+      PrevTokLength = NewlinePos + P.CurrentLinePrefix.size();
     }
 
     // If there are multiple changes in this token, sum up all the changes until
     // the end of the line.
-    if (Changes[i - 1].IsInsideToken && Changes[i - 1].NewlinesBefore == 0) {
-      LastOutsideTokenChange->TokenLength +=
-          Changes[i - 1].TokenLength + Changes[i - 1].Spaces;
-    } else {
-      LastOutsideTokenChange = &Changes[i - 1];
-    }
+    if (P.IsInsideToken && P.NewlinesBefore == 0)
+      LastOutsideTokenChange->TokenLength += PrevTokLength + P.Spaces;
+    else
+      LastOutsideTokenChange = &P;
 
-    Changes[i].PreviousEndOfTokenColumn =
-        Changes[i - 1].StartOfTokenColumn + Changes[i - 1].TokenLength;
+    C.PreviousEndOfTokenColumn = P.StartOfTokenColumn + PrevTokLength;
 
-    Changes[i - 1].IsTrailingComment =
-        (Changes[i].NewlinesBefore > 0 || Changes[i].Tok->is(tok::eof) ||
-         (Changes[i].IsInsideToken && Changes[i].Tok->is(tok::comment))) &&
-        Changes[i - 1].Tok->is(tok::comment) &&
+    P.IsTrailingComment =
+        (C.NewlinesBefore > 0 || C.Tok->is(tok::eof) ||
+         (C.IsInsideToken && C.Tok->is(tok::comment))) &&
+        P.Tok->is(tok::comment) &&
         // FIXME: This is a dirty hack. The problem is that
         // BreakableLineCommentSection does comment reflow changes and here is
         // the aligning of trailing comments. Consider the case where we reflow
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index 3610a08831e7..1b93588553a2 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -1067,7 +1067,7 @@ public:
 
   std::vector<Decl *> takeTopLevelDecls() { return std::move(TopLevelDecls); }
 
-  std::vector<serialization::DeclID> takeTopLevelDeclIDs() {
+  std::vector<LocalDeclID> takeTopLevelDeclIDs() {
     return std::move(TopLevelDeclIDs);
   }
 
@@ -1101,7 +1101,7 @@ public:
 private:
   unsigned Hash = 0;
   std::vector<Decl *> TopLevelDecls;
-  std::vector<serialization::DeclID> TopLevelDeclIDs;
+  std::vector<LocalDeclID> TopLevelDeclIDs;
   llvm::SmallVector<ASTUnit::StandaloneDiagnostic, 4> PreambleDiags;
 };
 
@@ -1467,11 +1467,12 @@ void ASTUnit::RealizeTopLevelDeclsFromPreamble() {
 
   std::vector<Decl *> Resolved;
   Resolved.reserve(TopLevelDeclsInPreamble.size());
-  ExternalASTSource &Source = *getASTContext().getExternalSource();
+  // The module file of the preamble.
+  serialization::ModuleFile &MF = Reader->getModuleManager().getPrimaryModule();
   for (const auto TopLevelDecl : TopLevelDeclsInPreamble) {
     // Resolve the declaration ID to an actual declaration, possibly
     // deserializing the declaration in the process.
-    if (Decl *D = Source.GetExternalDecl(TopLevelDecl))
+    if (Decl *D = Reader->GetDecl(Reader->getGlobalDeclID(MF, TopLevelDecl)))
       Resolved.push_back(D);
   }
   TopLevelDeclsInPreamble.clear();
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 6e3baf838644..66a45b888f15 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1293,6 +1293,10 @@ compileModuleImpl(CompilerInstance &ImportingInstance, SourceLocation ImportLoc,
                                             diag::remark_module_build_done)
     << ModuleName;
 
+  // Propagate the statistics to the parent FileManager.
+  if (!FrontendOpts.ModulesShareFileManager)
+    ImportingInstance.getFileManager().AddStats(Instance.getFileManager());
+
   if (Crashed) {
     // Clear the ASTConsumer if it hasn't been already, in case it owns streams
     // that must be closed before clearing output files.
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 8236051e30c4..8312abc36039 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2549,6 +2549,7 @@ static const auto &getFrontendActionTable() {
       {frontend::DumpTokens, OPT_dump_tokens},
       {frontend::EmitAssembly, OPT_S},
       {frontend::EmitBC, OPT_emit_llvm_bc},
+      {frontend::EmitCIR, OPT_emit_cir},
       {frontend::EmitHTML, OPT_emit_html},
       {frontend::EmitLLVM, OPT_emit_llvm},
       {frontend::EmitLLVMOnly, OPT_emit_llvm_only},
@@ -2891,6 +2892,8 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
   if (Opts.ProgramAction != frontend::GenerateModule && Opts.IsSystemModule)
     Diags.Report(diag::err_drv_argument_only_allowed_with) << "-fsystem-module"
                                                            << "-emit-module";
+  if (Args.hasArg(OPT_fclangir) || Args.hasArg(OPT_emit_cir))
+    Opts.UseClangIRPipeline = true;
 
   if (Args.hasArg(OPT_aux_target_cpu))
     Opts.AuxTargetCPU = std::string(Args.getLastArgValue(OPT_aux_target_cpu));
@@ -4337,6 +4340,7 @@ static bool isStrictlyPreprocessorAction(frontend::ActionKind Action) {
   case frontend::ASTView:
   case frontend::EmitAssembly:
   case frontend::EmitBC:
+  case frontend::EmitCIR:
   case frontend::EmitHTML:
   case frontend::EmitLLVM:
   case frontend::EmitLLVMOnly:
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index b7c9967316f0..9ae7664b4b49 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -80,7 +80,7 @@ public:
     if (Previous)
       Previous->TypeRead(Idx, T);
   }
-  void DeclRead(serialization::DeclID ID, const Decl *D) override {
+  void DeclRead(GlobalDeclID ID, const Decl *D) override {
     if (Previous)
       Previous->DeclRead(ID, D);
   }
@@ -102,7 +102,7 @@ public:
                                    bool DeletePrevious)
       : DelegatingDeserializationListener(Previous, DeletePrevious) {}
 
-  void DeclRead(serialization::DeclID ID, const Decl *D) override {
+  void DeclRead(GlobalDeclID ID, const Decl *D) override {
     llvm::outs() << "PCH DECL: " << D->getDeclKindName();
     if (const NamedDecl *ND = dyn_cast<NamedDecl>(D)) {
       llvm::outs() << " - ";
@@ -128,7 +128,7 @@ public:
       : DelegatingDeserializationListener(Previous, DeletePrevious), Ctx(Ctx),
         NamesToCheck(NamesToCheck) {}
 
-  void DeclRead(serialization::DeclID ID, const Decl *D) override {
+  void DeclRead(GlobalDeclID ID, const Decl *D) override {
     if (const NamedDecl *ND = dyn_cast<NamedDecl>(D))
       if (NamesToCheck.find(ND->getNameAsString()) != NamesToCheck.end()) {
         unsigned DiagID
@@ -757,8 +757,11 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
 
   // IR files bypass the rest of initialization.
   if (Input.getKind().getLanguage() == Language::LLVM_IR) {
-    assert(hasIRSupport() &&
-           "This action does not have IR file support!");
+    if (!hasIRSupport()) {
+      CI.getDiagnostics().Report(diag::err_ast_action_on_llvm_ir)
+          << Input.getFile();
+      return false;
+    }
 
     // Inform the diagnostic client we are processing a source file.
     CI.getDiagnosticClient().BeginSourceFile(CI.getLangOpts(), nullptr);
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index 04eb10413267..454653a31534 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -272,14 +272,10 @@ bool GenerateModuleInterfaceAction::BeginSourceFileAction(
 std::unique_ptr<ASTConsumer>
 GenerateModuleInterfaceAction::CreateASTConsumer(CompilerInstance &CI,
                                                  StringRef InFile) {
-  CI.getHeaderSearchOpts().ModulesSkipDiagnosticOptions = true;
-  CI.getHeaderSearchOpts().ModulesSkipHeaderSearchPaths = true;
-  CI.getHeaderSearchOpts().ModulesSkipPragmaDiagnosticMappings = true;
-
-  std::vector<std::unique_ptr<ASTConsumer>> Consumers =
-      CreateMultiplexConsumer(CI, InFile);
-  if (Consumers.empty())
-    return nullptr;
+  std::vector<std::unique_ptr<ASTConsumer>> Consumers;
+  Consumers.push_back(std::make_unique<CXX20ModulesGenerator>(
+      CI.getPreprocessor(), CI.getModuleCache(),
+      CI.getFrontendOpts().OutputFile));
 
   if (CI.getFrontendOpts().GenReducedBMI &&
       !CI.getFrontendOpts().ModuleOutputPath.empty()) {
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 4f44c3b7b89d..c1d209466ffe 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -389,8 +389,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
                         Twine((unsigned)LangOpts.getHLSLVersion()));
 
     if (LangOpts.NativeHalfType)
-      Builder.defineMacro("__HLSL_ENABLE_16_BIT",
-                          Twine((unsigned)LangOpts.getHLSLVersion()));
+      Builder.defineMacro("__HLSL_ENABLE_16_BIT", "1");
 
     // Shader target information
     // "enums" for shader stages
@@ -704,7 +703,7 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
     Builder.defineMacro("__cpp_nested_namespace_definitions", "201411L");
     Builder.defineMacro("__cpp_variadic_using", "201611L");
     Builder.defineMacro("__cpp_aggregate_bases", "201603L");
-    Builder.defineMacro("__cpp_structured_bindings", "201606L");
+    Builder.defineMacro("__cpp_structured_bindings", "202403L");
     Builder.defineMacro("__cpp_nontype_template_args",
                         "201411L"); // (not latest)
     Builder.defineMacro("__cpp_fold_expressions", "201603L");
@@ -1309,6 +1308,16 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__GCC_ATOMIC_TEST_AND_SET_TRUEVAL", "1");
   }
 
+  // GCC defines these macros in both C and C++ modes despite them being needed
+  // mostly for STL implementations in C++.
+  auto [Destructive, Constructive] = TI.hardwareInterferenceSizes();
+  Builder.defineMacro("__GCC_DESTRUCTIVE_SIZE", Twine(Destructive));
+  Builder.defineMacro("__GCC_CONSTRUCTIVE_SIZE", Twine(Constructive));
+  // We need to use push_macro to allow users to redefine these macros from the
+  // command line with -D and not issue a -Wmacro-redefined warning.
+  Builder.append("#pragma push_macro(\"__GCC_DESTRUCTIVE_SIZE\")");
+  Builder.append("#pragma push_macro(\"__GCC_CONSTRUCTIVE_SIZE\")");
+
   auto addLockFreeMacros = [&](const llvm::Twine &Prefix) {
     // Used by libc++ and libstdc++ to implement ATOMIC_<foo>_LOCK_FREE.
 #define DEFINE_LOCK_FREE_MACRO(TYPE, Type)                                     \
diff --git a/clang/lib/Frontend/MultiplexConsumer.cpp b/clang/lib/Frontend/MultiplexConsumer.cpp
index 744ea70cc24d..c74bfd86195f 100644
--- a/clang/lib/Frontend/MultiplexConsumer.cpp
+++ b/clang/lib/Frontend/MultiplexConsumer.cpp
@@ -52,8 +52,8 @@ void MultiplexASTDeserializationListener::TypeRead(
     Listeners[i]->TypeRead(Idx, T);
 }
 
-void MultiplexASTDeserializationListener::DeclRead(
-    serialization::DeclID ID, const Decl *D) {
+void MultiplexASTDeserializationListener::DeclRead(GlobalDeclID ID,
+                                                   const Decl *D) {
   for (size_t i = 0, e = Listeners.size(); i != e; ++i)
     Listeners[i]->DeclRead(ID, D);
 }
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index f85f0365616f..7476b1076d10 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -53,6 +53,8 @@ CreateFrontendBaseAction(CompilerInstance &CI) {
   case DumpTokens:             return std::make_unique<DumpTokensAction>();
   case EmitAssembly:           return std::make_unique<EmitAssemblyAction>();
   case EmitBC:                 return std::make_unique<EmitBCAction>();
+  case EmitCIR:
+    llvm_unreachable("CIR suppport not built into clang");
   case EmitHTML:               return std::make_unique<HTMLPrintAction>();
   case EmitLLVM:               return std::make_unique<EmitLLVMAction>();
   case EmitLLVMOnly:           return std::make_unique<EmitLLVMOnlyAction>();
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index e6ae4e19e81d..3416811e39de 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -335,6 +335,10 @@ set(llvm_libc_wrapper_files
   llvm_libc_wrappers/time.h
 )
 
+set(zos_wrapper_files
+  zos_wrappers/builtins.h
+)
+
 include(GetClangResourceDir)
 get_clang_resource_dir(output_dir PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/.. SUBDIR include)
 set(out_files)
@@ -370,7 +374,7 @@ endfunction(clang_generate_header)
 
 # Copy header files from the source directory to the build directory
 foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files}
-           ${ppc_wrapper_files} ${openmp_wrapper_files} ${hlsl_files}
+           ${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files}
            ${llvm_libc_wrapper_files})
   copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
 endforeach( f )
@@ -487,7 +491,7 @@ add_header_target("mips-resource-headers" "${mips_msa_files}")
 add_header_target("ppc-resource-headers" "${ppc_files};${ppc_wrapper_files}")
 add_header_target("ppc-htm-resource-headers" "${ppc_htm_files}")
 add_header_target("riscv-resource-headers" "${riscv_files};${riscv_generated_files}")
-add_header_target("systemz-resource-headers" "${systemz_files}")
+add_header_target("systemz-resource-headers" "${systemz_files};${zos_wrapper_files}")
 add_header_target("ve-resource-headers" "${ve_files}")
 add_header_target("webassembly-resource-headers" "${webassembly_files}")
 add_header_target("x86-resource-headers" "${x86_files}")
@@ -538,6 +542,11 @@ install(
   DESTINATION ${header_install_dir}/openmp_wrappers
   COMPONENT clang-resource-headers)
 
+install(
+  FILES ${zos_wrapper_files}
+  DESTINATION ${header_install_dir}/zos_wrappers
+  COMPONENT clang-resource-headers)
+
 #############################################################
 # Install rules for separate header lists
 install(
@@ -643,6 +652,12 @@ install(
   COMPONENT systemz-resource-headers)
 
 install(
+  FILES ${zos_wrapper_files}
+  DESTINATION  ${header_install_dir}/zos_wrappers
+  EXCLUDE_FROM_ALL
+  COMPONENT systemz-resource-headers)
+
+install(
   FILES ${ve_files}
   DESTINATION ${header_install_dir}
   EXCLUDE_FROM_ALL
diff --git a/clang/lib/Headers/builtins.h b/clang/lib/Headers/builtins.h
index 65095861ca9b..1e534e632c8e 100644
--- a/clang/lib/Headers/builtins.h
+++ b/clang/lib/Headers/builtins.h
@@ -13,4 +13,7 @@
 #ifndef __BUILTINS_H
 #define __BUILTINS_H
 
+#if defined(__MVS__) && __has_include_next(<builtins.h>)
+#include_next <builtins.h>
+#endif /* __MVS__ */
 #endif /* __BUILTINS_H */
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 0bb9912b465f..bb7692efb78f 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -10,7 +10,7 @@
 #ifndef __CPUID_H
 #define __CPUID_H
 
-#if !(__x86_64__ || __i386__)
+#if !defined(__x86_64__) && !defined(__i386__)
 #error this header is for x86 only
 #endif
 
@@ -256,7 +256,7 @@
 #define bit_AVX10_256   0x00020000
 #define bit_AVX10_512   0x00040000
 
-#if __i386__
+#ifdef __i386__
 #define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
     __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
                   : "0"(__leaf))
@@ -285,7 +285,7 @@ static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
                                               unsigned int *__sig)
 {
     unsigned int __eax, __ebx, __ecx, __edx;
-#if __i386__
+#ifdef __i386__
     int __cpuid_supported;
 
     __asm("  pushfl\n"
diff --git a/clang/lib/Headers/float.h b/clang/lib/Headers/float.h
index 0e73bca0a2d6..642c8f06cc93 100644
--- a/clang/lib/Headers/float.h
+++ b/clang/lib/Headers/float.h
@@ -10,6 +10,10 @@
 #ifndef __CLANG_FLOAT_H
 #define __CLANG_FLOAT_H
 
+#if defined(__MVS__) && __has_include_next(<float.h>)
+#include_next <float.h>
+#else
+
 /* If we're on MinGW, fall back to the system's float.h, which might have
  * additional definitions provided for Windows.
  * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
@@ -165,4 +169,5 @@
 #  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
 #endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
 
+#endif /* __MVS__ */
 #endif /* __CLANG_FLOAT_H */
diff --git a/clang/lib/Headers/inttypes.h b/clang/lib/Headers/inttypes.h
index 1c894c4aca49..5150d22f8b2e 100644
--- a/clang/lib/Headers/inttypes.h
+++ b/clang/lib/Headers/inttypes.h
@@ -13,6 +13,9 @@
 #if !defined(_AIX) || !defined(_STD_TYPES_T)
 #define __CLANG_INTTYPES_H
 #endif
+#if defined(__MVS__) && __has_include_next(<inttypes.h>)
+#include_next <inttypes.h>
+#else
 
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #error MSVC does not have inttypes.h prior to Visual Studio 2013
@@ -94,4 +97,5 @@
 #define SCNxFAST32 "x"
 #endif
 
+#endif /* __MVS__ */
 #endif /* __CLANG_INTTYPES_H */
diff --git a/clang/lib/Headers/iso646.h b/clang/lib/Headers/iso646.h
index e0a20c6f1891..b53fcd9b4e53 100644
--- a/clang/lib/Headers/iso646.h
+++ b/clang/lib/Headers/iso646.h
@@ -9,6 +9,9 @@
 
 #ifndef __ISO646_H
 #define __ISO646_H
+#if defined(__MVS__) && __has_include_next(<iso646.h>)
+#include_next <iso646.h>
+#else
 
 #ifndef __cplusplus
 #define and    &&
@@ -24,4 +27,5 @@
 #define xor_eq ^=
 #endif
 
+#endif /* __MVS__ */
 #endif /* __ISO646_H */
diff --git a/clang/lib/Headers/limits.h b/clang/lib/Headers/limits.h
index 15e6bbe0abcf..56dffe568486 100644
--- a/clang/lib/Headers/limits.h
+++ b/clang/lib/Headers/limits.h
@@ -9,6 +9,10 @@
 #ifndef __CLANG_LIMITS_H
 #define __CLANG_LIMITS_H
 
+#if defined(__MVS__) && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#else
+
 /* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
    Avert this #include_next madness. */
 #if defined __GNUC__ && !defined _GCC_LIMITS_H_
@@ -122,4 +126,5 @@
 #define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
 #endif
 
+#endif /* __MVS__ */
 #endif /* __CLANG_LIMITS_H */
diff --git a/clang/lib/Headers/stdalign.h b/clang/lib/Headers/stdalign.h
index 158508e65d2b..56cdfa52d4ba 100644
--- a/clang/lib/Headers/stdalign.h
+++ b/clang/lib/Headers/stdalign.h
@@ -10,6 +10,10 @@
 #ifndef __STDALIGN_H
 #define __STDALIGN_H
 
+#if defined(__MVS__) && __has_include_next(<stdalign.h>)
+#include_next <stdalign.h>
+#else
+
 #if defined(__cplusplus) ||                                                    \
     (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202311L)
 #ifndef __cplusplus
@@ -21,4 +25,5 @@
 #define __alignof_is_defined 1
 #endif /* __STDC_VERSION__ */
 
+#endif /* __MVS__ */
 #endif /* __STDALIGN_H */
diff --git a/clang/lib/Headers/stdarg.h b/clang/lib/Headers/stdarg.h
index 94b066566f08..6e7bd604b2df 100644
--- a/clang/lib/Headers/stdarg.h
+++ b/clang/lib/Headers/stdarg.h
@@ -33,6 +33,16 @@
     defined(__need_va_arg) || defined(__need___va_copy) ||                     \
     defined(__need_va_copy)
 
+#if defined(__MVS__) && __has_include_next(<stdarg.h>)
+#define __STDARG_H
+#undef __need___va_list
+#undef __need_va_list
+#undef __need_va_arg
+#undef __need___va_copy
+#undef __need_va_copy
+#include_next <stdarg.h>
+
+#else
 #if !defined(__need___va_list) && !defined(__need_va_list) &&                  \
     !defined(__need_va_arg) && !defined(__need___va_copy) &&                   \
     !defined(__need_va_copy)
@@ -76,4 +86,6 @@
 #undef __need_va_copy
 #endif /* defined(__need_va_copy) */
 
+#endif /* __MVS__ */
+
 #endif
diff --git a/clang/lib/Headers/stdbool.h b/clang/lib/Headers/stdbool.h
index 9406aab0ca72..dfaad2b65a9b 100644
--- a/clang/lib/Headers/stdbool.h
+++ b/clang/lib/Headers/stdbool.h
@@ -12,6 +12,10 @@
 
 #define __bool_true_false_are_defined 1
 
+#if defined(__MVS__) && __has_include_next(<stdbool.h>)
+#include_next <stdbool.h>
+#else
+
 #if defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L
 /* FIXME: We should be issuing a deprecation warning here, but cannot yet due
  * to system headers which include this header file unconditionally.
@@ -31,4 +35,5 @@
 #endif
 #endif
 
+#endif /* __MVS__ */
 #endif /* __STDBOOL_H */
diff --git a/clang/lib/Headers/stddef.h b/clang/lib/Headers/stddef.h
index e0ad7b8d17af..9ccc0a68fbff 100644
--- a/clang/lib/Headers/stddef.h
+++ b/clang/lib/Headers/stddef.h
@@ -36,6 +36,22 @@
     defined(__need_unreachable) || defined(__need_max_align_t) ||              \
     defined(__need_offsetof) || defined(__need_wint_t)
 
+#if defined(__MVS__) && __has_include_next(<stddef.h>)
+#define __STDDEF_H
+#undef __need_ptrdiff_t
+#undef __need_size_t
+#undef __need_rsize_t
+#undef __need_wchar_t
+#undef __need_NULL
+#undef __need_nullptr_t
+#undef __need_unreachable
+#undef __need_max_align_t
+#undef __need_offsetof
+#undef __need_wint_t
+#include_next <stddef.h>
+
+#else
+
 #if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
     !defined(__need_rsize_t) && !defined(__need_wchar_t) &&                    \
     !defined(__need_NULL) && !defined(__need_nullptr_t) &&                     \
@@ -120,4 +136,5 @@ __WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
 #undef __need_wint_t
 #endif /* __need_wint_t */
 
+#endif /* __MVS__ */
 #endif
diff --git a/clang/lib/Headers/stdint.h b/clang/lib/Headers/stdint.h
index b6699b6ca3d4..01feab7b1ee2 100644
--- a/clang/lib/Headers/stdint.h
+++ b/clang/lib/Headers/stdint.h
@@ -14,6 +14,10 @@
 #define __CLANG_STDINT_H
 #endif
 
+#if defined(__MVS__) && __has_include_next(<stdint.h>)
+#include_next <stdint.h>
+#else
+
 /* If we're hosted, fall back to the system's stdint.h, which might have
  * additional definitions.
  */
@@ -947,4 +951,5 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #endif
 
 #endif /* __STDC_HOSTED__ */
+#endif /* __MVS__ */
 #endif /* __CLANG_STDINT_H */
diff --git a/clang/lib/Headers/stdnoreturn.h b/clang/lib/Headers/stdnoreturn.h
index c90bf77e840e..6a9b209c7218 100644
--- a/clang/lib/Headers/stdnoreturn.h
+++ b/clang/lib/Headers/stdnoreturn.h
@@ -10,9 +10,15 @@
 #ifndef __STDNORETURN_H
 #define __STDNORETURN_H
 
+#if defined(__MVS__) && __has_include_next(<stdnoreturn.h>)
+#include_next <stdnoreturn.h>
+#else
+
 #define noreturn _Noreturn
 #define __noreturn_is_defined 1
 
+#endif /* __MVS__ */
+
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) &&               \
     !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* The noreturn macro is deprecated in C23. We do not mark it as such because
diff --git a/clang/lib/Headers/varargs.h b/clang/lib/Headers/varargs.h
index d241b7de3cb2..d33ddc5ae7f8 100644
--- a/clang/lib/Headers/varargs.h
+++ b/clang/lib/Headers/varargs.h
@@ -8,5 +8,9 @@
 */
 #ifndef __VARARGS_H
 #define __VARARGS_H
-  #error "Please use <stdarg.h> instead of <varargs.h>"
+#if defined(__MVS__) && __has_include_next(<varargs.h>)
+#include_next <varargs.h>
+#else
+#error "Please use <stdarg.h> instead of <varargs.h>"
+#endif /* __MVS__ */
 #endif
diff --git a/clang/lib/Headers/zos_wrappers/builtins.h b/clang/lib/Headers/zos_wrappers/builtins.h
new file mode 100644
index 000000000000..1f0d0e27ecb3
--- /dev/null
+++ b/clang/lib/Headers/zos_wrappers/builtins.h
@@ -0,0 +1,18 @@
+/*===---- builtins.h - z/Architecture Builtin Functions --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ZOS_WRAPPERS_BUILTINS_H
+#define __ZOS_WRAPPERS_BUILTINS_H
+#if defined(__MVS__)
+#include_next <builtins.h>
+#if defined(__VEC__)
+#include <vecintrin.h>
+#endif
+#endif /* defined(__MVS__) */
+#endif /* __ZOS_WRAPPERS_BUILTINS_H */
diff --git a/clang/lib/InstallAPI/FileList.cpp b/clang/lib/InstallAPI/FileList.cpp
index 8a01248659b7..65610903840a 100644
--- a/clang/lib/InstallAPI/FileList.cpp
+++ b/clang/lib/InstallAPI/FileList.cpp
@@ -51,6 +51,7 @@ private:
 
 public:
   std::unique_ptr<MemoryBuffer> InputBuffer;
+  clang::FileManager *FM;
   unsigned Version;
   HeaderSeq HeaderList;
 
@@ -124,6 +125,12 @@ Error Implementation::parseHeaders(Array &Headers) {
           HeaderFile{PathStr, *Type, /*IncludeName=*/"", Language});
       continue;
     }
+
+    if (FM)
+      if (!FM->getOptionalFileRef(PathStr))
+        return createFileError(
+            PathStr, make_error_code(std::errc::no_such_file_or_directory));
+
     auto IncludeName = createIncludeHeaderName(PathStr);
     HeaderList.emplace_back(PathStr, *Type,
                             IncludeName.has_value() ? IncludeName.value() : "",
@@ -170,9 +177,10 @@ Error Implementation::parse(StringRef Input) {
 
 llvm::Error
 FileListReader::loadHeaders(std::unique_ptr<MemoryBuffer> InputBuffer,
-                            HeaderSeq &Destination) {
+                            HeaderSeq &Destination, clang::FileManager *FM) {
   Implementation Impl;
   Impl.InputBuffer = std::move(InputBuffer);
+  Impl.FM = FM;
 
   if (llvm::Error Err = Impl.parse(Impl.InputBuffer->getBuffer()))
     return Err;
diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp
index 499813f8ab7d..10f0ab7180e6 100644
--- a/clang/lib/Lex/Pragma.cpp
+++ b/clang/lib/Lex/Pragma.cpp
@@ -1444,7 +1444,8 @@ struct PragmaWarningHandler : public PragmaHandler {
                                  .Case("once", PPCallbacks::PWS_Once)
                                  .Case("suppress", PPCallbacks::PWS_Suppress)
                                  .Default(-1);
-          if ((SpecifierValid = SpecifierInt != -1))
+          SpecifierValid = SpecifierInt != -1;
+          if (SpecifierValid)
             Specifier =
                 static_cast<PPCallbacks::PragmaWarningSpecifier>(SpecifierInt);
 
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 05ad5ecbfaa0..4e4b05b21383 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -91,13 +91,23 @@ static StringRef normalizeAttrName(StringRef Name) {
   return Name;
 }
 
-/// isAttributeLateParsed - Return true if the attribute has arguments that
-/// require late parsing.
-static bool isAttributeLateParsed(const IdentifierInfo &II) {
+/// returns true iff attribute is annotated with `LateAttrParseExperimentalExt`
+/// in `Attr.td`.
+static bool IsAttributeLateParsedExperimentalExt(const IdentifierInfo &II) {
+#define CLANG_ATTR_LATE_PARSED_EXPERIMENTAL_EXT_LIST
+  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+#include "clang/Parse/AttrParserStringSwitches.inc"
+      .Default(false);
+#undef CLANG_ATTR_LATE_PARSED_EXPERIMENTAL_EXT_LIST
+}
+
+/// returns true iff attribute is annotated with `LateAttrParseStandard` in
+/// `Attr.td`.
+static bool IsAttributeLateParsedStandard(const IdentifierInfo &II) {
 #define CLANG_ATTR_LATE_PARSED_LIST
-    return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
 #include "clang/Parse/AttrParserStringSwitches.inc"
-        .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_LATE_PARSED_LIST
 }
 
@@ -222,8 +232,26 @@ void Parser::ParseGNUAttributes(ParsedAttributes &Attrs,
         continue;
       }
 
+      bool LateParse = false;
+      if (!LateAttrs)
+        LateParse = false;
+      else if (LateAttrs->lateAttrParseExperimentalExtOnly()) {
+        // The caller requested that this attribute **only** be late
+        // parsed for `LateAttrParseExperimentalExt` attributes. This will
+        // only be late parsed if the experimental language option is enabled.
+        LateParse = getLangOpts().ExperimentalLateParseAttributes &&
+                    IsAttributeLateParsedExperimentalExt(*AttrName);
+      } else {
+        // The caller did not restrict late parsing to only
+        // `LateAttrParseExperimentalExt` attributes so late parse
+        // both `LateAttrParseStandard` and `LateAttrParseExperimentalExt`
+        // attributes.
+        LateParse = IsAttributeLateParsedExperimentalExt(*AttrName) ||
+                    IsAttributeLateParsedStandard(*AttrName);
+      }
+
       // Handle "parameterized" attributes
-      if (!LateAttrs || !isAttributeLateParsed(*AttrName)) {
+      if (!LateParse) {
         ParseGNUAttributeArgs(AttrName, AttrNameLoc, Attrs, &EndLoc, nullptr,
                               SourceLocation(), ParsedAttr::Form::GNU(), D);
         continue;
@@ -2998,7 +3026,7 @@ bool Parser::ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS,
         << TokenName << TagName << getLangOpts().CPlusPlus
         << FixItHint::CreateInsertion(Tok.getLocation(), FixitTagName);
 
-      if (Actions.LookupParsedName(R, getCurScope(), SS)) {
+      if (Actions.LookupName(R, getCurScope())) {
         for (LookupResult::iterator I = R.begin(), IEnd = R.end();
              I != IEnd; ++I)
           Diag((*I)->getLocation(), diag::note_decl_hiding_tag_type)
@@ -7038,18 +7066,23 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
 void Parser::ParseDecompositionDeclarator(Declarator &D) {
   assert(Tok.is(tok::l_square));
 
+  TentativeParsingAction PA(*this);
+  BalancedDelimiterTracker T(*this, tok::l_square);
+  T.consumeOpen();
+
+  if (isCXX11AttributeSpecifier())
+    DiagnoseAndSkipCXX11Attributes();
+
   // If this doesn't look like a structured binding, maybe it's a misplaced
   // array declarator.
-  // FIXME: Consume the l_square first so we don't need extra lookahead for
-  // this.
-  if (!(NextToken().is(tok::identifier) &&
-        GetLookAheadToken(2).isOneOf(tok::comma, tok::r_square)) &&
-      !(NextToken().is(tok::r_square) &&
-        GetLookAheadToken(2).isOneOf(tok::equal, tok::l_brace)))
+  if (!(Tok.is(tok::identifier) &&
+        NextToken().isOneOf(tok::comma, tok::r_square, tok::kw_alignas,
+                            tok::l_square)) &&
+      !(Tok.is(tok::r_square) &&
+        NextToken().isOneOf(tok::equal, tok::l_brace))) {
+    PA.Revert();
     return ParseMisplacedBracketDeclarator(D);
-
-  BalancedDelimiterTracker T(*this, tok::l_square);
-  T.consumeOpen();
+  }
 
   SmallVector<DecompositionDeclarator::Binding, 32> Bindings;
   while (Tok.isNot(tok::r_square)) {
@@ -7074,13 +7107,27 @@ void Parser::ParseDecompositionDeclarator(Declarator &D) {
       }
     }
 
+    if (isCXX11AttributeSpecifier())
+      DiagnoseAndSkipCXX11Attributes();
+
     if (Tok.isNot(tok::identifier)) {
       Diag(Tok, diag::err_expected) << tok::identifier;
       break;
     }
 
-    Bindings.push_back({Tok.getIdentifierInfo(), Tok.getLocation()});
+    IdentifierInfo *II = Tok.getIdentifierInfo();
+    SourceLocation Loc = Tok.getLocation();
     ConsumeToken();
+
+    ParsedAttributes Attrs(AttrFactory);
+    if (isCXX11AttributeSpecifier()) {
+      Diag(Tok, getLangOpts().CPlusPlus26
+                    ? diag::warn_cxx23_compat_decl_attrs_on_binding
+                    : diag::ext_decl_attrs_on_binding);
+      MaybeParseCXX11Attributes(Attrs);
+    }
+
+    Bindings.push_back({II, Loc, std::move(Attrs)});
   }
 
   if (Tok.isNot(tok::r_square))
@@ -7095,6 +7142,8 @@ void Parser::ParseDecompositionDeclarator(Declarator &D) {
     T.consumeClose();
   }
 
+  PA.Commit();
+
   return D.setDecompositionBindings(T.getOpenLocation(), Bindings,
                                     T.getCloseLocation());
 }
@@ -7367,12 +7416,20 @@ void Parser::ParseFunctionDeclarator(Declarator &D,
       std::optional<Sema::CXXThisScopeRAII> ThisScope;
       InitCXXThisScopeForDeclaratorIfRelevant(D, DS, ThisScope);
 
-      // Parse exception-specification[opt].
-      // FIXME: Per [class.mem]p6, all exception-specifications at class scope
-      // should be delayed, including those for non-members (eg, friend
-      // declarations). But only applying this to member declarations is
-      // consistent with what other implementations do.
-      bool Delayed = D.isFirstDeclarationOfMember() &&
+      // C++ [class.mem.general]p8:
+      //   A complete-class context of a class (template) is a
+      //     - function body,
+      //     - default argument,
+      //     - default template argument,
+      //     - noexcept-specifier, or
+      //     - default member initializer
+      //   within the member-specification of the class or class template.
+      //
+      // Parse exception-specification[opt]. If we are in the
+      // member-specification of a class or class template, this is a
+      // complete-class context and parsing of the noexcept-specifier should be
+      // delayed (even if this is a friend declaration).
+      bool Delayed = D.getContext() == DeclaratorContext::Member &&
                      D.isFunctionDeclaratorAFunctionDeclaration();
       if (Delayed && Actions.isLibstdcxxEagerExceptionSpecHack(D) &&
           GetLookAheadToken(0).is(tok::kw_noexcept) &&
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 32d96f81c4c8..7d6febb04a82 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -31,6 +31,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCUDA.h"
+#include "clang/Sema/SemaOpenACC.h"
 #include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/SemaSYCL.h"
 #include "clang/Sema/TypoCorrection.h"
@@ -2070,15 +2071,22 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
       if (!LHS.isInvalid() && !HasError && !Length.isInvalid() &&
           !Stride.isInvalid() && Tok.is(tok::r_square)) {
         if (ColonLocFirst.isValid() || ColonLocSecond.isValid()) {
-          // FIXME: OpenACC hasn't implemented Sema/Array section handling at a
-          // semantic level yet. For now, just reuse the OpenMP implementation
-          // as it gets the parsing/type management mostly right, and we can
-          // replace this call to ActOnOpenACCArraySectionExpr in the future.
-          // Eventually we'll genericize the OPenMPArraySectionExpr type as
-          // well.
-          LHS = Actions.OpenMP().ActOnOMPArraySectionExpr(
-              LHS.get(), Loc, ArgExprs.empty() ? nullptr : ArgExprs[0],
-              ColonLocFirst, ColonLocSecond, Length.get(), Stride.get(), RLoc);
+          // Like above, AllowOpenACCArraySections is 'more specific' and only
+          // enabled when actively parsing a 'var' in a 'var-list' during
+          // clause/'cache' construct parsing, so it is more specific. So we
+          // should do it first, so that the correct node gets created.
+          if (AllowOpenACCArraySections) {
+            assert(!Stride.isUsable() && !ColonLocSecond.isValid() &&
+                   "Stride/second colon not allowed for OpenACC");
+            LHS = Actions.OpenACC().ActOnArraySectionExpr(
+                LHS.get(), Loc, ArgExprs.empty() ? nullptr : ArgExprs[0],
+                ColonLocFirst, Length.get(), RLoc);
+          } else {
+            LHS = Actions.OpenMP().ActOnOMPArraySectionExpr(
+                LHS.get(), Loc, ArgExprs.empty() ? nullptr : ArgExprs[0],
+                ColonLocFirst, ColonLocSecond, Length.get(), Stride.get(),
+                RLoc);
+          }
         } else {
           LHS = Actions.ActOnArraySubscriptExpr(getCurScope(), LHS.get(), Loc,
                                                 ArgExprs, RLoc);
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 8a18fca8064e..2d1ec6539b2f 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -86,6 +86,10 @@ OpenACCClauseKind getOpenACCClauseKind(Token Tok) {
   if (Tok.is(tok::kw_if))
     return OpenACCClauseKind::If;
 
+  // 'private' is also a keyword, make sure we pare it correctly.
+  if (Tok.is(tok::kw_private))
+    return OpenACCClauseKind::Private;
+
   if (!Tok.is(tok::identifier))
     return OpenACCClauseKind::Invalid;
 
@@ -327,7 +331,7 @@ OpenACCReductionOperator ParseReductionOperator(Parser &P) {
       return OpenACCReductionOperator::Max;
     if (ReductionKindTok.getIdentifierInfo()->isStr("min"))
       return OpenACCReductionOperator::Min;
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   default:
     P.Diag(ReductionKindTok, diag::err_acc_invalid_reduction_operator);
     return OpenACCReductionOperator::Invalid;
@@ -682,28 +686,6 @@ bool Parser::ParseOpenACCIntExprList(OpenACCDirectiveKind DK,
   return false;
 }
 
-bool Parser::ParseOpenACCClauseVarList(OpenACCClauseKind Kind) {
-  // FIXME: Future clauses will require 'special word' parsing, check for one,
-  // then parse it based on whether it is a clause that requires a 'special
-  // word'.
-  (void)Kind;
-
-  // If the var parsing fails, skip until the end of the directive as this is
-  // an expression and gets messy if we try to continue otherwise.
-  if (ParseOpenACCVar())
-    return true;
-
-  while (!getCurToken().isOneOf(tok::r_paren, tok::annot_pragma_openacc_end)) {
-    ExpectAndConsume(tok::comma);
-
-    // If the var parsing fails, skip until the end of the directive as this is
-    // an expression and gets messy if we try to continue otherwise.
-    if (ParseOpenACCVar())
-      return true;
-  }
-  return false;
-}
-
 /// OpenACC 3.3 Section 2.4:
 /// The argument to the device_type clause is a comma-separated list of one or
 /// more device architecture name identifiers, or an asterisk.
@@ -917,35 +899,26 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
     case OpenACCClauseKind::CopyIn:
       tryParseAndConsumeSpecialTokenKind(
           *this, OpenACCSpecialTokenKind::ReadOnly, ClauseKind);
-      if (ParseOpenACCClauseVarList(ClauseKind)) {
-        Parens.skipToEnd();
-        return OpenACCCanContinue();
-      }
+      ParseOpenACCVarList();
       break;
     case OpenACCClauseKind::Create:
     case OpenACCClauseKind::CopyOut:
       tryParseAndConsumeSpecialTokenKind(*this, OpenACCSpecialTokenKind::Zero,
                                          ClauseKind);
-      if (ParseOpenACCClauseVarList(ClauseKind)) {
-        Parens.skipToEnd();
-        return OpenACCCanContinue();
-      }
+      ParseOpenACCVarList();
       break;
     case OpenACCClauseKind::Reduction:
       // If we're missing a clause-kind (or it is invalid), see if we can parse
       // the var-list anyway.
       ParseReductionOperator(*this);
-      if (ParseOpenACCClauseVarList(ClauseKind)) {
-        Parens.skipToEnd();
-        return OpenACCCanContinue();
-      }
+      ParseOpenACCVarList();
       break;
     case OpenACCClauseKind::Self:
       // The 'self' clause is a var-list instead of a 'condition' in the case of
       // the 'update' clause, so we have to handle it here.  U se an assert to
       // make sure we get the right differentiator.
       assert(DirKind == OpenACCDirectiveKind::Update);
-      LLVM_FALLTHROUGH;
+      [[fallthrough]];
     case OpenACCClauseKind::Attach:
     case OpenACCClauseKind::Copy:
     case OpenACCClauseKind::Delete:
@@ -958,12 +931,11 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
     case OpenACCClauseKind::Link:
     case OpenACCClauseKind::NoCreate:
     case OpenACCClauseKind::Present:
-    case OpenACCClauseKind::Private:
     case OpenACCClauseKind::UseDevice:
-      if (ParseOpenACCClauseVarList(ClauseKind)) {
-        Parens.skipToEnd();
-        return OpenACCCanContinue();
-      }
+      ParseOpenACCVarList();
+      break;
+    case OpenACCClauseKind::Private:
+      ParsedClause.setVarListDetails(ParseOpenACCVarList());
       break;
     case OpenACCClauseKind::Collapse: {
       tryParseAndConsumeSpecialTokenKind(*this, OpenACCSpecialTokenKind::Force,
@@ -1227,16 +1199,51 @@ ExprResult Parser::ParseOpenACCBindClauseArgument() {
 
 /// OpenACC 3.3, section 1.6:
 /// In this spec, a 'var' (in italics) is one of the following:
-/// - a variable name (a scalar, array, or compisite variable name)
+/// - a variable name (a scalar, array, or composite variable name)
 /// - a subarray specification with subscript ranges
 /// - an array element
 /// - a member of a composite variable
 /// - a common block name between slashes (fortran only)
-bool Parser::ParseOpenACCVar() {
+Parser::OpenACCVarParseResult Parser::ParseOpenACCVar() {
   OpenACCArraySectionRAII ArraySections(*this);
-  ExprResult Res =
-      getActions().CorrectDelayedTyposInExpr(ParseAssignmentExpression());
-  return Res.isInvalid();
+
+  ExprResult Res = ParseAssignmentExpression();
+  if (!Res.isUsable())
+    return {Res, OpenACCParseCanContinue::Cannot};
+
+  Res = getActions().CorrectDelayedTyposInExpr(Res.get());
+  if (!Res.isUsable())
+    return {Res, OpenACCParseCanContinue::Can};
+
+  Res = getActions().OpenACC().ActOnVar(Res.get());
+
+  return {Res, OpenACCParseCanContinue::Can};
+}
+
+llvm::SmallVector<Expr *> Parser::ParseOpenACCVarList() {
+  llvm::SmallVector<Expr *> Vars;
+
+  auto [Res, CanContinue] = ParseOpenACCVar();
+  if (Res.isUsable()) {
+    Vars.push_back(Res.get());
+  } else if (CanContinue == OpenACCParseCanContinue::Cannot) {
+    SkipUntil(tok::r_paren, tok::annot_pragma_openacc_end, StopBeforeMatch);
+    return Vars;
+  }
+
+  while (!getCurToken().isOneOf(tok::r_paren, tok::annot_pragma_openacc_end)) {
+    ExpectAndConsume(tok::comma);
+
+    auto [Res, CanContinue] = ParseOpenACCVar();
+
+    if (Res.isUsable()) {
+      Vars.push_back(Res.get());
+    } else if (CanContinue == OpenACCParseCanContinue::Cannot) {
+      SkipUntil(tok::r_paren, tok::annot_pragma_openacc_end, StopBeforeMatch);
+      return Vars;
+    }
+  }
+  return Vars;
 }
 
 /// OpenACC 3.3, section 2.10:
@@ -1259,24 +1266,9 @@ void Parser::ParseOpenACCCacheVarList() {
     // Sema/AST generation.
   }
 
-  bool FirstArray = true;
-  while (!getCurToken().isOneOf(tok::r_paren, tok::annot_pragma_openacc_end)) {
-    if (!FirstArray)
-      ExpectAndConsume(tok::comma);
-    FirstArray = false;
-
-    // OpenACC 3.3, section 2.10:
-    // A 'var' in a cache directive must be a single array element or a simple
-    // subarray.  In C and C++, a simple subarray is an array name followed by
-    // an extended array range specification in brackets, with a start and
-    // length such as:
-    //
-    // arr[lower:length]
-    //
-    if (ParseOpenACCVar())
-      SkipUntil(tok::r_paren, tok::annot_pragma_openacc_end, tok::comma,
-                StopBeforeMatch);
-  }
+  // ParseOpenACCVarList should leave us before a r-paren, so no need to skip
+  // anything here.
+  ParseOpenACCVarList();
 }
 
 Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 480201bc06f6..53d89ce2fa3e 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4228,13 +4228,20 @@ bool Parser::parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data) {
   return T.consumeClose();
 }
 
+static OpenMPMapClauseKind isMapType(Parser &P);
+
 /// Parse map-type-modifiers in map clause.
-/// map([ [map-type-modifier[,] [map-type-modifier[,] ...] map-type : ] list)
+/// map([ [map-type-modifier[,] [map-type-modifier[,] ...] [map-type] : ] list)
 /// where, map-type-modifier ::= always | close | mapper(mapper-identifier) |
 /// present
+/// where, map-type ::= alloc | delete | from | release | to | tofrom
 bool Parser::parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data) {
+  bool HasMapType = false;
+  SourceLocation PreMapLoc = Tok.getLocation();
+  StringRef PreMapName = "";
   while (getCurToken().isNot(tok::colon)) {
     OpenMPMapModifierKind TypeModifier = isMapModifier(*this);
+    OpenMPMapClauseKind MapKind = isMapType(*this);
     if (TypeModifier == OMPC_MAP_MODIFIER_always ||
         TypeModifier == OMPC_MAP_MODIFIER_close ||
         TypeModifier == OMPC_MAP_MODIFIER_present ||
@@ -4257,6 +4264,19 @@ bool Parser::parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data) {
         Diag(Data.MapTypeModifiersLoc.back(), diag::err_omp_missing_comma)
             << "map type modifier";
 
+    } else if (getLangOpts().OpenMP >= 60 && MapKind != OMPC_MAP_unknown) {
+      if (!HasMapType) {
+        HasMapType = true;
+        Data.ExtraModifier = MapKind;
+        MapKind = OMPC_MAP_unknown;
+        PreMapLoc = Tok.getLocation();
+        PreMapName = Tok.getIdentifierInfo()->getName();
+      } else {
+        Diag(Tok, diag::err_omp_more_one_map_type);
+        Diag(PreMapLoc, diag::note_previous_map_type_specified_here)
+            << PreMapName;
+      }
+      ConsumeToken();
     } else {
       // For the case of unknown map-type-modifier or a map-type.
       // Map-type is followed by a colon; the function returns when it
@@ -4267,8 +4287,14 @@ bool Parser::parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data) {
         continue;
       }
       // Potential map-type token as it is followed by a colon.
-      if (PP.LookAhead(0).is(tok::colon))
-        return false;
+      if (PP.LookAhead(0).is(tok::colon)) {
+        if (getLangOpts().OpenMP >= 60) {
+          break;
+        } else {
+          return false;
+        }
+      }
+
       Diag(Tok, diag::err_omp_unknown_map_type_modifier)
           << (getLangOpts().OpenMP >= 51 ? (getLangOpts().OpenMP >= 52 ? 2 : 1)
                                          : 0)
@@ -4278,6 +4304,14 @@ bool Parser::parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data) {
     if (getCurToken().is(tok::comma))
       ConsumeToken();
   }
+  if (getLangOpts().OpenMP >= 60 && !HasMapType) {
+    if (!Tok.is(tok::colon)) {
+      Diag(Tok, diag::err_omp_unknown_map_type);
+      ConsumeToken();
+    } else {
+      Data.ExtraModifier = OMPC_MAP_unknown;
+    }
+  }
   return false;
 }
 
@@ -4675,8 +4709,10 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
     // Only parse map-type-modifier[s] and map-type if a colon is present in
     // the map clause.
     if (ColonPresent) {
+      if (getLangOpts().OpenMP >= 60 && getCurToken().is(tok::colon))
+        Diag(Tok, diag::err_omp_map_modifier_specification_list);
       IsInvalidMapperModifier = parseMapTypeModifiers(Data);
-      if (!IsInvalidMapperModifier)
+      if (getLangOpts().OpenMP < 60 && !IsInvalidMapperModifier)
         parseMapType(*this, Data);
       else
         SkipUntil(tok::colon, tok::annot_pragma_openmp_end, StopBeforeMatch);
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index b79683bb32a6..60e818902570 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -293,7 +293,7 @@ DeclaratorChunk DeclaratorChunk::getFunction(bool hasProto,
 
 void Declarator::setDecompositionBindings(
     SourceLocation LSquareLoc,
-    ArrayRef<DecompositionDeclarator::Binding> Bindings,
+    MutableArrayRef<DecompositionDeclarator::Binding> Bindings,
     SourceLocation RSquareLoc) {
   assert(!hasName() && "declarator given multiple names!");
 
@@ -317,7 +317,7 @@ void Declarator::setDecompositionBindings(
           new DecompositionDeclarator::Binding[Bindings.size()];
       BindingGroup.DeleteBindings = true;
     }
-    std::uninitialized_copy(Bindings.begin(), Bindings.end(),
+    std::uninitialized_move(Bindings.begin(), Bindings.end(),
                             BindingGroup.Bindings);
   }
 }
@@ -1202,7 +1202,10 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
         !S.Context.getTargetInfo().hasFeature("power8-vector"))
       S.Diag(TSTLoc, diag::err_invalid_vector_int128_decl_spec);
 
-    if (TypeAltiVecBool) {
+    // Complex vector types are not supported.
+    if (TypeSpecComplex != TSC_unspecified)
+      S.Diag(TSCLoc, diag::err_invalid_vector_complex_decl_spec);
+    else if (TypeAltiVecBool) {
       // Sign specifiers are not allowed with vector bool. (PIM 2.1)
       if (getTypeSpecSign() != TypeSpecifierSign::Unspecified) {
         S.Diag(TSSLoc, diag::err_invalid_vector_bool_decl_spec)
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 1a1febf7a352..bb283c54b3d2 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -126,12 +126,15 @@ struct BuiltinTypeDeclBuilder {
 
   static DeclRefExpr *lookupBuiltinFunction(ASTContext &AST, Sema &S,
                                             StringRef Name) {
-    CXXScopeSpec SS;
     IdentifierInfo &II = AST.Idents.get(Name, tok::TokenKind::identifier);
     DeclarationNameInfo NameInfo =
         DeclarationNameInfo(DeclarationName(&II), SourceLocation());
     LookupResult R(S, NameInfo, Sema::LookupOrdinaryName);
-    S.LookupParsedName(R, S.getCurScope(), &SS, false);
+    // AllowBuiltinCreation is false but LookupDirect will create
+    // the builtin when searching the global scope anyways...
+    S.LookupName(R, S.getCurScope());
+    // FIXME: If the builtin function was user-declared in global scope,
+    // this assert *will* fail. Should this call LookupBuiltin instead?
     assert(R.isSingleResult() &&
            "Since this is a builtin it should always resolve!");
     auto *VD = cast<ValueDecl>(R.getFoundDecl());
diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp
index ce6211c23218..8af36d5c24e3 100644
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@@ -180,7 +180,8 @@ static ScopePair GetDiagForGotoScopeDecl(Sema &S, const Decl *D) {
     }
 
     const Expr *Init = VD->getInit();
-    if (S.Context.getLangOpts().CPlusPlus && VD->hasLocalStorage() && Init) {
+    if (S.Context.getLangOpts().CPlusPlus && VD->hasLocalStorage() && Init &&
+        !Init->containsErrors()) {
       // C++11 [stmt.dcl]p3:
       //   A program that jumps from a point where a variable with automatic
       //   storage duration is not in scope to a point where it is in scope
diff --git a/clang/lib/Sema/MultiplexExternalSemaSource.cpp b/clang/lib/Sema/MultiplexExternalSemaSource.cpp
index 6a5f9f6680e6..79e656eb4b7e 100644
--- a/clang/lib/Sema/MultiplexExternalSemaSource.cpp
+++ b/clang/lib/Sema/MultiplexExternalSemaSource.cpp
@@ -46,7 +46,7 @@ void MultiplexExternalSemaSource::AddSource(ExternalSemaSource *Source) {
 // ExternalASTSource.
 //===----------------------------------------------------------------------===//
 
-Decl *MultiplexExternalSemaSource::GetExternalDecl(Decl::DeclID ID) {
+Decl *MultiplexExternalSemaSource::GetExternalDecl(GlobalDeclID ID) {
   for(size_t i = 0; i < Sources.size(); ++i)
     if (Decl *Result = Sources[i]->GetExternalDecl(ID))
       return Result;
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 4c445f28bba8..c5998aca0d72 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -594,6 +594,11 @@ static void ProcessAPINotes(Sema &S, TagDecl *D, const api_notes::TagInfo &Info,
     D->addAttr(
         SwiftAttrAttr::Create(S.Context, "release:" + ReleaseOp.value()));
 
+  if (auto Copyable = Info.isSwiftCopyable()) {
+    if (!*Copyable)
+      D->addAttr(SwiftAttrAttr::Create(S.Context, "~Copyable"));
+  }
+
   if (auto Extensibility = Info.EnumExtensibility) {
     using api_notes::EnumExtensibilityKind;
     bool ShouldAddAttribute = (*Extensibility != EnumExtensibilityKind::None);
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index a5dd158808f2..a83b1e8afadb 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -837,7 +837,7 @@ void Sema::ActOnPragmaUnused(const Token &IdTok, Scope *curScope,
 
   IdentifierInfo *Name = IdTok.getIdentifierInfo();
   LookupResult Lookup(*this, Name, IdTok.getLocation(), LookupOrdinaryName);
-  LookupParsedName(Lookup, curScope, nullptr, true);
+  LookupName(Lookup, curScope, /*AllowBuiltinCreation=*/true);
 
   if (Lookup.empty()) {
     Diag(PragmaLoc, diag::warn_pragma_unused_undeclared_var)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 67132701b41c..cf8840c63024 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3164,13 +3164,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
     const Expr *Arg = TheCall->getArg(0);
     const auto *TyA = Arg->getType()->getAs<VectorType>();
-    if (!TyA) {
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull()) {
       Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1 << /* vector ty*/ 4 << Arg->getType();
       return ExprError();
     }
 
-    TheCall->setType(TyA->getElementType());
+    TheCall->setType(ElTy);
     break;
   }
 
@@ -3186,12 +3193,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
     const Expr *Arg = TheCall->getArg(0);
     const auto *TyA = Arg->getType()->getAs<VectorType>();
-    if (!TyA || !TyA->getElementType()->isIntegerType()) {
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull() || !ElTy->isIntegerType()) {
       Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1  << /* vector of integers */ 6 << Arg->getType();
       return ExprError();
     }
-    TheCall->setType(TyA->getElementType());
+
+    TheCall->setType(ElTy);
     break;
   }
 
@@ -12544,6 +12559,17 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
     return true;
   }
 
+  // Diagnose attempts to use '%P' with ObjC object types, which will result in
+  // dumping raw class data (like is-a pointer), not actual data.
+  if (FS.getConversionSpecifier().getKind() == ConversionSpecifier::PArg &&
+      ExprTy->isObjCObjectPointerType()) {
+    const CharSourceRange &CSR =
+        getSpecifierRange(StartSpecifier, SpecifierLen);
+    EmitFormatDiagnostic(S.PDiag(diag::warn_format_P_with_objc_pointer),
+                         E->getExprLoc(), false, CSR);
+    return true;
+  }
+
   ArgType::MatchKind ImplicitMatch = ArgType::NoMatch;
   ArgType::MatchKind Match = AT.matchesType(S.Context, ExprTy);
   ArgType::MatchKind OrigMatch = Match;
@@ -18724,8 +18750,10 @@ void Sema::CheckArrayAccess(const Expr *expr) {
         expr = cast<MemberExpr>(expr)->getBase();
         break;
       }
-      case Stmt::OMPArraySectionExprClass: {
-        const OMPArraySectionExpr *ASE = cast<OMPArraySectionExpr>(expr);
+      case Stmt::ArraySectionExprClass: {
+        const ArraySectionExpr *ASE = cast<ArraySectionExpr>(expr);
+        // FIXME: We should probably be checking all of the elements to the
+        // 'length' here as well.
         if (ASE->getLowerBound())
           CheckArrayAccess(ASE->getBase(), ASE->getLowerBound(),
                            /*ASE=*/nullptr, AllowOnePastEnd > 0);
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 378615497b13..79fb6c0417e3 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -832,7 +832,7 @@ static bool isTagTypeWithMissingTag(Sema &SemaRef, LookupResult &Result,
                                     IdentifierInfo *&Name,
                                     SourceLocation NameLoc) {
   LookupResult R(SemaRef, Name, NameLoc, Sema::LookupTagName);
-  SemaRef.LookupParsedName(R, S, &SS);
+  SemaRef.LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
   if (TagDecl *Tag = R.getAsSingle<TagDecl>()) {
     StringRef FixItTagName;
     switch (Tag->getTagKind()) {
@@ -869,7 +869,7 @@ static bool isTagTypeWithMissingTag(Sema &SemaRef, LookupResult &Result,
 
     // Replace lookup results with just the tag decl.
     Result.clear(Sema::LookupTagName);
-    SemaRef.LookupParsedName(Result, S, &SS);
+    SemaRef.LookupParsedName(Result, S, &SS, /*ObjectType=*/QualType());
     return true;
   }
 
@@ -896,7 +896,8 @@ Sema::NameClassification Sema::ClassifyName(Scope *S, CXXScopeSpec &SS,
   }
 
   LookupResult Result(*this, Name, NameLoc, LookupOrdinaryName);
-  LookupParsedName(Result, S, &SS, !CurMethod);
+  LookupParsedName(Result, S, &SS, /*ObjectType=*/QualType(),
+                   /*AllowBuiltinCreation=*/!CurMethod);
 
   if (SS.isInvalid())
     return NameClassification::Error();
@@ -1974,7 +1975,7 @@ static bool ShouldDiagnoseUnusedDecl(const LangOptions &LangOpts,
     // it is, by the bindings' expressions).
     bool IsAllPlaceholders = true;
     for (const auto *BD : DD->bindings()) {
-      if (BD->isReferenced())
+      if (BD->isReferenced() || BD->hasAttr<UnusedAttr>())
         return false;
       IsAllPlaceholders = IsAllPlaceholders && BD->isPlaceholderVar(LangOpts);
     }
@@ -13502,16 +13503,18 @@ void Sema::checkNonTrivialCUnion(QualType QT, SourceLocation Loc,
 void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   // If there is no declaration, there was an error parsing it.  Just ignore
   // the initializer.
-  if (!RealDecl || RealDecl->isInvalidDecl()) {
+  if (!RealDecl) {
     CorrectDelayedTyposInExpr(Init, dyn_cast_or_null<VarDecl>(RealDecl));
     return;
   }
 
-  if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(RealDecl)) {
-    // Pure-specifiers are handled in ActOnPureSpecifier.
-    Diag(Method->getLocation(), diag::err_member_function_initialization)
-      << Method->getDeclName() << Init->getSourceRange();
-    Method->setInvalidDecl();
+  if (auto *Method = dyn_cast<CXXMethodDecl>(RealDecl)) {
+    if (!Method->isInvalidDecl()) {
+      // Pure-specifiers are handled in ActOnPureSpecifier.
+      Diag(Method->getLocation(), diag::err_member_function_initialization)
+          << Method->getDeclName() << Init->getSourceRange();
+      Method->setInvalidDecl();
+    }
     return;
   }
 
@@ -13523,6 +13526,15 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     return;
   }
 
+  if (VDecl->isInvalidDecl()) {
+    CorrectDelayedTyposInExpr(Init, VDecl);
+    ExprResult Recovery =
+        CreateRecoveryExpr(Init->getBeginLoc(), Init->getEndLoc(), {Init});
+    if (Expr *E = Recovery.get())
+      VDecl->setInit(E);
+    return;
+  }
+
   // WebAssembly tables can't be used to initialise a variable.
   if (Init && !Init->getType().isNull() &&
       Init->getType()->isWebAssemblyTableType()) {
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index abdbc9d8830c..157d42c09cfc 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -910,6 +910,8 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D,
 
     auto *BD = BindingDecl::Create(Context, DC, B.NameLoc, VarName);
 
+    ProcessDeclAttributeList(S, BD, *B.Attrs);
+
     // Find the shadowed declaration before filtering for scope.
     NamedDecl *ShadowedDecl = D.getCXXScopeSpec().isEmpty()
                                   ? getShadowedDeclaration(BD, Previous)
@@ -4517,7 +4519,7 @@ Sema::BuildMemInitializer(Decl *ConstructorD,
                               DS.getBeginLoc(), DS.getEllipsisLoc());
   } else {
     LookupResult R(*this, MemberOrBase, IdLoc, LookupOrdinaryName);
-    LookupParsedName(R, S, &SS);
+    LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
 
     TypeDecl *TyD = R.getAsSingle<TypeDecl>();
     if (!TyD) {
@@ -12052,11 +12054,17 @@ bool Sema::isStdInitializerList(QualType Ty, QualType *Element) {
 
     Template = Specialization->getSpecializedTemplate();
     Arguments = Specialization->getTemplateArgs().data();
-  } else if (const TemplateSpecializationType *TST =
-                 Ty->getAs<TemplateSpecializationType>()) {
-    Template = dyn_cast_or_null<ClassTemplateDecl>(
-        TST->getTemplateName().getAsTemplateDecl());
-    Arguments = TST->template_arguments().begin();
+  } else {
+    const TemplateSpecializationType *TST = nullptr;
+    if (auto *ICN = Ty->getAs<InjectedClassNameType>())
+      TST = ICN->getInjectedTST();
+    else
+      TST = Ty->getAs<TemplateSpecializationType>();
+    if (TST) {
+      Template = dyn_cast_or_null<ClassTemplateDecl>(
+          TST->getTemplateName().getAsTemplateDecl());
+      Arguments = TST->template_arguments().begin();
+    }
   }
   if (!Template)
     return false;
@@ -12262,7 +12270,7 @@ Decl *Sema::ActOnUsingDirective(Scope *S, SourceLocation UsingLoc,
 
   // Lookup namespace name.
   LookupResult R(*this, NamespcName, IdentLoc, LookupNamespaceName);
-  LookupParsedName(R, S, &SS);
+  LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
   if (R.isAmbiguous())
     return nullptr;
 
@@ -13721,7 +13729,7 @@ Decl *Sema::ActOnNamespaceAliasDef(Scope *S, SourceLocation NamespaceLoc,
 
   // Lookup the namespace name.
   LookupResult R(*this, Ident, IdentLoc, LookupNamespaceName);
-  LookupParsedName(R, S, &SS);
+  LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
 
   if (R.isAmbiguous())
     return nullptr;
@@ -19164,40 +19172,40 @@ void Sema::checkExceptionSpecification(
   }
 }
 
-void Sema::actOnDelayedExceptionSpecification(Decl *MethodD,
-             ExceptionSpecificationType EST,
-             SourceRange SpecificationRange,
-             ArrayRef<ParsedType> DynamicExceptions,
-             ArrayRef<SourceRange> DynamicExceptionRanges,
-             Expr *NoexceptExpr) {
-  if (!MethodD)
+void Sema::actOnDelayedExceptionSpecification(
+    Decl *D, ExceptionSpecificationType EST, SourceRange SpecificationRange,
+    ArrayRef<ParsedType> DynamicExceptions,
+    ArrayRef<SourceRange> DynamicExceptionRanges, Expr *NoexceptExpr) {
+  if (!D)
     return;
 
-  // Dig out the method we're referring to.
-  if (FunctionTemplateDecl *FunTmpl = dyn_cast<FunctionTemplateDecl>(MethodD))
-    MethodD = FunTmpl->getTemplatedDecl();
+  // Dig out the function we're referring to.
+  if (FunctionTemplateDecl *FTD = dyn_cast<FunctionTemplateDecl>(D))
+    D = FTD->getTemplatedDecl();
 
-  CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(MethodD);
-  if (!Method)
+  FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
+  if (!FD)
     return;
 
   // Check the exception specification.
   llvm::SmallVector<QualType, 4> Exceptions;
   FunctionProtoType::ExceptionSpecInfo ESI;
-  checkExceptionSpecification(/*IsTopLevel*/true, EST, DynamicExceptions,
+  checkExceptionSpecification(/*IsTopLevel=*/true, EST, DynamicExceptions,
                               DynamicExceptionRanges, NoexceptExpr, Exceptions,
                               ESI);
 
   // Update the exception specification on the function type.
-  Context.adjustExceptionSpec(Method, ESI, /*AsWritten*/true);
+  Context.adjustExceptionSpec(FD, ESI, /*AsWritten=*/true);
 
-  if (Method->isStatic())
-    checkThisInStaticMemberFunctionExceptionSpec(Method);
+  if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(D)) {
+    if (MD->isStatic())
+      checkThisInStaticMemberFunctionExceptionSpec(MD);
 
-  if (Method->isVirtual()) {
-    // Check overrides, which we previously had to delay.
-    for (const CXXMethodDecl *O : Method->overridden_methods())
-      CheckOverridingFunctionExceptionSpec(Method, O);
+    if (MD->isVirtual()) {
+      // Check overrides, which we previously had to delay.
+      for (const CXXMethodDecl *O : MD->overridden_methods())
+        CheckOverridingFunctionExceptionSpec(MD, O);
+    }
   }
 }
 
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 00384f9dc16a..41bf273d12f2 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -258,13 +258,14 @@ Sema::UpdateExceptionSpec(FunctionDecl *FD,
 }
 
 static bool exceptionSpecNotKnownYet(const FunctionDecl *FD) {
-  auto *MD = dyn_cast<CXXMethodDecl>(FD);
-  if (!MD)
+  ExceptionSpecificationType EST =
+      FD->getType()->castAs<FunctionProtoType>()->getExceptionSpecType();
+  if (EST == EST_Unparsed)
+    return true;
+  else if (EST != EST_Unevaluated)
     return false;
-
-  auto EST = MD->getType()->castAs<FunctionProtoType>()->getExceptionSpecType();
-  return EST == EST_Unparsed ||
-         (EST == EST_Unevaluated && MD->getParent()->isBeingDefined());
+  const DeclContext *DC = FD->getLexicalDeclContext();
+  return DC->isRecord() && cast<RecordDecl>(DC)->isBeingDefined();
 }
 
 static bool CheckEquivalentExceptionSpecImpl(
@@ -1314,7 +1315,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
     // Some might be dependent for other reasons.
   case Expr::ArraySubscriptExprClass:
   case Expr::MatrixSubscriptExprClass:
-  case Expr::OMPArraySectionExprClass:
+  case Expr::ArraySectionExprClass:
   case Expr::OMPArrayShapingExprClass:
   case Expr::OMPIteratorExprClass:
   case Expr::BinaryOperatorClass:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 5c861467bc10..0c37f43f7540 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -673,8 +673,9 @@ ExprResult Sema::DefaultLvalueConversion(Expr *E) {
   // expressions of certain types in C++.
   if (getLangOpts().CPlusPlus &&
       (E->getType() == Context.OverloadTy ||
-       T->isDependentType() ||
-       T->isRecordType()))
+       // FIXME: This is a hack! We want the lvalue-to-rvalue conversion applied
+       // to pointer types even if the pointee type is dependent.
+       (T->isDependentType() && !T->isPointerType()) || T->isRecordType()))
     return E;
 
   // The C standard is actually really unclear on this point, and
@@ -2751,8 +2752,8 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
   if (isBoundsAttrContext() && !getLangOpts().CPlusPlus && S->isClassScope()) {
     // See if this is reference to a field of struct.
     LookupResult R(*this, NameInfo, LookupMemberName);
-    // LookupParsedName handles a name lookup from within anonymous struct.
-    if (LookupParsedName(R, S, &SS)) {
+    // LookupName handles a name lookup from within anonymous struct.
+    if (LookupName(R, S)) {
       if (auto *VD = dyn_cast<ValueDecl>(R.getFoundDecl())) {
         QualType type = VD->getType().getNonReferenceType();
         // This will eventually be translated into MemberExpr upon
@@ -2773,20 +2774,19 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
     // lookup to determine that it was a template name in the first place. If
     // this becomes a performance hit, we can work harder to preserve those
     // results until we get here but it's likely not worth it.
-    bool MemberOfUnknownSpecialization;
     AssumedTemplateKind AssumedTemplate;
-    if (LookupTemplateName(R, S, SS, QualType(), /*EnteringContext=*/false,
-                           MemberOfUnknownSpecialization, TemplateKWLoc,
+    if (LookupTemplateName(R, S, SS, /*ObjectType=*/QualType(),
+                           /*EnteringContext=*/false, TemplateKWLoc,
                            &AssumedTemplate))
       return ExprError();
 
-    if (MemberOfUnknownSpecialization ||
-        (R.getResultKind() == LookupResult::NotFoundInCurrentInstantiation))
+    if (R.wasNotFoundInCurrentInstantiation())
       return ActOnDependentIdExpression(SS, TemplateKWLoc, NameInfo,
                                         IsAddressOfOperand, TemplateArgs);
   } else {
     bool IvarLookupFollowUp = II && !SS.isSet() && getCurMethodDecl();
-    LookupParsedName(R, S, &SS, !IvarLookupFollowUp);
+    LookupParsedName(R, S, &SS, /*ObjectType=*/QualType(),
+                     /*AllowBuiltinCreation=*/!IvarLookupFollowUp);
 
     // If the result might be in a dependent base class, this is a dependent
     // id-expression.
@@ -5069,11 +5069,18 @@ ExprResult Sema::ActOnArraySubscriptExpr(Scope *S, Expr *base,
                                          SourceLocation rbLoc) {
 
   if (base && !base->getType().isNull() &&
-      base->hasPlaceholderType(BuiltinType::OMPArraySection))
-    return OpenMP().ActOnOMPArraySectionExpr(base, lbLoc, ArgExprs.front(),
-                                             SourceLocation(), SourceLocation(),
-                                             /*Length*/ nullptr,
-                                             /*Stride=*/nullptr, rbLoc);
+      base->hasPlaceholderType(BuiltinType::ArraySection)) {
+    auto *AS = cast<ArraySectionExpr>(base);
+    if (AS->isOMPArraySection())
+      return OpenMP().ActOnOMPArraySectionExpr(
+          base, lbLoc, ArgExprs.front(), SourceLocation(), SourceLocation(),
+          /*Length*/ nullptr,
+          /*Stride=*/nullptr, rbLoc);
+
+    return OpenACC().ActOnArraySectionExpr(base, lbLoc, ArgExprs.front(),
+                                           SourceLocation(), /*Length*/ nullptr,
+                                           rbLoc);
+  }
 
   // Since this might be a postfix expression, get rid of ParenListExprs.
   if (isa<ParenListExpr>(base)) {
@@ -6361,7 +6368,7 @@ static bool isPlaceholderToRemoveAsArg(QualType type) {
   case BuiltinType::BoundMember:
   case BuiltinType::BuiltinFn:
   case BuiltinType::IncompleteMatrixIdx:
-  case BuiltinType::OMPArraySection:
+  case BuiltinType::ArraySection:
   case BuiltinType::OMPArrayShaping:
   case BuiltinType::OMPIterator:
     return true;
@@ -21343,8 +21350,9 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
     return ExprError();
 
   // Expressions of unknown type.
-  case BuiltinType::OMPArraySection:
-    Diag(E->getBeginLoc(), diag::err_omp_array_section_use);
+  case BuiltinType::ArraySection:
+    Diag(E->getBeginLoc(), diag::err_array_section_use)
+        << cast<ArraySectionExpr>(E)->isOMPArraySection();
     return ExprError();
 
   // Expressions of unknown type.
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 779a41620033..c1cb03e4ec7a 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -9157,7 +9157,7 @@ Sema::CheckMicrosoftIfExistsSymbol(Scope *S,
   // Do the redeclaration lookup in the current scope.
   LookupResult R(*this, TargetNameInfo, Sema::LookupAnyName,
                  RedeclarationKind::NotForRedeclaration);
-  LookupParsedName(R, S, &SS);
+  LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
   R.suppressDiagnostics();
 
   switch (R.getResultKind()) {
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index 6e30716b9ae4..5facb14a18b7 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -667,8 +667,8 @@ namespace {
 // classes, one of its base classes.
 class RecordMemberExprValidatorCCC final : public CorrectionCandidateCallback {
 public:
-  explicit RecordMemberExprValidatorCCC(const RecordType *RTy)
-      : Record(RTy->getDecl()) {
+  explicit RecordMemberExprValidatorCCC(QualType RTy)
+      : Record(RTy->getAsRecordDecl()) {
     // Don't add bare keywords to the consumer since they will always fail
     // validation by virtue of not being associated with any decls.
     WantTypeSpecifiers = false;
@@ -713,58 +713,36 @@ private:
 }
 
 static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
-                                     Expr *BaseExpr,
-                                     const RecordType *RTy,
+                                     Expr *BaseExpr, QualType RTy,
                                      SourceLocation OpLoc, bool IsArrow,
                                      CXXScopeSpec &SS, bool HasTemplateArgs,
                                      SourceLocation TemplateKWLoc,
                                      TypoExpr *&TE) {
   SourceRange BaseRange = BaseExpr ? BaseExpr->getSourceRange() : SourceRange();
-  RecordDecl *RDecl = RTy->getDecl();
-  if (!SemaRef.isThisOutsideMemberFunctionBody(QualType(RTy, 0)) &&
-      SemaRef.RequireCompleteType(OpLoc, QualType(RTy, 0),
-                                  diag::err_typecheck_incomplete_tag,
-                                  BaseRange))
+  if (!RTy->isDependentType() &&
+      !SemaRef.isThisOutsideMemberFunctionBody(RTy) &&
+      SemaRef.RequireCompleteType(
+          OpLoc, RTy, diag::err_typecheck_incomplete_tag, BaseRange))
     return true;
 
-  if (HasTemplateArgs || TemplateKWLoc.isValid()) {
-    // LookupTemplateName doesn't expect these both to exist simultaneously.
-    QualType ObjectType = SS.isSet() ? QualType() : QualType(RTy, 0);
+  // LookupTemplateName/LookupParsedName don't expect these both to exist
+  // simultaneously.
+  QualType ObjectType = SS.isSet() ? QualType() : RTy;
+  if (HasTemplateArgs || TemplateKWLoc.isValid())
+    return SemaRef.LookupTemplateName(R,
+                                      /*S=*/nullptr, SS, ObjectType,
+                                      /*EnteringContext=*/false, TemplateKWLoc);
 
-    bool MOUS;
-    return SemaRef.LookupTemplateName(R, nullptr, SS, ObjectType, false, MOUS,
-                                      TemplateKWLoc);
-  }
-
-  DeclContext *DC = RDecl;
-  if (SS.isSet()) {
-    // If the member name was a qualified-id, look into the
-    // nested-name-specifier.
-    DC = SemaRef.computeDeclContext(SS, false);
-
-    if (SemaRef.RequireCompleteDeclContext(SS, DC)) {
-      SemaRef.Diag(SS.getRange().getEnd(), diag::err_typecheck_incomplete_tag)
-          << SS.getRange() << DC;
-      return true;
-    }
-
-    assert(DC && "Cannot handle non-computable dependent contexts in lookup");
-
-    if (!isa<TypeDecl>(DC)) {
-      SemaRef.Diag(R.getNameLoc(), diag::err_qualified_member_nonclass)
-          << DC << SS.getRange();
-      return true;
-    }
-  }
-
-  // The record definition is complete, now look up the member.
-  SemaRef.LookupQualifiedName(R, DC, SS);
+  SemaRef.LookupParsedName(R, /*S=*/nullptr, &SS, ObjectType);
 
-  if (!R.empty())
+  if (!R.empty() || R.wasNotFoundInCurrentInstantiation())
     return false;
 
   DeclarationName Typo = R.getLookupName();
   SourceLocation TypoLoc = R.getNameLoc();
+  // Recompute the lookup context.
+  DeclContext *DC = SS.isSet() ? SemaRef.computeDeclContext(SS)
+                               : SemaRef.computeDeclContext(RTy);
 
   struct QueryState {
     Sema &SemaRef;
@@ -788,7 +766,8 @@ static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
                                        << Typo << DC << DroppedSpecifier
                                        << SS.getRange());
         } else {
-          SemaRef.Diag(TypoLoc, diag::err_no_member) << Typo << DC << BaseRange;
+          SemaRef.Diag(TypoLoc, diag::err_no_member)
+              << Typo << DC << (SS.isSet() ? SS.getRange() : BaseRange);
         }
       },
       [=](Sema &SemaRef, TypoExpr *TE, TypoCorrection TC) mutable {
@@ -814,34 +793,25 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
                                    Decl *ObjCImpDecl, bool HasTemplateArgs,
                                    SourceLocation TemplateKWLoc);
 
-ExprResult
-Sema::BuildMemberReferenceExpr(Expr *Base, QualType BaseType,
-                               SourceLocation OpLoc, bool IsArrow,
-                               CXXScopeSpec &SS,
-                               SourceLocation TemplateKWLoc,
-                               NamedDecl *FirstQualifierInScope,
-                               const DeclarationNameInfo &NameInfo,
-                               const TemplateArgumentListInfo *TemplateArgs,
-                               const Scope *S,
-                               ActOnMemberAccessExtraArgs *ExtraArgs) {
-  if (BaseType->isDependentType() ||
-      (SS.isSet() && isDependentScopeSpecifier(SS)) ||
-      NameInfo.getName().isDependentName())
-    return ActOnDependentMemberExpr(Base, BaseType,
-                                    IsArrow, OpLoc,
-                                    SS, TemplateKWLoc, FirstQualifierInScope,
-                                    NameInfo, TemplateArgs);
-
+ExprResult Sema::BuildMemberReferenceExpr(
+    Expr *Base, QualType BaseType, SourceLocation OpLoc, bool IsArrow,
+    CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
+    NamedDecl *FirstQualifierInScope, const DeclarationNameInfo &NameInfo,
+    const TemplateArgumentListInfo *TemplateArgs, const Scope *S,
+    ActOnMemberAccessExtraArgs *ExtraArgs) {
   LookupResult R(*this, NameInfo, LookupMemberName);
 
+  if (SS.isInvalid())
+    return ExprError();
+
   // Implicit member accesses.
   if (!Base) {
     TypoExpr *TE = nullptr;
     QualType RecordTy = BaseType;
     if (IsArrow) RecordTy = RecordTy->castAs<PointerType>()->getPointeeType();
-    if (LookupMemberExprInRecord(
-            *this, R, nullptr, RecordTy->castAs<RecordType>(), OpLoc, IsArrow,
-            SS, TemplateArgs != nullptr, TemplateKWLoc, TE))
+    if (LookupMemberExprInRecord(*this, R, nullptr, RecordTy, OpLoc, IsArrow,
+                                 SS, TemplateArgs != nullptr, TemplateKWLoc,
+                                 TE))
       return ExprError();
     if (TE)
       return TE;
@@ -969,19 +939,6 @@ BuildMSPropertyRefExpr(Sema &S, Expr *BaseExpr, bool IsArrow,
 }
 
 MemberExpr *Sema::BuildMemberExpr(
-    Expr *Base, bool IsArrow, SourceLocation OpLoc, const CXXScopeSpec *SS,
-    SourceLocation TemplateKWLoc, ValueDecl *Member, DeclAccessPair FoundDecl,
-    bool HadMultipleCandidates, const DeclarationNameInfo &MemberNameInfo,
-    QualType Ty, ExprValueKind VK, ExprObjectKind OK,
-    const TemplateArgumentListInfo *TemplateArgs) {
-  NestedNameSpecifierLoc NNS =
-      SS ? SS->getWithLocInContext(Context) : NestedNameSpecifierLoc();
-  return BuildMemberExpr(Base, IsArrow, OpLoc, NNS, TemplateKWLoc, Member,
-                         FoundDecl, HadMultipleCandidates, MemberNameInfo, Ty,
-                         VK, OK, TemplateArgs);
-}
-
-MemberExpr *Sema::BuildMemberExpr(
     Expr *Base, bool IsArrow, SourceLocation OpLoc, NestedNameSpecifierLoc NNS,
     SourceLocation TemplateKWLoc, ValueDecl *Member, DeclAccessPair FoundDecl,
     bool HadMultipleCandidates, const DeclarationNameInfo &MemberNameInfo,
@@ -1033,6 +990,17 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
                                const Scope *S,
                                bool SuppressQualifierCheck,
                                ActOnMemberAccessExtraArgs *ExtraArgs) {
+  assert(!SS.isInvalid() && "nested-name-specifier cannot be invalid");
+  // If the member wasn't found in the current instantiation, or if the
+  // arrow operator was used with a dependent non-pointer object expression,
+  // build a CXXDependentScopeMemberExpr.
+  if (R.wasNotFoundInCurrentInstantiation() ||
+      (IsArrow && !BaseExprType->isPointerType() &&
+       BaseExprType->isDependentType()))
+    return ActOnDependentMemberExpr(BaseExpr, BaseExprType, IsArrow, OpLoc, SS,
+                                    TemplateKWLoc, FirstQualifierInScope,
+                                    R.getLookupNameInfo(), TemplateArgs);
+
   QualType BaseType = BaseExprType;
   if (IsArrow) {
     assert(BaseType->isPointerType());
@@ -1040,6 +1008,11 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
   }
   R.setBaseObjectType(BaseType);
 
+  assert((SS.isEmpty()
+              ? !BaseType->isDependentType() || computeDeclContext(BaseType)
+              : !isDependentScopeSpecifier(SS) || computeDeclContext(SS)) &&
+         "dependent lookup context that isn't the current instantiation?");
+
   // C++1z [expr.ref]p2:
   //   For the first option (dot) the first expression shall be a glvalue [...]
   if (!IsArrow && BaseExpr && BaseExpr->isPRValue()) {
@@ -1068,40 +1041,39 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
         << isa<CXXDestructorDecl>(FD);
 
   if (R.empty()) {
-    // Rederive where we looked up.
-    DeclContext *DC = (SS.isSet()
-                       ? computeDeclContext(SS, false)
-                       : BaseType->castAs<RecordType>()->getDecl());
-
-    if (ExtraArgs) {
-      ExprResult RetryExpr;
-      if (!IsArrow && BaseExpr) {
-        SFINAETrap Trap(*this, true);
-        ParsedType ObjectType;
-        bool MayBePseudoDestructor = false;
-        RetryExpr = ActOnStartCXXMemberReference(getCurScope(), BaseExpr,
-                                                 OpLoc, tok::arrow, ObjectType,
-                                                 MayBePseudoDestructor);
-        if (RetryExpr.isUsable() && !Trap.hasErrorOccurred()) {
-          CXXScopeSpec TempSS(SS);
-          RetryExpr = ActOnMemberAccessExpr(
-              ExtraArgs->S, RetryExpr.get(), OpLoc, tok::arrow, TempSS,
-              TemplateKWLoc, ExtraArgs->Id, ExtraArgs->ObjCImpDecl);
-        }
-        if (Trap.hasErrorOccurred())
-          RetryExpr = ExprError();
-      }
-      if (RetryExpr.isUsable()) {
-        Diag(OpLoc, diag::err_no_member_overloaded_arrow)
-          << MemberName << DC << FixItHint::CreateReplacement(OpLoc, "->");
-        return RetryExpr;
+    ExprResult RetryExpr = ExprError();
+    if (ExtraArgs && !IsArrow && BaseExpr && !BaseExpr->isTypeDependent()) {
+      SFINAETrap Trap(*this, true);
+      ParsedType ObjectType;
+      bool MayBePseudoDestructor = false;
+      RetryExpr = ActOnStartCXXMemberReference(getCurScope(), BaseExpr, OpLoc,
+                                               tok::arrow, ObjectType,
+                                               MayBePseudoDestructor);
+      if (RetryExpr.isUsable() && !Trap.hasErrorOccurred()) {
+        CXXScopeSpec TempSS(SS);
+        RetryExpr = ActOnMemberAccessExpr(
+            ExtraArgs->S, RetryExpr.get(), OpLoc, tok::arrow, TempSS,
+            TemplateKWLoc, ExtraArgs->Id, ExtraArgs->ObjCImpDecl);
       }
+      if (Trap.hasErrorOccurred())
+        RetryExpr = ExprError();
     }
 
-    Diag(R.getNameLoc(), diag::err_no_member)
-      << MemberName << DC
-      << (BaseExpr ? BaseExpr->getSourceRange() : SourceRange());
-    return ExprError();
+    // Rederive where we looked up.
+    DeclContext *DC =
+        (SS.isSet() ? computeDeclContext(SS) : computeDeclContext(BaseType));
+    assert(DC);
+
+    if (RetryExpr.isUsable())
+      Diag(OpLoc, diag::err_no_member_overloaded_arrow)
+          << MemberName << DC << FixItHint::CreateReplacement(OpLoc, "->");
+    else
+      Diag(R.getNameLoc(), diag::err_no_member)
+          << MemberName << DC
+          << (SS.isSet()
+                  ? SS.getRange()
+                  : (BaseExpr ? BaseExpr->getSourceRange() : SourceRange()));
+    return RetryExpr;
   }
 
   // Diagnose lookups that find only declarations from a non-base
@@ -1186,7 +1158,8 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
                                                     OpLoc);
 
   if (VarDecl *Var = dyn_cast<VarDecl>(MemberDecl)) {
-    return BuildMemberExpr(BaseExpr, IsArrow, OpLoc, &SS, TemplateKWLoc, Var,
+    return BuildMemberExpr(BaseExpr, IsArrow, OpLoc,
+                           SS.getWithLocInContext(Context), TemplateKWLoc, Var,
                            FoundDecl, /*HadMultipleCandidates=*/false,
                            MemberNameInfo, Var->getType().getNonReferenceType(),
                            VK_LValue, OK_Ordinary);
@@ -1203,17 +1176,18 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
       type = MemberFn->getType();
     }
 
-    return BuildMemberExpr(BaseExpr, IsArrow, OpLoc, &SS, TemplateKWLoc,
+    return BuildMemberExpr(BaseExpr, IsArrow, OpLoc,
+                           SS.getWithLocInContext(Context), TemplateKWLoc,
                            MemberFn, FoundDecl, /*HadMultipleCandidates=*/false,
                            MemberNameInfo, type, valueKind, OK_Ordinary);
   }
   assert(!isa<FunctionDecl>(MemberDecl) && "member function not C++ method?");
 
   if (EnumConstantDecl *Enum = dyn_cast<EnumConstantDecl>(MemberDecl)) {
-    return BuildMemberExpr(BaseExpr, IsArrow, OpLoc, &SS, TemplateKWLoc, Enum,
-                           FoundDecl, /*HadMultipleCandidates=*/false,
-                           MemberNameInfo, Enum->getType(), VK_PRValue,
-                           OK_Ordinary);
+    return BuildMemberExpr(
+        BaseExpr, IsArrow, OpLoc, SS.getWithLocInContext(Context),
+        TemplateKWLoc, Enum, FoundDecl, /*HadMultipleCandidates=*/false,
+        MemberNameInfo, Enum->getType(), VK_PRValue, OK_Ordinary);
   }
 
   if (VarTemplateDecl *VarTempl = dyn_cast<VarTemplateDecl>(MemberDecl)) {
@@ -1237,7 +1211,8 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
     if (!Var->getTemplateSpecializationKind())
       Var->setTemplateSpecializationKind(TSK_ImplicitInstantiation, MemberLoc);
 
-    return BuildMemberExpr(BaseExpr, IsArrow, OpLoc, &SS, TemplateKWLoc, Var,
+    return BuildMemberExpr(BaseExpr, IsArrow, OpLoc,
+                           SS.getWithLocInContext(Context), TemplateKWLoc, Var,
                            FoundDecl, /*HadMultipleCandidates=*/false,
                            MemberNameInfo, Var->getType().getNonReferenceType(),
                            VK_LValue, OK_Ordinary, TemplateArgs);
@@ -1330,7 +1305,6 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
     return ExprError();
 
   QualType BaseType = BaseExpr.get()->getType();
-  assert(!BaseType->isDependentType());
 
   DeclarationName MemberName = R.getLookupName();
   SourceLocation MemberLoc = R.getNameLoc();
@@ -1342,29 +1316,31 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
   if (IsArrow) {
     if (const PointerType *Ptr = BaseType->getAs<PointerType>())
       BaseType = Ptr->getPointeeType();
-    else if (const ObjCObjectPointerType *Ptr
-               = BaseType->getAs<ObjCObjectPointerType>())
+    else if (const ObjCObjectPointerType *Ptr =
+                 BaseType->getAs<ObjCObjectPointerType>())
       BaseType = Ptr->getPointeeType();
-    else if (BaseType->isRecordType()) {
-      // Recover from arrow accesses to records, e.g.:
-      //   struct MyRecord foo;
-      //   foo->bar
-      // This is actually well-formed in C++ if MyRecord has an
-      // overloaded operator->, but that should have been dealt with
-      // by now--or a diagnostic message already issued if a problem
-      // was encountered while looking for the overloaded operator->.
-      if (!S.getLangOpts().CPlusPlus) {
-        S.Diag(OpLoc, diag::err_typecheck_member_reference_suggestion)
-          << BaseType << int(IsArrow) << BaseExpr.get()->getSourceRange()
-          << FixItHint::CreateReplacement(OpLoc, ".");
+    else if (!BaseType->isDependentType()) {
+      if (BaseType->isRecordType()) {
+        // Recover from arrow accesses to records, e.g.:
+        //   struct MyRecord foo;
+        //   foo->bar
+        // This is actually well-formed in C++ if MyRecord has an
+        // overloaded operator->, but that should have been dealt with
+        // by now--or a diagnostic message already issued if a problem
+        // was encountered while looking for the overloaded operator->.
+        if (!S.getLangOpts().CPlusPlus) {
+          S.Diag(OpLoc, diag::err_typecheck_member_reference_suggestion)
+              << BaseType << int(IsArrow) << BaseExpr.get()->getSourceRange()
+              << FixItHint::CreateReplacement(OpLoc, ".");
+        }
+        IsArrow = false;
+      } else if (BaseType->isFunctionType()) {
+        goto fail;
+      } else {
+        S.Diag(MemberLoc, diag::err_typecheck_member_reference_arrow)
+            << BaseType << BaseExpr.get()->getSourceRange();
+        return ExprError();
       }
-      IsArrow = false;
-    } else if (BaseType->isFunctionType()) {
-      goto fail;
-    } else {
-      S.Diag(MemberLoc, diag::err_typecheck_member_reference_arrow)
-        << BaseType << BaseExpr.get()->getSourceRange();
-      return ExprError();
     }
   }
 
@@ -1384,10 +1360,10 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
   }
 
   // Handle field access to simple records.
-  if (const RecordType *RTy = BaseType->getAs<RecordType>()) {
+  if (BaseType->getAsRecordDecl() || BaseType->isDependentType()) {
     TypoExpr *TE = nullptr;
-    if (LookupMemberExprInRecord(S, R, BaseExpr.get(), RTy, OpLoc, IsArrow, SS,
-                                 HasTemplateArgs, TemplateKWLoc, TE))
+    if (LookupMemberExprInRecord(S, R, BaseExpr.get(), BaseType, OpLoc, IsArrow,
+                                 SS, HasTemplateArgs, TemplateKWLoc, TE))
       return ExprError();
 
     // Returning valid-but-null is how we indicate to the caller that
@@ -1810,7 +1786,6 @@ ExprResult Sema::ActOnMemberAccessExpr(Scope *S, Expr *Base,
   DecomposeUnqualifiedId(Id, TemplateArgsBuffer,
                          NameInfo, TemplateArgs);
 
-  DeclarationName Name = NameInfo.getName();
   bool IsArrow = (OpKind == tok::arrow);
 
   if (getLangOpts().HLSL && IsArrow)
@@ -1824,13 +1799,6 @@ ExprResult Sema::ActOnMemberAccessExpr(Scope *S, Expr *Base,
   if (Result.isInvalid()) return ExprError();
   Base = Result.get();
 
-  if (Base->getType()->isDependentType() || Name.isDependentName() ||
-      isDependentScopeSpecifier(SS)) {
-    return ActOnDependentMemberExpr(Base, Base->getType(), IsArrow, OpLoc, SS,
-                                    TemplateKWLoc, FirstQualifierInScope,
-                                    NameInfo, TemplateArgs);
-  }
-
   ActOnMemberAccessExtraArgs ExtraArgs = {S, Id, ObjCImpDecl};
   ExprResult Res = BuildMemberReferenceExpr(
       Base, Base->getType(), OpLoc, IsArrow, SS, TemplateKWLoc,
@@ -1949,10 +1917,10 @@ Sema::BuildFieldReferenceExpr(Expr *BaseExpr, bool IsArrow,
     }
   }
 
-  return BuildMemberExpr(Base.get(), IsArrow, OpLoc, &SS,
-                         /*TemplateKWLoc=*/SourceLocation(), Field, FoundDecl,
-                         /*HadMultipleCandidates=*/false, MemberNameInfo,
-                         MemberType, VK, OK);
+  return BuildMemberExpr(
+      Base.get(), IsArrow, OpLoc, SS.getWithLocInContext(Context),
+      /*TemplateKWLoc=*/SourceLocation(), Field, FoundDecl,
+      /*HadMultipleCandidates=*/false, MemberNameInfo, MemberType, VK, OK);
 }
 
 /// Builds an implicit member access expression.  The current context
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 793e16df1789..7d9eaf672046 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -7753,9 +7753,9 @@ static void visitLocalsRetainedByReferenceBinding(IndirectLocalPath &Path,
     break;
   }
 
-  case Stmt::OMPArraySectionExprClass: {
+  case Stmt::ArraySectionExprClass: {
     visitLocalsRetainedByInitializer(Path,
-                                     cast<OMPArraySectionExpr>(Init)->getBase(),
+                                     cast<ArraySectionExpr>(Init)->getBase(),
                                      Visit, true, EnableLifetimeWarnings);
     break;
   }
@@ -8340,8 +8340,17 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
             << Entity.getType()->isReferenceType() << CLE->getInitializer() << 2
             << DiagRange;
       } else {
-        Diag(DiagLoc, diag::warn_ret_local_temp_addr_ref)
-         << Entity.getType()->isReferenceType() << DiagRange;
+        // P2748R5: Disallow Binding a Returned Glvalue to a Temporary.
+        // [stmt.return]/p6: In a function whose return type is a reference,
+        // other than an invented function for std::is_convertible ([meta.rel]),
+        // a return statement that binds the returned reference to a temporary
+        // expression ([class.temporary]) is ill-formed.
+        if (getLangOpts().CPlusPlus26 && Entity.getType()->isReferenceType())
+          Diag(DiagLoc, diag::err_ret_local_temp_ref)
+              << Entity.getType()->isReferenceType() << DiagRange;
+        else
+          Diag(DiagLoc, diag::warn_ret_local_temp_addr_ref)
+              << Entity.getType()->isReferenceType() << DiagRange;
       }
       break;
     }
@@ -10790,8 +10799,6 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer(
   // FIXME: Perform "exact type" matching first, per CWG discussion?
   //        Or implement this via an implied 'T(T) -> T' deduction guide?
 
-  // FIXME: Do we need/want a std::initializer_list<T> special case?
-
   // Look up deduction guides, including those synthesized from constructors.
   //
   // C++1z [over.match.class.deduct]p1:
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 55af414df39f..2f6ad49fc08b 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1282,6 +1282,31 @@ bool Sema::CppLookupName(LookupResult &R, Scope *S) {
       if (DeclContext *DC = PreS->getEntity())
         DeclareImplicitMemberFunctionsWithName(*this, Name, R.getNameLoc(), DC);
   }
+  // C++23 [temp.dep.general]p2:
+  //   The component name of an unqualified-id is dependent if
+  //   - it is a conversion-function-id whose conversion-type-id
+  //     is dependent, or
+  //   - it is operator= and the current class is a templated entity, or
+  //   - the unqualified-id is the postfix-expression in a dependent call.
+  if (Name.getNameKind() == DeclarationName::CXXConversionFunctionName &&
+      Name.getCXXNameType()->isDependentType()) {
+    R.setNotFoundInCurrentInstantiation();
+    return false;
+  }
+
+  // If this is the name of an implicitly-declared special member function,
+  // go through the scope stack to implicitly declare
+  if (isImplicitlyDeclaredMemberFunctionName(Name)) {
+    for (Scope *PreS = S; PreS; PreS = PreS->getParent())
+      if (DeclContext *DC = PreS->getEntity()) {
+        if (DC->isDependentContext() && isa<CXXRecordDecl>(DC) &&
+            Name.getCXXOverloadedOperator() == OO_Equal) {
+          R.setNotFoundInCurrentInstantiation();
+          return false;
+        }
+        DeclareImplicitMemberFunctionsWithName(*this, Name, R.getNameLoc(), DC);
+      }
+  }
 
   // Implicitly declare member functions with the name we're looking for, if in
   // fact we are in a scope where it matters.
@@ -2446,10 +2471,33 @@ bool Sema::LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
     }
   } QL(LookupCtx);
 
+  CXXRecordDecl *LookupRec = dyn_cast<CXXRecordDecl>(LookupCtx);
+  // FIXME: Per [temp.dep.general]p2, an unqualified name is also dependent
+  // if it's a dependent conversion-function-id or operator= where the current
+  // class is a templated entity. This should be handled in LookupName.
+  if (!InUnqualifiedLookup && !R.isForRedeclaration()) {
+    // C++23 [temp.dep.type]p5:
+    //   A qualified name is dependent if
+    //   - it is a conversion-function-id whose conversion-type-id
+    //     is dependent, or
+    //   - [...]
+    //   - its lookup context is the current instantiation and it
+    //     is operator=, or
+    //   - [...]
+    if (DeclarationName Name = R.getLookupName();
+        (Name.getNameKind() == DeclarationName::CXXConversionFunctionName &&
+         Name.getCXXNameType()->isDependentType()) ||
+        (Name.getCXXOverloadedOperator() == OO_Equal && LookupRec &&
+         LookupRec->isDependentContext())) {
+      R.setNotFoundInCurrentInstantiation();
+      return false;
+    }
+  }
+
   if (LookupDirect(*this, R, LookupCtx)) {
     R.resolveKind();
-    if (isa<CXXRecordDecl>(LookupCtx))
-      R.setNamingClass(cast<CXXRecordDecl>(LookupCtx));
+    if (LookupRec)
+      R.setNamingClass(LookupRec);
     return true;
   }
 
@@ -2471,7 +2519,6 @@ bool Sema::LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
 
   // If this isn't a C++ class, we aren't allowed to look into base
   // classes, we're done.
-  CXXRecordDecl *LookupRec = dyn_cast<CXXRecordDecl>(LookupCtx);
   if (!LookupRec || !LookupRec->getDefinition())
     return false;
 
@@ -2718,38 +2765,54 @@ bool Sema::LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
 ///
 /// @returns True if any decls were found (but possibly ambiguous)
 bool Sema::LookupParsedName(LookupResult &R, Scope *S, CXXScopeSpec *SS,
-                            bool AllowBuiltinCreation, bool EnteringContext) {
-  if (SS && SS->isInvalid()) {
-    // When the scope specifier is invalid, don't even look for
-    // anything.
+                            QualType ObjectType, bool AllowBuiltinCreation,
+                            bool EnteringContext) {
+  // When the scope specifier is invalid, don't even look for anything.
+  if (SS && SS->isInvalid())
     return false;
-  }
 
-  if (SS && SS->isSet()) {
-    NestedNameSpecifier *NNS = SS->getScopeRep();
-    if (NNS->getKind() == NestedNameSpecifier::Super)
+  // Determine where to perform name lookup
+  DeclContext *DC = nullptr;
+  bool IsDependent = false;
+  if (!ObjectType.isNull()) {
+    // This nested-name-specifier occurs in a member access expression, e.g.,
+    // x->B::f, and we are looking into the type of the object.
+    assert((!SS || SS->isEmpty()) &&
+           "ObjectType and scope specifier cannot coexist");
+    DC = computeDeclContext(ObjectType);
+    IsDependent = !DC && ObjectType->isDependentType();
+    assert(((!DC && ObjectType->isDependentType()) ||
+            !ObjectType->isIncompleteType() || !ObjectType->getAs<TagType>() ||
+            ObjectType->castAs<TagType>()->isBeingDefined()) &&
+           "Caller should have completed object type");
+  } else if (SS && SS->isNotEmpty()) {
+    if (NestedNameSpecifier *NNS = SS->getScopeRep();
+        NNS->getKind() == NestedNameSpecifier::Super)
       return LookupInSuper(R, NNS->getAsRecordDecl());
-
-    if (DeclContext *DC = computeDeclContext(*SS, EnteringContext)) {
-      // We have resolved the scope specifier to a particular declaration
-      // contex, and will perform name lookup in that context.
+    // This nested-name-specifier occurs after another nested-name-specifier,
+    // so long into the context associated with the prior nested-name-specifier.
+    if ((DC = computeDeclContext(*SS, EnteringContext))) {
+      // The declaration context must be complete.
       if (!DC->isDependentContext() && RequireCompleteDeclContext(*SS, DC))
         return false;
-
       R.setContextRange(SS->getRange());
-      return LookupQualifiedName(R, DC);
     }
+    IsDependent = !DC && isDependentScopeSpecifier(*SS);
+  } else {
+    // Perform unqualified name lookup starting in the given scope.
+    return LookupName(R, S, AllowBuiltinCreation);
+  }
 
+  // If we were able to compute a declaration context, perform qualified name
+  // lookup in that context.
+  if (DC)
+    return LookupQualifiedName(R, DC);
+  else if (IsDependent)
     // We could not resolve the scope specified to a specific declaration
     // context, which means that SS refers to an unknown specialization.
     // Name lookup can't find anything in this case.
     R.setNotFoundInCurrentInstantiation();
-    R.setContextRange(SS->getRange());
-    return false;
-  }
-
-  // Perform unqualified name lookup starting in the given scope.
-  return LookupName(R, S, AllowBuiltinCreation);
+  return false;
 }
 
 /// Perform qualified name lookup into all base classes of the given
@@ -5018,8 +5081,9 @@ static void LookupPotentialTypoResult(Sema &SemaRef,
     return;
   }
 
-  SemaRef.LookupParsedName(Res, S, SS, /*AllowBuiltinCreation=*/false,
-                           EnteringContext);
+  SemaRef.LookupParsedName(Res, S, SS,
+                           /*ObjectType=*/QualType(),
+                           /*AllowBuiltinCreation=*/false, EnteringContext);
 
   // Fake ivar lookup; this should really be part of
   // LookupParsedName.
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index ba69e71e30a1..3ea81e0497c2 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -103,6 +103,18 @@ bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
     default:
       return false;
     }
+  case OpenACCClauseKind::Private:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Loop:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
   default:
     // Do nothing so we can go to the 'unimplemented' diagnostic instead.
     return true;
@@ -303,6 +315,21 @@ SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses,
         getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
         Clause.getIntExprs()[0], Clause.getEndLoc());
   }
+  case OpenACCClauseKind::Private: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // ActOnVar ensured that everything is a valid variable reference, so there
+    // really isn't anything to do here. GCC does some duplicate-finding, though
+    // it isn't apparent in the standard where this is justified.
+
+    return OpenACCPrivateClause::Create(
+        getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+        Clause.getVarList(), Clause.getEndLoc());
+  }
   default:
     break;
   }
@@ -423,6 +450,67 @@ ExprResult SemaOpenACC::ActOnIntExpr(OpenACCDirectiveKind DK,
   return IntExpr;
 }
 
+ExprResult SemaOpenACC::ActOnVar(Expr *VarExpr) {
+  // We still need to retain the array subscript/subarray exprs, so work on a
+  // copy.
+  Expr *CurVarExpr = VarExpr->IgnoreParenImpCasts();
+
+  // Sub-arrays/subscript-exprs are fine as long as the base is a
+  // VarExpr/MemberExpr. So strip all of those off.
+  while (isa<ArraySectionExpr, ArraySubscriptExpr>(CurVarExpr)) {
+    if (auto *SubScrpt = dyn_cast<ArraySubscriptExpr>(CurVarExpr))
+      CurVarExpr = SubScrpt->getBase()->IgnoreParenImpCasts();
+    else
+      CurVarExpr =
+          cast<ArraySectionExpr>(CurVarExpr)->getBase()->IgnoreParenImpCasts();
+  }
+
+  // References to a VarDecl are fine.
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(CurVarExpr)) {
+    if (isa<VarDecl, NonTypeTemplateParmDecl>(
+            DRE->getDecl()->getCanonicalDecl()))
+      return VarExpr;
+  }
+
+  // A MemberExpr that references a Field is valid.
+  if (const auto *ME = dyn_cast<MemberExpr>(CurVarExpr)) {
+    if (isa<FieldDecl>(ME->getMemberDecl()->getCanonicalDecl()))
+      return VarExpr;
+  }
+
+  // Referring to 'this' is always OK.
+  if (isa<CXXThisExpr>(CurVarExpr))
+    return VarExpr;
+
+  // Nothing really we can do here, as these are dependent.  So just return they
+  // are valid.
+  if (isa<DependentScopeDeclRefExpr, CXXDependentScopeMemberExpr>(CurVarExpr))
+    return VarExpr;
+
+  // There isn't really anything we can do in the case of a recovery expr, so
+  // skip the diagnostic rather than produce a confusing diagnostic.
+  if (isa<RecoveryExpr>(CurVarExpr))
+    return ExprError();
+
+  Diag(VarExpr->getExprLoc(), diag::err_acc_not_a_var_ref);
+  return ExprError();
+}
+
+ExprResult SemaOpenACC::ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc,
+                                              Expr *LowerBound,
+                                              SourceLocation ColonLoc,
+                                              Expr *Length,
+                                              SourceLocation RBLoc) {
+  ASTContext &Context = getASTContext();
+
+  // TODO OpenACC: We likely have to reproduce a lot of the same logic from the
+  // OMP version of this, but at the moment we don't have a good way to test it,
+  // so for now we'll just create the node.
+  return new (Context)
+      ArraySectionExpr(Base, LowerBound, Length, Context.ArraySectionTy,
+                       VK_LValue, OK_Ordinary, ColonLoc, RBLoc);
+}
+
 bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K,
                                           SourceLocation StartLoc) {
   return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/true);
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 5ba09926acf2..cf5447f223d4 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2230,7 +2230,7 @@ bool SemaOpenMP::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
               dyn_cast<UnaryOperator>(Last->getAssociatedExpression());
           if ((UO && UO->getOpcode() == UO_Deref) ||
               isa<ArraySubscriptExpr>(Last->getAssociatedExpression()) ||
-              isa<OMPArraySectionExpr>(Last->getAssociatedExpression()) ||
+              isa<ArraySectionExpr>(Last->getAssociatedExpression()) ||
               isa<MemberExpr>(EI->getAssociatedExpression()) ||
               isa<OMPArrayShapingExpr>(Last->getAssociatedExpression())) {
             IsVariableAssociatedWithSection = true;
@@ -3061,7 +3061,9 @@ ExprResult SemaOpenMP::ActOnOpenMPIdExpression(Scope *CurScope,
                                                OpenMPDirectiveKind Kind) {
   ASTContext &Context = getASTContext();
   LookupResult Lookup(SemaRef, Id, Sema::LookupOrdinaryName);
-  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
+  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec,
+                           /*ObjectType=*/QualType(),
+                           /*AllowBuiltinCreation=*/true);
 
   if (Lookup.isAmbiguous())
     return ExprError();
@@ -3884,7 +3886,7 @@ public:
                                     MappableComponent &MC) {
                                return MC.getAssociatedDeclaration() ==
                                           nullptr &&
-                                      (isa<OMPArraySectionExpr>(
+                                      (isa<ArraySectionExpr>(
                                            MC.getAssociatedExpression()) ||
                                        isa<OMPArrayShapingExpr>(
                                            MC.getAssociatedExpression()) ||
@@ -4062,7 +4064,7 @@ public:
                   // Do both expressions have the same kind?
                   if (CCI->getAssociatedExpression()->getStmtClass() !=
                       SC.getAssociatedExpression()->getStmtClass())
-                    if (!((isa<OMPArraySectionExpr>(
+                    if (!((isa<ArraySectionExpr>(
                                SC.getAssociatedExpression()) ||
                            isa<OMPArrayShapingExpr>(
                                SC.getAssociatedExpression())) &&
@@ -5428,9 +5430,9 @@ static std::pair<ValueDecl *, bool> getPrivateItem(Sema &S, Expr *&RefExpr,
         Base = TempASE->getBase()->IgnoreParenImpCasts();
       RefExpr = Base;
       IsArrayExpr = ArraySubscript;
-    } else if (auto *OASE = dyn_cast_or_null<OMPArraySectionExpr>(RefExpr)) {
+    } else if (auto *OASE = dyn_cast_or_null<ArraySectionExpr>(RefExpr)) {
       Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
-      while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+      while (auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
         Base = TempOASE->getBase()->IgnoreParenImpCasts();
       while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
         Base = TempASE->getBase()->IgnoreParenImpCasts();
@@ -6060,10 +6062,10 @@ processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack,
       // Array section - need to check for the mapping of the array section
       // element.
       QualType CanonType = E->getType().getCanonicalType();
-      if (CanonType->isSpecificBuiltinType(BuiltinType::OMPArraySection)) {
-        const auto *OASE = cast<OMPArraySectionExpr>(E->IgnoreParenImpCasts());
+      if (CanonType->isSpecificBuiltinType(BuiltinType::ArraySection)) {
+        const auto *OASE = cast<ArraySectionExpr>(E->IgnoreParenImpCasts());
         QualType BaseType =
-            OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
+            ArraySectionExpr::getBaseOriginalType(OASE->getBase());
         QualType ElemType;
         if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
           ElemType = ATy->getElementType();
@@ -7407,7 +7409,8 @@ void SemaOpenMP::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
   const IdentifierInfo *BaseII = D.getIdentifier();
   LookupResult Lookup(SemaRef, DeclarationName(BaseII), D.getIdentifierLoc(),
                       Sema::LookupOrdinaryName);
-  SemaRef.LookupParsedName(Lookup, S, &D.getCXXScopeSpec());
+  SemaRef.LookupParsedName(Lookup, S, &D.getCXXScopeSpec(),
+                           /*ObjectType=*/QualType());
 
   TypeSourceInfo *TInfo = SemaRef.GetTypeForDeclarator(D);
   QualType FType = TInfo->getType();
@@ -19311,7 +19314,8 @@ buildDeclareReductionRef(Sema &SemaRef, SourceLocation Loc, SourceRange Range,
   if (S) {
     LookupResult Lookup(SemaRef, ReductionId, Sema::LookupOMPReductionName);
     Lookup.suppressDiagnostics();
-    while (S && SemaRef.LookupParsedName(Lookup, S, &ReductionIdScopeSpec)) {
+    while (S && SemaRef.LookupParsedName(Lookup, S, &ReductionIdScopeSpec,
+                                         /*ObjectType=*/QualType())) {
       NamedDecl *D = Lookup.getRepresentativeDecl();
       do {
         S = S->getParent();
@@ -19513,7 +19517,7 @@ struct ReductionData {
 } // namespace
 
 static bool checkOMPArraySectionConstantForReduction(
-    ASTContext &Context, const OMPArraySectionExpr *OASE, bool &SingleElement,
+    ASTContext &Context, const ArraySectionExpr *OASE, bool &SingleElement,
     SmallVectorImpl<llvm::APSInt> &ArraySizes) {
   const Expr *Length = OASE->getLength();
   if (Length == nullptr) {
@@ -19540,7 +19544,7 @@ static bool checkOMPArraySectionConstantForReduction(
 
   // We require length = 1 for all array sections except the right-most to
   // guarantee that the memory region is contiguous and has no holes in it.
-  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) {
+  while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base)) {
     Length = TempOASE->getLength();
     if (Length == nullptr) {
       // For array sections of the form [1:] or [:], we would need to analyze
@@ -19745,12 +19749,12 @@ static bool actOnOMPReductionKindClause(
     Expr *TaskgroupDescriptor = nullptr;
     QualType Type;
     auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr->IgnoreParens());
-    auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr->IgnoreParens());
+    auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr->IgnoreParens());
     if (ASE) {
       Type = ASE->getType().getNonReferenceType();
     } else if (OASE) {
       QualType BaseType =
-          OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
+          ArraySectionExpr::getBaseOriginalType(OASE->getBase());
       if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
         Type = ATy->getElementType();
       else
@@ -21284,10 +21288,10 @@ OMPClause *SemaOpenMP::ActOnOpenMPDependClause(
           // List items used in depend clauses cannot be zero-length array
           // sections.
           QualType ExprTy = RefExpr->getType().getNonReferenceType();
-          const auto *OASE = dyn_cast<OMPArraySectionExpr>(SimpleExpr);
+          const auto *OASE = dyn_cast<ArraySectionExpr>(SimpleExpr);
           if (OASE) {
             QualType BaseType =
-                OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
+                ArraySectionExpr::getBaseOriginalType(OASE->getBase());
             if (BaseType.isNull())
               return nullptr;
             if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
@@ -21346,7 +21350,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPDependClause(
             Res = SemaRef.CreateBuiltinUnaryOp(ELoc, UO_AddrOf,
                                                RefExpr->IgnoreParenImpCasts());
           }
-          if (!Res.isUsable() && !isa<OMPArraySectionExpr>(SimpleExpr) &&
+          if (!Res.isUsable() && !isa<ArraySectionExpr>(SimpleExpr) &&
               !isa<OMPArrayShapingExpr>(SimpleExpr)) {
             Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
                 << (getLangOpts().OpenMP >= 50 ? 1 : 0)
@@ -21447,7 +21451,7 @@ static bool checkTypeMappable(SourceLocation SL, SourceRange SR, Sema &SemaRef,
 static bool checkArrayExpressionDoesNotReferToWholeSize(Sema &SemaRef,
                                                         const Expr *E,
                                                         QualType BaseQTy) {
-  const auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
+  const auto *OASE = dyn_cast<ArraySectionExpr>(E);
 
   // If this is an array subscript, it refers to the whole size if the size of
   // the dimension is constant and equals 1. Also, an array section assumes the
@@ -21505,7 +21509,7 @@ static bool checkArrayExpressionDoesNotReferToWholeSize(Sema &SemaRef,
 static bool checkArrayExpressionDoesNotReferToUnitySize(Sema &SemaRef,
                                                         const Expr *E,
                                                         QualType BaseQTy) {
-  const auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
+  const auto *OASE = dyn_cast<ArraySectionExpr>(E);
 
   // An array subscript always refer to a single element. Also, an array section
   // assumes the format of an array subscript if no colon is used.
@@ -21720,14 +21724,14 @@ public:
     return RelevantExpr || Visit(E);
   }
 
-  bool VisitOMPArraySectionExpr(OMPArraySectionExpr *OASE) {
+  bool VisitArraySectionExpr(ArraySectionExpr *OASE) {
     // After OMP 5.0  Array section in reduction clause will be implicitly
     // mapped
     assert(!(SemaRef.getLangOpts().OpenMP < 50 && NoDiagnose) &&
            "Array sections cannot be implicitly mapped.");
     Expr *E = OASE->getBase()->IgnoreParenImpCasts();
     QualType CurType =
-        OMPArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
+        ArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
 
     // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, C++, p.1]
     //  If the type of a list item is a reference to a type T then the type
@@ -21900,7 +21904,7 @@ static const Expr *checkMapClauseExpressionBase(
       auto CE = CurComponents.rend();
       for (; CI != CE; ++CI) {
         const auto *OASE =
-            dyn_cast<OMPArraySectionExpr>(CI->getAssociatedExpression());
+            dyn_cast<ArraySectionExpr>(CI->getAssociatedExpression());
         if (!OASE)
           continue;
         if (OASE && OASE->getLength())
@@ -21970,10 +21974,10 @@ static bool checkMapConflicts(
           //  variable in map clauses of the same construct.
           if (CurrentRegionOnly &&
               (isa<ArraySubscriptExpr>(CI->getAssociatedExpression()) ||
-               isa<OMPArraySectionExpr>(CI->getAssociatedExpression()) ||
+               isa<ArraySectionExpr>(CI->getAssociatedExpression()) ||
                isa<OMPArrayShapingExpr>(CI->getAssociatedExpression())) &&
               (isa<ArraySubscriptExpr>(SI->getAssociatedExpression()) ||
-               isa<OMPArraySectionExpr>(SI->getAssociatedExpression()) ||
+               isa<ArraySectionExpr>(SI->getAssociatedExpression()) ||
                isa<OMPArrayShapingExpr>(SI->getAssociatedExpression()))) {
             SemaRef.Diag(CI->getAssociatedExpression()->getExprLoc(),
                          diag::err_omp_multiple_array_items_in_map_clause)
@@ -22001,11 +22005,10 @@ static bool checkMapConflicts(
           if (const auto *ASE =
                   dyn_cast<ArraySubscriptExpr>(SI->getAssociatedExpression())) {
             Type = ASE->getBase()->IgnoreParenImpCasts()->getType();
-          } else if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(
+          } else if (const auto *OASE = dyn_cast<ArraySectionExpr>(
                          SI->getAssociatedExpression())) {
             const Expr *E = OASE->getBase()->IgnoreParenImpCasts();
-            Type =
-                OMPArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
+            Type = ArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
           } else if (const auto *OASE = dyn_cast<OMPArrayShapingExpr>(
                          SI->getAssociatedExpression())) {
             Type = OASE->getBase()->getType()->getPointeeType();
@@ -22181,7 +22184,8 @@ static ExprResult buildUserDefinedMapperRef(Sema &SemaRef, Scope *S,
   LookupResult Lookup(SemaRef, MapperId, Sema::LookupOMPMapperName);
   Lookup.suppressDiagnostics();
   if (S) {
-    while (S && SemaRef.LookupParsedName(Lookup, S, &MapperIdScopeSpec)) {
+    while (S && SemaRef.LookupParsedName(Lookup, S, &MapperIdScopeSpec,
+                                         /*ObjectType=*/QualType())) {
       NamedDecl *D = Lookup.getRepresentativeDecl();
       while (S && !S->isDeclScope(D))
         S = S->getParent();
@@ -22480,13 +22484,13 @@ static void checkMappableExpressionList(
     (void)I;
     QualType Type;
     auto *ASE = dyn_cast<ArraySubscriptExpr>(VE->IgnoreParens());
-    auto *OASE = dyn_cast<OMPArraySectionExpr>(VE->IgnoreParens());
+    auto *OASE = dyn_cast<ArraySectionExpr>(VE->IgnoreParens());
     auto *OAShE = dyn_cast<OMPArrayShapingExpr>(VE->IgnoreParens());
     if (ASE) {
       Type = ASE->getType().getNonReferenceType();
     } else if (OASE) {
       QualType BaseType =
-          OMPArraySectionExpr::getBaseOriginalType(OASE->getBase());
+          ArraySectionExpr::getBaseOriginalType(OASE->getBase());
       if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
         Type = ATy->getElementType();
       else
@@ -23498,7 +23502,9 @@ void SemaOpenMP::DiagnoseUnterminatedOpenMPDeclareTarget() {
 NamedDecl *SemaOpenMP::lookupOpenMPDeclareTargetName(
     Scope *CurScope, CXXScopeSpec &ScopeSpec, const DeclarationNameInfo &Id) {
   LookupResult Lookup(SemaRef, Id, Sema::LookupOrdinaryName);
-  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
+  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec,
+                           /*ObjectType=*/QualType(),
+                           /*AllowBuiltinCreation=*/true);
 
   if (Lookup.isAmbiguous())
     return nullptr;
@@ -23955,7 +23961,7 @@ SemaOpenMP::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
     MVLI.VarBaseDeclarations.push_back(D);
     MVLI.VarComponents.emplace_back();
     Expr *Component = SimpleRefExpr;
-    if (VD && (isa<OMPArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) ||
+    if (VD && (isa<ArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) ||
                isa<ArraySubscriptExpr>(RefExpr->IgnoreParenImpCasts())))
       Component =
           SemaRef.DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
@@ -24105,7 +24111,7 @@ SemaOpenMP::ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
     // against other clauses later on.
     Expr *Component = SimpleRefExpr;
     auto *VD = dyn_cast<VarDecl>(D);
-    if (VD && (isa<OMPArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) ||
+    if (VD && (isa<ArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) ||
                isa<ArraySubscriptExpr>(RefExpr->IgnoreParenImpCasts())))
       Component =
           SemaRef.DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
@@ -24519,7 +24525,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPAffinityClause(
       Sema::TentativeAnalysisScope Trap(SemaRef);
       Res = SemaRef.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, SimpleExpr);
     }
-    if (!Res.isUsable() && !isa<OMPArraySectionExpr>(SimpleExpr) &&
+    if (!Res.isUsable() && !isa<ArraySectionExpr>(SimpleExpr) &&
         !isa<OMPArrayShapingExpr>(SimpleExpr)) {
       Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
           << 1 << 0 << RefExpr->getSourceRange();
@@ -24632,7 +24638,7 @@ ExprResult SemaOpenMP::ActOnOMPArraySectionExpr(
     Expr *Stride, SourceLocation RBLoc) {
   ASTContext &Context = getASTContext();
   if (Base->hasPlaceholderType() &&
-      !Base->hasPlaceholderType(BuiltinType::OMPArraySection)) {
+      !Base->hasPlaceholderType(BuiltinType::ArraySection)) {
     ExprResult Result = SemaRef.CheckPlaceholderExpr(Base);
     if (Result.isInvalid())
       return ExprError();
@@ -24672,13 +24678,13 @@ ExprResult SemaOpenMP::ActOnOMPArraySectionExpr(
        (LowerBound->isTypeDependent() || LowerBound->isValueDependent())) ||
       (Length && (Length->isTypeDependent() || Length->isValueDependent())) ||
       (Stride && (Stride->isTypeDependent() || Stride->isValueDependent()))) {
-    return new (Context) OMPArraySectionExpr(
+    return new (Context) ArraySectionExpr(
         Base, LowerBound, Length, Stride, Context.DependentTy, VK_LValue,
         OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc);
   }
 
   // Perform default conversions.
-  QualType OriginalTy = OMPArraySectionExpr::getBaseOriginalType(Base);
+  QualType OriginalTy = ArraySectionExpr::getBaseOriginalType(Base);
   QualType ResultTy;
   if (OriginalTy->isAnyPointerType()) {
     ResultTy = OriginalTy->getPointeeType();
@@ -24801,14 +24807,14 @@ ExprResult SemaOpenMP::ActOnOMPArraySectionExpr(
     }
   }
 
-  if (!Base->hasPlaceholderType(BuiltinType::OMPArraySection)) {
+  if (!Base->hasPlaceholderType(BuiltinType::ArraySection)) {
     ExprResult Result = SemaRef.DefaultFunctionArrayLvalueConversion(Base);
     if (Result.isInvalid())
       return ExprError();
     Base = Result.get();
   }
-  return new (Context) OMPArraySectionExpr(
-      Base, LowerBound, Length, Stride, Context.OMPArraySectionTy, VK_LValue,
+  return new (Context) ArraySectionExpr(
+      Base, LowerBound, Length, Stride, Context.ArraySectionTy, VK_LValue,
       OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc);
 }
 
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 9d44c22c8ddc..1c84830b6ddd 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -109,16 +109,14 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A,
     SetHints(LoopHintAttr::Unroll, LoopHintAttr::Disable);
   } else if (PragmaName == "unroll") {
     // #pragma unroll N
-    if (ValueExpr && !ValueExpr->isValueDependent()) {
-      llvm::APSInt ValueAPS;
-      ExprResult R = S.VerifyIntegerConstantExpression(ValueExpr, &ValueAPS);
-      assert(!R.isInvalid() && "unroll count value must be a valid value, it's "
-                               "should be checked in Sema::CheckLoopHintExpr");
-      (void)R;
-      // The values of 0 and 1 block any unrolling of the loop.
-      if (ValueAPS.isZero() || ValueAPS.isOne())
-        SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Disable);
-      else
+    if (ValueExpr) {
+      if (!ValueExpr->isValueDependent()) {
+        auto Value = ValueExpr->EvaluateKnownConstInt(S.getASTContext());
+        if (Value.isZero() || Value.isOne())
+          SetHints(LoopHintAttr::Unroll, LoopHintAttr::Disable);
+        else
+          SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Numeric);
+      } else
         SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Numeric);
     } else
       SetHints(LoopHintAttr::Unroll, LoopHintAttr::Enable);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index bbcb7c33a985..7f18631c6096 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -210,10 +210,11 @@ TemplateNameKind Sema::isTemplateName(Scope *S,
   AssumedTemplateKind AssumedTemplate;
   LookupResult R(*this, TName, Name.getBeginLoc(), LookupOrdinaryName);
   if (LookupTemplateName(R, S, SS, ObjectType, EnteringContext,
-                         MemberOfUnknownSpecialization, SourceLocation(),
+                         /*RequiredTemplate=*/SourceLocation(),
                          &AssumedTemplate,
                          /*AllowTypoCorrection=*/!Disambiguation))
     return TNK_Non_template;
+  MemberOfUnknownSpecialization = R.wasNotFoundInCurrentInstantiation();
 
   if (AssumedTemplate != AssumedTemplateKind::None) {
     TemplateResult = TemplateTy::make(Context.getAssumedTemplateName(TName));
@@ -320,15 +321,12 @@ TemplateNameKind Sema::isTemplateName(Scope *S,
 bool Sema::isDeductionGuideName(Scope *S, const IdentifierInfo &Name,
                                 SourceLocation NameLoc, CXXScopeSpec &SS,
                                 ParsedTemplateTy *Template /*=nullptr*/) {
-  bool MemberOfUnknownSpecialization = false;
-
   // We could use redeclaration lookup here, but we don't need to: the
   // syntactic form of a deduction guide is enough to identify it even
   // if we can't look up the template name at all.
   LookupResult R(*this, DeclarationName(&Name), NameLoc, LookupOrdinaryName);
   if (LookupTemplateName(R, S, SS, /*ObjectType*/ QualType(),
-                         /*EnteringContext*/ false,
-                         MemberOfUnknownSpecialization))
+                         /*EnteringContext*/ false))
     return false;
 
   if (R.empty()) return false;
@@ -374,11 +372,8 @@ bool Sema::DiagnoseUnknownTemplateName(const IdentifierInfo &II,
   return true;
 }
 
-bool Sema::LookupTemplateName(LookupResult &Found,
-                              Scope *S, CXXScopeSpec &SS,
-                              QualType ObjectType,
-                              bool EnteringContext,
-                              bool &MemberOfUnknownSpecialization,
+bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS,
+                              QualType ObjectType, bool EnteringContext,
                               RequiredTemplateKind RequiredTemplate,
                               AssumedTemplateKind *ATK,
                               bool AllowTypoCorrection) {
@@ -391,7 +386,6 @@ bool Sema::LookupTemplateName(LookupResult &Found,
   Found.setTemplateNameLookup(true);
 
   // Determine where to perform name lookup
-  MemberOfUnknownSpecialization = false;
   DeclContext *LookupCtx = nullptr;
   bool IsDependent = false;
   if (!ObjectType.isNull()) {
@@ -548,7 +542,7 @@ bool Sema::LookupTemplateName(LookupResult &Found,
   FilterAcceptableTemplateNames(Found, AllowFunctionTemplatesInLookup);
   if (Found.empty()) {
     if (IsDependent) {
-      MemberOfUnknownSpecialization = true;
+      Found.setNotFoundInCurrentInstantiation();
       return false;
     }
 
@@ -5595,11 +5589,9 @@ Sema::BuildQualifiedTemplateIdExpr(CXXScopeSpec &SS,
       RequireCompleteDeclContext(SS, DC))
     return BuildDependentDeclRefExpr(SS, TemplateKWLoc, NameInfo, TemplateArgs);
 
-  bool MemberOfUnknownSpecialization;
   LookupResult R(*this, NameInfo, LookupOrdinaryName);
   if (LookupTemplateName(R, (Scope *)nullptr, SS, QualType(),
-                         /*Entering*/false, MemberOfUnknownSpecialization,
-                         TemplateKWLoc))
+                         /*Entering*/ false, TemplateKWLoc))
     return ExprError();
 
   if (R.isAmbiguous())
@@ -5720,14 +5712,13 @@ TemplateNameKind Sema::ActOnTemplateName(Scope *S,
     DeclarationNameInfo DNI = GetNameFromUnqualifiedId(Name);
     LookupResult R(*this, DNI.getName(), Name.getBeginLoc(),
                    LookupOrdinaryName);
-    bool MOUS;
     // Tell LookupTemplateName that we require a template so that it diagnoses
     // cases where it finds a non-template.
     RequiredTemplateKind RTK = TemplateKWLoc.isValid()
                                    ? RequiredTemplateKind(TemplateKWLoc)
                                    : TemplateNameIsRequired;
-    if (!LookupTemplateName(R, S, SS, ObjectType.get(), EnteringContext, MOUS,
-                            RTK, nullptr, /*AllowTypoCorrection=*/false) &&
+    if (!LookupTemplateName(R, S, SS, ObjectType.get(), EnteringContext, RTK,
+                            /*ATK=*/nullptr, /*AllowTypoCorrection=*/false) &&
         !R.isAmbiguous()) {
       if (LookupCtx)
         Diag(Name.getBeginLoc(), diag::err_no_member)
@@ -5816,7 +5807,7 @@ bool Sema::CheckTemplateTypeArgument(
 
     if (auto *II = NameInfo.getName().getAsIdentifierInfo()) {
       LookupResult Result(*this, NameInfo, LookupOrdinaryName);
-      LookupParsedName(Result, CurScope, &SS);
+      LookupParsedName(Result, CurScope, &SS, /*ObjectType=*/QualType());
 
       if (Result.getAsSingle<TypeDecl>() ||
           Result.getResultKind() ==
@@ -7706,7 +7697,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
     // FIXME: The language rules don't say what happens in this case.
     // FIXME: We get an opaque dependent type out of decltype(auto) if the
     // expression is merely instantiation-dependent; is this enough?
-    if (CTAK == CTAK_Deduced && Arg->isTypeDependent()) {
+    if (Arg->isTypeDependent()) {
       auto *AT = dyn_cast<AutoType>(DeducedT);
       if (AT && AT->isDecltypeAuto()) {
         SugaredConverted = TemplateArgument(Arg);
@@ -8438,10 +8429,9 @@ void Sema::NoteTemplateParameterLocation(const NamedDecl &Decl) {
 /// declaration and the type of its corresponding non-type template
 /// parameter, produce an expression that properly refers to that
 /// declaration.
-ExprResult
-Sema::BuildExpressionFromDeclTemplateArgument(const TemplateArgument &Arg,
-                                              QualType ParamType,
-                                              SourceLocation Loc) {
+ExprResult Sema::BuildExpressionFromDeclTemplateArgument(
+    const TemplateArgument &Arg, QualType ParamType, SourceLocation Loc,
+    NamedDecl *TemplateParam) {
   // C++ [temp.param]p8:
   //
   //   A non-type template-parameter of type "array of T" or
@@ -8508,6 +8498,18 @@ Sema::BuildExpressionFromDeclTemplateArgument(const TemplateArgument &Arg,
   } else {
     assert(ParamType->isReferenceType() &&
            "unexpected type for decl template argument");
+    if (NonTypeTemplateParmDecl *NTTP =
+            dyn_cast_if_present<NonTypeTemplateParmDecl>(TemplateParam)) {
+      QualType TemplateParamType = NTTP->getType();
+      const AutoType *AT = TemplateParamType->getAs<AutoType>();
+      if (AT && AT->isDecltypeAuto()) {
+        RefExpr = new (getASTContext()) SubstNonTypeTemplateParmExpr(
+            ParamType->getPointeeType(), RefExpr.get()->getValueKind(),
+            RefExpr.get()->getExprLoc(), RefExpr.get(), VD, NTTP->getIndex(),
+            /*PackIndex=*/std::nullopt,
+            /*RefParam=*/true);
+      }
+    }
   }
 
   // At this point we should have the right value category.
@@ -11179,7 +11181,8 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
                            : TSK_ExplicitInstantiationDeclaration;
 
   LookupResult Previous(*this, NameInfo, LookupOrdinaryName);
-  LookupParsedName(Previous, S, &D.getCXXScopeSpec());
+  LookupParsedName(Previous, S, &D.getCXXScopeSpec(),
+                   /*ObjectType=*/QualType());
 
   if (!R->isFunctionType()) {
     // C++ [temp.explicit]p1:
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index c3815bca0385..e93f7bd842e4 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -2639,7 +2639,8 @@ static bool isSameTemplateArg(ASTContext &Context,
 /// argument.
 TemplateArgumentLoc
 Sema::getTrivialTemplateArgumentLoc(const TemplateArgument &Arg,
-                                    QualType NTTPType, SourceLocation Loc) {
+                                    QualType NTTPType, SourceLocation Loc,
+                                    NamedDecl *TemplateParam) {
   switch (Arg.getKind()) {
   case TemplateArgument::Null:
     llvm_unreachable("Can't get a NULL template argument here");
@@ -2651,7 +2652,8 @@ Sema::getTrivialTemplateArgumentLoc(const TemplateArgument &Arg,
   case TemplateArgument::Declaration: {
     if (NTTPType.isNull())
       NTTPType = Arg.getParamTypeForDecl();
-    Expr *E = BuildExpressionFromDeclTemplateArgument(Arg, NTTPType, Loc)
+    Expr *E = BuildExpressionFromDeclTemplateArgument(Arg, NTTPType, Loc,
+                                                      TemplateParam)
                   .getAs<Expr>();
     return TemplateArgumentLoc(TemplateArgument(E), E);
   }
@@ -2718,8 +2720,8 @@ static bool ConvertDeducedTemplateArgument(
     // Convert the deduced template argument into a template
     // argument that we can check, almost as if the user had written
     // the template argument explicitly.
-    TemplateArgumentLoc ArgLoc =
-        S.getTrivialTemplateArgumentLoc(Arg, QualType(), Info.getLocation());
+    TemplateArgumentLoc ArgLoc = S.getTrivialTemplateArgumentLoc(
+        Arg, QualType(), Info.getLocation(), Param);
 
     // Check the template argument, converting it as necessary.
     return S.CheckTemplateArgument(
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 98d5c7cb3a8a..3a9fd906b7af 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2151,13 +2151,25 @@ TemplateInstantiator::TransformLoopHintAttr(const LoopHintAttr *LH) {
 
   // Generate error if there is a problem with the value.
   if (getSema().CheckLoopHintExpr(TransformedExpr, LH->getLocation(),
-                                  LH->getOption() == LoopHintAttr::UnrollCount))
+                                  LH->getSemanticSpelling() ==
+                                      LoopHintAttr::Pragma_unroll))
     return LH;
 
+  LoopHintAttr::OptionType Option = LH->getOption();
+  LoopHintAttr::LoopHintState State = LH->getState();
+
+  llvm::APSInt ValueAPS =
+      TransformedExpr->EvaluateKnownConstInt(getSema().getASTContext());
+  // The values of 0 and 1 block any unrolling of the loop.
+  if (ValueAPS.isZero() || ValueAPS.isOne()) {
+    Option = LoopHintAttr::Unroll;
+    State = LoopHintAttr::Disable;
+  }
+
   // Create new LoopHintValueAttr with integral expression in place of the
   // non-type template parameter.
-  return LoopHintAttr::CreateImplicit(getSema().Context, LH->getOption(),
-                                      LH->getState(), TransformedExpr, *LH);
+  return LoopHintAttr::CreateImplicit(getSema().Context, Option, State,
+                                      TransformedExpr, *LH);
 }
 const NoInlineAttr *TemplateInstantiator::TransformStmtNoInlineAttr(
     const Stmt *OrigS, const Stmt *InstS, const NoInlineAttr *A) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 9404be5a46f3..dff7e9df636b 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2784,15 +2784,23 @@ public:
   ///
   /// By default, performs semantic analysis to build the new expression.
   /// Subclasses may override this routine to provide different behavior.
-  ExprResult RebuildOMPArraySectionExpr(Expr *Base, SourceLocation LBracketLoc,
-                                        Expr *LowerBound,
-                                        SourceLocation ColonLocFirst,
-                                        SourceLocation ColonLocSecond,
-                                        Expr *Length, Expr *Stride,
-                                        SourceLocation RBracketLoc) {
-    return getSema().OpenMP().ActOnOMPArraySectionExpr(
-        Base, LBracketLoc, LowerBound, ColonLocFirst, ColonLocSecond, Length,
-        Stride, RBracketLoc);
+  ExprResult RebuildArraySectionExpr(bool IsOMPArraySection, Expr *Base,
+                                     SourceLocation LBracketLoc,
+                                     Expr *LowerBound,
+                                     SourceLocation ColonLocFirst,
+                                     SourceLocation ColonLocSecond,
+                                     Expr *Length, Expr *Stride,
+                                     SourceLocation RBracketLoc) {
+    if (IsOMPArraySection)
+      return getSema().OpenMP().ActOnOMPArraySectionExpr(
+          Base, LBracketLoc, LowerBound, ColonLocFirst, ColonLocSecond, Length,
+          Stride, RBracketLoc);
+
+    assert(Stride == nullptr && !ColonLocSecond.isValid() &&
+           "Stride/second colon not allowed for OpenACC");
+
+    return getSema().OpenACC().ActOnArraySectionExpr(
+        Base, LBracketLoc, LowerBound, ColonLocFirst, Length, RBracketLoc);
   }
 
   /// Build a new array shaping expression.
@@ -6649,6 +6657,10 @@ TreeTransform<Derived>::TransformPackIndexingType(TypeLocBuilder &TLB,
     }
   }
 
+  // A pack indexing type can appear in a larger pack expansion,
+  // e.g. `Pack...[pack_of_indexes]...`
+  // so we need to temporarily disable substitution of pack elements
+  Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1);
   QualType Result = getDerived().TransformType(TLB, TL.getPatternLoc());
 
   QualType Out = getDerived().RebuildPackIndexingType(
@@ -11096,13 +11108,15 @@ template <typename Derived>
 class OpenACCClauseTransform final
     : public OpenACCClauseVisitor<OpenACCClauseTransform<Derived>> {
   TreeTransform<Derived> &Self;
+  ArrayRef<const OpenACCClause *> ExistingClauses;
   SemaOpenACC::OpenACCParsedClause &ParsedClause;
   OpenACCClause *NewClause = nullptr;
 
 public:
   OpenACCClauseTransform(TreeTransform<Derived> &Self,
+                         ArrayRef<const OpenACCClause *> ExistingClauses,
                          SemaOpenACC::OpenACCParsedClause &PC)
-      : Self(Self), ParsedClause(PC) {}
+      : Self(Self), ExistingClauses(ExistingClauses), ParsedClause(PC) {}
 
   OpenACCClause *CreatedClause() const { return NewClause; }
 
@@ -11188,6 +11202,31 @@ void OpenACCClauseTransform<Derived>::VisitNumGangsClause(
       ParsedClause.getLParenLoc(), ParsedClause.getIntExprs(),
       ParsedClause.getEndLoc());
 }
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitPrivateClause(
+    const OpenACCPrivateClause &C) {
+  llvm::SmallVector<Expr *> InstantiatedVarList;
+
+  for (Expr *CurVar : C.getVarList()) {
+    ExprResult Res = Self.TransformExpr(CurVar);
+
+    if (!Res.isUsable())
+      return;
+
+    Res = Self.getSema().OpenACC().ActOnVar(Res.get());
+
+    if (Res.isUsable())
+      InstantiatedVarList.push_back(Res.get());
+  }
+  ParsedClause.setVarListDetails(std::move(InstantiatedVarList));
+
+  NewClause = OpenACCPrivateClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
+      ParsedClause.getEndLoc());
+}
+
 template <typename Derived>
 void OpenACCClauseTransform<Derived>::VisitNumWorkersClause(
     const OpenACCNumWorkersClause &C) {
@@ -11246,7 +11285,8 @@ OpenACCClause *TreeTransform<Derived>::TransformOpenACCClause(
   if (const auto *WithParms = dyn_cast<OpenACCClauseWithParams>(OldClause))
     ParsedClause.setLParenLoc(WithParms->getLParenLoc());
 
-  OpenACCClauseTransform<Derived> Transform{*this, ParsedClause};
+  OpenACCClauseTransform<Derived> Transform{*this, ExistingClauses,
+                                            ParsedClause};
   Transform.Visit(OldClause);
 
   return Transform.CreatedClause();
@@ -11742,7 +11782,7 @@ TreeTransform<Derived>::TransformMatrixSubscriptExpr(MatrixSubscriptExpr *E) {
 
 template <typename Derived>
 ExprResult
-TreeTransform<Derived>::TransformOMPArraySectionExpr(OMPArraySectionExpr *E) {
+TreeTransform<Derived>::TransformArraySectionExpr(ArraySectionExpr *E) {
   ExprResult Base = getDerived().TransformExpr(E->getBase());
   if (Base.isInvalid())
     return ExprError();
@@ -11762,20 +11802,25 @@ TreeTransform<Derived>::TransformOMPArraySectionExpr(OMPArraySectionExpr *E) {
   }
 
   ExprResult Stride;
-  if (Expr *Str = E->getStride()) {
-    Stride = getDerived().TransformExpr(Str);
-    if (Stride.isInvalid())
-      return ExprError();
+  if (E->isOMPArraySection()) {
+    if (Expr *Str = E->getStride()) {
+      Stride = getDerived().TransformExpr(Str);
+      if (Stride.isInvalid())
+        return ExprError();
+    }
   }
 
   if (!getDerived().AlwaysRebuild() && Base.get() == E->getBase() &&
-      LowerBound.get() == E->getLowerBound() && Length.get() == E->getLength())
+      LowerBound.get() == E->getLowerBound() &&
+      Length.get() == E->getLength() &&
+      (E->isOpenACCArraySection() || Stride.get() == E->getStride()))
     return E;
 
-  return getDerived().RebuildOMPArraySectionExpr(
-      Base.get(), E->getBase()->getEndLoc(), LowerBound.get(),
-      E->getColonLocFirst(), E->getColonLocSecond(), Length.get(), Stride.get(),
-      E->getRBracketLoc());
+  return getDerived().RebuildArraySectionExpr(
+      E->isOMPArraySection(), Base.get(), E->getBase()->getEndLoc(),
+      LowerBound.get(), E->getColonLocFirst(),
+      E->isOMPArraySection() ? E->getColonLocSecond() : SourceLocation{},
+      Length.get(), Stride.get(), E->getRBracketLoc());
 }
 
 template <typename Derived>
@@ -13204,6 +13249,26 @@ bool TreeTransform<Derived>::TransformOverloadExprDecls(OverloadExpr *Old,
   // Resolve a kind, but don't do any further analysis.  If it's
   // ambiguous, the callee needs to deal with it.
   R.resolveKind();
+
+  if (Old->hasTemplateKeyword() && !R.empty()) {
+    NamedDecl *FoundDecl = R.getRepresentativeDecl()->getUnderlyingDecl();
+    getSema().FilterAcceptableTemplateNames(R,
+                                            /*AllowFunctionTemplates=*/true,
+                                            /*AllowDependent=*/true);
+    if (R.empty()) {
+      // If a 'template' keyword was used, a lookup that finds only non-template
+      // names is an error.
+      getSema().Diag(R.getNameLoc(),
+                     diag::err_template_kw_refers_to_non_template)
+          << R.getLookupName() << Old->getQualifierLoc().getSourceRange()
+          << Old->hasTemplateKeyword() << Old->getTemplateKeywordLoc();
+      getSema().Diag(FoundDecl->getLocation(),
+                     diag::note_template_kw_refers_to_non_template)
+          << R.getLookupName();
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -14186,6 +14251,8 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
   // FIXME: Sema's lambda-building mechanism expects us to push an expression
   // evaluation context even if we're not transforming the function body.
   getSema().PushExpressionEvaluationContext(
+      E->getCallOperator()->isConsteval() ?
+      Sema::ExpressionEvaluationContext::ImmediateFunctionContext :
       Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
 
   Sema::CodeSynthesisContext C;
diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp
index f8d54c0c3989..e017f5bdb488 100644
--- a/clang/lib/Serialization/ASTCommon.cpp
+++ b/clang/lib/Serialization/ASTCommon.cpp
@@ -261,8 +261,8 @@ serialization::TypeIdxFromBuiltin(const BuiltinType *BT) {
   case BuiltinType::IncompleteMatrixIdx:
     ID = PREDEF_TYPE_INCOMPLETE_MATRIX_IDX;
     break;
-  case BuiltinType::OMPArraySection:
-    ID = PREDEF_TYPE_OMP_ARRAY_SECTION;
+  case BuiltinType::ArraySection:
+    ID = PREDEF_TYPE_ARRAY_SECTION;
     break;
   case BuiltinType::OMPArrayShaping:
     ID = PREDEF_TYPE_OMP_ARRAY_SHAPING;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 43b69045bb05..29b81c1a753c 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -2630,6 +2630,14 @@ InputFile ASTReader::getInputFile(ModuleFile &F, unsigned ID, bool Complain) {
       F.StandardCXXModule && FileChange.Kind == Change::None)
     FileChange = HasInputContentChanged(FileChange);
 
+  // When we have StoredTime equal to zero and ValidateASTInputFilesContent,
+  // it is better to check the content of the input files because we cannot rely
+  // on the file modification time, which will be the same (zero) for these
+  // files.
+  if (!StoredTime && ValidateASTInputFilesContent &&
+      FileChange.Kind == Change::None)
+    FileChange = HasInputContentChanged(FileChange);
+
   // For an overridden file, there is nothing to validate.
   if (!Overridden && FileChange.Kind != Change::None) {
     if (Complain && !Diags.isDiagnosticInFlight()) {
@@ -5101,8 +5109,9 @@ void ASTReader::InitializeContext() {
 
   // If there's a listener, notify them that we "read" the translation unit.
   if (DeserializationListener)
-    DeserializationListener->DeclRead(PREDEF_DECL_TRANSLATION_UNIT_ID,
-                                      Context.getTranslationUnitDecl());
+    DeserializationListener->DeclRead(
+        GlobalDeclID(PREDEF_DECL_TRANSLATION_UNIT_ID),
+        Context.getTranslationUnitDecl());
 
   // FIXME: Find a better way to deal with collisions between these
   // built-in types. Right now, we just ignore the problem.
@@ -6010,9 +6019,9 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
     case SUBMODULE_INITIALIZERS: {
       if (!ContextObj)
         break;
-      SmallVector<DeclID, 16> Inits;
+      SmallVector<GlobalDeclID, 16> Inits;
       for (auto &ID : Record)
-        Inits.push_back(getGlobalDeclID(F, LocalDeclID(ID)).get());
+        Inits.push_back(getGlobalDeclID(F, LocalDeclID(ID)));
       ContextObj->addLazyModuleInitializers(CurrentModule, Inits);
       break;
     }
@@ -7384,11 +7393,11 @@ QualType ASTReader::GetType(TypeID ID) {
     case PREDEF_TYPE_INCOMPLETE_MATRIX_IDX:
       T = Context.IncompleteMatrixIdxTy;
       break;
-    case PREDEF_TYPE_OMP_ARRAY_SECTION:
-      T = Context.OMPArraySectionTy;
+    case PREDEF_TYPE_ARRAY_SECTION:
+      T = Context.ArraySectionTy;
       break;
     case PREDEF_TYPE_OMP_ARRAY_SHAPING:
-      T = Context.OMPArraySectionTy;
+      T = Context.OMPArrayShapingTy;
       break;
     case PREDEF_TYPE_OMP_ITERATOR:
       T = Context.OMPIteratorTy;
@@ -7517,9 +7526,7 @@ ASTRecordReader::readASTTemplateArgumentListInfo() {
   return ASTTemplateArgumentListInfo::Create(getContext(), Result);
 }
 
-Decl *ASTReader::GetExternalDecl(DeclID ID) {
-  return GetDecl(GlobalDeclID(ID));
-}
+Decl *ASTReader::GetExternalDecl(GlobalDeclID ID) { return GetDecl(ID); }
 
 void ASTReader::CompleteRedeclChain(const Decl *D) {
   if (NumCurrentElementsDeserializing) {
@@ -7668,8 +7675,7 @@ GlobalDeclID ASTReader::getGlobalDeclID(ModuleFile &F,
   return GlobalDeclID(ID + I->second);
 }
 
-bool ASTReader::isDeclIDFromModule(serialization::GlobalDeclID ID,
-                                   ModuleFile &M) const {
+bool ASTReader::isDeclIDFromModule(GlobalDeclID ID, ModuleFile &M) const {
   // Predefined decls aren't from any module.
   if (ID.get() < NUM_PREDEF_DECL_IDS)
     return false;
@@ -7768,7 +7774,7 @@ static Decl *getPredefinedDecl(ASTContext &Context, PredefinedDeclIDs ID) {
 Decl *ASTReader::GetExistingDecl(GlobalDeclID ID) {
   assert(ContextObj && "reading decl with no AST context");
   if (ID.get() < NUM_PREDEF_DECL_IDS) {
-    Decl *D = getPredefinedDecl(*ContextObj, (PredefinedDeclIDs)ID.get());
+    Decl *D = getPredefinedDecl(*ContextObj, (PredefinedDeclIDs)ID);
     if (D) {
       // Track that we have merged the declaration with ID \p ID into the
       // pre-existing predefined declaration \p D.
@@ -7805,28 +7811,28 @@ Decl *ASTReader::GetDecl(GlobalDeclID ID) {
   if (!DeclsLoaded[Index]) {
     ReadDeclRecord(ID);
     if (DeserializationListener)
-      DeserializationListener->DeclRead(ID.get(), DeclsLoaded[Index]);
+      DeserializationListener->DeclRead(ID, DeclsLoaded[Index]);
   }
 
   return DeclsLoaded[Index];
 }
 
-DeclID ASTReader::mapGlobalIDToModuleFileGlobalID(ModuleFile &M,
-                                                  GlobalDeclID GlobalID) {
+LocalDeclID ASTReader::mapGlobalIDToModuleFileGlobalID(ModuleFile &M,
+                                                       GlobalDeclID GlobalID) {
   DeclID ID = GlobalID.get();
   if (ID < NUM_PREDEF_DECL_IDS)
-    return ID;
+    return LocalDeclID(ID);
 
   GlobalDeclMapType::const_iterator I = GlobalDeclMap.find(GlobalID);
   assert(I != GlobalDeclMap.end() && "Corrupted global declaration map");
   ModuleFile *Owner = I->second;
 
-  llvm::DenseMap<ModuleFile *, serialization::DeclID>::iterator Pos
-    = M.GlobalToLocalDeclIDs.find(Owner);
+  llvm::DenseMap<ModuleFile *, DeclID>::iterator Pos =
+      M.GlobalToLocalDeclIDs.find(Owner);
   if (Pos == M.GlobalToLocalDeclIDs.end())
-    return 0;
+    return LocalDeclID();
 
-  return ID - Owner->BaseDeclID + Pos->second;
+  return LocalDeclID(ID - Owner->BaseDeclID + Pos->second);
 }
 
 GlobalDeclID ASTReader::ReadDeclID(ModuleFile &F, const RecordData &Record,
@@ -7872,7 +7878,7 @@ void ASTReader::FindExternalLexicalDecls(
       if (!IsKindWeWant(K))
         continue;
 
-      auto ID = (serialization::DeclID)+LexicalDecls[I + 1];
+      auto ID = (DeclID) + LexicalDecls[I + 1];
 
       // Don't add predefined declarations to the lexical context more
       // than once.
@@ -7954,7 +7960,7 @@ void ASTReader::FindFileRegionDecls(FileID File,
   SourceLocation EndLoc = BeginLoc.getLocWithOffset(Length);
 
   DeclIDComp DIDComp(*this, *DInfo.Mod);
-  ArrayRef<serialization::LocalDeclID>::iterator BeginIt =
+  ArrayRef<LocalDeclID>::iterator BeginIt =
       llvm::lower_bound(DInfo.Decls, BeginLoc, DIDComp);
   if (BeginIt != DInfo.Decls.begin())
     --BeginIt;
@@ -7967,13 +7973,12 @@ void ASTReader::FindFileRegionDecls(FileID File,
              ->isTopLevelDeclInObjCContainer())
     --BeginIt;
 
-  ArrayRef<serialization::LocalDeclID>::iterator EndIt =
+  ArrayRef<LocalDeclID>::iterator EndIt =
       llvm::upper_bound(DInfo.Decls, EndLoc, DIDComp);
   if (EndIt != DInfo.Decls.end())
     ++EndIt;
 
-  for (ArrayRef<serialization::LocalDeclID>::iterator
-         DIt = BeginIt; DIt != EndIt; ++DIt)
+  for (ArrayRef<LocalDeclID>::iterator DIt = BeginIt; DIt != EndIt; ++DIt)
     Decls.push_back(GetDecl(getGlobalDeclID(*DInfo.Mod, *DIt)));
 }
 
@@ -7994,6 +7999,7 @@ ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC,
   // Load the list of declarations.
   SmallVector<NamedDecl *, 64> Decls;
   llvm::SmallPtrSet<NamedDecl *, 8> Found;
+
   for (GlobalDeclID ID : It->second.Table.find(Name)) {
     NamedDecl *ND = cast<NamedDecl>(GetDecl(ID));
     if (ND->getDeclName() == Name && Found.insert(ND).second)
@@ -11770,6 +11776,14 @@ void ASTRecordReader::readOMPChildren(OMPChildren *Data) {
     Data->getChildren()[I] = readStmt();
 }
 
+SmallVector<Expr *> ASTRecordReader::readOpenACCVarList() {
+  unsigned NumVars = readInt();
+  llvm::SmallVector<Expr *> VarList;
+  for (unsigned I = 0; I < NumVars; ++I)
+    VarList.push_back(readSubExpr());
+  return VarList;
+}
+
 OpenACCClause *ASTRecordReader::readOpenACCClause() {
   OpenACCClauseKind ClauseKind = readEnum<OpenACCClauseKind>();
   SourceLocation BeginLoc = readSourceLocation();
@@ -11815,6 +11829,12 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     return OpenACCVectorLengthClause::Create(getContext(), BeginLoc, LParenLoc,
                                              IntExpr, EndLoc);
   }
+  case OpenACCClauseKind::Private: {
+    SourceLocation LParenLoc = readSourceLocation();
+    llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
+    return OpenACCPrivateClause::Create(getContext(), BeginLoc, LParenLoc,
+                                        VarList, EndLoc);
+  }
   case OpenACCClauseKind::Finalize:
   case OpenACCClauseKind::IfPresent:
   case OpenACCClauseKind::Seq:
@@ -11836,7 +11856,6 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   case OpenACCClauseKind::Link:
   case OpenACCClauseKind::NoCreate:
   case OpenACCClauseKind::Present:
-  case OpenACCClauseKind::Private:
   case OpenACCClauseKind::CopyOut:
   case OpenACCClauseKind::CopyIn:
   case OpenACCClauseKind::Create:
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index bb82173dfe0b..744f11de88c2 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -273,17 +273,15 @@ namespace clang {
       auto *&LazySpecializations = D->getCommonPtr()->LazySpecializations;
 
       if (auto &Old = LazySpecializations) {
-        IDs.insert(IDs.end(), GlobalDeclIDIterator(Old + 1),
-                   GlobalDeclIDIterator(Old + 1 + Old[0]));
+        IDs.insert(IDs.end(), Old + 1, Old + 1 + Old[0].get());
         llvm::sort(IDs);
         IDs.erase(std::unique(IDs.begin(), IDs.end()), IDs.end());
       }
 
-      auto *Result = new (C) serialization::DeclID[1 + IDs.size()];
-      *Result = IDs.size();
+      auto *Result = new (C) GlobalDeclID[1 + IDs.size()];
+      *Result = GlobalDeclID(IDs.size());
 
-      std::copy(DeclIDIterator(IDs.begin()), DeclIDIterator(IDs.end()),
-                Result + 1);
+      std::copy(IDs.begin(), IDs.end(), Result + 1);
 
       LazySpecializations = Result;
     }
@@ -558,7 +556,7 @@ void ASTDeclReader::Visit(Decl *D) {
 
     // If this is a tag declaration with a typedef name for linkage, it's safe
     // to load that typedef now.
-    if (NamedDeclForTagDecl != GlobalDeclID())
+    if (NamedDeclForTagDecl.isValid())
       cast<TagDecl>(D)->TypedefNameDeclOrQualifier =
           cast<TypedefNameDecl>(Reader.GetDecl(NamedDeclForTagDecl));
   } else if (auto *ID = dyn_cast<ObjCInterfaceDecl>(D)) {
@@ -603,7 +601,7 @@ void ASTDeclReader::VisitDecl(Decl *D) {
     GlobalDeclID SemaDCIDForTemplateParmDecl = readDeclID();
     GlobalDeclID LexicalDCIDForTemplateParmDecl =
         HasStandaloneLexicalDC ? readDeclID() : GlobalDeclID();
-    if (LexicalDCIDForTemplateParmDecl == GlobalDeclID())
+    if (LexicalDCIDForTemplateParmDecl.isInvalid())
       LexicalDCIDForTemplateParmDecl = SemaDCIDForTemplateParmDecl;
     Reader.addPendingDeclContextInfo(D,
                                      SemaDCIDForTemplateParmDecl,
@@ -1860,7 +1858,7 @@ void ASTDeclReader::VisitNamespaceDecl(NamespaceDecl *D) {
 
   mergeRedeclarable(D, Redecl);
 
-  if (AnonNamespace != GlobalDeclID()) {
+  if (AnonNamespace.isValid()) {
     // Each module has its own anonymous namespace, which is disjoint from
     // any other module's anonymous namespaces, so don't attach the anonymous
     // namespace at all.
@@ -2792,9 +2790,9 @@ ASTDeclReader::VisitRedeclarable(Redeclarable<T> *D) {
 
   uint64_t RedeclOffset = 0;
 
-  // 0 indicates that this declaration was the only declaration of its entity,
-  // and is used for space optimization.
-  if (FirstDeclID == GlobalDeclID()) {
+  // invalid FirstDeclID  indicates that this declaration was the only
+  // declaration of its entity, and is used for space optimization.
+  if (FirstDeclID.isInvalid()) {
     FirstDeclID = ThisDeclID;
     IsKeyDecl = true;
     IsFirstLocalDecl = true;
@@ -3829,240 +3827,232 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
         Twine("ASTReader::readDeclRecord failed reading decl code: ") +
         toString(MaybeDeclCode.takeError()));
 
-  DeclID RawGlobalID = ID.get();
   switch ((DeclCode)MaybeDeclCode.get()) {
   case DECL_CONTEXT_LEXICAL:
   case DECL_CONTEXT_VISIBLE:
     llvm_unreachable("Record cannot be de-serialized with readDeclRecord");
   case DECL_TYPEDEF:
-    D = TypedefDecl::CreateDeserialized(Context, RawGlobalID);
+    D = TypedefDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_TYPEALIAS:
-    D = TypeAliasDecl::CreateDeserialized(Context, RawGlobalID);
+    D = TypeAliasDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_ENUM:
-    D = EnumDecl::CreateDeserialized(Context, RawGlobalID);
+    D = EnumDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_RECORD:
-    D = RecordDecl::CreateDeserialized(Context, RawGlobalID);
+    D = RecordDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_ENUM_CONSTANT:
-    D = EnumConstantDecl::CreateDeserialized(Context, RawGlobalID);
+    D = EnumConstantDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_FUNCTION:
-    D = FunctionDecl::CreateDeserialized(Context, RawGlobalID);
+    D = FunctionDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_LINKAGE_SPEC:
-    D = LinkageSpecDecl::CreateDeserialized(Context, RawGlobalID);
+    D = LinkageSpecDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_EXPORT:
-    D = ExportDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ExportDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_LABEL:
-    D = LabelDecl::CreateDeserialized(Context, RawGlobalID);
+    D = LabelDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_NAMESPACE:
-    D = NamespaceDecl::CreateDeserialized(Context, RawGlobalID);
+    D = NamespaceDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_NAMESPACE_ALIAS:
-    D = NamespaceAliasDecl::CreateDeserialized(Context, RawGlobalID);
+    D = NamespaceAliasDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_USING:
-    D = UsingDecl::CreateDeserialized(Context, RawGlobalID);
+    D = UsingDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_USING_PACK:
-    D = UsingPackDecl::CreateDeserialized(Context, RawGlobalID,
-                                          Record.readInt());
+    D = UsingPackDecl::CreateDeserialized(Context, ID, Record.readInt());
     break;
   case DECL_USING_SHADOW:
-    D = UsingShadowDecl::CreateDeserialized(Context, RawGlobalID);
+    D = UsingShadowDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_USING_ENUM:
-    D = UsingEnumDecl::CreateDeserialized(Context, RawGlobalID);
+    D = UsingEnumDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CONSTRUCTOR_USING_SHADOW:
-    D = ConstructorUsingShadowDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ConstructorUsingShadowDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_USING_DIRECTIVE:
-    D = UsingDirectiveDecl::CreateDeserialized(Context, RawGlobalID);
+    D = UsingDirectiveDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_UNRESOLVED_USING_VALUE:
-    D = UnresolvedUsingValueDecl::CreateDeserialized(Context, RawGlobalID);
+    D = UnresolvedUsingValueDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_UNRESOLVED_USING_TYPENAME:
-    D = UnresolvedUsingTypenameDecl::CreateDeserialized(Context, RawGlobalID);
+    D = UnresolvedUsingTypenameDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_UNRESOLVED_USING_IF_EXISTS:
-    D = UnresolvedUsingIfExistsDecl::CreateDeserialized(Context, RawGlobalID);
+    D = UnresolvedUsingIfExistsDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CXX_RECORD:
-    D = CXXRecordDecl::CreateDeserialized(Context, RawGlobalID);
+    D = CXXRecordDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CXX_DEDUCTION_GUIDE:
-    D = CXXDeductionGuideDecl::CreateDeserialized(Context, RawGlobalID);
+    D = CXXDeductionGuideDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CXX_METHOD:
-    D = CXXMethodDecl::CreateDeserialized(Context, RawGlobalID);
+    D = CXXMethodDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CXX_CONSTRUCTOR:
-    D = CXXConstructorDecl::CreateDeserialized(Context, RawGlobalID,
-                                               Record.readInt());
+    D = CXXConstructorDecl::CreateDeserialized(Context, ID, Record.readInt());
     break;
   case DECL_CXX_DESTRUCTOR:
-    D = CXXDestructorDecl::CreateDeserialized(Context, RawGlobalID);
+    D = CXXDestructorDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CXX_CONVERSION:
-    D = CXXConversionDecl::CreateDeserialized(Context, RawGlobalID);
+    D = CXXConversionDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_ACCESS_SPEC:
-    D = AccessSpecDecl::CreateDeserialized(Context, RawGlobalID);
+    D = AccessSpecDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_FRIEND:
-    D = FriendDecl::CreateDeserialized(Context, RawGlobalID, Record.readInt());
+    D = FriendDecl::CreateDeserialized(Context, ID, Record.readInt());
     break;
   case DECL_FRIEND_TEMPLATE:
-    D = FriendTemplateDecl::CreateDeserialized(Context, RawGlobalID);
+    D = FriendTemplateDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CLASS_TEMPLATE:
-    D = ClassTemplateDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ClassTemplateDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CLASS_TEMPLATE_SPECIALIZATION:
-    D = ClassTemplateSpecializationDecl::CreateDeserialized(Context,
-                                                            RawGlobalID);
+    D = ClassTemplateSpecializationDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CLASS_TEMPLATE_PARTIAL_SPECIALIZATION:
-    D = ClassTemplatePartialSpecializationDecl::CreateDeserialized(Context,
-                                                                   RawGlobalID);
+    D = ClassTemplatePartialSpecializationDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_VAR_TEMPLATE:
-    D = VarTemplateDecl::CreateDeserialized(Context, RawGlobalID);
+    D = VarTemplateDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_VAR_TEMPLATE_SPECIALIZATION:
-    D = VarTemplateSpecializationDecl::CreateDeserialized(Context, RawGlobalID);
+    D = VarTemplateSpecializationDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_VAR_TEMPLATE_PARTIAL_SPECIALIZATION:
-    D = VarTemplatePartialSpecializationDecl::CreateDeserialized(Context,
-                                                                 RawGlobalID);
+    D = VarTemplatePartialSpecializationDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_FUNCTION_TEMPLATE:
-    D = FunctionTemplateDecl::CreateDeserialized(Context, RawGlobalID);
+    D = FunctionTemplateDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_TEMPLATE_TYPE_PARM: {
     bool HasTypeConstraint = Record.readInt();
-    D = TemplateTypeParmDecl::CreateDeserialized(Context, RawGlobalID,
+    D = TemplateTypeParmDecl::CreateDeserialized(Context, ID,
                                                  HasTypeConstraint);
     break;
   }
   case DECL_NON_TYPE_TEMPLATE_PARM: {
     bool HasTypeConstraint = Record.readInt();
-    D = NonTypeTemplateParmDecl::CreateDeserialized(Context, RawGlobalID,
+    D = NonTypeTemplateParmDecl::CreateDeserialized(Context, ID,
                                                     HasTypeConstraint);
     break;
   }
   case DECL_EXPANDED_NON_TYPE_TEMPLATE_PARM_PACK: {
     bool HasTypeConstraint = Record.readInt();
     D = NonTypeTemplateParmDecl::CreateDeserialized(
-        Context, RawGlobalID, Record.readInt(), HasTypeConstraint);
+        Context, ID, Record.readInt(), HasTypeConstraint);
     break;
   }
   case DECL_TEMPLATE_TEMPLATE_PARM:
-    D = TemplateTemplateParmDecl::CreateDeserialized(Context, RawGlobalID);
+    D = TemplateTemplateParmDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_EXPANDED_TEMPLATE_TEMPLATE_PARM_PACK:
-    D = TemplateTemplateParmDecl::CreateDeserialized(Context, RawGlobalID,
+    D = TemplateTemplateParmDecl::CreateDeserialized(Context, ID,
                                                      Record.readInt());
     break;
   case DECL_TYPE_ALIAS_TEMPLATE:
-    D = TypeAliasTemplateDecl::CreateDeserialized(Context, RawGlobalID);
+    D = TypeAliasTemplateDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CONCEPT:
-    D = ConceptDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ConceptDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_REQUIRES_EXPR_BODY:
-    D = RequiresExprBodyDecl::CreateDeserialized(Context, RawGlobalID);
+    D = RequiresExprBodyDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_STATIC_ASSERT:
-    D = StaticAssertDecl::CreateDeserialized(Context, RawGlobalID);
+    D = StaticAssertDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_METHOD:
-    D = ObjCMethodDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCMethodDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_INTERFACE:
-    D = ObjCInterfaceDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCInterfaceDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_IVAR:
-    D = ObjCIvarDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCIvarDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_PROTOCOL:
-    D = ObjCProtocolDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCProtocolDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_AT_DEFS_FIELD:
-    D = ObjCAtDefsFieldDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCAtDefsFieldDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_CATEGORY:
-    D = ObjCCategoryDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCCategoryDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_CATEGORY_IMPL:
-    D = ObjCCategoryImplDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCCategoryImplDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_IMPLEMENTATION:
-    D = ObjCImplementationDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCImplementationDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_COMPATIBLE_ALIAS:
-    D = ObjCCompatibleAliasDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCCompatibleAliasDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_PROPERTY:
-    D = ObjCPropertyDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCPropertyDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_PROPERTY_IMPL:
-    D = ObjCPropertyImplDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCPropertyImplDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_FIELD:
-    D = FieldDecl::CreateDeserialized(Context, RawGlobalID);
+    D = FieldDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_INDIRECTFIELD:
-    D = IndirectFieldDecl::CreateDeserialized(Context, RawGlobalID);
+    D = IndirectFieldDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_VAR:
-    D = VarDecl::CreateDeserialized(Context, RawGlobalID);
+    D = VarDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_IMPLICIT_PARAM:
-    D = ImplicitParamDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ImplicitParamDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_PARM_VAR:
-    D = ParmVarDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ParmVarDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_DECOMPOSITION:
-    D = DecompositionDecl::CreateDeserialized(Context, RawGlobalID,
-                                              Record.readInt());
+    D = DecompositionDecl::CreateDeserialized(Context, ID, Record.readInt());
     break;
   case DECL_BINDING:
-    D = BindingDecl::CreateDeserialized(Context, RawGlobalID);
+    D = BindingDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_FILE_SCOPE_ASM:
-    D = FileScopeAsmDecl::CreateDeserialized(Context, RawGlobalID);
+    D = FileScopeAsmDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_TOP_LEVEL_STMT_DECL:
-    D = TopLevelStmtDecl::CreateDeserialized(Context, RawGlobalID);
+    D = TopLevelStmtDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_BLOCK:
-    D = BlockDecl::CreateDeserialized(Context, RawGlobalID);
+    D = BlockDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_MS_PROPERTY:
-    D = MSPropertyDecl::CreateDeserialized(Context, RawGlobalID);
+    D = MSPropertyDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_MS_GUID:
-    D = MSGuidDecl::CreateDeserialized(Context, RawGlobalID);
+    D = MSGuidDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_UNNAMED_GLOBAL_CONSTANT:
-    D = UnnamedGlobalConstantDecl::CreateDeserialized(Context, RawGlobalID);
+    D = UnnamedGlobalConstantDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_TEMPLATE_PARAM_OBJECT:
-    D = TemplateParamObjectDecl::CreateDeserialized(Context, RawGlobalID);
+    D = TemplateParamObjectDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_CAPTURED:
-    D = CapturedDecl::CreateDeserialized(Context, RawGlobalID,
-                                         Record.readInt());
+    D = CapturedDecl::CreateDeserialized(Context, ID, Record.readInt());
     break;
   case DECL_CXX_BASE_SPECIFIERS:
     Error("attempt to read a C++ base-specifier record as a declaration");
@@ -4073,66 +4063,62 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
   case DECL_IMPORT:
     // Note: last entry of the ImportDecl record is the number of stored source
     // locations.
-    D = ImportDecl::CreateDeserialized(Context, RawGlobalID, Record.back());
+    D = ImportDecl::CreateDeserialized(Context, ID, Record.back());
     break;
   case DECL_OMP_THREADPRIVATE: {
     Record.skipInts(1);
     unsigned NumChildren = Record.readInt();
     Record.skipInts(1);
-    D = OMPThreadPrivateDecl::CreateDeserialized(Context, RawGlobalID,
-                                                 NumChildren);
+    D = OMPThreadPrivateDecl::CreateDeserialized(Context, ID, NumChildren);
     break;
   }
   case DECL_OMP_ALLOCATE: {
     unsigned NumClauses = Record.readInt();
     unsigned NumVars = Record.readInt();
     Record.skipInts(1);
-    D = OMPAllocateDecl::CreateDeserialized(Context, RawGlobalID, NumVars,
-                                            NumClauses);
+    D = OMPAllocateDecl::CreateDeserialized(Context, ID, NumVars, NumClauses);
     break;
   }
   case DECL_OMP_REQUIRES: {
     unsigned NumClauses = Record.readInt();
     Record.skipInts(2);
-    D = OMPRequiresDecl::CreateDeserialized(Context, RawGlobalID, NumClauses);
+    D = OMPRequiresDecl::CreateDeserialized(Context, ID, NumClauses);
     break;
   }
   case DECL_OMP_DECLARE_REDUCTION:
-    D = OMPDeclareReductionDecl::CreateDeserialized(Context, RawGlobalID);
+    D = OMPDeclareReductionDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OMP_DECLARE_MAPPER: {
     unsigned NumClauses = Record.readInt();
     Record.skipInts(2);
-    D = OMPDeclareMapperDecl::CreateDeserialized(Context, RawGlobalID,
-                                                 NumClauses);
+    D = OMPDeclareMapperDecl::CreateDeserialized(Context, ID, NumClauses);
     break;
   }
   case DECL_OMP_CAPTUREDEXPR:
-    D = OMPCapturedExprDecl::CreateDeserialized(Context, RawGlobalID);
+    D = OMPCapturedExprDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_PRAGMA_COMMENT:
-    D = PragmaCommentDecl::CreateDeserialized(Context, RawGlobalID,
-                                              Record.readInt());
+    D = PragmaCommentDecl::CreateDeserialized(Context, ID, Record.readInt());
     break;
   case DECL_PRAGMA_DETECT_MISMATCH:
-    D = PragmaDetectMismatchDecl::CreateDeserialized(Context, RawGlobalID,
+    D = PragmaDetectMismatchDecl::CreateDeserialized(Context, ID,
                                                      Record.readInt());
     break;
   case DECL_EMPTY:
-    D = EmptyDecl::CreateDeserialized(Context, RawGlobalID);
+    D = EmptyDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_LIFETIME_EXTENDED_TEMPORARY:
-    D = LifetimeExtendedTemporaryDecl::CreateDeserialized(Context, RawGlobalID);
+    D = LifetimeExtendedTemporaryDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_OBJC_TYPE_PARAM:
-    D = ObjCTypeParamDecl::CreateDeserialized(Context, RawGlobalID);
+    D = ObjCTypeParamDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_HLSL_BUFFER:
-    D = HLSLBufferDecl::CreateDeserialized(Context, RawGlobalID);
+    D = HLSLBufferDecl::CreateDeserialized(Context, ID);
     break;
   case DECL_IMPLICIT_CONCEPT_SPECIALIZATION:
-    D = ImplicitConceptSpecializationDecl::CreateDeserialized(
-        Context, RawGlobalID, Record.readInt());
+    D = ImplicitConceptSpecializationDecl::CreateDeserialized(Context, ID,
+                                                              Record.readInt());
     break;
   }
 
@@ -4215,7 +4201,7 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) {
   // The declaration may have been modified by files later in the chain.
   // If this is the case, read the record containing the updates from each file
   // and pass it to ASTDeclReader to make the modifications.
-  serialization::GlobalDeclID ID = Record.ID;
+  GlobalDeclID ID = Record.ID;
   Decl *D = Record.D;
   ProcessingUpdatesRAIIObj ProcessingUpdates(*this);
   DeclUpdateOffsetsMap::iterator UpdI = DeclUpdateOffsets.find(ID);
@@ -4357,7 +4343,7 @@ namespace {
     llvm::SmallPtrSetImpl<ObjCCategoryDecl *> &Deserialized;
     ObjCCategoryDecl *Tail = nullptr;
     llvm::DenseMap<DeclarationName, ObjCCategoryDecl *> NameCategoryMap;
-    serialization::GlobalDeclID InterfaceID;
+    GlobalDeclID InterfaceID;
     unsigned PreviousGeneration;
 
     void add(ObjCCategoryDecl *Cat) {
@@ -4399,11 +4385,10 @@ namespace {
     }
 
   public:
-    ObjCCategoriesVisitor(ASTReader &Reader,
-                          ObjCInterfaceDecl *Interface,
-                          llvm::SmallPtrSetImpl<ObjCCategoryDecl *> &Deserialized,
-                          serialization::GlobalDeclID InterfaceID,
-                          unsigned PreviousGeneration)
+    ObjCCategoriesVisitor(
+        ASTReader &Reader, ObjCInterfaceDecl *Interface,
+        llvm::SmallPtrSetImpl<ObjCCategoryDecl *> &Deserialized,
+        GlobalDeclID InterfaceID, unsigned PreviousGeneration)
         : Reader(Reader), Interface(Interface), Deserialized(Deserialized),
           InterfaceID(InterfaceID), PreviousGeneration(PreviousGeneration) {
       // Populate the name -> category map with the set of known categories.
@@ -4425,8 +4410,9 @@ namespace {
       // Map global ID of the definition down to the local ID used in this
       // module file. If there is no such mapping, we'll find nothing here
       // (or in any module it imports).
-      DeclID LocalID = Reader.mapGlobalIDToModuleFileGlobalID(M, InterfaceID);
-      if (!LocalID)
+      LocalDeclID LocalID =
+          Reader.mapGlobalIDToModuleFileGlobalID(M, InterfaceID);
+      if (LocalID.isInvalid())
         return true;
 
       // Perform a binary search to find the local redeclarations for this
@@ -4457,8 +4443,7 @@ namespace {
 
 } // namespace
 
-void ASTReader::loadObjCCategories(serialization::GlobalDeclID ID,
-                                   ObjCInterfaceDecl *D,
+void ASTReader::loadObjCCategories(GlobalDeclID ID, ObjCInterfaceDecl *D,
                                    unsigned PreviousGeneration) {
   ObjCCategoriesVisitor Visitor(*this, D, CategoriesDeserialized, ID,
                                 PreviousGeneration);
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index baded0fe1983..7d3930022a69 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -956,14 +956,22 @@ void ASTStmtReader::VisitMatrixSubscriptExpr(MatrixSubscriptExpr *E) {
   E->setRBracketLoc(readSourceLocation());
 }
 
-void ASTStmtReader::VisitOMPArraySectionExpr(OMPArraySectionExpr *E) {
+void ASTStmtReader::VisitArraySectionExpr(ArraySectionExpr *E) {
   VisitExpr(E);
+  E->ASType = Record.readEnum<ArraySectionExpr::ArraySectionType>();
+
   E->setBase(Record.readSubExpr());
   E->setLowerBound(Record.readSubExpr());
   E->setLength(Record.readSubExpr());
-  E->setStride(Record.readSubExpr());
+
+  if (E->isOMPArraySection())
+    E->setStride(Record.readSubExpr());
+
   E->setColonLocFirst(readSourceLocation());
-  E->setColonLocSecond(readSourceLocation());
+
+  if (E->isOMPArraySection())
+    E->setColonLocSecond(readSourceLocation());
+
   E->setRBracketLoc(readSourceLocation());
 }
 
@@ -3090,8 +3098,8 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = new (Context) MatrixSubscriptExpr(Empty);
       break;
 
-    case EXPR_OMP_ARRAY_SECTION:
-      S = new (Context) OMPArraySectionExpr(Empty);
+    case EXPR_ARRAY_SECTION:
+      S = new (Context) ArraySectionExpr(Empty);
       break;
 
     case EXPR_OMP_ARRAY_SHAPING:
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 30195868ca99..80c7ce643088 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -173,54 +173,50 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
 
   const HeaderSearch &HS = PP.getHeaderSearchInfo();
   const ModuleMap &MM = HS.getModuleMap();
-  const SourceManager &SourceMgr = PP.getSourceManager();
 
   std::set<const FileEntry *> ModuleMaps;
-  auto CollectIncludingModuleMaps = [&](FileID FID, FileEntryRef F) {
-    if (!ModuleMaps.insert(F).second)
+  std::set<const Module *> ProcessedModules;
+  auto CollectModuleMapsForHierarchy = [&](const Module *M) {
+    M = M->getTopLevelModule();
+
+    if (!ProcessedModules.insert(M).second)
       return;
-    SourceLocation Loc = SourceMgr.getIncludeLoc(FID);
-    // The include location of inferred module maps can point into the header
-    // file that triggered the inferring. Cut off the walk if that's the case.
-    while (Loc.isValid() && isModuleMap(SourceMgr.getFileCharacteristic(Loc))) {
-      FID = SourceMgr.getFileID(Loc);
-      F = *SourceMgr.getFileEntryRefForID(FID);
-      if (!ModuleMaps.insert(F).second)
-        break;
-      Loc = SourceMgr.getIncludeLoc(FID);
-    }
-  };
 
-  std::set<const Module *> ProcessedModules;
-  auto CollectIncludingMapsFromAncestors = [&](const Module *M) {
-    for (const Module *Mod = M; Mod; Mod = Mod->Parent) {
-      if (!ProcessedModules.insert(Mod).second)
-        break;
+    std::queue<const Module *> Q;
+    Q.push(M);
+    while (!Q.empty()) {
+      const Module *Mod = Q.front();
+      Q.pop();
+
       // The containing module map is affecting, because it's being pointed
       // into by Module::DefinitionLoc.
-      if (FileID FID = MM.getContainingModuleMapFileID(Mod); FID.isValid())
-        CollectIncludingModuleMaps(FID, *SourceMgr.getFileEntryRefForID(FID));
-      // For inferred modules, the module map that allowed inferring is not in
-      // the include chain of the virtual containing module map file. It did
-      // affect the compilation, though.
-      if (FileID FID = MM.getModuleMapFileIDForUniquing(Mod); FID.isValid())
-        CollectIncludingModuleMaps(FID, *SourceMgr.getFileEntryRefForID(FID));
+      if (auto FE = MM.getContainingModuleMapFile(Mod))
+        ModuleMaps.insert(*FE);
+      // For inferred modules, the module map that allowed inferring is not
+      // related to the virtual containing module map file. It did affect the
+      // compilation, though.
+      if (auto FE = MM.getModuleMapFileForUniquing(Mod))
+        ModuleMaps.insert(*FE);
+
+      for (auto *SubM : Mod->submodules())
+        Q.push(SubM);
     }
   };
 
   // Handle all the affecting modules referenced from the root module.
 
+  CollectModuleMapsForHierarchy(RootModule);
+
   std::queue<const Module *> Q;
   Q.push(RootModule);
   while (!Q.empty()) {
     const Module *CurrentModule = Q.front();
     Q.pop();
 
-    CollectIncludingMapsFromAncestors(CurrentModule);
     for (const Module *ImportedModule : CurrentModule->Imports)
-      CollectIncludingMapsFromAncestors(ImportedModule);
+      CollectModuleMapsForHierarchy(ImportedModule);
     for (const Module *UndeclaredModule : CurrentModule->UndeclaredUses)
-      CollectIncludingMapsFromAncestors(UndeclaredModule);
+      CollectModuleMapsForHierarchy(UndeclaredModule);
 
     for (auto *M : CurrentModule->submodules())
       Q.push(M);
@@ -249,9 +245,27 @@ GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) {
 
     for (const auto &KH : HS.findResolvedModulesForHeader(*File))
       if (const Module *M = KH.getModule())
-        CollectIncludingMapsFromAncestors(M);
+        CollectModuleMapsForHierarchy(M);
   }
 
+  // FIXME: This algorithm is not correct for module map hierarchies where
+  // module map file defining a (sub)module of a top-level module X includes
+  // a module map file that defines a (sub)module of another top-level module Y.
+  // Whenever X is affecting and Y is not, "replaying" this PCM file will fail
+  // when parsing module map files for X due to not knowing about the `extern`
+  // module map for Y.
+  //
+  // We don't have a good way to fix it here. We could mark all children of
+  // affecting module map files as being affecting as well, but that's
+  // expensive. SourceManager does not model the edge from parent to child
+  // SLocEntries, so instead, we would need to iterate over leaf module map
+  // files, walk up their include hierarchy and check whether we arrive at an
+  // affecting module map.
+  //
+  // Instead of complicating and slowing down this function, we should probably
+  // just ban module map hierarchies where module map defining a (sub)module X
+  // includes a module map defining a module that's not a submodule of X.
+
   return ModuleMaps;
 }
 
@@ -1174,26 +1188,47 @@ ASTWriter::createSignature() const {
   return std::make_pair(ASTBlockHash, Signature);
 }
 
+ASTFileSignature ASTWriter::createSignatureForNamedModule() const {
+  llvm::SHA1 Hasher;
+  Hasher.update(StringRef(Buffer.data(), Buffer.size()));
+
+  assert(WritingModule);
+  assert(WritingModule->isNamedModule());
+
+  // We need to combine all the export imported modules no matter
+  // we used it or not.
+  for (auto [ExportImported, _] : WritingModule->Exports)
+    Hasher.update(ExportImported->Signature);
+
+  return ASTFileSignature::create(Hasher.result());
+}
+
+static void BackpatchSignatureAt(llvm::BitstreamWriter &Stream,
+                                 const ASTFileSignature &S, uint64_t BitNo) {
+  for (uint8_t Byte : S) {
+    Stream.BackpatchByte(BitNo, Byte);
+    BitNo += 8;
+  }
+}
+
 ASTFileSignature ASTWriter::backpatchSignature() {
+  if (isWritingStdCXXNamedModules()) {
+    ASTFileSignature Signature = createSignatureForNamedModule();
+    BackpatchSignatureAt(Stream, Signature, SignatureOffset);
+    return Signature;
+  }
+
   if (!WritingModule ||
       !PP->getHeaderSearchInfo().getHeaderSearchOpts().ModulesHashContent)
     return {};
 
   // For implicit modules, write the hash of the PCM as its signature.
-
-  auto BackpatchSignatureAt = [&](const ASTFileSignature &S, uint64_t BitNo) {
-    for (uint8_t Byte : S) {
-      Stream.BackpatchByte(BitNo, Byte);
-      BitNo += 8;
-    }
-  };
-
   ASTFileSignature ASTBlockHash;
   ASTFileSignature Signature;
   std::tie(ASTBlockHash, Signature) = createSignature();
 
-  BackpatchSignatureAt(ASTBlockHash, ASTBlockHashOffset);
-  BackpatchSignatureAt(Signature, SignatureOffset);
+  BackpatchSignatureAt(Stream, ASTBlockHash, ASTBlockHashOffset);
+  BackpatchSignatureAt(Stream, Signature, SignatureOffset);
 
   return Signature;
 }
@@ -1210,9 +1245,11 @@ void ASTWriter::writeUnhashedControlBlock(Preprocessor &PP,
   RecordData Record;
   Stream.EnterSubblock(UNHASHED_CONTROL_BLOCK_ID, 5);
 
-  // For implicit modules, write the hash of the PCM as its signature.
-  if (WritingModule &&
-      PP.getHeaderSearchInfo().getHeaderSearchOpts().ModulesHashContent) {
+  // For implicit modules and C++20 named modules, write the hash of the PCM as
+  // its signature.
+  if (isWritingStdCXXNamedModules() ||
+      (WritingModule &&
+       PP.getHeaderSearchInfo().getHeaderSearchOpts().ModulesHashContent)) {
     // At this point, we don't know the actual signature of the file or the AST
     // block - we're only able to compute those at the end of the serialization
     // process. Let's store dummy signatures for now, and replace them with the
@@ -1223,21 +1260,24 @@ void ASTWriter::writeUnhashedControlBlock(Preprocessor &PP,
     auto Dummy = ASTFileSignature::createDummy();
     SmallString<128> Blob{Dummy.begin(), Dummy.end()};
 
-    auto Abbrev = std::make_shared<BitCodeAbbrev>();
-    Abbrev->Add(BitCodeAbbrevOp(AST_BLOCK_HASH));
-    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-    unsigned ASTBlockHashAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
+    // We don't need AST Block hash in named modules.
+    if (!isWritingStdCXXNamedModules()) {
+      auto Abbrev = std::make_shared<BitCodeAbbrev>();
+      Abbrev->Add(BitCodeAbbrevOp(AST_BLOCK_HASH));
+      Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+      unsigned ASTBlockHashAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-    Abbrev = std::make_shared<BitCodeAbbrev>();
+      Record.push_back(AST_BLOCK_HASH);
+      Stream.EmitRecordWithBlob(ASTBlockHashAbbrev, Record, Blob);
+      ASTBlockHashOffset = Stream.GetCurrentBitNo() - Blob.size() * 8;
+      Record.clear();
+    }
+
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(SIGNATURE));
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
     unsigned SignatureAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-    Record.push_back(AST_BLOCK_HASH);
-    Stream.EmitRecordWithBlob(ASTBlockHashAbbrev, Record, Blob);
-    ASTBlockHashOffset = Stream.GetCurrentBitNo() - Blob.size() * 8;
-    Record.clear();
-
     Record.push_back(SIGNATURE);
     Stream.EmitRecordWithBlob(SignatureAbbrev, Record, Blob);
     SignatureOffset = Stream.GetCurrentBitNo() - Blob.size() * 8;
@@ -1639,6 +1679,18 @@ struct InputFileEntry {
 
 } // namespace
 
+SourceLocation ASTWriter::getAffectingIncludeLoc(const SourceManager &SourceMgr,
+                                                 const SrcMgr::FileInfo &File) {
+  SourceLocation IncludeLoc = File.getIncludeLoc();
+  if (IncludeLoc.isValid()) {
+    FileID IncludeFID = SourceMgr.getFileID(IncludeLoc);
+    assert(IncludeFID.isValid() && "IncludeLoc in invalid file");
+    if (!IsSLocAffecting[IncludeFID.ID])
+      IncludeLoc = SourceLocation();
+  }
+  return IncludeLoc;
+}
+
 void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
                                 HeaderSearchOptions &HSOpts) {
   using namespace llvm;
@@ -1692,7 +1744,7 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr,
     Entry.IsSystemFile = isSystem(File.getFileCharacteristic());
     Entry.IsTransient = Cache->IsTransient;
     Entry.BufferOverridden = Cache->BufferOverridden;
-    Entry.IsTopLevel = File.getIncludeLoc().isInvalid();
+    Entry.IsTopLevel = getAffectingIncludeLoc(SourceMgr, File).isInvalid();
     Entry.IsModuleMap = isModuleMap(File.getFileCharacteristic());
 
     auto ContentHash = hash_code(-1);
@@ -2219,7 +2271,7 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr,
       SLocEntryOffsets.push_back(Offset);
       // Starting offset of this entry within this module, so skip the dummy.
       Record.push_back(getAdjustedOffset(SLoc->getOffset()) - 2);
-      AddSourceLocation(File.getIncludeLoc(), Record);
+      AddSourceLocation(getAffectingIncludeLoc(SourceMgr, File), Record);
       Record.push_back(File.getFileCharacteristic()); // FIXME: stable encoding
       Record.push_back(File.hasLineDirectives());
 
@@ -3043,7 +3095,7 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) {
     RecordData Inits;
     for (Decl *D : Context->getModuleInitializers(Mod))
       if (wasDeclEmitted(D))
-        Inits.push_back(GetDeclRef(D));
+        AddDeclRef(D, Inits);
     if (!Inits.empty())
       Stream.EmitRecord(SUBMODULE_INITIALIZERS, Inits);
 
@@ -3205,6 +3257,17 @@ void ASTWriter::WriteType(QualType T) {
 // Declaration Serialization
 //===----------------------------------------------------------------------===//
 
+static bool IsInternalDeclFromFileContext(const Decl *D) {
+  auto *ND = dyn_cast<NamedDecl>(D);
+  if (!ND)
+    return false;
+
+  if (!D->getDeclContext()->getRedeclContext()->isFileContext())
+    return false;
+
+  return ND->getFormalLinkage() == Linkage::Internal;
+}
+
 /// Write the block containing all of the declaration IDs
 /// lexically declared within the given DeclContext.
 ///
@@ -3225,8 +3288,17 @@ uint64_t ASTWriter::WriteDeclContextLexicalBlock(ASTContext &Context,
     if (DoneWritingDeclsAndTypes && !wasDeclEmitted(D))
       continue;
 
+    // We don't need to write decls with internal linkage into reduced BMI.
+    // If such decls gets emitted due to it get used from inline functions,
+    // the program illegal. However, there are too many use of static inline
+    // functions in the global module fragment and it will be breaking change
+    // to forbid that. So we have to allow to emit such declarations from GMF.
+    if (GeneratingReducedBMI && !D->isFromExplicitGlobalModule() &&
+        IsInternalDeclFromFileContext(D))
+      continue;
+
     KindDeclPairs.push_back(D->getKind());
-    KindDeclPairs.push_back(GetDeclRef(D));
+    KindDeclPairs.push_back(GetDeclRef(D).get());
   }
 
   ++NumLexicalDeclContexts;
@@ -3261,7 +3333,7 @@ void ASTWriter::WriteTypeDeclOffsets() {
   unsigned DeclOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   {
     RecordData::value_type Record[] = {DECL_OFFSET, DeclOffsets.size(),
-                                       FirstDeclID - NUM_PREDEF_DECL_IDS};
+                                       FirstDeclID.get() - NUM_PREDEF_DECL_IDS};
     Stream.EmitRecordWithBlob(DeclOffsetAbbrev, Record, bytes(DeclOffsets));
   }
 }
@@ -3282,7 +3354,7 @@ void ASTWriter::WriteFileDeclIDsMap() {
     Info.FirstDeclIndex = FileGroupedDeclIDs.size();
     llvm::stable_sort(Info.DeclIDs);
     for (auto &LocDeclEntry : Info.DeclIDs)
-      FileGroupedDeclIDs.push_back(LocDeclEntry.second);
+      FileGroupedDeclIDs.push_back(LocDeclEntry.second.get());
   }
 
   auto Abbrev = std::make_shared<BitCodeAbbrev>();
@@ -3420,11 +3492,11 @@ public:
     for (const ObjCMethodList *Method = &Methods.Instance; Method;
          Method = Method->getNext())
       if (ShouldWriteMethodListNode(Method))
-        LE.write<DeclID>(Writer.getDeclID(Method->getMethod()));
+        LE.write<DeclID>((DeclID)Writer.getDeclID(Method->getMethod()));
     for (const ObjCMethodList *Method = &Methods.Factory; Method;
          Method = Method->getNext())
       if (ShouldWriteMethodListNode(Method))
-        LE.write<DeclID>(Writer.getDeclID(Method->getMethod()));
+        LE.write<DeclID>((DeclID)Writer.getDeclID(Method->getMethod()));
 
     assert(Out.tell() - Start == DataLen && "Data length is wrong");
   }
@@ -3743,8 +3815,8 @@ public:
       // Only emit declarations that aren't from a chained PCH, though.
       SmallVector<NamedDecl *, 16> Decls(IdResolver.decls(II));
       for (NamedDecl *D : llvm::reverse(Decls))
-        LE.write<DeclID>(
-            Writer.getDeclID(getDeclForLocalLookup(PP.getLangOpts(), D)));
+        LE.write<DeclID>((DeclID)Writer.getDeclID(
+            getDeclForLocalLookup(PP.getLangOpts(), D)));
     }
   }
 };
@@ -3860,7 +3932,7 @@ namespace {
 // Trait used for the on-disk hash table used in the method pool.
 class ASTDeclContextNameLookupTrait {
   ASTWriter &Writer;
-  llvm::SmallVector<DeclID, 64> DeclIDs;
+  llvm::SmallVector<LocalDeclID, 64> DeclIDs;
 
 public:
   using key_type = DeclarationNameKey;
@@ -3886,6 +3958,13 @@ public:
           !Writer.wasDeclEmitted(DeclForLocalLookup))
         continue;
 
+      // Try to avoid writing internal decls to reduced BMI.
+      // See comments in ASTWriter::WriteDeclContextLexicalBlock for details.
+      if (Writer.isGeneratingReducedBMI() &&
+          !DeclForLocalLookup->isFromExplicitGlobalModule() &&
+          IsInternalDeclFromFileContext(DeclForLocalLookup))
+        continue;
+
       DeclIDs.push_back(Writer.GetDeclRef(DeclForLocalLookup));
     }
     return std::make_pair(Start, DeclIDs.size());
@@ -3893,8 +3972,10 @@ public:
 
   data_type ImportData(const reader::ASTDeclContextNameLookupTrait::data_type &FromReader) {
     unsigned Start = DeclIDs.size();
-    DeclIDs.insert(DeclIDs.end(), DeclIDIterator(FromReader.begin()),
-                   DeclIDIterator(FromReader.end()));
+    DeclIDs.insert(
+        DeclIDs.end(),
+        DeclIDIterator<GlobalDeclID, LocalDeclID>(FromReader.begin()),
+        DeclIDIterator<GlobalDeclID, LocalDeclID>(FromReader.end()));
     return std::make_pair(Start, DeclIDs.size());
   }
 
@@ -3983,7 +4064,7 @@ public:
     endian::Writer LE(Out, llvm::endianness::little);
     uint64_t Start = Out.tell(); (void)Start;
     for (unsigned I = Lookup.first, N = Lookup.second; I != N; ++I)
-      LE.write<DeclID>(DeclIDs[I]);
+      LE.write<DeclID>((DeclID)DeclIDs[I]);
     assert(Out.tell() - Start == DataLen && "Data length is wrong");
   }
 };
@@ -4255,6 +4336,12 @@ uint64_t ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
         if (DoneWritingDeclsAndTypes && !wasDeclEmitted(ND))
           continue;
 
+        // We don't need to force emitting internal decls into reduced BMI.
+        // See comments in ASTWriter::WriteDeclContextLexicalBlock for details.
+        if (GeneratingReducedBMI && !ND->isFromExplicitGlobalModule() &&
+            IsInternalDeclFromFileContext(ND))
+          continue;
+
         GetDeclRef(ND);
       }
     }
@@ -4317,7 +4404,8 @@ void ASTWriter::WriteDeclContextVisibleUpdate(const DeclContext *DC) {
     DC = cast<DeclContext>(Chain->getKeyDeclaration(cast<Decl>(DC)));
 
   // Write the lookup table
-  RecordData::value_type Record[] = {UPDATE_VISIBLE, getDeclID(cast<Decl>(DC))};
+  RecordData::value_type Record[] = {UPDATE_VISIBLE,
+                                     getDeclID(cast<Decl>(DC)).get()};
   Stream.EmitRecordWithBlob(UpdateVisibleAbbrev, Record, LookupTable);
 }
 
@@ -4371,7 +4459,7 @@ void ASTWriter::WriteObjCCategories() {
            Cat = Class->known_categories_begin(),
            CatEnd = Class->known_categories_end();
          Cat != CatEnd; ++Cat, ++Size) {
-      assert(getDeclID(*Cat) != 0 && "Bogus category");
+      assert(getDeclID(*Cat).isValid() && "Bogus category");
       AddDeclRef(*Cat, Categories);
     }
 
@@ -4914,8 +5002,7 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &SemaRef) {
       // is ill-formed. However, in practice, there are a lot of projects
       // uses `static inline` in the headers. So we can't get rid of all
       // static entities in reduced BMI now.
-      if (auto *ND = dyn_cast<NamedDecl>(D);
-          ND && ND->getFormalLinkage() == Linkage::Internal)
+      if (IsInternalDeclFromFileContext(D))
         continue;
     }
 
@@ -5089,7 +5176,7 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) {
       if (!D || !wasDeclEmitted(D))
         SemaDeclRefs.push_back(0);
       else
-        SemaDeclRefs.push_back(getDeclID(D));
+        AddDeclRef(D, SemaDeclRefs);
     };
 
     AddEmittedDeclRefOrZero(SemaRef.getStdNamespace());
@@ -5100,10 +5187,10 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) {
     Stream.EmitRecord(SEMA_DECL_REFS, SemaDeclRefs);
 
   // Write the record containing decls to be checked for deferred diags.
-  SmallVector<serialization::DeclID, 64> DeclsToCheckForDeferredDiags;
+  RecordData DeclsToCheckForDeferredDiags;
   for (auto *D : SemaRef.DeclsToCheckForDeferredDiags)
     if (wasDeclEmitted(D))
-      DeclsToCheckForDeferredDiags.push_back(getDeclID(D));
+      AddDeclRef(D, DeclsToCheckForDeferredDiags);
   if (!DeclsToCheckForDeferredDiags.empty())
     Stream.EmitRecord(DECLS_TO_CHECK_FOR_DEFERRED_DIAGS,
         DeclsToCheckForDeferredDiags);
@@ -5473,7 +5560,7 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
     if (VisibleOffset)
       VisibleOffset -= DeclTypesBlockStartOffset;
 
-    DelayedNamespaceRecord.push_back(getDeclID(NS));
+    AddDeclRef(NS, DelayedNamespaceRecord);
     DelayedNamespaceRecord.push_back(LexicalOffset);
     DelayedNamespaceRecord.push_back(VisibleOffset);
   }
@@ -5507,7 +5594,7 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
       continue;
 
     NewGlobalKindDeclPairs.push_back(D->getKind());
-    NewGlobalKindDeclPairs.push_back(GetDeclRef(D));
+    NewGlobalKindDeclPairs.push_back(GetDeclRef(D).get());
   }
 
   auto Abv = std::make_shared<llvm::BitCodeAbbrev>();
@@ -5568,7 +5655,7 @@ void ASTWriter::WriteDeclUpdatesBlocks(RecordDataImpl &OffsetsRecord) {
       case UPD_CXX_ADDED_TEMPLATE_SPECIALIZATION:
       case UPD_CXX_ADDED_ANONYMOUS_NAMESPACE:
         assert(Update.getDecl() && "no decl to add?");
-        Record.push_back(GetDeclRef(Update.getDecl()));
+        Record.AddDeclRef(Update.getDecl());
         break;
 
       case UPD_CXX_ADDED_FUNCTION_DEFINITION:
@@ -5709,7 +5796,7 @@ void ASTWriter::WriteDeclUpdatesBlocks(RecordDataImpl &OffsetsRecord) {
       }
     }
 
-    OffsetsRecord.push_back(GetDeclRef(D));
+    AddDeclRef(D, OffsetsRecord);
     OffsetsRecord.push_back(Record.Emit(DECL_UPDATES));
   }
 }
@@ -5974,18 +6061,18 @@ void ASTWriter::AddEmittedDeclRef(const Decl *D, RecordDataImpl &Record) {
   if (!wasDeclEmitted(D))
     return;
 
-  Record.push_back(GetDeclRef(D));
+  Record.push_back(GetDeclRef(D).get());
 }
 
 void ASTWriter::AddDeclRef(const Decl *D, RecordDataImpl &Record) {
-  Record.push_back(GetDeclRef(D));
+  Record.push_back(GetDeclRef(D).get());
 }
 
-DeclID ASTWriter::GetDeclRef(const Decl *D) {
+LocalDeclID ASTWriter::GetDeclRef(const Decl *D) {
   assert(WritingAST && "Cannot request a declaration ID before AST writing");
 
   if (!D) {
-    return 0;
+    return LocalDeclID();
   }
 
   // If the DeclUpdate from the GMF gets touched, emit it.
@@ -5999,14 +6086,14 @@ DeclID ASTWriter::GetDeclRef(const Decl *D) {
   // If D comes from an AST file, its declaration ID is already known and
   // fixed.
   if (D->isFromASTFile())
-    return D->getGlobalID();
+    return LocalDeclID(D->getGlobalID());
 
   assert(!(reinterpret_cast<uintptr_t>(D) & 0x01) && "Invalid decl pointer");
-  DeclID &ID = DeclIDs[D];
-  if (ID == 0) {
+  LocalDeclID &ID = DeclIDs[D];
+  if (ID.isInvalid()) {
     if (DoneWritingDeclsAndTypes) {
       assert(0 && "New decl seen after serializing all the decls to emit!");
-      return 0;
+      return LocalDeclID();
     }
 
     // We haven't seen this declaration before. Give it a new ID and
@@ -6018,14 +6105,14 @@ DeclID ASTWriter::GetDeclRef(const Decl *D) {
   return ID;
 }
 
-DeclID ASTWriter::getDeclID(const Decl *D) {
+LocalDeclID ASTWriter::getDeclID(const Decl *D) {
   if (!D)
-    return 0;
+    return LocalDeclID();
 
   // If D comes from an AST file, its declaration ID is already known and
   // fixed.
   if (D->isFromASTFile())
-    return D->getGlobalID();
+    return LocalDeclID(D->getGlobalID());
 
   assert(DeclIDs.contains(D) && "Declaration not emitted!");
   return DeclIDs[D];
@@ -6046,8 +6133,8 @@ bool ASTWriter::wasDeclEmitted(const Decl *D) const {
   return Emitted;
 }
 
-void ASTWriter::associateDeclWithFile(const Decl *D, DeclID ID) {
-  assert(ID);
+void ASTWriter::associateDeclWithFile(const Decl *D, LocalDeclID ID) {
+  assert(ID.isValid());
   assert(D);
 
   SourceLocation Loc = D->getLocation();
@@ -6079,7 +6166,7 @@ void ASTWriter::associateDeclWithFile(const Decl *D, DeclID ID) {
   if (!Info)
     Info = std::make_unique<DeclIDInFileInfo>();
 
-  std::pair<unsigned, serialization::DeclID> LocDecl(Offset, ID);
+  std::pair<unsigned, LocalDeclID> LocDecl(Offset, ID);
   LocDeclIDsTy &Decls = Info->DeclIDs;
   Decls.push_back(LocDecl);
 }
@@ -6349,7 +6436,7 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
       Writer->Context->getLangOpts().ModulesDebugInfo && !D->isDependentType();
   Record->push_back(ModulesDebugInfo);
   if (ModulesDebugInfo)
-    Writer->ModularCodegenDecls.push_back(Writer->GetDeclRef(D));
+    Writer->AddDeclRef(D, Writer->ModularCodegenDecls);
 
   // IsLambda bit is already saved.
 
@@ -6453,7 +6540,7 @@ void ASTWriter::ReaderInitialized(ASTReader *Reader) {
 
   // Note, this will get called multiple times, once one the reader starts up
   // and again each time it's done reading a PCH or module.
-  FirstDeclID = NUM_PREDEF_DECL_IDS + Chain->getTotalNumDecls();
+  FirstDeclID = LocalDeclID(NUM_PREDEF_DECL_IDS + Chain->getTotalNumDecls());
   FirstTypeID = NUM_PREDEF_TYPE_IDS + Chain->getTotalNumTypes();
   FirstIdentID = NUM_PREDEF_IDENT_IDS + Chain->getTotalNumIdentifiers();
   FirstMacroID = NUM_PREDEF_MACRO_IDS + Chain->getTotalNumMacros();
@@ -7642,6 +7729,12 @@ void ASTRecordWriter::writeOMPChildren(OMPChildren *Data) {
     AddStmt(Data->getChildren()[I]);
 }
 
+void ASTRecordWriter::writeOpenACCVarList(const OpenACCClauseWithVarList *C) {
+  writeUInt32(C->getVarList().size());
+  for (Expr *E : C->getVarList())
+    AddStmt(E);
+}
+
 void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   writeEnum(C->getClauseKind());
   writeSourceLocation(C->getBeginLoc());
@@ -7688,6 +7781,12 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     AddStmt(const_cast<Expr *>(NWC->getIntExpr()));
     return;
   }
+  case OpenACCClauseKind::Private: {
+    const auto *PC = cast<OpenACCPrivateClause>(C);
+    writeSourceLocation(PC->getLParenLoc());
+    writeOpenACCVarList(PC);
+    return;
+  }
   case OpenACCClauseKind::Finalize:
   case OpenACCClauseKind::IfPresent:
   case OpenACCClauseKind::Seq:
@@ -7709,7 +7808,6 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::Link:
   case OpenACCClauseKind::NoCreate:
   case OpenACCClauseKind::Present:
-  case OpenACCClauseKind::Private:
   case OpenACCClauseKind::CopyOut:
   case OpenACCClauseKind::CopyIn:
   case OpenACCClauseKind::Create:
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index c6db107e0ca4..0edc4feda3ef 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -223,9 +223,9 @@ namespace clang {
         assert(!Common->LazySpecializations);
       }
 
-      ArrayRef<DeclID> LazySpecializations;
+      ArrayRef<GlobalDeclID> LazySpecializations;
       if (auto *LS = Common->LazySpecializations)
-        LazySpecializations = llvm::ArrayRef(LS + 1, LS[0]);
+        LazySpecializations = llvm::ArrayRef(LS + 1, LS[0].get());
 
       // Add a slot to the record for the number of specializations.
       unsigned I = Record.size();
@@ -243,7 +243,9 @@ namespace clang {
         assert(D->isCanonicalDecl() && "non-canonical decl in set");
         AddFirstDeclFromEachModule(D, /*IncludeLocal*/true);
       }
-      Record.append(LazySpecializations.begin(), LazySpecializations.end());
+      Record.append(
+          DeclIDIterator<GlobalDeclID, DeclID>(LazySpecializations.begin()),
+          DeclIDIterator<GlobalDeclID, DeclID>(LazySpecializations.end()));
 
       // Update the size entry we added earlier.
       Record[I] = Record.size() - I - 1;
@@ -1166,7 +1168,7 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
   Record.push_back(VarDeclBits);
 
   if (ModulesCodegen)
-    Writer.ModularCodegenDecls.push_back(Writer.GetDeclRef(D));
+    Writer.AddDeclRef(D, Writer.ModularCodegenDecls);
 
   if (D->hasAttr<BlocksAttr>()) {
     BlockVarCopyInit Init = Writer.Context->getBlockVarCopyInit(D);
@@ -2786,10 +2788,10 @@ void ASTWriter::WriteDecl(ASTContext &Context, Decl *D) {
                                       "serializing");
 
   // Determine the ID for this declaration.
-  serialization::DeclID ID;
+  LocalDeclID ID;
   assert(!D->isFromASTFile() && "should not be emitting imported decl");
-  serialization::DeclID &IDR = DeclIDs[D];
-  if (IDR == 0)
+  LocalDeclID &IDR = DeclIDs[D];
+  if (IDR.isInvalid())
     IDR = NextDeclID++;
 
   ID = IDR;
@@ -2807,7 +2809,7 @@ void ASTWriter::WriteDecl(ASTContext &Context, Decl *D) {
 
   // Record the offset for this declaration
   SourceLocation Loc = D->getLocation();
-  unsigned Index = ID - FirstDeclID;
+  unsigned Index = ID.get() - FirstDeclID.get();
   if (DeclOffsets.size() == Index)
     DeclOffsets.emplace_back(getAdjustedLocation(Loc), Offset,
                              DeclTypesBlockStartOffset);
@@ -2827,7 +2829,7 @@ void ASTWriter::WriteDecl(ASTContext &Context, Decl *D) {
   // Note declarations that should be deserialized eagerly so that we can add
   // them to a record in the AST file later.
   if (isRequiredDecl(D, Context, WritingModule))
-    EagerlyDeserializedDecls.push_back(ID);
+    AddDeclRef(D, EagerlyDeserializedDecls);
 }
 
 void ASTRecordWriter::AddFunctionDefinition(const FunctionDecl *FD) {
@@ -2863,7 +2865,7 @@ void ASTRecordWriter::AddFunctionDefinition(const FunctionDecl *FD) {
   }
   Record->push_back(ModulesCodegen);
   if (ModulesCodegen)
-    Writer->ModularCodegenDecls.push_back(Writer->GetDeclRef(FD));
+    Writer->AddDeclRef(FD, Writer->ModularCodegenDecls);
   if (auto *CD = dyn_cast<CXXConstructorDecl>(FD)) {
     Record->push_back(CD->getNumCtorInitializers());
     if (CD->getNumCtorInitializers())
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index cd5f733baf76..39aec31b6d87 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -880,16 +880,21 @@ void ASTStmtWriter::VisitMatrixSubscriptExpr(MatrixSubscriptExpr *E) {
   Code = serialization::EXPR_ARRAY_SUBSCRIPT;
 }
 
-void ASTStmtWriter::VisitOMPArraySectionExpr(OMPArraySectionExpr *E) {
+void ASTStmtWriter::VisitArraySectionExpr(ArraySectionExpr *E) {
   VisitExpr(E);
+  Record.writeEnum(E->ASType);
   Record.AddStmt(E->getBase());
   Record.AddStmt(E->getLowerBound());
   Record.AddStmt(E->getLength());
-  Record.AddStmt(E->getStride());
+  if (E->isOMPArraySection())
+    Record.AddStmt(E->getStride());
   Record.AddSourceLocation(E->getColonLocFirst());
-  Record.AddSourceLocation(E->getColonLocSecond());
+
+  if (E->isOMPArraySection())
+    Record.AddSourceLocation(E->getColonLocSecond());
+
   Record.AddSourceLocation(E->getRBracketLoc());
-  Code = serialization::EXPR_OMP_ARRAY_SECTION;
+  Code = serialization::EXPR_ARRAY_SECTION;
 }
 
 void ASTStmtWriter::VisitOMPArrayShapingExpr(OMPArrayShapingExpr *E) {
diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp
index bed74399098d..cc06106a4770 100644
--- a/clang/lib/Serialization/GeneratePCH.cpp
+++ b/clang/lib/Serialization/GeneratePCH.cpp
@@ -88,36 +88,34 @@ ASTDeserializationListener *PCHGenerator::GetASTDeserializationListener() {
   return &Writer;
 }
 
-ReducedBMIGenerator::ReducedBMIGenerator(Preprocessor &PP,
-                                         InMemoryModuleCache &ModuleCache,
-                                         StringRef OutputFile)
+void PCHGenerator::anchor() {}
+
+CXX20ModulesGenerator::CXX20ModulesGenerator(Preprocessor &PP,
+                                             InMemoryModuleCache &ModuleCache,
+                                             StringRef OutputFile,
+                                             bool GeneratingReducedBMI)
     : PCHGenerator(
           PP, ModuleCache, OutputFile, llvm::StringRef(),
           std::make_shared<PCHBuffer>(),
           /*Extensions=*/ArrayRef<std::shared_ptr<ModuleFileExtension>>(),
           /*AllowASTWithErrors*/ false, /*IncludeTimestamps=*/false,
           /*BuildingImplicitModule=*/false, /*ShouldCacheASTInMemory=*/false,
-          /*GeneratingReducedBMI=*/true) {}
+          GeneratingReducedBMI) {}
 
-Module *ReducedBMIGenerator::getEmittingModule(ASTContext &Ctx) {
+Module *CXX20ModulesGenerator::getEmittingModule(ASTContext &Ctx) {
   Module *M = Ctx.getCurrentNamedModule();
   assert(M && M->isNamedModuleUnit() &&
-         "ReducedBMIGenerator should only be used with C++20 Named modules.");
+         "CXX20ModulesGenerator should only be used with C++20 Named modules.");
   return M;
 }
 
-void ReducedBMIGenerator::HandleTranslationUnit(ASTContext &Ctx) {
-  // We need to do this to make sure the size of reduced BMI not to be larger
-  // than full BMI.
-  //
+void CXX20ModulesGenerator::HandleTranslationUnit(ASTContext &Ctx) {
   // FIMXE: We'd better to wrap such options to a new class ASTWriterOptions
   // since this is not about searching header really.
-  // FIXME2: We'd better to move the class writing full BMI with reduced BMI.
   HeaderSearchOptions &HSOpts =
       getPreprocessor().getHeaderSearchInfo().getHeaderSearchOpts();
   HSOpts.ModulesSkipDiagnosticOptions = true;
   HSOpts.ModulesSkipHeaderSearchPaths = true;
-  HSOpts.ModulesSkipPragmaDiagnosticMappings = true;
 
   PCHGenerator::HandleTranslationUnit(Ctx);
 
@@ -135,3 +133,7 @@ void ReducedBMIGenerator::HandleTranslationUnit(ASTContext &Ctx) {
   *OS << getBufferPtr()->Data;
   OS->flush();
 }
+
+void CXX20ModulesGenerator::anchor() {}
+
+void ReducedBMIGenerator::anchor() {}
diff --git a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
index a678c3827e7f..1cebfbbee77d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
@@ -188,9 +188,9 @@ void DereferenceChecker::reportBug(DerefKind K, ProgramStateRef State,
     os << DerefStr1;
     break;
   }
-  case Stmt::OMPArraySectionExprClass: {
+  case Stmt::ArraySectionExprClass: {
     os << "Array access";
-    const OMPArraySectionExpr *AE = cast<OMPArraySectionExpr>(S);
+    const ArraySectionExpr *AE = cast<ArraySectionExpr>(S);
     AddDerefSource(os, Ranges, AE->getBase()->IgnoreParenCasts(),
                    State.get(), N->getLocationContext());
     os << DerefStr1;
diff --git a/clang/lib/StaticAnalyzer/Checkers/IdenticalExprChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/IdenticalExprChecker.cpp
index 1cf81b54e77d..7ac34ef8164e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/IdenticalExprChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/IdenticalExprChecker.cpp
@@ -350,7 +350,7 @@ static bool isIdenticalStmt(const ASTContext &Ctx, const Stmt *Stmt1,
     return false;
   case Stmt::CallExprClass:
   case Stmt::ArraySubscriptExprClass:
-  case Stmt::OMPArraySectionExprClass:
+  case Stmt::ArraySectionExprClass:
   case Stmt::OMPArrayShapingExprClass:
   case Stmt::OMPIteratorExprClass:
   case Stmt::ImplicitCastExprClass:
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 287f6a528700..6901dbb415bf 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -311,7 +311,7 @@ public:
   bool VisitUnaryOperator(const UnaryOperator *UO) {
     // Operator '*' and '!' are allowed as long as the operand is trivial.
     auto op = UO->getOpcode();
-    if (op == UO_Deref || op == UO_AddrOf || op == UO_LNot)
+    if (op == UO_Deref || op == UO_AddrOf || op == UO_LNot || op == UO_Not)
       return Visit(UO->getSubExpr());
 
     if (UO->isIncrementOp() || UO->isDecrementOp()) {
@@ -331,6 +331,16 @@ public:
     return Visit(BO->getLHS()) && Visit(BO->getRHS());
   }
 
+  bool VisitCompoundAssignOperator(const CompoundAssignOperator *CAO) {
+    // Compound assignment operator such as |= is trivial if its
+    // subexpresssions are trivial.
+    return VisitChildren(CAO);
+  }
+
+  bool VisitArraySubscriptExpr(const ArraySubscriptExpr *ASE) {
+    return VisitChildren(ASE);
+  }
+
   bool VisitConditionalOperator(const ConditionalOperator *CO) {
     // Ternary operators are trivial if their conditions & values are trivial.
     return VisitChildren(CO);
@@ -360,6 +370,16 @@ public:
     return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
   }
 
+  bool
+  VisitSubstNonTypeTemplateParmExpr(const SubstNonTypeTemplateParmExpr *E) {
+    // Non-type template paramter is compile time constant and trivial.
+    return true;
+  }
+
+  bool VisitUnaryExprOrTypeTraitExpr(const UnaryExprOrTypeTraitExpr *E) {
+    return VisitChildren(E);
+  }
+
   bool VisitPredefinedExpr(const PredefinedExpr *E) {
     // A predefined identifier such as "func" is considered trivial.
     return true;
@@ -463,6 +483,7 @@ public:
   bool VisitFixedPointLiteral(const FixedPointLiteral *E) { return true; }
   bool VisitCharacterLiteral(const CharacterLiteral *E) { return true; }
   bool VisitStringLiteral(const StringLiteral *E) { return true; }
+  bool VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *E) { return true; }
 
   bool VisitConstantExpr(const ConstantExpr *CE) {
     // Constant expressions are trivial.
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 9ed8e7cab6ab..ec1db1cc3358 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -50,6 +50,9 @@ std::optional<bool> isUncounted(const clang::CXXRecordDecl* Class);
 /// class, false if not, std::nullopt if inconclusive.
 std::optional<bool> isUncountedPtr(const clang::Type* T);
 
+/// \returns true if Name is a RefPtr, Ref, or its variant, false if not.
+bool isRefType(const std::string &Name);
+
 /// \returns true if \p F creates ref-countable object from uncounted parameter,
 /// false if not.
 bool isCtorOfRefCounted(const clang::FunctionDecl *F);
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 8b41a949fd67..ae494de58da3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -53,6 +53,13 @@ public:
       bool shouldVisitTemplateInstantiations() const { return true; }
       bool shouldVisitImplicitCode() const { return false; }
 
+      bool TraverseClassTemplateDecl(ClassTemplateDecl *Decl) {
+        if (isRefType(safeGetName(Decl)))
+          return true;
+        return RecursiveASTVisitor<LocalVisitor>::TraverseClassTemplateDecl(
+            Decl);
+      }
+
       bool VisitCallExpr(const CallExpr *CE) {
         Checker->visitCallExpr(CE);
         return true;
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 09c69f9612d9..0b1edf3e5c96 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1948,7 +1948,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::CXXPseudoDestructorExprClass:
     case Stmt::SubstNonTypeTemplateParmExprClass:
     case Stmt::CXXNullPtrLiteralExprClass:
-    case Stmt::OMPArraySectionExprClass:
+    case Stmt::ArraySectionExprClass:
     case Stmt::OMPArrayShapingExprClass:
     case Stmt::OMPIteratorExprClass:
     case Stmt::SYCLUniqueStableNameExprClass:
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 32850f5eea92..0c047b6c5da2 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -439,6 +439,9 @@ public:
     if (Result)
       setLastCC1Arguments(std::move(OriginalInvocation));
 
+    // Propagate the statistics to the parent FileManager.
+    DriverFileMgr->AddStats(ScanInstance.getFileManager());
+
     return Result;
   }
 
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
index 5dbb83cab86b..b0eead42869a 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
@@ -7,3 +7,7 @@ Tags:
   SwiftImportAs: reference
   SwiftReleaseOp: RCRelease
   SwiftRetainOp: RCRetain
+- Name: NonCopyableType
+  SwiftCopyable: false
+- Name: CopyableType
+  SwiftCopyable: true
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
index 82b8a6749c4f..a8f6d0248eae 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
@@ -4,3 +4,6 @@ struct RefCountedType { int value; };
 
 inline void RCRetain(RefCountedType *x) { x->value++; }
 inline void RCRelease(RefCountedType *x) { x->value--; }
+
+struct NonCopyableType { int value; };
+struct CopyableType { int value; };
diff --git a/clang/test/APINotes/swift-import-as.cpp b/clang/test/APINotes/swift-import-as.cpp
index 904857e58593..103cf02f431a 100644
--- a/clang/test/APINotes/swift-import-as.cpp
+++ b/clang/test/APINotes/swift-import-as.cpp
@@ -2,6 +2,8 @@
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter ImmortalRefType | FileCheck -check-prefix=CHECK-IMMORTAL %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter RefCountedType | FileCheck -check-prefix=CHECK-REF-COUNTED %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter NonCopyableType | FileCheck -check-prefix=CHECK-NON-COPYABLE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter CopyableType | FileCheck -check-prefix=CHECK-COPYABLE %s
 
 #include <SwiftImportAs.h>
 
@@ -14,3 +16,11 @@
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_reference"
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "retain:RCRetain"
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "release:RCRelease"
+
+// CHECK-NON-COPYABLE: Dumping NonCopyableType:
+// CHECK-NON-COPYABLE-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct NonCopyableType
+// CHECK-NON-COPYABLE: SwiftAttrAttr {{.+}} <<invalid sloc>> "~Copyable"
+
+// CHECK-COPYABLE: Dumping CopyableType:
+// CHECK-COPYABLE-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct CopyableType
+// CHECK-COPYABLE-NOT: SwiftAttrAttr
diff --git a/clang/test/AST/HLSL/this-reference-template.hlsl b/clang/test/AST/HLSL/this-reference-template.hlsl
index 60e057986ebf..d427e73044b7 100644
--- a/clang/test/AST/HLSL/this-reference-template.hlsl
+++ b/clang/test/AST/HLSL/this-reference-template.hlsl
@@ -24,7 +24,7 @@ void main() {
 // CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:8:3, line:10:3> line:8:5 getFirst 'K ()' implicit-inline
 // CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:10:3>
 // CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:9:4, col:16>
-// CHECK-NEXT:-CXXDependentScopeMemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> '<dependent type>' lvalue .First
+// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'K' lvalue .First 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair<K, V>' lvalue this
 // CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:12:3, line:14:3> line:12:5 getSecond 'V ()' implicit-inline
 // CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:17, line:14:3>
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index a5951158ed0e..207da5fe8126 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -263,3 +263,10 @@ const int *p = &b;
 const __int128 K = (__int128)(int*)0;
 const unsigned __int128 KU = (unsigned __int128)(int*)0;
 #endif
+
+
+int test3(void) {
+  int a[2];
+  a[0] = test3; // all-error {{incompatible pointer to integer conversion assigning to 'int' from 'int (void)'}}
+  return 0;
+}
diff --git a/clang/test/AST/Interp/cxx23.cpp b/clang/test/AST/Interp/cxx23.cpp
index f0325eef6d87..d1ec93e99803 100644
--- a/clang/test/AST/Interp/cxx23.cpp
+++ b/clang/test/AST/Interp/cxx23.cpp
@@ -141,3 +141,32 @@ struct check_ice {
     };
 };
 static_assert(check_ice<42>::x == 42);
+
+
+namespace VirtualBases {
+  namespace One {
+    struct U { int n; };
+    struct V : U { int n; };
+    struct A : virtual V { int n; };
+    struct Aa { int n; };
+    struct B : virtual A, Aa {};
+    struct C : virtual A, Aa {};
+    struct D : B, C {};
+
+    /// Calls the constructor of D.
+    D d;
+  }
+}
+
+namespace LabelGoto {
+  constexpr int foo() { // all20-error {{never produces a constant expression}}
+    a: // all20-warning {{use of this statement in a constexpr function is a C++23 extension}}
+    goto a; // all20-note 2{{subexpression not valid in a constant expression}} \
+            // ref23-note {{subexpression not valid in a constant expression}} \
+            // expected23-note {{subexpression not valid in a constant expression}}
+
+    return 1;
+  }
+  static_assert(foo() == 1, ""); // all-error {{not an integral constant expression}} \
+                                 // all-note {{in call to}}
+}
diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp
index f9bb5d53634e..a5bb9f1a19aa 100644
--- a/clang/test/AST/Interp/functions.cpp
+++ b/clang/test/AST/Interp/functions.cpp
@@ -601,3 +601,19 @@ namespace FromIntegral {
                                            // both-warning {{variable length arrays}}
 #endif
 }
+
+namespace {
+  template <typename T> using id = T;
+  template <typename T>
+  constexpr void g() {
+    constexpr id<void (T)> f;
+  }
+
+  static_assert((g<int>(), true), "");
+}
+
+namespace {
+  /// The InitListExpr here is of void type.
+  void bir [[clang::annotate("B", {1, 2, 3, 4})]] (); // both-error {{'annotate' attribute requires parameter 1 to be a constant expression}} \
+                                                      // both-note {{subexpression not valid in a constant expression}}
+}
diff --git a/clang/test/AST/Interp/opencl.cl b/clang/test/AST/Interp/opencl.cl
new file mode 100644
index 000000000000..fd7756fff7c1
--- /dev/null
+++ b/clang/test/AST/Interp/opencl.cl
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -fsyntax-only -cl-std=CL2.0 -verify=ref,both %s
+// RUN: %clang_cc1 -fsyntax-only -cl-std=CL2.0 -verify=expected,both %s -fexperimental-new-constant-interpreter
+
+// both-no-diagnostics
+
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
+
+void foo(int3 arg1, int8 arg2) {
+  int4 auto1;
+  int16 *auto2;
+  int auto3;
+  int2 auto4;
+  struct S *incomplete1;
+
+  int res1[vec_step(arg1) == 4 ? 1 : -1];
+  int res2[vec_step(arg2) == 8 ? 1 : -1];
+  int res3[vec_step(auto1) == 4 ? 1 : -1];
+  int res4[vec_step(*auto2) == 16 ? 1 : -1];
+  int res5[vec_step(auto3) == 1 ? 1 : -1];
+  int res6[vec_step(auto4) == 2 ? 1 : -1];
+  int res7[vec_step(int2) == 2 ? 1 : -1];
+  int res8[vec_step(int3) == 4 ? 1 : -1];
+  int res9[vec_step(int4) == 4 ? 1 : -1];
+  int res10[vec_step(int8) == 8 ? 1 : -1];
+  int res11[vec_step(int16) == 16 ? 1 : -1];
+  int res12[vec_step(void) == 1 ? 1 : -1];
+}
+
+void negativeShift32(int a,int b) {
+  char array0[((int)1)<<40];
+}
+
+int2 A = {1,2};
+int4 B = {(int2)(1,2), (int2)(3,4)};
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 3e52354a4a10..771e5adfca34 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -90,8 +90,7 @@ struct Ints2 {
   int a = 10;
   int b;
 };
-constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}} \
-                        // expected-error {{must be initialized by a constant expression}}
+constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}}
 
 constexpr Ints2 I2 = Ints2{12, 25};
 static_assert(I2.a == 12, "");
@@ -1031,6 +1030,12 @@ namespace ParenInit {
                      // both-note {{required by 'constinit' specifier}} \
                      // both-note {{reference to temporary is not a constant expression}} \
                      // both-note {{temporary created here}}
+
+
+  /// Initializing an array.
+  constexpr void bar(int i, int j) {
+    int arr[4](i, j);
+  }
 }
 #endif
 
@@ -1330,3 +1335,108 @@ namespace UnnamedBitFields {
   static_assert(a.f == 1.0, "");
   static_assert(a.c == 'a', "");
 }
+
+/// FIXME: This still doesn't work in the new interpreter because
+/// we lack type information for dummy pointers.
+namespace VirtualBases {
+  /// This used to crash.
+  namespace One {
+    class A {
+    protected:
+      int x;
+    };
+    class B : public virtual A {
+    public:
+      int getX() { return x; } // ref-note {{declared here}}
+    };
+
+    class DV : virtual public B{};
+
+    void foo() {
+      DV b;
+      int a[b.getX()]; // both-warning {{variable length arrays}} \
+                       // ref-note {{non-constexpr function 'getX' cannot be used}}
+    }
+  }
+
+  namespace Two {
+    struct U { int n; };
+    struct A : virtual U { int n; };
+    struct B : A {};
+    B a;
+    static_assert((U*)(A*)(&a) == (U*)(&a), "");
+
+    struct C : virtual A {};
+    struct D : B, C {};
+    D d;
+    constexpr B *p = &d;
+    constexpr C *q = &d;
+    static_assert((A*)p == (A*)q, ""); // both-error {{failed}}
+  }
+
+  namespace Three {
+    struct U { int n; };
+    struct V : U { int n; };
+    struct A : virtual V { int n; };
+    struct Aa { int n; };
+    struct B : virtual A, Aa {};
+
+    struct C : virtual A, Aa {};
+
+    struct D : B, C {};
+
+    D d;
+
+    constexpr B *p = &d;
+    constexpr C *q = &d;
+
+    static_assert((void*)p != (void*)q, "");
+    static_assert((A*)p == (A*)q, "");
+    static_assert((Aa*)p != (Aa*)q, "");
+
+    constexpr V *v = p;
+    constexpr V *w = q;
+    constexpr V *x = (A*)p;
+    static_assert(v == w, "");
+    static_assert(v == x, "");
+
+    static_assert((U*)&d == p, "");
+    static_assert((U*)&d == q, "");
+    static_assert((U*)&d == v, "");
+    static_assert((U*)&d == w, "");
+    static_assert((U*)&d == x, "");
+
+    struct X {};
+    struct Y1 : virtual X {};
+    struct Y2 : X {};
+    struct Z : Y1, Y2 {};
+    Z z;
+    static_assert((X*)(Y1*)&z != (X*)(Y2*)&z, "");
+  }
+}
+
+namespace ZeroInit {
+  struct S3 {
+    S3() = default;
+    S3(const S3&) = default;
+    S3(S3&&) = default;
+    constexpr S3(int n) : n(n) {}
+    int n;
+  };
+  constexpr S3 s3d; // both-error {{default initialization of an object of const type 'const S3' without a user-provided default constructor}}
+  static_assert(s3d.n == 0, "");
+}
+
+namespace {
+#if __cplusplus >= 202002L
+  struct C {
+    template <unsigned N> constexpr C(const char (&)[N]) : n(N) {}
+    unsigned n;
+  };
+  template <C c>
+  constexpr auto operator""_c() { return c.n; }
+
+  constexpr auto waldo = "abc"_c;
+  static_assert(waldo == 4, "");
+#endif
+}
diff --git a/clang/test/AST/ast-dump-macro-json.c b/clang/test/AST/ast-dump-macro-json.c
index 96f4be6fec3d..fb9b4118b4f1 100644
--- a/clang/test/AST/ast-dump-macro-json.c
+++ b/clang/test/AST/ast-dump-macro-json.c
@@ -132,7 +132,7 @@ void BLAP(foo, __COUNTER__)(void);
 // CHECK-NEXT:   "spellingLoc": {
 // CHECK-NEXT:    "offset": {{[0-9]+}},
 // CHECK-NEXT:    "file": "<scratch space>",
-// CHECK-NEXT:    "line": 3,
+// CHECK-NEXT:    "line": 5,
 // CHECK-NEXT:    "col": 1,
 // CHECK-NEXT:    "tokLen": 4
 // CHECK-NEXT:   },
@@ -169,7 +169,7 @@ void BLAP(foo, __COUNTER__)(void);
 // CHECK-NEXT:   "spellingLoc": {
 // CHECK-NEXT:    "offset": {{[0-9]+}},
 // CHECK-NEXT:    "file": "<scratch space>",
-// CHECK-NEXT:    "line": 5,
+// CHECK-NEXT:    "line": 7,
 // CHECK-NEXT:    "col": 1,
 // CHECK-NEXT:    "tokLen": 4
 // CHECK-NEXT:   },
diff --git a/clang/test/AST/ast-dump-pragma-unroll.cpp b/clang/test/AST/ast-dump-pragma-unroll.cpp
new file mode 100644
index 000000000000..f9c254b803ff
--- /dev/null
+++ b/clang/test/AST/ast-dump-pragma-unroll.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s | FileCheck %s
+
+using size_t = unsigned long long;
+
+// CHECK: LoopHintAttr {{.*}} Implicit unroll UnrollCount Numeric
+// CHECK: LoopHintAttr {{.*}} Implicit unroll UnrollCount Numeric
+// CHECK: LoopHintAttr {{.*}} Implicit unroll Unroll Disable
+// CHECK: LoopHintAttr {{.*}} Implicit unroll Unroll Disable
+template <bool Flag>
+int value_dependent(int n) {
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+
+#pragma unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+#pragma unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp
index cfb013585ad7..77527743fe85 100644
--- a/clang/test/AST/ast-dump-recovery.cpp
+++ b/clang/test/AST/ast-dump-recovery.cpp
@@ -413,6 +413,14 @@ void RecoveryExprForInvalidDecls(Unknown InvalidDecl) {
   // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>'
 }
 
+void InitializerOfInvalidDecl() {
+  int ValidDecl;
+  Unkown InvalidDecl = ValidDecl;
+  // CHECK:      VarDecl {{.*}} invalid InvalidDecl
+  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
+  // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'ValidDecl'
+}
+
 void RecoverToAnInvalidDecl() {
   Unknown* foo; // invalid decl
   goo; // the typo was correct to the invalid foo.
diff --git a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
index cf740516db6f..5ac55d269dce 100644
--- a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
+++ b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
@@ -1846,6 +1846,42 @@ int main()
 // CHECK-NEXT:              "kind": "VarTemplateDecl",
 // CHECK-NEXT:              "name": "is_const_v"
 // CHECK-NEXT:             }
+// CHECK-NEXT:            ],
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "kind": "TemplateArgument",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "const _Ty"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "inner": [
+// CHECK-NEXT:               {
+// CHECK-NEXT:                "id": "0x{{.*}}",
+// CHECK-NEXT:                "kind": "QualType",
+// CHECK-NEXT:                "type": {
+// CHECK-NEXT:                 "qualType": "const _Ty"
+// CHECK-NEXT:                },
+// CHECK-NEXT:                "qualifiers": "const",
+// CHECK-NEXT:                "inner": [
+// CHECK-NEXT:                 {
+// CHECK-NEXT:                  "id": "0x{{.*}}",
+// CHECK-NEXT:                  "kind": "TemplateTypeParmType",
+// CHECK-NEXT:                  "type": {
+// CHECK-NEXT:                   "qualType": "_Ty"
+// CHECK-NEXT:                  },
+// CHECK-NEXT:                  "isDependent": true,
+// CHECK-NEXT:                  "isInstantiationDependent": true,
+// CHECK-NEXT:                  "depth": 0,
+// CHECK-NEXT:                  "index": 0,
+// CHECK-NEXT:                  "decl": {
+// CHECK-NEXT:                   "id": "0x{{.*}}",
+// CHECK-NEXT:                   "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:                   "name": "_Ty"
+// CHECK-NEXT:                  }
+// CHECK-NEXT:                 }
+// CHECK-NEXT:                ]
+// CHECK-NEXT:               }
+// CHECK-NEXT:              ]
+// CHECK-NEXT:             }
 // CHECK-NEXT:            ]
 // CHECK-NEXT:           }
 // CHECK-NEXT:          ]
@@ -1900,6 +1936,32 @@ int main()
 // CHECK-NEXT:              "kind": "VarTemplateDecl",
 // CHECK-NEXT:              "name": "is_reference_v"
 // CHECK-NEXT:             }
+// CHECK-NEXT:            ],
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "kind": "TemplateArgument",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "_Ty"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "inner": [
+// CHECK-NEXT:               {
+// CHECK-NEXT:                "id": "0x{{.*}}",
+// CHECK-NEXT:                "kind": "TemplateTypeParmType",
+// CHECK-NEXT:                "type": {
+// CHECK-NEXT:                 "qualType": "_Ty"
+// CHECK-NEXT:                },
+// CHECK-NEXT:                "isDependent": true,
+// CHECK-NEXT:                "isInstantiationDependent": true,
+// CHECK-NEXT:                "depth": 0,
+// CHECK-NEXT:                "index": 0,
+// CHECK-NEXT:                "decl": {
+// CHECK-NEXT:                 "id": "0x{{.*}}",
+// CHECK-NEXT:                 "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:                 "name": "_Ty"
+// CHECK-NEXT:                }
+// CHECK-NEXT:               }
+// CHECK-NEXT:              ]
+// CHECK-NEXT:             }
 // CHECK-NEXT:            ]
 // CHECK-NEXT:           }
 // CHECK-NEXT:          ]
@@ -2565,6 +2627,32 @@ int main()
 // CHECK-NEXT:            "kind": "VarTemplateDecl",
 // CHECK-NEXT:            "name": "is_function_v"
 // CHECK-NEXT:           }
+// CHECK-NEXT:          ],
+// CHECK-NEXT:          "inner": [
+// CHECK-NEXT:           {
+// CHECK-NEXT:            "kind": "TemplateArgument",
+// CHECK-NEXT:            "type": {
+// CHECK-NEXT:             "qualType": "_Ty1"
+// CHECK-NEXT:            },
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "id": "0x{{.*}}",
+// CHECK-NEXT:              "kind": "TemplateTypeParmType",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "_Ty1"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "isDependent": true,
+// CHECK-NEXT:              "isInstantiationDependent": true,
+// CHECK-NEXT:              "depth": 0,
+// CHECK-NEXT:              "index": 0,
+// CHECK-NEXT:              "decl": {
+// CHECK-NEXT:               "id": "0x{{.*}}",
+// CHECK-NEXT:               "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:               "name": "_Ty1"
+// CHECK-NEXT:              }
+// CHECK-NEXT:             }
+// CHECK-NEXT:            ]
+// CHECK-NEXT:           }
 // CHECK-NEXT:          ]
 // CHECK-NEXT:         }
 // CHECK-NEXT:        ]
diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp
index d25ef36dd4d3..9fcafbcbcc46 100644
--- a/clang/test/AST/ast-dump-templates.cpp
+++ b/clang/test/AST/ast-dump-templates.cpp
@@ -104,3 +104,17 @@ void (*q)() = f<>;
 // CHECK1: template<> void f<0L>()
 // CHECK1: template<> void f<0U>()
 }
+
+namespace test6 {
+template <class D>
+constexpr bool C = true;
+
+template <class Key>
+void func() {
+  C<Key>;
+// DUMP:      UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'C'
+// DUMP-NEXT: `-TemplateArgument type 'Key'
+// DUMP-NEXT:   `-TemplateTypeParmType {{.*}} 'Key' dependent depth 0 index 0
+// DUMP-NEXT:     `-TemplateTypeParm {{.*}} 'Key'
+}
+}
diff --git a/clang/test/AST/ast-print-openacc-compute-construct.cpp b/clang/test/AST/ast-print-openacc-compute-construct.cpp
new file mode 100644
index 000000000000..cd39ea087b3c
--- /dev/null
+++ b/clang/test/AST/ast-print-openacc-compute-construct.cpp
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s
+
+void foo() {
+  int i;
+  float array[5];
+// CHECK: #pragma acc parallel default(none)
+// CHECK-NEXT: while (true)
+#pragma acc parallel default(none)
+  while(true);
+// CHECK: #pragma acc serial default(present)
+// CHECK-NEXT: while (true)
+#pragma acc serial default(present)
+  while(true);
+// CHECK: #pragma acc kernels if(i == array[1])
+// CHECK-NEXT: while (true)
+#pragma acc kernels if(i == array[1])
+  while(true);
+// CHECK: #pragma acc parallel self(i == 3)
+// CHECK-NEXT: while (true)
+#pragma acc parallel self(i == 3)
+  while(true);
+
+// CHECK: #pragma acc parallel num_gangs(i, (int)array[2])
+// CHECK-NEXT: while (true)
+#pragma acc parallel num_gangs(i, (int)array[2])
+  while(true);
+
+// CHECK: #pragma acc parallel num_workers(i)
+// CHECK-NEXT: while (true)
+#pragma acc parallel num_workers(i)
+  while(true);
+
+// CHECK: #pragma acc parallel vector_length((int)array[1])
+// CHECK-NEXT: while (true)
+#pragma acc parallel vector_length((int)array[1])
+  while(true);
+
+// CHECK: #pragma acc parallel private(i, array[1], array, array[1:2])
+#pragma acc parallel private(i, array[1], array, array[1:2])
+  while(true);
+}
+
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp
new file mode 100644
index 000000000000..3d8e822025f6
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+template <class Class> struct T;
+template <template <class> class Class, class Type>
+struct T<Class<Type>>
+{ };
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args.cpp b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
index f2e1f9bc5a24..2a4b6bb1f106 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
@@ -32,7 +32,7 @@ namespace ref_counted {
   void consume_ref_counted(Ref<RefCountable>) {}
 
   void foo() {
-    consume_refcntbl(provide_ref_counted().get());
+    consume_refcntbl(provide_ref_counted().ptr());
     // no warning
   }
 }
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index aab99197dfa4..c27ea9baaf3b 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -1,24 +1,61 @@
 #ifndef mock_types_1103988513531
 #define mock_types_1103988513531
 
-template <typename T> struct Ref {
-  T *t;
+template<typename T>
+struct RawPtrTraits {
+  using StorageType = T*;
 
-  Ref() : t{} {};
-  Ref(T &t)
-    : t(t) {
-    if (t)
-      t->ref();
+  template<typename U>
+  static T* exchange(StorageType& ptr, U&& newValue)
+  {
+    StorageType oldValue = static_cast<StorageType&&>(ptr);
+    ptr = static_cast<U&&>(newValue);
+    return oldValue;
   }
-  ~Ref() {
-    if (t)
-      t->deref();
+
+  static void swap(StorageType& a, StorageType& b)
+  {
+    StorageType temp = static_cast<StorageType&&>(a);
+    a = static_cast<StorageType&&>(b);
+    b = static_cast<StorageType&&>(temp);
   }
-  T *get() { return t; }
-  T *ptr() { return t; }
-  T *operator->() { return t; }
-  operator const T &() const { return *t; }
-  operator T &() { return *t; }
+  static T* unwrap(const StorageType& ptr) { return ptr; }
+};
+
+template<typename T> struct DefaultRefDerefTraits {
+  static T* refIfNotNull(T* ptr)
+  {
+    if (ptr)
+      ptr->ref();
+    return ptr;
+  }
+
+  static T& ref(T& ref)
+  {
+    ref.ref();
+    return ref;
+  }
+
+  static void derefIfNotNull(T* ptr)
+  {
+    if (ptr)
+      ptr->deref();
+  }
+};
+
+template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> struct Ref {
+  typename PtrTraits::StorageType t;
+
+  Ref() : t{} {};
+  Ref(T &t) : t(RefDerefTraits::refIfNotNull(t)) { }
+  Ref(const Ref& o) : t(RefDerefTraits::refIfNotNull(PtrTraits::unwrap(o.t))) { }
+  ~Ref() { RefDerefTraits::derefIfNotNull(PtrTraits::exchange(t, nullptr)); }
+  T &get() { return *PtrTraits::unwrap(t); }
+  T *ptr() { return PtrTraits::unwrap(t); }
+  T *operator->() { return PtrTraits::unwrap(t); }
+  operator const T &() const { return *PtrTraits::unwrap(t); }
+  operator T &() { return *PtrTraits::unwrap(t); }
+  T* leakRef() { PtrTraits::exchange(t, nullptr); }
 };
 
 template <typename T> struct RefPtr {
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index 80a9a263dab1..63a68a994a5c 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -201,6 +201,13 @@ public:
   unsigned trivial25() const { return __c11_atomic_load((volatile _Atomic(unsigned) *)&v, __ATOMIC_RELAXED); }
   bool trivial26() { bool hasValue = v; return !hasValue; }
   bool trivial27(int v) { bool value; value = v ? 1 : 0; return value; }
+  bool trivial28() { return true; }
+  bool trivial29() { return false; }
+  unsigned trivial30(unsigned v) { unsigned r = 0xff; r |= v; return r; }
+  int trivial31(int* v) { return v[0]; }
+  unsigned trivial32() { return sizeof(int); }
+  unsigned trivial33() { return ~0xff; }
+  template <unsigned v> unsigned trivial34() { return v; }
 
   static RefCounted& singleton() {
     static RefCounted s_RefCounted;
@@ -273,6 +280,9 @@ public:
     return val;
   }
 
+  int nonTrivial13() { return ~otherFunction(); }
+  int nonTrivial14() { int r = 0xff; r |= otherFunction(); return r; }
+
   unsigned v { 0 };
   Number* number { nullptr };
   Enum enumValue { Enum::Value1 };
@@ -322,6 +332,15 @@ public:
     getFieldTrivial().trivial25(); // no-warning
     getFieldTrivial().trivial26(); // no-warning
     getFieldTrivial().trivial27(5); // no-warning
+    getFieldTrivial().trivial28(); // no-warning
+    getFieldTrivial().trivial29(); // no-warning
+    getFieldTrivial().trivial30(7); // no-warning
+    int a[] = {1, 2};
+    getFieldTrivial().trivial31(a); // no-warning
+    getFieldTrivial().trivial32(); // no-warning
+    getFieldTrivial().trivial33(); // no-warning
+    getFieldTrivial().trivial34<7>(); // no-warning
+
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
 
@@ -351,6 +370,10 @@ public:
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial12();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial13();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial14();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
   }
 };
 
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 23e37a856d09..fda920fa3951 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -12,7 +12,6 @@
 // CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtExec = 0x04
 // CHECK-NEXT: alpha.security.MmapWriteExec:MmapProtRead = 0x01
 // CHECK-NEXT: alpha.security.taint.TaintPropagation:Config = ""
-// CHECK-NEXT: alpha.unix.Stream:Pedantic = false
 // CHECK-NEXT: apply-fixits = false
 // CHECK-NEXT: assume-controlled-environment = false
 // CHECK-NEXT: avoid-suppressing-null-argument-paths = false
@@ -131,6 +130,7 @@
 // CHECK-NEXT: unix.Errno:AllowErrnoReadOutsideConditionExpressions = true
 // CHECK-NEXT: unix.StdCLibraryFunctions:DisplayLoadedSummaries = false
 // CHECK-NEXT: unix.StdCLibraryFunctions:ModelPOSIX = true
+// CHECK-NEXT: unix.Stream:Pedantic = false
 // CHECK-NEXT: unroll-loops = false
 // CHECK-NEXT: verbose-report-filename = false
 // CHECK-NEXT: widen-loops = false
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index 2c4e34f4990b..9543ba8ec02f 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -48,6 +48,7 @@
 // CHECK-NEXT: unix.Malloc
 // CHECK-NEXT: unix.MallocSizeof
 // CHECK-NEXT: unix.MismatchedDeallocator
+// CHECK-NEXT: unix.Stream
 // CHECK-NEXT: unix.StdCLibraryFunctions
 // CHECK-NEXT: unix.Vfork
 // CHECK-NEXT: unix.cstring.BadSizeArg
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c b/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
index 3abffd609b5b..1998c9383d9d 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
@@ -11,4 +11,4 @@ int f(int coin) {
 // RUN: rm -rf %t.output
 // RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
-// CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}, "3": {"3": 1, "4": 1, "5": 1, "6": 1, "7": 1}};
+// CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}, "4": {"3": 1, "4": 1, "5": 1, "6": 1, "7": 1}};
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index d4721c0a59a3..14aca5a948bf 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -6,7 +6,7 @@
 // RUN:   -Xclang -analyzer-checker=unix.StdCLibraryFunctions \
 // RUN:   -Xclang -analyzer-config \
 // RUN:      -Xclang unix.StdCLibraryFunctions:ModelPOSIX=true \
-// RUN:   -Xclang -analyzer-checker=alpha.unix.Stream \
+// RUN:   -Xclang -analyzer-checker=unix.Stream \
 // RUN:   -Xclang -analyzer-list-enabled-checkers \
 // RUN:   -Xclang -analyzer-display-progress \
 // RUN:   2>&1 | FileCheck %s --implicit-check-not=ANALYZE \
@@ -14,8 +14,6 @@
 
 // CHECK:      OVERVIEW: Clang Static Analyzer Enabled Checkers List
 // CHECK-EMPTY:
-// CHECK-NEXT: core.NonNullParamChecker
-// CHECK-NEXT: alpha.unix.Stream
 // CHECK-NEXT: apiModeling.Errno
 // CHECK-NEXT: apiModeling.TrustNonnull
 // CHECK-NEXT: apiModeling.TrustReturnsNonnull
@@ -26,6 +24,7 @@
 // CHECK-NEXT: core.CallAndMessage
 // CHECK-NEXT: core.DivideZero
 // CHECK-NEXT: core.DynamicTypePropagation
+// CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
 // CHECK-NEXT: core.StackAddrEscapeBase
@@ -57,6 +56,7 @@
 // CHECK-NEXT: unix.Malloc
 // CHECK-NEXT: unix.MallocSizeof
 // CHECK-NEXT: unix.MismatchedDeallocator
+// CHECK-NEXT: unix.Stream
 // CHECK-NEXT: unix.StdCLibraryFunctions
 // CHECK-NEXT: unix.Vfork
 // CHECK-NEXT: unix.cstring.BadSizeArg
diff --git a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c
index 5df5a770015b..1f0d3627fae3 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c
@@ -3,7 +3,7 @@
 
 // RUN: %clang_analyze_cc1 %s \
 // RUN:   -analyzer-checker=core \
-// RUN:   -analyzer-checker=alpha.unix.Stream \
+// RUN:   -analyzer-checker=unix.Stream \
 // RUN:   -analyzer-checker=unix.StdCLibraryFunctions \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true \
 // RUN:   -triple x86_64-unknown-linux-gnu \
@@ -57,5 +57,5 @@ void test_notnull_arg(FILE *F) {
 
 void test_notnull_stream_arg(void) {
   fileno(0); // \
-  // expected-warning{{Stream pointer might be NULL [alpha.unix.Stream]}}
+  // expected-warning{{Stream pointer might be NULL [unix.Stream]}}
 }
diff --git a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
index cac3fe5c5151..b99cc30149c9 100644
--- a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
+++ b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
@@ -1,7 +1,7 @@
 // Check the case when only the StreamChecker is enabled.
 // RUN: %clang_analyze_cc1 %s \
-// RUN:   -analyzer-checker=core,alpha.unix.Stream \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN:   -analyzer-checker=core,unix.Stream \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-config eagerly-assume=false \
 // RUN:   -triple x86_64-unknown-linux \
@@ -19,8 +19,8 @@
 // Check the case when both the StreamChecker and the
 // StdLibraryFunctionsChecker are enabled.
 // RUN: %clang_analyze_cc1 %s \
-// RUN:   -analyzer-checker=core,alpha.unix.Stream \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN:   -analyzer-checker=core,unix.Stream \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -analyzer-checker=unix.StdCLibraryFunctions \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:DisplayLoadedSummaries=true \
 // RUN:   -analyzer-checker=debug.ExprInspection \
diff --git a/clang/test/Analysis/stream-errno-note.c b/clang/test/Analysis/stream-errno-note.c
index fb12f0bace93..71ea026ed4de 100644
--- a/clang/test/Analysis/stream-errno-note.c
+++ b/clang/test/Analysis/stream-errno-note.c
@@ -1,6 +1,6 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core \
-// RUN:   -analyzer-checker=alpha.unix.Stream \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN:   -analyzer-checker=unix.Stream \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -analyzer-checker=unix.Errno \
 // RUN:   -analyzer-checker=unix.StdCLibraryFunctions \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true \
diff --git a/clang/test/Analysis/stream-errno.c b/clang/test/Analysis/stream-errno.c
index 08382eaf6abf..b28cc301a4ec 100644
--- a/clang/test/Analysis/stream-errno.c
+++ b/clang/test/Analysis/stream-errno.c
@@ -1,5 +1,5 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,unix.Errno,unix.StdCLibraryFunctions,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Stream,unix.Errno,unix.StdCLibraryFunctions,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true -verify %s
 
 #include "Inputs/system-header-simulator.h"
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index 2abf4b900a04..3f791d133464 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -1,7 +1,7 @@
 // RUN: %clang_analyze_cc1 -verify %s \
 // RUN: -analyzer-checker=core \
-// RUN: -analyzer-checker=alpha.unix.Stream \
-// RUN: -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN: -analyzer-checker=unix.Stream \
+// RUN: -analyzer-config unix.Stream:Pedantic=true \
 // RUN: -analyzer-checker=debug.StreamTester \
 // RUN: -analyzer-checker=debug.ExprInspection
 
diff --git a/clang/test/Analysis/stream-invalidate.c b/clang/test/Analysis/stream-invalidate.c
index 5046a356d058..749c53d164fb 100644
--- a/clang/test/Analysis/stream-invalidate.c
+++ b/clang/test/Analysis/stream-invalidate.c
@@ -1,6 +1,6 @@
 // RUN: %clang_analyze_cc1 -verify %s \
 // RUN: -analyzer-checker=core \
-// RUN: -analyzer-checker=alpha.unix.Stream \
+// RUN: -analyzer-checker=unix.Stream \
 // RUN: -analyzer-checker=debug.ExprInspection
 
 #include "Inputs/system-header-simulator.h"
diff --git a/clang/test/Analysis/stream-non-posix-function.c b/clang/test/Analysis/stream-non-posix-function.c
index 091d95a573dd..b9ece31cde15 100644
--- a/clang/test/Analysis/stream-non-posix-function.c
+++ b/clang/test/Analysis/stream-non-posix-function.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -fno-builtin -analyzer-checker=core,alpha.unix.Stream -verify %s
+// RUN: %clang_analyze_cc1 -fno-builtin -analyzer-checker=core,unix.Stream -verify %s
 // RUN: %clang_analyze_cc1 -fno-builtin -analyzer-checker=core,alpha.unix.SimpleStream -verify %s
 
 // expected-no-diagnostics
diff --git a/clang/test/Analysis/stream-noopen.c b/clang/test/Analysis/stream-noopen.c
index 644c699d05e2..87761b3afb76 100644
--- a/clang/test/Analysis/stream-noopen.c
+++ b/clang/test/Analysis/stream-noopen.c
@@ -1,7 +1,7 @@
 // RUN: %clang_analyze_cc1 -verify %s \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=unix.Errno \
-// RUN:   -analyzer-checker=alpha.unix.Stream \
+// RUN:   -analyzer-checker=unix.Stream \
 // RUN:   -analyzer-checker=unix.StdCLibraryFunctions \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true \
 // RUN:   -analyzer-checker=debug.ExprInspection
diff --git a/clang/test/Analysis/stream-note.c b/clang/test/Analysis/stream-note.c
index 03a8ff4e468f..3aef707d5005 100644
--- a/clang/test/Analysis/stream-note.c
+++ b/clang/test/Analysis/stream-note.c
@@ -1,8 +1,8 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream -analyzer-output text \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Stream -analyzer-output text \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -verify %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,unix.StdCLibraryFunctions -analyzer-output text \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Stream,unix.StdCLibraryFunctions -analyzer-output text \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true -verify=expected,stdargs %s
 
 #include "Inputs/system-header-simulator.h"
diff --git a/clang/test/Analysis/stream-pedantic.c b/clang/test/Analysis/stream-pedantic.c
index 2bbea81d47ef..2a3dff867890 100644
--- a/clang/test/Analysis/stream-pedantic.c
+++ b/clang/test/Analysis/stream-pedantic.c
@@ -1,8 +1,8 @@
-// RUN: %clang_analyze_cc1 -triple=x86_64-pc-linux-gnu -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=false -verify=nopedantic %s
+// RUN: %clang_analyze_cc1 -triple=x86_64-pc-linux-gnu -analyzer-checker=core,unix.Stream,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=false -verify=nopedantic %s
 
-// RUN: %clang_analyze_cc1 -triple=x86_64-pc-linux-gnu -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true -verify=pedantic %s
+// RUN: %clang_analyze_cc1 -triple=x86_64-pc-linux-gnu -analyzer-checker=core,unix.Stream,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify=pedantic %s
 
 #include "Inputs/system-header-simulator.h"
 
diff --git a/clang/test/Analysis/stream-stdlibraryfunctionargs.c b/clang/test/Analysis/stream-stdlibraryfunctionargs.c
index 2ea6a8c472c6..c3a6302f9c7a 100644
--- a/clang/test/Analysis/stream-stdlibraryfunctionargs.c
+++ b/clang/test/Analysis/stream-stdlibraryfunctionargs.c
@@ -1,13 +1,13 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,unix.StdCLibraryFunctions,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Stream,unix.StdCLibraryFunctions,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true -verify=stream,any %s
 
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Stream,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true -verify=stream,any %s
 
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.StdCLibraryFunctions,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true \
 // RUN:   -analyzer-config unix.StdCLibraryFunctions:ModelPOSIX=true -verify=stdfunc,any %s
 
 #include "Inputs/system-header-simulator.h"
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index 93ed555c89eb..db03d90cd8af 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -1,11 +1,11 @@
-// RUN: %clang_analyze_cc1 -triple=x86_64-pc-linux-gnu -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true -verify %s
-// RUN: %clang_analyze_cc1 -triple=armv8-none-linux-eabi -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true -verify %s
-// RUN: %clang_analyze_cc1 -triple=aarch64-linux-gnu -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true -verify %s
-// RUN: %clang_analyze_cc1 -triple=hexagon -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config alpha.unix.Stream:Pedantic=true -verify %s
+// RUN: %clang_analyze_cc1 -triple=x86_64-pc-linux-gnu -analyzer-checker=core,unix.Stream,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN: %clang_analyze_cc1 -triple=armv8-none-linux-eabi -analyzer-checker=core,unix.Stream,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN: %clang_analyze_cc1 -triple=aarch64-linux-gnu -analyzer-checker=core,unix.Stream,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN: %clang_analyze_cc1 -triple=hexagon -analyzer-checker=core,unix.Stream,debug.ExprInspection \
+// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
 
 #include "Inputs/system-header-simulator.h"
 #include "Inputs/system-header-simulator-for-malloc.h"
diff --git a/clang/test/Analysis/stream.cpp b/clang/test/Analysis/stream.cpp
index 7eca505bcaf5..8c65d9c287fd 100644
--- a/clang/test/Analysis/stream.cpp
+++ b/clang/test/Analysis/stream.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Stream -verify %s
 
 typedef struct _IO_FILE FILE;
 extern FILE *fopen(const char *path, const char *mode);
diff --git a/clang/test/CXX/class/class.mem/class.mem.general/p8.cpp b/clang/test/CXX/class/class.mem/class.mem.general/p8.cpp
new file mode 100644
index 000000000000..8cc9b41eaca9
--- /dev/null
+++ b/clang/test/CXX/class/class.mem/class.mem.general/p8.cpp
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace N0 {
+  struct A {
+    void f0() noexcept(x);
+    void g0() noexcept(y); // expected-error {{use of undeclared identifier 'y'}}
+
+    void f1() noexcept(A::x);
+    void g1() noexcept(A::y); // expected-error {{no member named 'y' in 'N0::A'}}
+
+    template<typename T>
+    void f2() noexcept(x);
+    template<typename T>
+    void g2() noexcept(y); // expected-error {{use of undeclared identifier 'y'}}
+
+    template<typename T>
+    void f3() noexcept(A::x);
+    template<typename T>
+    void g3() noexcept(A::y); // expected-error {{no member named 'y' in 'N0::A'}}
+
+    friend void f4() noexcept(x);
+    friend void g4() noexcept(y); // expected-error {{use of undeclared identifier 'y'}}
+
+    friend void f5() noexcept(A::x);
+    friend void g5() noexcept(A::y); // expected-error {{no member named 'y' in 'N0::A'}}
+
+    template<typename T>
+    friend void f6() noexcept(x);
+    template<typename T>
+    friend void g6() noexcept(y); // expected-error {{use of undeclared identifier 'y'}}
+
+    template<typename T>
+    friend void f7() noexcept(A::x);
+    template<typename T>
+    friend void g7() noexcept(A::y); // expected-error {{no member named 'y' in 'N0::A'}}
+
+    static constexpr bool x = true;
+  };
+} // namespace N0
+
+namespace N1 {
+  template<typename T>
+  struct A {
+    void f0() noexcept(x);
+    void g0() noexcept(y); // expected-error {{use of undeclared identifier 'y'}}
+
+    void f1() noexcept(A::x);
+    void g1() noexcept(A::y); // expected-error {{no member named 'y' in 'A<T>'}}
+
+    template<typename U>
+    void f2() noexcept(x);
+    template<typename U>
+    void g2() noexcept(y); // expected-error {{use of undeclared identifier 'y'}}
+
+    template<typename U>
+    void f3() noexcept(A::x);
+    template<typename U>
+    void g3() noexcept(A::y); // expected-error {{no member named 'y' in 'A<T>'}}
+
+    friend void f4() noexcept(x);
+    friend void g4() noexcept(y); // expected-error {{use of undeclared identifier 'y'}}
+
+    friend void f5() noexcept(A::x);
+    friend void g5() noexcept(A::y); // expected-error {{no member named 'y' in 'A<T>'}}
+
+    template<typename U>
+    friend void f6() noexcept(x);
+    template<typename U>
+    friend void g6() noexcept(y); // expected-error {{use of undeclared identifier 'y'}}
+
+    template<typename U>
+    friend void f7() noexcept(A::x);
+    template<typename U>
+    friend void g7() noexcept(A::y); // expected-error {{no member named 'y' in 'A<T>'}}
+
+    static constexpr bool x = true;
+  };
+} // namespace N1
diff --git a/clang/test/CXX/drs/cwg2149.cpp b/clang/test/CXX/drs/cwg2149.cpp
new file mode 100644
index 000000000000..8416e42cbd69
--- /dev/null
+++ b/clang/test/CXX/drs/cwg2149.cpp
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes CXX98
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
+namespace cwg2149 { // cwg2149: 3.1
+#if __cplusplus <= 201103L
+struct X { int i, j, k; };
+#else
+struct X { int i, j, k = 42; };
+#endif
+
+template<int N> 
+void f1(const X(&)[N]); // #cwg2149-f1
+
+template<int N>
+void f2(const X(&)[N][2]); // #cwg2149-f2
+
+void f() {
+  X a[] = { 1, 2, 3, 4, 5, 6 };
+  static_assert(sizeof(a) / sizeof(X) == 2, "");
+  X b[2] = { { 1, 2, 3 }, { 4, 5, 6 } };
+  X c[][2] = { 1, 2, 3, 4, 5, 6 };
+  static_assert(sizeof(c) / sizeof(X[2]) == 1, "");
+  
+  #if __cplusplus >= 201103L
+  constexpr X ca[] = { 1, 2, 3, 4, 5, 6 };
+  constexpr X cb[2] = { { 1, 2, 3 }, { 4, 5, 6 } };
+  static_assert(ca[0].i == cb[0].i, "");
+  static_assert(ca[0].j == cb[0].j, "");
+  static_assert(ca[0].k == cb[0].k, "");
+  static_assert(ca[1].i == cb[1].i, "");
+  static_assert(ca[1].j == cb[1].j, "");
+  static_assert(ca[1].k == cb[1].k, "");
+
+  f1({ 1, 2, 3, 4, 5, 6 });
+  // since-cxx11-error@-1 {{no matching function for call to 'f1'}}
+  //   since-cxx11-note@#cwg2149-f1 {{candidate function [with N = 6] not viable: no known conversion from 'int' to 'const X' for 1st argument}}
+  f2({ 1, 2, 3, 4, 5, 6 });
+  // since-cxx11-error@-1 {{no matching function for call to 'f2'}}
+  //   since-cxx11-note@#cwg2149-f2 {{candidate function [with N = 6] not viable: no known conversion from 'int' to 'const X[2]' for 1st argument}}
+  #endif
+}
+} // namespace cwg2149
+
+// Constant evaluation is not powerful enough in 98 mode to check for equality
+// via static_assert, even with constant folding enabled.
+
+// CXX98:       VarDecl {{.+}} a 'X[2]'
+// CXX98-NEXT:  `-InitListExpr {{.+}} 'X[2]'
+// CXX98-NEXT:    |-InitListExpr {{.+}} 'X':'cwg2149::X'
+// CXX98-NEXT:    | |-IntegerLiteral {{.+}} 'int' 1
+// CXX98-NEXT:    | |-IntegerLiteral {{.+}} 'int' 2
+// CXX98-NEXT:    | `-IntegerLiteral {{.+}} 'int' 3
+// CXX98-NEXT:    `-InitListExpr {{.+}} 'X':'cwg2149::X'
+// CXX98-NEXT:      |-IntegerLiteral {{.+}} 'int' 4
+// CXX98-NEXT:      |-IntegerLiteral {{.+}} 'int' 5
+// CXX98-NEXT:      `-IntegerLiteral {{.+}} 'int' 6
+
+// CXX98:       VarDecl {{.+}} b 'X[2]'
+// CXX98-NEXT:  `-InitListExpr {{.+}} 'X[2]'
+// CXX98-NEXT:    |-InitListExpr {{.+}} 'X':'cwg2149::X'
+// CXX98-NEXT:    | |-IntegerLiteral {{.+}} 'int' 1
+// CXX98-NEXT:    | |-IntegerLiteral {{.+}} 'int' 2
+// CXX98-NEXT:    | `-IntegerLiteral {{.+}} 'int' 3
+// CXX98-NEXT:    `-InitListExpr {{.+}} 'X':'cwg2149::X'
+// CXX98-NEXT:      |-IntegerLiteral {{.+}} 'int' 4
+// CXX98-NEXT:      |-IntegerLiteral {{.+}} 'int' 5
+// CXX98-NEXT:      `-IntegerLiteral {{.+}} 'int' 6
diff --git a/clang/test/CXX/drs/cwg650.cpp b/clang/test/CXX/drs/cwg650.cpp
index dcb844095b05..33ea179986e3 100644
--- a/clang/test/CXX/drs/cwg650.cpp
+++ b/clang/test/CXX/drs/cwg650.cpp
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
 // RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
 // RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// We aren't testing this since C++26 because of P2748R5 "Disallow Binding a Returned Glvalue to a Temporary".
 
 #if __cplusplus == 199711L
 #define NOTHROW throw()
diff --git a/clang/test/CXX/drs/dr20xx.cpp b/clang/test/CXX/drs/dr20xx.cpp
index 291a77e0cc71..9797097acce7 100644
--- a/clang/test/CXX/drs/dr20xx.cpp
+++ b/clang/test/CXX/drs/dr20xx.cpp
@@ -90,7 +90,7 @@ namespace cwg2026 { // cwg2026: 11
   }
 }
 
-namespace cwg2049 { // cwg2049: 18 drafting P2308R1
+namespace cwg2049 { // cwg2049: 18
 #if __cplusplus >= 202302L
 template <int* x = {}> struct X {};
 X<> a;
diff --git a/clang/test/CXX/drs/dr21xx.cpp b/clang/test/CXX/drs/dr21xx.cpp
index 4fab10c279aa..082deb42e4fa 100644
--- a/clang/test/CXX/drs/dr21xx.cpp
+++ b/clang/test/CXX/drs/dr21xx.cpp
@@ -175,6 +175,8 @@ void foo() {
 }
 }
 
+// cwg2149 is in cwg2149.cpp
+
 namespace cwg2157 { // cwg2157: 11
 #if __cplusplus >= 201103L
   enum E : int;
diff --git a/clang/test/CXX/drs/dr24xx.cpp b/clang/test/CXX/drs/dr24xx.cpp
index 5ffaebda68c1..9f876cd87083 100644
--- a/clang/test/CXX/drs/dr24xx.cpp
+++ b/clang/test/CXX/drs/dr24xx.cpp
@@ -45,7 +45,7 @@ void fallthrough(int n) {
 #endif
 }
 
-namespace cwg2450 { // cwg2450: 18 review P2308R1
+namespace cwg2450 { // cwg2450: 18
 #if __cplusplus >= 202302L
 struct S {int a;};
 template <S s>
@@ -59,7 +59,7 @@ f<{.a= 0}>();
 #endif
 }
 
-namespace cwg2459 { // cwg2459: 18 drafting P2308R1
+namespace cwg2459 { // cwg2459: 18
 #if __cplusplus >= 202302L
 struct A {
   constexpr A(float) {}
diff --git a/clang/test/CXX/drs/dr25xx.cpp b/clang/test/CXX/drs/dr25xx.cpp
index 62b2a0a088cc..8bca58f44944 100644
--- a/clang/test/CXX/drs/dr25xx.cpp
+++ b/clang/test/CXX/drs/dr25xx.cpp
@@ -130,12 +130,14 @@ struct D3 : B {
 #endif
 
 #if __cplusplus >= 202302L
-namespace cwg2561 { // cwg2561: 18 review 2023-11-09
+namespace cwg2561 { // cwg2561: no tentatively ready 2024-03-18
 struct C {
     constexpr C(auto) { }
 };
 void foo() {
     constexpr auto b = [](this C) { return 1; };
+    // FIXME: closure type shouldn't have a conversion function to function
+    //        pointer, because explicit object parameter is present. 
     constexpr int (*fp)(C) = b;
     static_assert(fp(1) == 1);
     static_assert((&decltype(b)::operator())(1) == 1);
diff --git a/clang/test/CXX/drs/dr28xx.cpp b/clang/test/CXX/drs/dr28xx.cpp
index 4d9b0c76758d..be35d366bdd6 100644
--- a/clang/test/CXX/drs/dr28xx.cpp
+++ b/clang/test/CXX/drs/dr28xx.cpp
@@ -10,7 +10,15 @@
 // expected-no-diagnostics
 #endif
 
-namespace cwg2847 { // cwg2847: 19
+namespace cwg2819 { // cwg2819: 19 tentatively ready 2023-12-01
+#if __cpp_constexpr >= 202306L
+  constexpr void* p = nullptr;
+  constexpr int* q = static_cast<int*>(p);
+  static_assert(q == nullptr);
+#endif
+}
+
+namespace cwg2847 { // cwg2847: 19 review 2024-03-01
 
 #if __cplusplus >= 202002L
 
@@ -59,7 +67,7 @@ void B<int>::g() requires true;
 
 } // namespace cwg2847
 
-namespace cwg2858 { // cwg2858: 19
+namespace cwg2858 { // cwg2858: 19 tentatively ready 2024-04-05
 
 #if __cplusplus > 202302L
 
diff --git a/clang/test/CXX/drs/dr2xx.cpp b/clang/test/CXX/drs/dr2xx.cpp
index 5d3e8ce4bea3..2b3131be3305 100644
--- a/clang/test/CXX/drs/dr2xx.cpp
+++ b/clang/test/CXX/drs/dr2xx.cpp
@@ -561,9 +561,9 @@ namespace cwg244 { // cwg244: 11
     B_ptr->B_alias::~B();
     B_ptr->B_alias::~B_alias();
     B_ptr->cwg244::~B();
-    // expected-error@-1 {{qualified member access refers to a member in namespace 'cwg244'}}
+    // expected-error@-1 {{no member named '~B' in namespace 'cwg244'}}
     B_ptr->cwg244::~B_alias();
-    // expected-error@-1 {{qualified member access refers to a member in namespace 'cwg244'}}
+    // expected-error@-1 {{no member named '~B' in namespace 'cwg244'}}
   }
 
   template<typename T, typename U>
@@ -836,7 +836,7 @@ namespace cwg258 { // cwg258: 2.8
 
 namespace cwg259 { // cwg259: 4
   template<typename T> struct A {};
-  template struct A<int>; // #cwg259-A-int 
+  template struct A<int>; // #cwg259-A-int
   template struct A<int>;
   // expected-error@-1 {{duplicate explicit instantiation of 'A<int>'}}
   //   expected-note@#cwg259-A-int {{previous explicit instantiation is here}}
@@ -997,7 +997,7 @@ namespace cwg275 { // cwg275: no
     // expected-error@-1 {{no function template matches function template specialization 'f'}}
   }
 
-  template <class T> void g(T) {} // #cwg275-g 
+  template <class T> void g(T) {} // #cwg275-g
 
   template <> void N::f(char) {}
   template <> void f(int) {}
@@ -1164,7 +1164,7 @@ namespace cwg285 { // cwg285: yes
 namespace cwg286 { // cwg286: 2.8
   template<class T> struct A {
     class C {
-      template<class T2> struct B {}; // #cwg286-B 
+      template<class T2> struct B {}; // #cwg286-B
     };
   };
 
diff --git a/clang/test/CXX/drs/dr3xx.cpp b/clang/test/CXX/drs/dr3xx.cpp
index 3e9228fe21fb..94227dc031c6 100644
--- a/clang/test/CXX/drs/dr3xx.cpp
+++ b/clang/test/CXX/drs/dr3xx.cpp
@@ -34,7 +34,7 @@ namespace cwg301 { // cwg301: 3.5
     bool b = (void(*)(S, S))operator- < (void(*)(S, S))operator-;
     // cxx98-17-warning@-1 {{ordered comparison of function pointers ('void (*)(S, S)' and 'void (*)(S, S)')}}
     // cxx20-23-error@-2 {{expected '>'}}
-    //   cxx20-23-note@-3 {{to match this '<'}} 
+    //   cxx20-23-note@-3 {{to match this '<'}}
     bool c = (void(*)(S, S))operator+ < (void(*)(S, S))operator-;
     // expected-error@-1 {{expected '>'}}
     //   expected-note@-2 {{to match this '<'}}
@@ -642,7 +642,7 @@ namespace cwg339 { // cwg339: 2.8
   char xxx(int);
   char (&xxx(float))[2];
 
-  template<class T> A<sizeof(xxx((T)0))> f(T) {} // #cwg339-f 
+  template<class T> A<sizeof(xxx((T)0))> f(T) {} // #cwg339-f
 
   void test() {
     A<1> a = f(0);
@@ -828,7 +828,7 @@ namespace cwg352 { // cwg352: 2.8
     void g(A::E e) {
       foo(e, &arg);
       // expected-error@-1 {{no matching function for call to 'foo'}}
-      //   expected-note@#cwg352-foo {{candidate template ignored: couldn't infer template argument 'R'}} 
+      //   expected-note@#cwg352-foo {{candidate template ignored: couldn't infer template argument 'R'}}
 
       using A::foo;
       foo<int, int>(e, &arg); // ok, uses non-template
@@ -929,7 +929,7 @@ namespace cwg352 { // cwg352: 2.8
 
   namespace example5 {
     template<int I> class A {};
-    template<int I> void g(A<I+1>); // #cwg352-g 
+    template<int I> void g(A<I+1>); // #cwg352-g
     template<int I> void f(A<I>, A<I+1>);
     void h(A<1> a1, A<2> a2) {
       g(a1);
@@ -1256,7 +1256,7 @@ namespace cwg373 { // cwg373: 5
     }
   };
 
-  struct A { struct B {}; }; // #cwg373-A 
+  struct A { struct B {}; }; // #cwg373-A
   namespace X = A::B;
   // expected-error@-1 {{expected namespace name}}
   //   expected-note@#cwg373-A {{'A' declared here}}
@@ -1608,7 +1608,7 @@ namespace cwg395 { // cwg395: 3.0
     // expected-error@-2 {{conversion function cannot have any parameters}}
     // expected-error@-3 {{cannot specify any part of a return type in the declaration of a conversion function}}
     // expected-error@-4 {{conversion function cannot convert to a function type}}
-  
+
   };
 
   struct null1_t {
@@ -1721,9 +1721,9 @@ namespace cwg399 { // cwg399: 11
     B_ptr->B_alias::~B();
     B_ptr->B_alias::~B_alias();
     B_ptr->cwg399::~B();
-    // expected-error@-1 {{qualified member access refers to a member in namespace 'cwg399'}}
+    // expected-error@-1 {{no member named '~B' in namespace 'cwg399'}}
     B_ptr->cwg399::~B_alias();
-    // expected-error@-1 {{qualified member access refers to a member in namespace 'cwg399'}}
+    // expected-error@-1 {{no member named '~B' in namespace 'cwg399'}}
   }
 
   template<typename T, typename U>
diff --git a/clang/test/CXX/except/except.spec/p13-friend.cpp b/clang/test/CXX/except/except.spec/p13-friend.cpp
new file mode 100644
index 000000000000..7f73a4ff431a
--- /dev/null
+++ b/clang/test/CXX/except/except.spec/p13-friend.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -fexceptions -fcxx-exceptions -fsyntax-only -verify %s
+
+namespace N0 {
+  void f() noexcept;
+  void g() noexcept;
+
+  struct A {
+    friend void f() noexcept;
+    friend void g() noexcept(x);
+
+    static constexpr bool x = true;
+  };
+} // namespace N0
+
+namespace N1 {
+  void f() noexcept;
+  void g();
+
+  template<typename T>
+  struct A {
+    friend void f() noexcept;
+    // FIXME: This error is emitted if no other errors occured (i.e. Sema::hasUncompilableErrorOccurred() is false).
+    friend void g() noexcept(x); // expected-error {{no member 'x' in 'N1::A<int>'; it has not yet been instantiated}}
+                                 // expected-note@-1 {{in instantiation of exception specification}}
+    static constexpr bool x = false; // expected-note {{not-yet-instantiated member is declared here}}
+  };
+
+  template struct A<int>; // expected-note {{in instantiation of template class}}
+} // namespace N1
diff --git a/clang/test/CXX/expr/expr.const/p5-26.cpp b/clang/test/CXX/expr/expr.const/p5-26.cpp
index 3624b1e5a3e3..7513b11c09aa 100644
--- a/clang/test/CXX/expr/expr.const/p5-26.cpp
+++ b/clang/test/CXX/expr/expr.const/p5-26.cpp
@@ -37,3 +37,10 @@ void err() {
                                                 // cxx23-note {{cast from 'void *' is not allowed in a constant expression in C++ standards before C++2c}} \
                                                 // cxx26-note {{cast from 'void *' is not allowed in a constant expression because the pointed object type 'T' is not similar to the target type 'S'}}
 }
+
+int* p;
+constexpr int** pp = &p;
+constexpr void* vp = pp;
+constexpr auto cvp = static_cast<const int* volatile*>(vp);
+// cxx23-error@-1 {{constant expression}}
+// cxx23-note@-2 {{cast from 'void *' is not allowed in a constant expression}}
diff --git a/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp b/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp
new file mode 100644
index 000000000000..c192b0c8112a
--- /dev/null
+++ b/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -std=c++26 -fsyntax-only -verify %s
+
+auto&& f1() {
+  return 42; // expected-error{{returning reference to local temporary object}}
+}
+const double& f2() {
+  static int x = 42;
+  return x; // expected-error{{returning reference to local temporary object}}
+}
+auto&& id(auto&& r) {
+  return static_cast<decltype(r)&&>(r);
+}
+auto&& f3() {
+  return id(42);        // OK, but probably a bug
+}
+
+void unevaluated() {
+  using a = decltype ([] () -> const int & {
+    const int &i = 0; // expected-note {{binding reference variable 'i' here}}
+    return i; // expected-error{{returning reference to local temporary object}}
+} ());
+}
+
+static_assert(__is_convertible(int, const int &));
+static_assert(__is_nothrow_convertible(int, const int &));
diff --git a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp
new file mode 100644
index 000000000000..0f24d716a7b7
--- /dev/null
+++ b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp
@@ -0,0 +1,523 @@
+// RUN: %clang_cc1 -Wno-unused-value -verify %s
+
+namespace N0 {
+  struct A {
+    int x0;
+    static int y0;
+    int x1;
+    static int y1;
+
+    void f0();
+    static void g0();
+    void f1();
+    static void g1();
+
+    using M0 = int;
+    using M1 = int;
+
+    struct C0 { };
+    struct C1 { };
+  };
+
+  template<typename T>
+  struct B : A {
+    int x2;
+    static int y2;
+
+    void f2();
+    static void g2();
+
+    using M2 = int;
+
+    struct C2 { };
+
+    using A::x1;
+    using A::y1;
+    using A::f1;
+    using A::g1;
+    using A::M1;
+    using A::C1;
+
+    using T::x3;
+    using T::y3;
+    using T::f3;
+    using T::g3;
+    using typename T::M3;
+    using typename T::C3;
+
+    void not_instantiated(B *a, B &b) {
+      // All of the following should be found in the current instantiation.
+
+      new M0;
+      new B::M0;
+      new A::M0;
+      new B::A::M0;
+      new C0;
+      new B::C0;
+      new A::C0;
+      new B::A::C0;
+      new M1;
+      new B::M1;
+      new A::M1;
+      new B::A::M1;
+      new C1;
+      new B::C1;
+      new A::C1;
+      new B::A::C1;
+      new M2;
+      new B::M2;
+      new C2;
+      new B::C2;
+      new M3;
+      new B::M3;
+      new C3;
+      new B::C3;
+
+      x0;
+      B::x0;
+      A::x0;
+      B::A::x0;
+      y0;
+      B::y0;
+      A::y0;
+      B::A::y0;
+      x1;
+      B::x1;
+      A::x1;
+      B::A::x1;
+      y1;
+      B::y1;
+      A::y1;
+      B::A::y1;
+      x2;
+      B::x2;
+      y2;
+      B::y2;
+      x3;
+      B::x3;
+      y3;
+      B::y3;
+
+      f0();
+      B::f0();
+      A::f0();
+      B::A::f0();
+      g0();
+      B::g0();
+      A::g0();
+      B::A::g0();
+      f1();
+      B::f1();
+      A::f1();
+      B::A::f1();
+      g1();
+      B::g1();
+      A::g1();
+      B::A::g1();
+      f2();
+      B::f2();
+      g2();
+      B::g2();
+      f3();
+      B::f3();
+      g3();
+      B::g3();
+
+      this->x0;
+      this->B::x0;
+      this->A::x0;
+      this->B::A::x0;
+      this->y0;
+      this->B::y0;
+      this->A::y0;
+      this->B::A::y0;
+      this->x1;
+      this->B::x1;
+      this->A::x1;
+      this->B::A::x1;
+      this->y1;
+      this->B::y1;
+      this->A::y1;
+      this->B::A::y1;
+      this->x2;
+      this->B::x2;
+      this->y2;
+      this->B::y2;
+      this->x3;
+      this->B::x3;
+      this->y3;
+      this->B::y3;
+
+      this->f0();
+      this->B::f0();
+      this->A::f0();
+      this->B::A::f0();
+      this->g0();
+      this->B::g0();
+      this->A::g0();
+      this->B::A::g0();
+      this->f1();
+      this->B::f1();
+      this->A::f1();
+      this->B::A::f1();
+      this->g1();
+      this->B::g1();
+      this->A::g1();
+      this->B::A::g1();
+      this->f2();
+      this->B::f2();
+      this->g2();
+      this->B::g2();
+      this->f3();
+      this->B::f3();
+      this->g3();
+      this->B::g3();
+
+      a->x0;
+      a->B::x0;
+      a->A::x0;
+      a->B::A::x0;
+      a->y0;
+      a->B::y0;
+      a->A::y0;
+      a->B::A::y0;
+      a->x1;
+      a->B::x1;
+      a->A::x1;
+      a->B::A::x1;
+      a->y1;
+      a->B::y1;
+      a->A::y1;
+      a->B::A::y1;
+      a->x2;
+      a->B::x2;
+      a->y2;
+      a->B::y2;
+      a->x3;
+      a->B::x3;
+      a->y3;
+      a->B::y3;
+
+      a->f0();
+      a->B::f0();
+      a->A::f0();
+      a->B::A::f0();
+      a->g0();
+      a->B::g0();
+      a->A::g0();
+      a->B::A::g0();
+      a->f1();
+      a->B::f1();
+      a->A::f1();
+      a->B::A::f1();
+      a->g1();
+      a->B::g1();
+      a->A::g1();
+      a->B::A::g1();
+      a->f2();
+      a->B::f2();
+      a->g2();
+      a->B::g2();
+      a->f3();
+      a->B::f3();
+      a->g3();
+      a->B::g3();
+
+      (*this).x0;
+      (*this).B::x0;
+      (*this).A::x0;
+      (*this).B::A::x0;
+      (*this).y0;
+      (*this).B::y0;
+      (*this).A::y0;
+      (*this).B::A::y0;
+      (*this).x1;
+      (*this).B::x1;
+      (*this).A::x1;
+      (*this).B::A::x1;
+      (*this).y1;
+      (*this).B::y1;
+      (*this).A::y1;
+      (*this).B::A::y1;
+      (*this).x2;
+      (*this).B::x2;
+      (*this).y2;
+      (*this).B::y2;
+      (*this).x3;
+      (*this).B::x3;
+      (*this).y3;
+      (*this).B::y3;
+
+      (*this).f0();
+      (*this).B::f0();
+      (*this).A::f0();
+      (*this).B::A::f0();
+      (*this).g0();
+      (*this).B::g0();
+      (*this).A::g0();
+      (*this).B::A::g0();
+      (*this).f1();
+      (*this).B::f1();
+      (*this).A::f1();
+      (*this).B::A::f1();
+      (*this).g1();
+      (*this).B::g1();
+      (*this).A::g1();
+      (*this).B::A::g1();
+      (*this).f2();
+      (*this).B::f2();
+      (*this).g2();
+      (*this).B::g2();
+      (*this).f3();
+      (*this).B::f3();
+      (*this).g3();
+      (*this).B::g3();
+
+      b.x0;
+      b.B::x0;
+      b.A::x0;
+      b.B::A::x0;
+      b.y0;
+      b.B::y0;
+      b.A::y0;
+      b.B::A::y0;
+      b.x1;
+      b.B::x1;
+      b.A::x1;
+      b.B::A::x1;
+      b.y1;
+      b.B::y1;
+      b.A::y1;
+      b.B::A::y1;
+      b.x2;
+      b.B::x2;
+      b.y2;
+      b.B::y2;
+      b.x3;
+      b.B::x3;
+      b.y3;
+      b.B::y3;
+
+      b.f0();
+      b.B::f0();
+      b.A::f0();
+      b.B::A::f0();
+      b.g0();
+      b.B::g0();
+      b.A::g0();
+      b.B::A::g0();
+      b.f1();
+      b.B::f1();
+      b.A::f1();
+      b.B::A::f1();
+      b.g1();
+      b.B::g1();
+      b.A::g1();
+      b.B::A::g1();
+      b.f2();
+      b.B::f2();
+      b.g2();
+      b.B::g2();
+      b.f3();
+      b.B::f3();
+      b.g3();
+      b.B::g3();
+
+      // None of the following should be found in the current instantiation.
+
+      new M4; // expected-error{{unknown type name 'M4'}}
+      new B::M4; // expected-error{{no type named 'M4' in 'B<T>'}}
+      new A::M4; // expected-error{{no type named 'M4' in 'N0::A'}}
+      new B::A::M4; // expected-error{{no type named 'M4' in 'N0::A'}}
+
+      x4; // expected-error{{use of undeclared identifier 'x4'}}
+      B::x4; // expected-error{{no member named 'x4' in 'B<T>'}}
+      A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
+      B::A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
+      f4(); // expected-error{{use of undeclared identifier 'f4'}}
+      B::f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
+      A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
+      B::A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
+
+      this->x4; // expected-error{{no member named 'x4' in 'B<T>'}}
+      this->B::x4; // expected-error{{no member named 'x4' in 'B<T>'}}
+      this->A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
+      this->B::A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
+      this->f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
+      this->B::f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
+      this->A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
+      this->B::A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
+
+      a->x4; // expected-error{{no member named 'x4' in 'B<T>'}}
+      a->B::x4; // expected-error{{no member named 'x4' in 'B<T>'}}
+      a->A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
+      a->B::A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
+      a->f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
+      a->B::f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
+      a->A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
+      a->B::A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
+
+      // FIXME: An overloaded unary 'operator*' is built for these
+      // even though the operand is a pointer (to a dependent type).
+      // Type::isOverloadableType should return false for such cases.
+      (*this).x4;
+      (*this).B::x4;
+      (*this).A::x4;
+      (*this).B::A::x4;
+      (*this).f4();
+      (*this).B::f4();
+      (*this).A::f4();
+      (*this).B::A::f4();
+
+      b.x4; // expected-error{{no member named 'x4' in 'B<T>'}}
+      b.B::x4; // expected-error{{no member named 'x4' in 'B<T>'}}
+      b.A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
+      b.B::A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
+      b.f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
+      b.B::f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
+      b.A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
+      b.B::A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
+    }
+  };
+} // namespace N0
+
+namespace N1 {
+  struct A {
+    template<int I>
+    void f();
+  };
+
+  template<typename T>
+  struct B {
+    template<int I>
+    void f();
+
+    A x;
+    A g();
+
+    void not_instantiated(B *a, B &b) {
+      f<0>();
+      this->f<0>();
+      a->f<0>();
+      // FIXME: This should not require 'template'!
+      (*this).f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
+      b.f<0>();
+
+      x.f<0>();
+      this->x.f<0>();
+      a->x.f<0>();
+      // FIXME: This should not require 'template'!
+      (*this).x.f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
+      b.x.f<0>();
+
+      // FIXME: None of these should require 'template'!
+      g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
+      this->g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
+      a->g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
+      (*this).g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
+      b.g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
+    }
+  };
+} // namespace N1
+
+namespace N2 {
+  template<typename T>
+  struct A {
+    struct B {
+      using C = A;
+
+      void not_instantiated(A *a, B *b) {
+        b->x; // expected-error{{no member named 'x' in 'N2::A::B'}}
+        b->B::x; // expected-error{{no member named 'x' in 'N2::A::B'}}
+        a->B::C::x; // expected-error{{no member named 'x' in 'A<T>'}}
+      }
+    };
+
+    void not_instantiated(A *a, B *b) {
+      b->x;
+      b->B::x;
+      a->B::C::x;
+    }
+  };
+} // namespace N2
+
+namespace N3 {
+  struct A { };
+
+  template<typename T>
+  struct B : A {
+    void not_instantiated() {
+      // Dependent, lookup context is the current instantiation.
+      this->operator=(*this);
+      // Not dependent, the lookup context is A (not the current instantiation).
+      this->A::operator=(*this);
+    }
+  };
+} // namespace N3
+
+namespace N4 {
+  template<typename T>
+  struct A {
+    void not_instantiated(A a, A<T> b, T c) {
+      a->x;
+      b->x;
+      c->x;
+    }
+
+    void instantiated(A a, A<T> b, T c) {
+      a->x; // expected-error {{member reference type 'A<int>' is not a pointer; did you mean to use '.'?}}
+            // expected-error@-1 {{no member named 'x' in 'N4::A<int>'}}
+      b->x; // expected-error {{member reference type 'A<int>' is not a pointer; did you mean to use '.'?}}
+            // expected-error@-1 {{no member named 'x' in 'N4::A<int>'}}
+      c->x; // expected-error {{member reference type 'int' is not a pointer}}
+    }
+  };
+
+  template void A<int>::instantiated(A<int>, A<int>, int); // expected-note {{in instantiation of}}
+
+  struct B {
+    int x;
+
+    void f();
+  };
+
+  template<typename T>
+  struct C {
+    B *operator->();
+
+    void not_instantiated(C a, C<T> b, T c) {
+      a->x;
+      b->x;
+      c->x;
+    }
+
+    void instantiated(C a, C<T> b, T c) {
+      a->x;
+      b->x;
+      c->x; // expected-error {{member reference type 'int' is not a pointer}}
+    }
+  };
+
+  template void C<int>::instantiated(C, C, int); // expected-note {{in instantiation of}}
+
+  template<typename T>
+  struct D {
+    T *operator->();
+
+    void not_instantiated(D a) {
+      a->x;
+      a->y;
+      a->f();
+      a->g();
+    }
+
+    void instantiated(D a) {
+      a->x;
+      a->y; // expected-error {{no member named 'y' in 'N4::B'}}
+      a->f();
+      a->g(); // expected-error {{no member named 'g' in 'N4::B'}}
+    }
+  };
+
+  template void D<B>::instantiated(D); // expected-note {{in instantiation of}}
+} // namespace N4
diff --git a/clang/test/CXX/temp/temp.res/temp.local/p3.cpp b/clang/test/CXX/temp/temp.res/temp.local/p3.cpp
index 87589e1e5bcd..b9b29d22736e 100644
--- a/clang/test/CXX/temp/temp.res/temp.local/p3.cpp
+++ b/clang/test/CXX/temp/temp.res/temp.local/p3.cpp
@@ -16,8 +16,7 @@ template <class T> struct Derived: Base<int>, Base<char> {
   void g(X0 *t) {
     t->Derived::Base<T>::f();
     t->Base<T>::f();
-    t->Base::f(); // expected-error{{member 'Base' found in multiple base classes of different types}} \
-    // expected-error{{no member named 'f' in 'X0'}}
+    t->Base::f(); // expected-error{{member 'Base' found in multiple base classes of different types}}
   }
 };
 
diff --git a/clang/test/ClangScanDeps/modules-extern-unrelated.m b/clang/test/ClangScanDeps/modules-extern-unrelated.m
index 76611c596d3e..957132fd5b18 100644
--- a/clang/test/ClangScanDeps/modules-extern-unrelated.m
+++ b/clang/test/ClangScanDeps/modules-extern-unrelated.m
@@ -1,3 +1,6 @@
+// This test checks that only module map files defining affecting modules are
+// affecting.
+
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 
@@ -22,15 +25,8 @@ module second { header "second.h" }
 //--- second/second.h
 #include "first_other.h"
 
-//--- cdb.json.template
-[{
-  "directory": "DIR",
-  "file": "DIR/tu.m",
-  "command": "clang -fmodules -fmodules-cache-path=DIR/cache -I DIR/zeroth -I DIR/first -I DIR/second -c DIR/tu.m -o DIR/tu.o"
-}]
-
-// RUN: sed -e "s|DIR|%/t|g" -e "s|INPUTS|%/S/Inputs|g" %t/cdb.json.template > %t/cdb.json
-// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full > %t/result.json
+// RUN: clang-scan-deps -format experimental-full -o %t/result.json \
+// RUN:   -- %clang -fmodules -fmodules-cache-path=%t/cache -I %t/zeroth -I %t/first -I %t/second -c %t/tu.m -o %t/tu.o
 // RUN: cat %t/result.json | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t
 
 // CHECK:      {
@@ -67,11 +63,11 @@ module second { header "second.h" }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/second/second.modulemap",
 // CHECK-NEXT:       "command-line": [
+// CHECK-NOT:          "-fmodule-map-file=[[PREFIX]]/second/module.modulemap"
 // CHECK:            ],
 // CHECK-NEXT:       "context-hash": "{{.*}}",
 // CHECK-NEXT:       "file-deps": [
 // CHECK-NEXT:         "[[PREFIX]]/first/module.modulemap",
-// CHECK-NEXT:         "[[PREFIX]]/second/module.modulemap",
 // CHECK-NEXT:         "[[PREFIX]]/second/second.h",
 // CHECK-NEXT:         "[[PREFIX]]/second/second.modulemap"
 // CHECK-NEXT:       ],
@@ -90,11 +86,11 @@ module second { header "second.h" }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/zeroth/module.modulemap",
 // CHECK-NEXT:       "command-line": [
+// CHECK-NOT:          "-fmodule-map-file=[[PREFIX]]/second/module.modulemap"
 // CHECK:            ],
 // CHECK-NEXT:       "context-hash": "{{.*}}",
 // CHECK-NEXT:       "file-deps": [
 // CHECK-NEXT:         "[[PREFIX]]/first/module.modulemap",
-// CHECK-NEXT:         "[[PREFIX]]/second/module.modulemap",
 // CHECK-NEXT:         "[[PREFIX]]/second/second.modulemap",
 // CHECK-NEXT:         "[[PREFIX]]/zeroth/module.modulemap",
 // CHECK-NEXT:         "[[PREFIX]]/zeroth/zeroth.h"
@@ -115,7 +111,7 @@ module second { header "second.h" }
 // CHECK-NEXT:           ],
 // CHECK-NEXT:           "command-line": [
 // CHECK:                ],
-// CHECK-NEXT:           "executable": "clang",
+// CHECK-NEXT:           "executable": "{{.*}}",
 // CHECK-NEXT:           "file-deps": [
 // CHECK-NEXT:             "[[PREFIX]]/tu.m"
 // CHECK-NEXT:           ],
diff --git a/clang/test/CodeGen/LoongArch/tls-dialect.c b/clang/test/CodeGen/LoongArch/tls-dialect.c
new file mode 100644
index 000000000000..03401ef8af03
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/tls-dialect.c
@@ -0,0 +1,14 @@
+// REQUIRES: loongarch-registered-target
+/// cc1 -enable-tlsdesc (due to -mtls-dialect=desc) enables TLSDESC.
+// RUN: %clang_cc1 -triple loongarch64 -S -mrelocation-model pic -pic-level 1 -enable-tlsdesc %s -o - | FileCheck %s --check-prefix=DESC
+// RUN: %clang_cc1 -triple loongarch64 -S -mrelocation-model pic -pic-level 1 %s -o - | FileCheck %s --check-prefix=NODESC
+
+__thread int x;
+
+// DESC:       %desc_pc_hi20
+// DESC-NOT:   %gd_pc_hi20
+// NODESC:     %gd_pc_hi20
+// NODESC-NOT: %desc_pc_hi20
+int use() {
+  return x;
+}
diff --git a/clang/test/CodeGen/X86/ms-x86-intrinsics.c b/clang/test/CodeGen/X86/ms-x86-intrinsics.c
index a1c90d71c8eb..aa557c8e19a8 100644
--- a/clang/test/CodeGen/X86/ms-x86-intrinsics.c
+++ b/clang/test/CodeGen/X86/ms-x86-intrinsics.c
@@ -48,7 +48,7 @@ long long test__readfsqword(unsigned long Offset) {
 __int64 test__emul(int a, int b) {
   return __emul(a, b);
 }
-// CHECK-LABEL: define dso_local i64 @test__emul(i32 noundef %a, i32 noundef %b)
+// CHECK-LABEL: define dso_local range(i64 -4611686016279904256, 4611686018427387905) i64 @test__emul(i32 noundef %a, i32 noundef %b)
 // CHECK: [[X:%[0-9]+]] = sext i32 %a to i64
 // CHECK: [[Y:%[0-9]+]] = sext i32 %b to i64
 // CHECK: [[RES:%[0-9]+]] = mul nsw i64 [[Y]], [[X]]
@@ -57,7 +57,7 @@ __int64 test__emul(int a, int b) {
 unsigned __int64 test__emulu(unsigned int a, unsigned int b) {
   return __emulu(a, b);
 }
-// CHECK-LABEL: define dso_local i64 @test__emulu(i32 noundef %a, i32 noundef %b)
+// CHECK-LABEL: define dso_local range(i64 0, -8589934590) i64 @test__emulu(i32 noundef %a, i32 noundef %b)
 // CHECK: [[X:%[0-9]+]] = zext i32 %a to i64
 // CHECK: [[Y:%[0-9]+]] = zext i32 %b to i64
 // CHECK: [[RES:%[0-9]+]] = mul nuw i64 [[Y]], [[X]]
@@ -108,13 +108,13 @@ long long test__readgsqword(unsigned long Offset) {
 __int64 test__mulh(__int64 a, __int64 b) {
   return __mulh(a, b);
 }
-// CHECK-X64-LABEL: define dso_local i64 @test__mulh(i64 noundef %a, i64 noundef %b)
+// CHECK-X64-LABEL: define dso_local range(i64 -4611686018427387904, 4611686018427387905) i64 @test__mulh(i64 noundef %a, i64 noundef %b)
 // CHECK-X64: = mul nsw i128 %
 
 unsigned __int64 test__umulh(unsigned __int64 a, unsigned __int64 b) {
   return __umulh(a, b);
 }
-// CHECK-X64-LABEL: define dso_local i64 @test__umulh(i64 noundef %a, i64 noundef %b)
+// CHECK-X64-LABEL: define dso_local range(i64 0, -1) i64 @test__umulh(i64 noundef %a, i64 noundef %b)
 // CHECK-X64: = mul nuw i128 %
 
 __int64 test_mul128(__int64 Multiplier,
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c
index 12533fa71698..cc9a3186da08 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c
@@ -35,16 +35,22 @@ svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base)
 // CHECK-LABEL: @test_svld1_vnum_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld1_vnum_bf16u10__SVBool_tPKu6__bf16l(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
 //
 svbfloat16_t test_svld1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c
index c3a5186b1e8b..12a2e95cf957 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1.c
@@ -199,15 +199,21 @@ svfloat64_t test_svld1_f64(svbool_t pg, const float64_t *base)
 
 // CHECK-LABEL: @test_svld1_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svld1_vnum_s8u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 svint8_t test_svld1_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -217,16 +223,22 @@ svint8_t test_svld1_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_s16u10__SVBool_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 //
 svint16_t test_svld1_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
 {
@@ -236,16 +248,22 @@ svint16_t test_svld1_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_s32u10__SVBool_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 //
 svint32_t test_svld1_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
 {
@@ -255,16 +273,22 @@ svint32_t test_svld1_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_s64u10__SVBool_tPKll(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 //
 svint64_t test_svld1_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
 {
@@ -273,15 +297,21 @@ svint64_t test_svld1_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
 
 // CHECK-LABEL: @test_svld1_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svld1_vnum_u8u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 svuint8_t test_svld1_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -291,16 +321,22 @@ svuint8_t test_svld1_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_u16u10__SVBool_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 //
 svuint16_t test_svld1_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
 {
@@ -310,16 +346,22 @@ svuint16_t test_svld1_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_u32u10__SVBool_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 //
 svuint32_t test_svld1_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
 {
@@ -329,16 +371,22 @@ svuint32_t test_svld1_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_u64u10__SVBool_tPKml(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 //
 svuint64_t test_svld1_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
 {
@@ -348,16 +396,22 @@ svuint64_t test_svld1_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1_vnum_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 8 x half> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_f16u10__SVBool_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP4]]
 //
 svfloat16_t test_svld1_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum)
 {
@@ -367,16 +421,22 @@ svfloat16_t test_svld1_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum
 // CHECK-LABEL: @test_svld1_vnum_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_f32u10__SVBool_tPKfl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP4]]
 //
 svfloat32_t test_svld1_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum)
 {
@@ -386,16 +446,22 @@ svfloat32_t test_svld1_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum
 // CHECK-LABEL: @test_svld1_vnum_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
-// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_f64u10__SVBool_tPKdl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
-// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
+// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP4]]
 //
 svfloat64_t test_svld1_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c
index 321628b125f3..1275ab07fd9f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sb.c
@@ -131,18 +131,24 @@ svuint64_t test_svld1sb_u64(svbool_t pg, const int8_t *base)
 // CHECK-LABEL: @test_svld1sb_vnum_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 8 x i8> [[TMP2]] to <vscale x 8 x i16>
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sb_vnum_s16u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 8 x i8> [[TMP2]] to <vscale x 8 x i16>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
 svint16_t test_svld1sb_vnum_s16(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -152,18 +158,24 @@ svint16_t test_svld1sb_vnum_s16(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sb_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 4 x i8> [[TMP2]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sb_vnum_s32u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 4 x i8> [[TMP2]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 svint32_t test_svld1sb_vnum_s32(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -173,18 +185,24 @@ svint32_t test_svld1sb_vnum_s32(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sb_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i8> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sb_vnum_s64u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i8> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svint64_t test_svld1sb_vnum_s64(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -194,18 +212,24 @@ svint64_t test_svld1sb_vnum_s64(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sb_vnum_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 8 x i8> [[TMP2]] to <vscale x 8 x i16>
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sb_vnum_u16u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 8 x i8> [[TMP2]] to <vscale x 8 x i16>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
 svuint16_t test_svld1sb_vnum_u16(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -215,18 +239,24 @@ svuint16_t test_svld1sb_vnum_u16(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sb_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 4 x i8> [[TMP2]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sb_vnum_u32u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 4 x i8> [[TMP2]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 svuint32_t test_svld1sb_vnum_u32(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -236,18 +266,24 @@ svuint32_t test_svld1sb_vnum_u32(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sb_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i8> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sb_vnum_u64u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i8> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svuint64_t test_svld1sb_vnum_u64(svbool_t pg, const int8_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c
index a72892fe9622..5384c432e69d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c
@@ -93,18 +93,24 @@ svuint64_t test_svld1sh_u64(svbool_t pg, const int16_t *base)
 // CHECK-LABEL: @test_svld1sh_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 4 x i16> [[TMP2]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sh_vnum_s32u10__SVBool_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 4 x i16> [[TMP2]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 svint32_t test_svld1sh_vnum_s32(svbool_t pg, const int16_t *base, int64_t vnum)
 {
@@ -114,18 +120,24 @@ svint32_t test_svld1sh_vnum_s32(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sh_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i16> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sh_vnum_s64u10__SVBool_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i16> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svint64_t test_svld1sh_vnum_s64(svbool_t pg, const int16_t *base, int64_t vnum)
 {
@@ -135,18 +147,24 @@ svint64_t test_svld1sh_vnum_s64(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sh_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 4 x i16> [[TMP2]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sh_vnum_u32u10__SVBool_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 4 x i16> [[TMP2]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 svuint32_t test_svld1sh_vnum_u32(svbool_t pg, const int16_t *base, int64_t vnum)
 {
@@ -156,18 +174,24 @@ svuint32_t test_svld1sh_vnum_u32(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sh_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i16> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sh_vnum_u64u10__SVBool_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i16> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svuint64_t test_svld1sh_vnum_u64(svbool_t pg, const int16_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c
index 8921c50fca53..eeac9cdddf02 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sw.c
@@ -55,18 +55,24 @@ svuint64_t test_svld1sw_u64(svbool_t pg, const int32_t *base)
 // CHECK-LABEL: @test_svld1sw_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i32> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sw_vnum_s64u10__SVBool_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i32> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svint64_t test_svld1sw_vnum_s64(svbool_t pg, const int32_t *base, int64_t vnum)
 {
@@ -76,18 +82,24 @@ svint64_t test_svld1sw_vnum_s64(svbool_t pg, const int32_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1sw_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i32> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1sw_vnum_u64u10__SVBool_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = sext <vscale x 2 x i32> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svuint64_t test_svld1sw_vnum_u64(svbool_t pg, const int32_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c
index 9adb2f36f44f..82852ebb16d5 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1ub.c
@@ -131,18 +131,24 @@ svuint64_t test_svld1ub_u64(svbool_t pg, const uint8_t *base)
 // CHECK-LABEL: @test_svld1ub_vnum_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 8 x i8> [[TMP2]] to <vscale x 8 x i16>
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1ub_vnum_s16u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 8 x i8> [[TMP2]] to <vscale x 8 x i16>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
 svint16_t test_svld1ub_vnum_s16(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -152,18 +158,24 @@ svint16_t test_svld1ub_vnum_s16(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1ub_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 4 x i8> [[TMP2]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1ub_vnum_s32u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 4 x i8> [[TMP2]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 svint32_t test_svld1ub_vnum_s32(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -173,18 +185,24 @@ svint32_t test_svld1ub_vnum_s32(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1ub_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i8> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1ub_vnum_s64u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i8> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svint64_t test_svld1ub_vnum_s64(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -194,18 +212,24 @@ svint64_t test_svld1ub_vnum_s64(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1ub_vnum_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 8 x i8> [[TMP2]] to <vscale x 8 x i16>
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1ub_vnum_u16u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 8 x i8> [[TMP2]] to <vscale x 8 x i16>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
 svuint16_t test_svld1ub_vnum_u16(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -215,18 +239,24 @@ svuint16_t test_svld1ub_vnum_u16(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1ub_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 4 x i8> [[TMP2]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1ub_vnum_u32u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 4 x i8> [[TMP2]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 svuint32_t test_svld1ub_vnum_u32(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -236,18 +266,24 @@ svuint32_t test_svld1ub_vnum_u32(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1ub_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i8> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1ub_vnum_u64u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i8> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svuint64_t test_svld1ub_vnum_u64(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c
index de2b975f3d61..8028f6e544f4 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uh.c
@@ -93,18 +93,24 @@ svuint64_t test_svld1uh_u64(svbool_t pg, const uint16_t *base)
 // CHECK-LABEL: @test_svld1uh_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 4 x i16> [[TMP2]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1uh_vnum_s32u10__SVBool_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 4 x i16> [[TMP2]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 svint32_t test_svld1uh_vnum_s32(svbool_t pg, const uint16_t *base, int64_t vnum)
 {
@@ -114,18 +120,24 @@ svint32_t test_svld1uh_vnum_s32(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1uh_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i16> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1uh_vnum_s64u10__SVBool_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i16> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svint64_t test_svld1uh_vnum_s64(svbool_t pg, const uint16_t *base, int64_t vnum)
 {
@@ -135,18 +147,24 @@ svint64_t test_svld1uh_vnum_s64(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1uh_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 4 x i16> [[TMP2]] to <vscale x 4 x i32>
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1uh_vnum_u32u10__SVBool_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 4 x i16> [[TMP2]] to <vscale x 4 x i32>
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
 svuint32_t test_svld1uh_vnum_u32(svbool_t pg, const uint16_t *base, int64_t vnum)
 {
@@ -156,18 +174,24 @@ svuint32_t test_svld1uh_vnum_u32(svbool_t pg, const uint16_t *base, int64_t vnum
 // CHECK-LABEL: @test_svld1uh_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i16> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1uh_vnum_u64u10__SVBool_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i16> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svuint64_t test_svld1uh_vnum_u64(svbool_t pg, const uint16_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c
index bb1d3b56750f..41afe4fe6b4b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1uw.c
@@ -55,18 +55,24 @@ svuint64_t test_svld1uw_u64(svbool_t pg, const uint32_t *base)
 // CHECK-LABEL: @test_svld1uw_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i32> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1uw_vnum_s64u10__SVBool_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i32> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svint64_t test_svld1uw_vnum_s64(svbool_t pg, const uint32_t *base, int64_t vnum)
 {
@@ -76,18 +82,24 @@ svint64_t test_svld1uw_vnum_s64(svbool_t pg, const uint32_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld1uw_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
-// CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i32> [[TMP2]] to <vscale x 2 x i64>
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1uw_vnum_u64u10__SVBool_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = zext <vscale x 2 x i32> [[TMP2]] to <vscale x 2 x i64>
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
 svuint64_t test_svld1uw_vnum_u64(svbool_t pg, const uint32_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c
index af416239a010..49acb38a2c96 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c
@@ -35,15 +35,21 @@ void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data)
 // CHECK-LABEL: @test_svst1_vnum_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst1_vnum_bf16u10__SVBool_tPu6__bf16lu14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c
index 04d8bfa2ea06..7adaece58c5f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1.c
@@ -199,14 +199,20 @@ void test_svst1_f64(svbool_t pg, float64_t *base, svfloat64_t data)
 
 // CHECK-LABEL: @test_svst1_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst1_vnum_s8u10__SVBool_tPalu10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8_t data)
@@ -217,15 +223,21 @@ void test_svst1_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8_t data)
 // CHECK-LABEL: @test_svst1_vnum_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_s16u10__SVBool_tPslu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16_t data)
@@ -236,15 +248,21 @@ void test_svst1_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16_t dat
 // CHECK-LABEL: @test_svst1_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_s32u10__SVBool_tPilu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32_t data)
@@ -255,15 +273,21 @@ void test_svst1_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32_t dat
 // CHECK-LABEL: @test_svst1_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_s64u10__SVBool_tPllu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64_t data)
@@ -273,14 +297,20 @@ void test_svst1_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64_t dat
 
 // CHECK-LABEL: @test_svst1_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst1_vnum_u8u10__SVBool_tPhlu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8_t data)
@@ -291,15 +321,21 @@ void test_svst1_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8_t data
 // CHECK-LABEL: @test_svst1_vnum_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_u16u10__SVBool_tPtlu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16_t data)
@@ -310,15 +346,21 @@ void test_svst1_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16_t d
 // CHECK-LABEL: @test_svst1_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_u32u10__SVBool_tPjlu12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32_t data)
@@ -329,15 +371,21 @@ void test_svst1_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32_t d
 // CHECK-LABEL: @test_svst1_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_u64u10__SVBool_tPmlu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64_t data)
@@ -348,15 +396,21 @@ void test_svst1_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64_t d
 // CHECK-LABEL: @test_svst1_vnum_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_f16u10__SVBool_tPDhlu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16_t data)
@@ -367,15 +421,21 @@ void test_svst1_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16_t
 // CHECK-LABEL: @test_svst1_vnum_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_f32u10__SVBool_tPflu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32_t data)
@@ -386,15 +446,21 @@ void test_svst1_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32_t
 // CHECK-LABEL: @test_svst1_vnum_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_f64u10__SVBool_tPdlu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c
index 5694f09bb388..7500c7718e25 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1b.c
@@ -86,9 +86,12 @@ void test_svst1b_u64(svbool_t pg, uint8_t *base, svuint64_t data)
 // CHECK-LABEL: @test_svst1b_vnum_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x i8>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_s16(svbool_t pg, int8_t *base, int64_t vnum, svint16_t data)
@@ -99,9 +102,12 @@ void test_svst1b_vnum_s16(svbool_t pg, int8_t *base, int64_t vnum, svint16_t dat
 // CHECK-LABEL: @test_svst1b_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i8>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_s32(svbool_t pg, int8_t *base, int64_t vnum, svint32_t data)
@@ -112,9 +118,12 @@ void test_svst1b_vnum_s32(svbool_t pg, int8_t *base, int64_t vnum, svint32_t dat
 // CHECK-LABEL: @test_svst1b_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i8>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_s64(svbool_t pg, int8_t *base, int64_t vnum, svint64_t data)
@@ -125,9 +134,12 @@ void test_svst1b_vnum_s64(svbool_t pg, int8_t *base, int64_t vnum, svint64_t dat
 // CHECK-LABEL: @test_svst1b_vnum_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x i8>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_u16(svbool_t pg, uint8_t *base, int64_t vnum, svuint16_t data)
@@ -138,9 +150,12 @@ void test_svst1b_vnum_u16(svbool_t pg, uint8_t *base, int64_t vnum, svuint16_t d
 // CHECK-LABEL: @test_svst1b_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i8>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_u32(svbool_t pg, uint8_t *base, int64_t vnum, svuint32_t data)
@@ -151,9 +166,12 @@ void test_svst1b_vnum_u32(svbool_t pg, uint8_t *base, int64_t vnum, svuint32_t d
 // CHECK-LABEL: @test_svst1b_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i8>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_u64(svbool_t pg, uint8_t *base, int64_t vnum, svuint64_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c
index e36788f22e71..7394b9c0fe54 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1h.c
@@ -62,9 +62,12 @@ void test_svst1h_u64(svbool_t pg, uint16_t *base, svuint64_t data)
 // CHECK-LABEL: @test_svst1h_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i16>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_vnum_s32(svbool_t pg, int16_t *base, int64_t vnum, svint32_t data)
@@ -75,9 +78,12 @@ void test_svst1h_vnum_s32(svbool_t pg, int16_t *base, int64_t vnum, svint32_t da
 // CHECK-LABEL: @test_svst1h_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i16>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_vnum_s64(svbool_t pg, int16_t *base, int64_t vnum, svint64_t data)
@@ -88,9 +94,12 @@ void test_svst1h_vnum_s64(svbool_t pg, int16_t *base, int64_t vnum, svint64_t da
 // CHECK-LABEL: @test_svst1h_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i16>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_vnum_u32(svbool_t pg, uint16_t *base, int64_t vnum, svuint32_t data)
@@ -101,9 +110,12 @@ void test_svst1h_vnum_u32(svbool_t pg, uint16_t *base, int64_t vnum, svuint32_t
 // CHECK-LABEL: @test_svst1h_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i16>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_vnum_u64(svbool_t pg, uint16_t *base, int64_t vnum, svuint64_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c
index 841f6926f1e1..9dfa096552eb 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1w.c
@@ -39,9 +39,12 @@ void test_svst1w_u64(svbool_t pg, uint32_t *base, svuint64_t data)
 // CHECK-LABEL: @test_svst1w_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i32>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i32>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1w_vnum_s64(svbool_t pg, int32_t *base, int64_t vnum, svint64_t data)
@@ -52,9 +55,12 @@ void test_svst1w_vnum_s64(svbool_t pg, int32_t *base, int64_t vnum, svint64_t da
 // CHECK-LABEL: @test_svst1w_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i32>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP2]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i32>
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1w_vnum_u64(svbool_t pg, uint32_t *base, int64_t vnum, svuint64_t data)
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
index e58cf4e49a37..9d5ffdafe866 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
@@ -20,13 +20,13 @@
 // CHECK-LABEL: @test_svaddqv_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.addqv.v8f16.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.faddqv.v8f16.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
 // CHECK-NEXT:    ret <8 x half> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svaddqv_f16u10__SVBool_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.addqv.v8f16.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.faddqv.v8f16.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <8 x half> [[TMP1]]
 //
 float16x8_t test_svaddqv_f16(svbool_t pg, svfloat16_t op)
@@ -37,13 +37,13 @@ float16x8_t test_svaddqv_f16(svbool_t pg, svfloat16_t op)
 // CHECK-LABEL: @test_svaddqv_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.addqv.v4f32.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.faddqv.v4f32.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svaddqv_f32u10__SVBool_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.addqv.v4f32.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.faddqv.v4f32.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <4 x float> [[TMP1]]
 //
 float32x4_t test_svaddqv_f32(svbool_t pg, svfloat32_t op)
@@ -54,13 +54,13 @@ float32x4_t test_svaddqv_f32(svbool_t pg, svfloat32_t op)
 // CHECK-LABEL: @test_svaddqv_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.addqv.v2f64.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.faddqv.v2f64.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
 // CHECK-NEXT:    ret <2 x double> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svaddqv_f64u10__SVBool_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.addqv.v2f64.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.faddqv.v2f64.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <2 x double> [[TMP1]]
 //
 float64x2_t test_svaddqv_f64(svbool_t pg, svfloat64_t op)
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
index 7657165d8b3f..28373bd80177 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
@@ -624,23 +624,29 @@ svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base) ATTR
 
 // CHECK-LABEL: @test_svld1_vnum_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1_vnum_u8_x2u11__SVCount_tPKhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
@@ -649,23 +655,29 @@ svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnu
 
 // CHECK-LABEL: @test_svld1_vnum_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u16_x2u11__SVCount_tPKtl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP7]]
 //
 svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
@@ -674,23 +686,29 @@ svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t
 
 // CHECK-LABEL: @test_svld1_vnum_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u32_x2u11__SVCount_tPKjl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP7]]
 //
 svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
@@ -699,23 +717,29 @@ svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t
 
 // CHECK-LABEL: @test_svld1_vnum_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u64_x2u11__SVCount_tPKml(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP7]]
 //
 svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
@@ -724,31 +748,37 @@ svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t
 
 // CHECK-LABEL: @test_svld1_vnum_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1_vnum_u8_x4u11__SVCount_tPKhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
@@ -757,31 +787,37 @@ svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnu
 
 // CHECK-LABEL: @test_svld1_vnum_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 16)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP9]], <vscale x 8 x i16> [[TMP10]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u16_x4u11__SVCount_tPKtl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP9]], <vscale x 8 x i16> [[TMP10]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP11]]
 //
 svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
@@ -790,31 +826,37 @@ svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t
 
 // CHECK-LABEL: @test_svld1_vnum_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 8)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u32_x4u11__SVCount_tPKjl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP11]]
 //
 svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
@@ -823,31 +865,37 @@ svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t
 
 // CHECK-LABEL: @test_svld1_vnum_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_u64_x4u11__SVCount_tPKml(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP11]]
 //
 svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
@@ -856,23 +904,29 @@ svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t
 
 // CHECK-LABEL: @test_svld1_vnum_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1_vnum_s8_x2u11__SVCount_tPKal(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
@@ -881,23 +935,29 @@ svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
 
 // CHECK-LABEL: @test_svld1_vnum_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s16_x2u11__SVCount_tPKsl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP7]]
 //
 svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
@@ -906,23 +966,29 @@ svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vn
 
 // CHECK-LABEL: @test_svld1_vnum_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s32_x2u11__SVCount_tPKil(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP7]]
 //
 svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
@@ -931,23 +997,29 @@ svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vn
 
 // CHECK-LABEL: @test_svld1_vnum_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s64_x2u11__SVCount_tPKll(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP7]]
 //
 svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
@@ -956,31 +1028,37 @@ svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vn
 
 // CHECK-LABEL: @test_svld1_vnum_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld1_vnum_s8_x4u11__SVCount_tPKal(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
@@ -989,31 +1067,37 @@ svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
 
 // CHECK-LABEL: @test_svld1_vnum_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 16)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP9]], <vscale x 8 x i16> [[TMP10]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s16_x4u11__SVCount_tPKsl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP9]], <vscale x 8 x i16> [[TMP10]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP11]]
 //
 svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
@@ -1022,31 +1106,37 @@ svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vn
 
 // CHECK-LABEL: @test_svld1_vnum_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 8)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s32_x4u11__SVCount_tPKil(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP11]]
 //
 svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
@@ -1055,31 +1145,37 @@ svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vn
 
 // CHECK-LABEL: @test_svld1_vnum_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_s64_x4u11__SVCount_tPKll(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP11]]
 //
 svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
@@ -1088,23 +1184,29 @@ svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vn
 
 // CHECK-LABEL: @test_svld1_vnum_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f16_x2u11__SVCount_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP7]]
 //
 svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
@@ -1113,23 +1215,29 @@ svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_
 
 // CHECK-LABEL: @test_svld1_vnum_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f32_x2u11__SVCount_tPKfl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP7]]
 //
 svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
@@ -1138,23 +1246,29 @@ svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_
 
 // CHECK-LABEL: @test_svld1_vnum_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f64_x2u11__SVCount_tPKdl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP7]]
 //
 svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
@@ -1163,31 +1277,37 @@ svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_
 
 // CHECK-LABEL: @test_svld1_vnum_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 16)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 8)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 16)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP9]], <vscale x 8 x half> [[TMP10]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f16_x4u11__SVCount_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP9]], <vscale x 8 x half> [[TMP10]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP11]]
 //
 svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
@@ -1196,31 +1316,37 @@ svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_
 
 // CHECK-LABEL: @test_svld1_vnum_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 8)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 8)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP9]], <vscale x 4 x float> [[TMP10]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f32_x4u11__SVCount_tPKfl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP9]], <vscale x 4 x float> [[TMP10]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP11]]
 //
 svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
@@ -1229,31 +1355,37 @@ svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_
 
 // CHECK-LABEL: @test_svld1_vnum_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 4)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP9]], <vscale x 2 x double> [[TMP10]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z22test_svld1_vnum_f64_x4u11__SVCount_tPKdl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP9]], <vscale x 2 x double> [[TMP10]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP11]]
 //
 svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
index 16361ecc987d..1cc8c7e469de 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1_single.c
@@ -45,17 +45,21 @@ svuint32_t test_svld1uwq_u32(svbool_t pred, uint32_t const * base) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 -8
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -32
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z22test_svld1uwq_vnum_u32u10__SVBool_tPKj
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 -8
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -32
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 //
 svuint32_t test_svld1uwq_vnum_u32(svbool_t pred, uint32_t const * base) {
   return SVE_ACLE_FUNC(svld1uwq_vnum, _u32, , )(pred, base, -8);
@@ -83,17 +87,21 @@ svint32_t test_svld1uwq_s32(svbool_t pred, int32_t const * base) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 7
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i64 [[TMP1]], 28
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z22test_svld1uwq_vnum_s32u10__SVBool_tPKi
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 7
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i64 [[TMP1]], 28
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 //
 svint32_t test_svld1uwq_vnum_s32(svbool_t pred, int32_t const * base) {
   return SVE_ACLE_FUNC(svld1uwq_vnum, _s32, , )(pred, base, 7);
@@ -121,17 +129,21 @@ svfloat32_t test_svld1uwq_f32(svbool_t pred, float32_t const * base) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 -8
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -32
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 4 x float> [[TMP3]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svld1uwq_vnum_f32u10__SVBool_tPKf
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 -8
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -32
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1uwq.nxv4f32(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP3]]
 //
 svfloat32_t test_svld1uwq_vnum_f32(svbool_t pred, float32_t const * base) {
   return SVE_ACLE_FUNC(svld1uwq_vnum, _f32, , )(pred, base, -8);
@@ -162,17 +174,21 @@ svuint64_t test_svld1udq_u64(svbool_t pred, uint64_t const * base) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 7
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i64 [[TMP1]], 56
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z22test_svld1udq_vnum_u64u10__SVBool_tPKm
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 7
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i64 [[TMP1]], 56
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 //
 svuint64_t test_svld1udq_vnum_u64(svbool_t pred, uint64_t const * base) {
   return SVE_ACLE_FUNC(svld1udq_vnum, _u64, , )(pred, base, 7);
@@ -200,17 +216,21 @@ svint64_t test_svld1udq_s64(svbool_t pred, int64_t const * base) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -64
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z22test_svld1udq_vnum_s64u10__SVBool_tPKl
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -64
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP3]]
 //
 svint64_t test_svld1udq_vnum_s64(svbool_t pred, int64_t const * base) {
   return SVE_ACLE_FUNC(svld1udq_vnum, _s64, , )(pred, base, -8);
@@ -238,17 +258,21 @@ svfloat64_t test_svld1udq_f64(svbool_t pred, float64_t const * base) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x double>, ptr [[BASE]], i64 7
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i64 [[TMP1]], 56
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CHECK-NEXT:    ret <vscale x 2 x double> [[TMP3]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 2 x double> @_Z22test_svld1udq_vnum_f64u10__SVBool_tPKd
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x double>, ptr [[BASE]], i64 7
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nuw nsw i64 [[TMP1]], 56
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1udq.nxv2f64(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP3]]
 //
 svfloat64_t test_svld1udq_vnum_f64(svbool_t pred, float64_t const * base) {
   return SVE_ACLE_FUNC(svld1udq_vnum, _f64, , )(pred, base, 7);
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
index 3f61cc3de139..668f2b8b8113 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
@@ -622,23 +622,29 @@ svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base) ATTR
 
 // CHECK-LABEL: @test_svldnt1_vnum_u8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_u8_x2u11__SVCount_tPKhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
@@ -647,23 +653,29 @@ svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t v
 
 // CHECK-LABEL: @test_svldnt1_vnum_u16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u16_x2u11__SVCount_tPKtl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP7]]
 //
 svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
@@ -672,23 +684,29 @@ svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_
 
 // CHECK-LABEL: @test_svldnt1_vnum_u32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u32_x2u11__SVCount_tPKjl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP7]]
 //
 svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
@@ -697,23 +715,29 @@ svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_
 
 // CHECK-LABEL: @test_svldnt1_vnum_u64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u64_x2u11__SVCount_tPKml(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP7]]
 //
 svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
@@ -722,31 +746,37 @@ svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_
 
 // CHECK-LABEL: @test_svldnt1_vnum_u8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_u8_x4u11__SVCount_tPKhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
@@ -755,31 +785,37 @@ svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t v
 
 // CHECK-LABEL: @test_svldnt1_vnum_u16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 16)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP9]], <vscale x 8 x i16> [[TMP10]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u16_x4u11__SVCount_tPKtl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP9]], <vscale x 8 x i16> [[TMP10]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP11]]
 //
 svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
@@ -788,31 +824,37 @@ svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_
 
 // CHECK-LABEL: @test_svldnt1_vnum_u32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 8)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u32_x4u11__SVCount_tPKjl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP11]]
 //
 svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
@@ -821,31 +863,37 @@ svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_
 
 // CHECK-LABEL: @test_svldnt1_vnum_u64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_u64_x4u11__SVCount_tPKml(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP11]]
 //
 svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
@@ -854,23 +902,29 @@ svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_
 
 // CHECK-LABEL: @test_svldnt1_vnum_s8_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_s8_x2u11__SVCount_tPKal(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
@@ -879,23 +933,29 @@ svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnu
 
 // CHECK-LABEL: @test_svldnt1_vnum_s16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s16_x2u11__SVCount_tPKsl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP7]]
 //
 svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
@@ -904,23 +964,29 @@ svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t
 
 // CHECK-LABEL: @test_svldnt1_vnum_s32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s32_x2u11__SVCount_tPKil(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP7]]
 //
 svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
@@ -929,23 +995,29 @@ svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t
 
 // CHECK-LABEL: @test_svldnt1_vnum_s64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s64_x2u11__SVCount_tPKll(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP7]]
 //
 svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
@@ -954,31 +1026,37 @@ svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t
 
 // CHECK-LABEL: @test_svldnt1_vnum_s8_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z23test_svldnt1_vnum_s8_x4u11__SVCount_tPKal(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
@@ -987,31 +1065,37 @@ svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnu
 
 // CHECK-LABEL: @test_svldnt1_vnum_s16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 16)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP9]], <vscale x 8 x i16> [[TMP10]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s16_x4u11__SVCount_tPKsl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP9]], <vscale x 8 x i16> [[TMP10]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP11]]
 //
 svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
@@ -1020,31 +1104,37 @@ svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t
 
 // CHECK-LABEL: @test_svldnt1_vnum_s32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 8)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s32_x4u11__SVCount_tPKil(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP11]]
 //
 svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
@@ -1053,31 +1143,37 @@ svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t
 
 // CHECK-LABEL: @test_svldnt1_vnum_s64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_s64_x4u11__SVCount_tPKll(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP11]]
 //
 svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
@@ -1086,23 +1182,29 @@ svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t
 
 // CHECK-LABEL: @test_svldnt1_vnum_f16_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f16_x2u11__SVCount_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP7]]
 //
 svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
@@ -1111,23 +1213,29 @@ svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int6
 
 // CHECK-LABEL: @test_svldnt1_vnum_f32_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f32_x2u11__SVCount_tPKfl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP7]]
 //
 svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
@@ -1136,23 +1244,29 @@ svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int6
 
 // CHECK-LABEL: @test_svldnt1_vnum_f64_x2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f64_x2u11__SVCount_tPKdl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP7]]
 //
 svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
@@ -1161,31 +1275,37 @@ svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int6
 
 // CHECK-LABEL: @test_svldnt1_vnum_f16_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 16)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 8)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 16)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP9]], <vscale x 8 x half> [[TMP10]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f16_x4u11__SVCount_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP9]], <vscale x 8 x half> [[TMP10]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP11]]
 //
 svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
@@ -1194,31 +1314,37 @@ svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int6
 
 // CHECK-LABEL: @test_svldnt1_vnum_f32_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 8)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 8)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP9]], <vscale x 4 x float> [[TMP10]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f32_x4u11__SVCount_tPKfl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP9]], <vscale x 4 x float> [[TMP10]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP11]]
 //
 svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
@@ -1227,31 +1353,37 @@ svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int6
 
 // CHECK-LABEL: @test_svldnt1_vnum_f64_x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 4)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP9]], <vscale x 2 x double> [[TMP10]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z24test_svldnt1_vnum_f64_x4u11__SVCount_tPKdl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP9]], <vscale x 2 x double> [[TMP10]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP11]]
 //
 svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
index ae3ddd416f7e..8922b19eed42 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c
@@ -312,23 +312,29 @@ svfloat64x2_t test_svld2q_f64(svbool_t pg, const float64_t *base)
 
 // CHECK-LABEL: @test_svld2q_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld2q_vnum_u8u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 svuint8x2_t test_svld2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -337,23 +343,29 @@ svuint8x2_t test_svld2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
 
 // CHECK-LABEL: @test_svld2q_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld2q_vnum_s8u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP7]]
 //
 svint8x2_t test_svld2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -362,24 +374,30 @@ svint8x2_t test_svld2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld2q_vnum_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u16u10__SVBool_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP8]]
 //
 svuint16x2_t test_svld2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
 {
@@ -389,24 +407,30 @@ svuint16x2_t test_svld2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld2q_vnum_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s16u10__SVBool_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP8]]
 //
 svint16x2_t test_svld2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
 {
@@ -416,24 +440,30 @@ svint16x2_t test_svld2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld2q_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u32u10__SVBool_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP8]]
 //
 svuint32x2_t test_svld2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
 {
@@ -443,24 +473,30 @@ svuint32x2_t test_svld2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld2q_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s32u10__SVBool_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP8]]
 //
 svint32x2_t test_svld2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
 {
@@ -470,24 +506,30 @@ svint32x2_t test_svld2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld2q_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_u64u10__SVBool_tPKml(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP8]]
 //
 svuint64x2_t test_svld2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
 {
@@ -497,24 +539,30 @@ svuint64x2_t test_svld2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld2q_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_s64u10__SVBool_tPKll(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP8]]
 //
 svint64x2_t test_svld2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
 {
@@ -524,24 +572,30 @@ svint64x2_t test_svld2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld2q_vnum_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f16u10__SVBool_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP8]]
 //
 svfloat16x2_t test_svld2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum)
 {
@@ -551,24 +605,30 @@ svfloat16x2_t test_svld2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v
 // CHECK-LABEL: @test_svld2q_vnum_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
-// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld2q_vnum_bf16u10__SVBool_tPKu6__bf16l(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP8]]
 //
 svbfloat16x2_t test_svld2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
 {
@@ -578,24 +638,30 @@ svbfloat16x2_t test_svld2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_
 // CHECK-LABEL: @test_svld2q_vnum_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
-// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f32u10__SVBool_tPKfl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP8]]
 //
 svfloat32x2_t test_svld2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum)
 {
@@ -605,24 +671,30 @@ svfloat32x2_t test_svld2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v
 // CHECK-LABEL: @test_svld2q_vnum_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
-// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP8]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld2q_vnum_f64u10__SVBool_tPKdl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP6]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP8]]
 //
 svfloat64x2_t test_svld2q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum)
 {
@@ -975,27 +1047,33 @@ svfloat64x3_t test_svld3q_f64(svbool_t pg, const float64_t *base)
 
 // CHECK-LABEL: @test_svld3q_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP9]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld3q_vnum_u8u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CPP-CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CPP-CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP9]]
 //
 svuint8x3_t test_svld3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -1004,27 +1082,33 @@ svuint8x3_t test_svld3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
 
 // CHECK-LABEL: @test_svld3q_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP9]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld3q_vnum_s8u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CPP-CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP7]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CPP-CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP9]]
 //
 svint8x3_t test_svld3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -1034,28 +1118,34 @@ svint8x3_t test_svld3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld3q_vnum_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
-// CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u16u10__SVBool_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP10]]
 //
 svuint16x3_t test_svld3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
 {
@@ -1065,28 +1155,34 @@ svuint16x3_t test_svld3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld3q_vnum_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
-// CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s16u10__SVBool_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP10]]
 //
 svint16x3_t test_svld3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
 {
@@ -1096,28 +1192,34 @@ svint16x3_t test_svld3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld3q_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
-// CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u32u10__SVBool_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP10]]
 //
 svuint32x3_t test_svld3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
 {
@@ -1127,28 +1229,34 @@ svuint32x3_t test_svld3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld3q_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
-// CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s32u10__SVBool_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP10]]
 //
 svint32x3_t test_svld3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
 {
@@ -1158,28 +1266,34 @@ svint32x3_t test_svld3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld3q_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
-// CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_u64u10__SVBool_tPKml(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP10]]
 //
 svuint64x3_t test_svld3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
 {
@@ -1189,28 +1303,34 @@ svuint64x3_t test_svld3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld3q_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
-// CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_s64u10__SVBool_tPKll(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP10]]
 //
 svint64x3_t test_svld3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
 {
@@ -1220,28 +1340,34 @@ svint64x3_t test_svld3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld3q_vnum_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 16)
-// CHECK-NEXT:    ret <vscale x 24 x half> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], i64 16)
+// CHECK-NEXT:    ret <vscale x 24 x half> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f16u10__SVBool_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 24 x half> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 24 x half> [[TMP10]]
 //
 svfloat16x3_t test_svld3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum)
 {
@@ -1251,28 +1377,34 @@ svfloat16x3_t test_svld3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v
 // CHECK-LABEL: @test_svld3q_vnum_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 16)
-// CHECK-NEXT:    ret <vscale x 24 x bfloat> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
+// CHECK-NEXT:    ret <vscale x 24 x bfloat> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld3q_vnum_bf16u10__SVBool_tPKu6__bf16l(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 16)
-// CPP-CHECK-NEXT:    ret <vscale x 24 x bfloat> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 24 x bfloat> [[TMP10]]
 //
 svbfloat16x3_t test_svld3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
 {
@@ -1282,28 +1414,34 @@ svbfloat16x3_t test_svld3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_
 // CHECK-LABEL: @test_svld3q_vnum_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 8)
-// CHECK-NEXT:    ret <vscale x 12 x float> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    ret <vscale x 12 x float> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f32u10__SVBool_tPKfl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 8)
-// CPP-CHECK-NEXT:    ret <vscale x 12 x float> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 12 x float> [[TMP10]]
 //
 svfloat32x3_t test_svld3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum)
 {
@@ -1313,28 +1451,34 @@ svfloat32x3_t test_svld3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v
 // CHECK-LABEL: @test_svld3q_vnum_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 4)
-// CHECK-NEXT:    ret <vscale x 6 x double> [[TMP8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], i64 4)
+// CHECK-NEXT:    ret <vscale x 6 x double> [[TMP10]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld3q_vnum_f64u10__SVBool_tPKdl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 4)
-// CPP-CHECK-NEXT:    ret <vscale x 6 x double> [[TMP8]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 6 x double> [[TMP10]]
 //
 svfloat64x3_t test_svld3q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum)
 {
@@ -1704,31 +1848,37 @@ svfloat64x4_t test_svld4q_f64(svbool_t pg, const float64_t *base)
 
 // CHECK-LABEL: @test_svld4q_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld4q_vnum_u8u10__SVBool_tPKhl(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 svuint8x4_t test_svld4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
 {
@@ -1737,31 +1887,37 @@ svuint8x4_t test_svld4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum)
 
 // CHECK-LABEL: @test_svld4q_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld4q_vnum_s8u10__SVBool_tPKal(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 0
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP2]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 2
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], 3
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
-// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 0
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP4]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 1
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 2
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]], 3
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP11]]
 //
 svint8x4_t test_svld4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
 {
@@ -1770,32 +1926,38 @@ svint8x4_t test_svld4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld4q_vnum_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP10]], <vscale x 8 x i16> [[TMP11]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u16u10__SVBool_tPKtl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP10]], <vscale x 8 x i16> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP12]]
 //
 svuint16x4_t test_svld4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
 {
@@ -1805,32 +1967,38 @@ svuint16x4_t test_svld4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld4q_vnum_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP10]], <vscale x 8 x i16> [[TMP11]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s16u10__SVBool_tPKsl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP8]], <vscale x 8 x i16> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP10]], <vscale x 8 x i16> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP12]]
 //
 svint16x4_t test_svld4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
 {
@@ -1840,32 +2008,38 @@ svint16x4_t test_svld4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld4q_vnum_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u32u10__SVBool_tPKjl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP12]]
 //
 svuint32x4_t test_svld4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
 {
@@ -1875,32 +2049,38 @@ svuint32x4_t test_svld4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld4q_vnum_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s32u10__SVBool_tPKil(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP12]]
 //
 svint32x4_t test_svld4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
 {
@@ -1910,32 +2090,38 @@ svint32x4_t test_svld4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld4q_vnum_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_u64u10__SVBool_tPKml(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP12]]
 //
 svuint64x4_t test_svld4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
 {
@@ -1945,32 +2131,38 @@ svuint64x4_t test_svld4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnu
 // CHECK-LABEL: @test_svld4q_vnum_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_s64u10__SVBool_tPKll(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP8]], <vscale x 2 x i64> [[TMP9]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP12]]
 //
 svint64x4_t test_svld4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
 {
@@ -1980,32 +2172,38 @@ svint64x4_t test_svld4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum)
 // CHECK-LABEL: @test_svld4q_vnum_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 16)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], i64 16)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP10]], <vscale x 8 x half> [[TMP11]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f16u10__SVBool_tPKDhl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP8]], <vscale x 8 x half> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP10]], <vscale x 8 x half> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP12]]
 //
 svfloat16x4_t test_svld4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum)
 {
@@ -2015,32 +2213,38 @@ svfloat16x4_t test_svld4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t v
 // CHECK-LABEL: @test_svld4q_vnum_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 16)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 24)
-// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z21test_svld4q_vnum_bf16u10__SVBool_tPKu6__bf16l(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 24)
-// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP12]]
 //
 svbfloat16x4_t test_svld4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum)
 {
@@ -2050,32 +2254,38 @@ svbfloat16x4_t test_svld4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_
 // CHECK-LABEL: @test_svld4q_vnum_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 8)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 12)
-// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f32u10__SVBool_tPKfl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 12)
-// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
 //
 svfloat32x4_t test_svld4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum)
 {
@@ -2085,32 +2295,38 @@ svfloat32x4_t test_svld4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t v
 // CHECK-LABEL: @test_svld4q_vnum_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 4)
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], i64 6)
-// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP10]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], i64 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP10]], <vscale x 2 x double> [[TMP11]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP12]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld4q_vnum_f64u10__SVBool_tPKdl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 0
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP3]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 1
-// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 2
-// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], 3
-// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], i64 6)
-// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP10]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP2]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP8]], <vscale x 2 x double> [[TMP9]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP10]], <vscale x 2 x double> [[TMP11]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP12]]
 //
 svfloat64x4_t test_svld4q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum)
 {
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
index 7aa994345a8c..9fe716950ddc 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
@@ -491,16 +491,22 @@ void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svst1_vnum_u8_x2u11__SVCount_tPhl11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) ATTR
@@ -512,16 +518,22 @@ void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u16_x2u11__SVCount_tPtl12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) ATTR
@@ -533,16 +545,22 @@ void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u32_x2u11__SVCount_tPjl12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) ATTR
@@ -554,16 +572,22 @@ void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u64_x2u11__SVCount_tPml12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) ATTR
@@ -577,8 +601,11 @@ void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 48)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svst1_vnum_u8_x4u11__SVCount_tPhl11svuint8x4_t(
@@ -587,8 +614,11 @@ void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 48)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) ATTR
@@ -602,8 +632,11 @@ void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u16_x4u11__SVCount_tPtl12svuint16x4_t(
@@ -612,8 +645,11 @@ void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) ATTR
@@ -627,8 +663,11 @@ void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u32_x4u11__SVCount_tPjl12svuint32x4_t(
@@ -637,8 +676,11 @@ void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) ATTR
@@ -652,8 +694,11 @@ void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_u64_x4u11__SVCount_tPml12svuint64x4_t(
@@ -662,8 +707,11 @@ void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) ATTR
@@ -675,16 +723,22 @@ void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svst1_vnum_s8_x2u11__SVCount_tPal10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) ATTR
@@ -696,16 +750,22 @@ void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s16_x2u11__SVCount_tPsl11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) ATTR
@@ -717,16 +777,22 @@ void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s32_x2u11__SVCount_tPil11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) ATTR
@@ -738,16 +804,22 @@ void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s64_x2u11__SVCount_tPll11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) ATTR
@@ -761,8 +833,11 @@ void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 48)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svst1_vnum_s8_x4u11__SVCount_tPal10svint8x4_t(
@@ -771,8 +846,11 @@ void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 48)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) ATTR
@@ -786,8 +864,11 @@ void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s16_x4u11__SVCount_tPsl11svint16x4_t(
@@ -796,8 +877,11 @@ void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) ATTR
@@ -811,8 +895,11 @@ void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s32_x4u11__SVCount_tPil11svint32x4_t(
@@ -821,8 +908,11 @@ void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) ATTR
@@ -836,8 +926,11 @@ void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_s64_x4u11__SVCount_tPll11svint64x4_t(
@@ -846,8 +939,11 @@ void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) ATTR
@@ -860,8 +956,11 @@ void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[V]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f16_x2u11__SVCount_tPDhd13svfloat16x2_t(
@@ -869,8 +968,11 @@ void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[V]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) ATTR
@@ -883,8 +985,11 @@ void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[V]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f32_x2u11__SVCount_tPfd13svfloat32x2_t(
@@ -892,8 +997,11 @@ void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[V]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) ATTR
@@ -906,8 +1014,11 @@ void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[V]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f64_x2u11__SVCount_tPdd13svfloat64x2_t(
@@ -915,8 +1026,11 @@ void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[V]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) ATTR
@@ -931,8 +1045,11 @@ void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svflo
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f16_x4u11__SVCount_tPDhd13svfloat16x4_t(
@@ -942,8 +1059,11 @@ void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) ATTR
@@ -958,8 +1078,11 @@ void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f32_x4u11__SVCount_tPfd13svfloat32x4_t(
@@ -969,8 +1092,11 @@ void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) ATTR
@@ -985,8 +1111,11 @@ void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z22test_svst1_vnum_f64_x4u11__SVCount_tPdd13svfloat64x4_t(
@@ -996,8 +1125,11 @@ void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) ATTR
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
index 27f7b8be7f18..677ec8c81d24 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1_single.c
@@ -45,16 +45,20 @@ void test_svst1wq_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 1
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP2]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_u32u10__SVBool_tPKju12__SVUint32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 1
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP2]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1wq_vnum_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) {
@@ -83,16 +87,20 @@ void test_svst1wq_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 1
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP2]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_s32u10__SVBool_tPKiu11__SVInt32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i32>, ptr [[BASE]], i64 1
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP2]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1wq_vnum_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
@@ -121,16 +129,20 @@ void test_svst1wq_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x float> [[ZT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 1
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP2]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_f32u10__SVBool_tPKfu13__SVFloat32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x float> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x float>, ptr [[BASE]], i64 1
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP2]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1wq_vnum_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) {
@@ -162,16 +174,20 @@ void test_svst1dq_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -64
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_u64u10__SVBool_tPKmu12__SVUint64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -64
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1dq_vnum_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) {
@@ -200,16 +216,20 @@ void test_svst1dq_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -64
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_s64u10__SVBool_tPKlu11__SVInt64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x i64>, ptr [[BASE]], i64 -8
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -64
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1dq_vnum_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
@@ -238,16 +258,20 @@ void test_svst1dq_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) {
 // CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x double> [[ZT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x double>, ptr [[BASE]], i64 -8
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -64
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_f64u10__SVBool_tPKdu13__SVFloat64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x double> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 1 x double>, ptr [[BASE]], i64 -8
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i64 [[TMP1]], -64
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1dq_vnum_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) {
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
index b1ca27b7b68a..4c7e824ac3fb 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
@@ -512,16 +512,22 @@ void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_u8_x2u11__SVCount_tPhl11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) ATTR
@@ -534,16 +540,22 @@ void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u16_x2u11__SVCount_tPtl12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) ATTR
@@ -556,16 +568,22 @@ void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u32_x2u11__SVCount_tPjl12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) ATTR
@@ -578,16 +596,22 @@ void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u64_x2u11__SVCount_tPml12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) ATTR
@@ -602,8 +626,11 @@ void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 48)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_u8_x4u11__SVCount_tPhl11svuint8x4_t(
@@ -612,8 +639,11 @@ void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 48)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) ATTR
@@ -628,8 +658,11 @@ void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u16_x4u11__SVCount_tPtl12svuint16x4_t(
@@ -638,8 +671,11 @@ void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) ATTR
@@ -654,8 +690,11 @@ void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u32_x4u11__SVCount_tPjl12svuint32x4_t(
@@ -664,8 +703,11 @@ void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) ATTR
@@ -680,8 +722,11 @@ void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_u64_x4u11__SVCount_tPml12svuint64x4_t(
@@ -690,8 +735,11 @@ void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) ATTR
@@ -704,16 +752,22 @@ void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_s8_x2u11__SVCount_tPal10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) ATTR
@@ -726,16 +780,22 @@ void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s16_x2u11__SVCount_tPsl11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[V]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) ATTR
@@ -748,16 +808,22 @@ void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s32_x2u11__SVCount_tPil11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[V]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) ATTR
@@ -770,16 +836,22 @@ void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s64_x2u11__SVCount_tPll11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[V]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) ATTR
@@ -794,8 +866,11 @@ void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 48)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z23test_svstnt1_vnum_s8_x4u11__SVCount_tPal10svint8x4_t(
@@ -804,8 +879,11 @@ void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[V]], i64 48)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) ATTR
@@ -820,8 +898,11 @@ void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s16_x4u11__SVCount_tPsl11svint16x4_t(
@@ -830,8 +911,11 @@ void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[V]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) ATTR
@@ -846,8 +930,11 @@ void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s32_x4u11__SVCount_tPil11svint32x4_t(
@@ -856,8 +943,11 @@ void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[V]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) ATTR
@@ -872,8 +962,11 @@ void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_s64_x4u11__SVCount_tPll11svint64x4_t(
@@ -882,8 +975,11 @@ void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[V]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[VNUM:%.*]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) ATTR
@@ -897,8 +993,11 @@ void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[V]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f16_x2u11__SVCount_tPDhd13svfloat16x2_t(
@@ -906,8 +1005,11 @@ void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[V]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) ATTR
@@ -921,8 +1023,11 @@ void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[V]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f32_x2u11__SVCount_tPfd13svfloat32x2_t(
@@ -930,8 +1035,11 @@ void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[V]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) ATTR
@@ -945,8 +1053,11 @@ void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[V:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[V]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f64_x2u11__SVCount_tPdd13svfloat64x2_t(
@@ -954,8 +1065,11 @@ void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[VNUM:%.*]] to i64
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[V:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[V]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[TMP2]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) ATTR
@@ -971,8 +1085,11 @@ void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svf
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f16_x4u11__SVCount_tPDhd13svfloat16x4_t(
@@ -982,8 +1099,11 @@ void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[V]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) ATTR
@@ -999,8 +1119,11 @@ void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f32_x4u11__SVCount_tPfd13svfloat32x4_t(
@@ -1010,8 +1133,11 @@ void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[V]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) ATTR
@@ -1027,8 +1153,11 @@ void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z24test_svstnt1_vnum_f64_x4u11__SVCount_tPdd13svfloat64x4_t(
@@ -1038,8 +1167,11 @@ void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[V]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[CONV]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[CONV]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) ATTR
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
index 2cbea29d3390..bd63065d4e55 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c
@@ -265,16 +265,22 @@ void test_svst2q_f64(svbool_t pg, const float64_t *base, svfloat64x2_t zt)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2q_vnum_u8u10__SVBool_tPKhl11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x2_t zt)
@@ -286,16 +292,22 @@ void test_svst2q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2q_vnum_s8u10__SVBool_tPKal10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZT]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP3]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x2_t zt)
@@ -308,8 +320,11 @@ void test_svst2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x2
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u16u10__SVBool_tPKtl12svuint16x2_t(
@@ -317,8 +332,11 @@ void test_svst2q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x2
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x2_t zt)
@@ -331,8 +349,11 @@ void test_svst2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s16u10__SVBool_tPKsl11svint16x2_t(
@@ -340,8 +361,11 @@ void test_svst2q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x2_t zt)
@@ -354,8 +378,11 @@ void test_svst2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u32u10__SVBool_tPKjl12svuint32x2_t(
@@ -363,8 +390,11 @@ void test_svst2q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x2_t zt)
@@ -377,8 +407,11 @@ void test_svst2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s32u10__SVBool_tPKil11svint32x2_t(
@@ -386,8 +419,11 @@ void test_svst2q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x2_t zt)
@@ -400,8 +436,11 @@ void test_svst2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_u64u10__SVBool_tPKml12svuint64x2_t(
@@ -409,8 +448,11 @@ void test_svst2q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x2_t zt)
@@ -423,8 +465,11 @@ void test_svst2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_s64u10__SVBool_tPKll11svint64x2_t(
@@ -432,8 +477,11 @@ void test_svst2q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[ZT]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x2_t zt)
@@ -446,8 +494,11 @@ void test_svst2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f16u10__SVBool_tPKDhl13svfloat16x2_t(
@@ -455,8 +506,11 @@ void test_svst2q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x2_t zt)
@@ -469,8 +523,11 @@ void test_svst2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svst2q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x2_t(
@@ -478,8 +535,11 @@ void test_svst2q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x2_t zt)
@@ -492,8 +552,11 @@ void test_svst2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f32u10__SVBool_tPKfl13svfloat32x2_t(
@@ -501,8 +564,11 @@ void test_svst2q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x2_t zt)
@@ -515,8 +581,11 @@ void test_svst2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2q_vnum_f64u10__SVBool_tPKdl13svfloat64x2_t(
@@ -524,8 +593,11 @@ void test_svst2q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZT]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x2_t zt)
@@ -811,8 +883,11 @@ void test_svst3q_f64(svbool_t pg, const float64_t *base, svfloat64x3_t zt)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3q_vnum_u8u10__SVBool_tPKhl11svuint8x3_t(
@@ -820,8 +895,11 @@ void test_svst3q_f64(svbool_t pg, const float64_t *base, svfloat64x3_t zt)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x3_t zt)
@@ -834,8 +912,11 @@ void test_svst3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3q_vnum_s8u10__SVBool_tPKal10svint8x3_t(
@@ -843,8 +924,11 @@ void test_svst3q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[ZT]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP4]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x3_t zt)
@@ -858,8 +942,11 @@ void test_svst3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x3
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u16u10__SVBool_tPKtl12svuint16x3_t(
@@ -868,8 +955,11 @@ void test_svst3q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x3
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x3_t zt)
@@ -883,8 +973,11 @@ void test_svst3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s16u10__SVBool_tPKsl11svint16x3_t(
@@ -893,8 +986,11 @@ void test_svst3q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x3_t zt)
@@ -908,8 +1004,11 @@ void test_svst3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u32u10__SVBool_tPKjl12svuint32x3_t(
@@ -918,8 +1017,11 @@ void test_svst3q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x3_t zt)
@@ -933,8 +1035,11 @@ void test_svst3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s32u10__SVBool_tPKil11svint32x3_t(
@@ -943,8 +1048,11 @@ void test_svst3q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x3_t zt)
@@ -958,8 +1066,11 @@ void test_svst3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_u64u10__SVBool_tPKml12svuint64x3_t(
@@ -968,8 +1079,11 @@ void test_svst3q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x3_t zt)
@@ -983,8 +1097,11 @@ void test_svst3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_s64u10__SVBool_tPKll11svint64x3_t(
@@ -993,8 +1110,11 @@ void test_svst3q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x3_t zt)
@@ -1008,8 +1128,11 @@ void test_svst3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f16u10__SVBool_tPKDhl13svfloat16x3_t(
@@ -1018,8 +1141,11 @@ void test_svst3q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x3_t zt)
@@ -1033,8 +1159,11 @@ void test_svst3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svst3q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x3_t(
@@ -1043,8 +1172,11 @@ void test_svst3q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x3_t zt)
@@ -1058,8 +1190,11 @@ void test_svst3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f32u10__SVBool_tPKfl13svfloat32x3_t(
@@ -1068,8 +1203,11 @@ void test_svst3q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x3_t zt)
@@ -1083,8 +1221,11 @@ void test_svst3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 2)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3q_vnum_f64u10__SVBool_tPKdl13svfloat64x3_t(
@@ -1093,8 +1234,11 @@ void test_svst3q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x3_t zt)
@@ -1405,8 +1549,11 @@ void test_svst4q_f64(svbool_t pg, const float64_t *base, svfloat64x4_t zt)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4q_vnum_u8u10__SVBool_tPKhl11svuint8x4_t(
@@ -1415,8 +1562,11 @@ void test_svst4q_f64(svbool_t pg, const float64_t *base, svfloat64x4_t zt)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8x4_t zt)
@@ -1430,8 +1580,11 @@ void test_svst4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP6]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4q_vnum_s8u10__SVBool_tPKal10svint8x4_t(
@@ -1440,8 +1593,11 @@ void test_svst4q_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum, svuint8
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 32)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZT]], i64 48)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP5]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP6]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x4_t zt)
@@ -1456,8 +1612,11 @@ void test_svst4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x4
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u16u10__SVBool_tPKtl12svuint16x4_t(
@@ -1467,8 +1626,11 @@ void test_svst4q_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum, svint8x4
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuint16x4_t zt)
@@ -1483,8 +1645,11 @@ void test_svst4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s16u10__SVBool_tPKsl11svint16x4_t(
@@ -1494,8 +1659,11 @@ void test_svst4q_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZT]], i64 24)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint16x4_t zt)
@@ -1510,8 +1678,11 @@ void test_svst4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u32u10__SVBool_tPKjl12svuint32x4_t(
@@ -1521,8 +1692,11 @@ void test_svst4q_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum, svint1
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuint32x4_t zt)
@@ -1537,8 +1711,11 @@ void test_svst4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s32u10__SVBool_tPKil11svint32x4_t(
@@ -1548,8 +1725,11 @@ void test_svst4q_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZT]], i64 12)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint32x4_t zt)
@@ -1564,8 +1744,11 @@ void test_svst4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_u64u10__SVBool_tPKml12svuint64x4_t(
@@ -1575,8 +1758,11 @@ void test_svst4q_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum, svint3
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuint64x4_t zt)
@@ -1591,8 +1777,11 @@ void test_svst4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_s64u10__SVBool_tPKll11svint64x4_t(
@@ -1602,8 +1791,11 @@ void test_svst4q_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum, svuin
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[ZT]], i64 6)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint64x4_t zt)
@@ -1618,8 +1810,11 @@ void test_svst4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 24)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f16u10__SVBool_tPKDhl13svfloat16x4_t(
@@ -1629,8 +1824,11 @@ void test_svst4q_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum, svint6
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZT]], i64 24)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfloat16x4_t zt)
@@ -1645,8 +1843,11 @@ void test_svst4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 24)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z21test_svst4q_vnum_bf16u10__SVBool_tPKu6__bf16l14svbfloat16x4_t(
@@ -1656,8 +1857,11 @@ void test_svst4q_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum, svfl
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 16)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZT]], i64 24)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, svbfloat16x4_t zt)
@@ -1672,8 +1876,11 @@ void test_svst4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 12)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f32u10__SVBool_tPKfl13svfloat32x4_t(
@@ -1683,8 +1890,11 @@ void test_svst4q_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum, sv
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZT]], i64 12)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfloat32x4_t zt)
@@ -1699,8 +1909,11 @@ void test_svst4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 4)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 6)
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP7]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4q_vnum_f64u10__SVBool_tPKdl13svfloat64x4_t(
@@ -1710,8 +1923,11 @@ void test_svst4q_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum, svfl
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZT]], i64 6)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
+// CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[TMP6]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP7]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4q_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum, svfloat64x4_t zt)
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index f50eaf371028..0f2c5b2546fa 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -426,11 +426,12 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
 // CHECK-NEXT:    [[__REINT_I:%.*]] = alloca bfloat, align 2
 // CHECK-NEXT:    [[__REINT1_I:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__REINT_I]], align 2
-// CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[TMP1]], 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__REINT_I]], align 2
+// CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
 // CHECK-NEXT:    store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[__REINT1_I]], align 4
-// CHECK-NEXT:    ret float [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__REINT1_I]], align 4
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float32_t test_vcvtah_f32_bf16(bfloat16_t a) {
   return vcvtah_f32_bf16(a);
diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
index e8309888dcfe..85472645acb3 100644
--- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp
+++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
@@ -201,3 +201,18 @@ S11 f11() {
   S11 x;
   return func11(x);
 }
+
+// GH86384
+// Pass and return object with template constructor (pass directly,
+// return indirectly).
+// CHECK: define dso_local void @"?f12@@YA?AUS12@@XZ"(ptr dead_on_unwind inreg noalias writable sret(%struct.S12) align 4 {{.*}})
+// CHECK: call void @"?func12@@YA?AUS12@@U1@@Z"(ptr dead_on_unwind inreg writable sret(%struct.S12) align 4 {{.*}}, i64 {{.*}})
+struct S12 {
+  template<typename T> S12(T*) {}
+  int x;
+};
+S12 func12(S12 x);
+S12 f12() {
+  S12 x((int*)0);
+  return func12(x);
+}
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 1fb39f9a3466..de30a00138ac 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -66,7 +66,7 @@ struct anon_struct {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10:[0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10:[0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
@@ -114,7 +114,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], [[INDEX]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
@@ -158,7 +158,7 @@ void test2(struct annotated *p, size_t index) {
   p->array[index] = __builtin_dynamic_object_size(p->array, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test2_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -169,7 +169,7 @@ void test2(struct annotated *p, size_t index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP3]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test2_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -203,7 +203,7 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], [[INDEX]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
@@ -257,7 +257,7 @@ void test3(struct annotated *p, size_t index) {
   p->array[index] = __builtin_dynamic_object_size(p, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934601) i64 @test3_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -270,7 +270,7 @@ void test3(struct annotated *p, size_t index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 0
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP5]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934601) i64 @test3_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -308,7 +308,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], 2
@@ -325,7 +325,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[IDXPROM13]], [[TMP6]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP7]], label [[CONT20:%.*]], label [[HANDLER_OUT_OF_BOUNDS16:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds16:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM13]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM13]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont20:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD7]], 3
@@ -342,7 +342,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[IDXPROM30]], [[TMP12]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP13]], label [[CONT37:%.*]], label [[HANDLER_OUT_OF_BOUNDS33:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds33:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM30]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM30]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont37:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM30]]
@@ -441,7 +441,7 @@ void test4(struct annotated *p, int index, int fam_idx) {
   p->array[index + 2] = (unsigned char)__builtin_dynamic_object_size(&(p->array[fam_idx]), 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test4_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -456,7 +456,7 @@ void test4(struct annotated *p, int index, int fam_idx) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP3]], i64 0
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP7]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test4_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -494,7 +494,7 @@ size_t test4_bdos(struct annotated *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
@@ -545,7 +545,7 @@ void test5(struct anon_struct *p, int index) {
   p->array[index] = __builtin_dynamic_object_size(p, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 16, 1) i64 @test5_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -556,7 +556,7 @@ void test5(struct anon_struct *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = select i1 [[DOTINV]], i64 0, i64 [[TMP1]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP2]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 16, 1) i64 @test5_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -590,7 +590,7 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
@@ -683,7 +683,7 @@ size_t test6_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 9
@@ -756,7 +756,7 @@ size_t test7_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT9:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont9:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 9
@@ -797,7 +797,7 @@ void test8(struct union_of_fams *p, int index) {
   p->ints[index] = __builtin_dynamic_object_size(p->ints, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test8_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -805,7 +805,7 @@ void test8(struct union_of_fams *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test8_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -955,7 +955,7 @@ void test10(struct union_of_fams *p, int index) {
   p->bytes[index] = (unsigned char)__builtin_dynamic_object_size(p->bytes, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test10_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @test10_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -964,7 +964,7 @@ void test10(struct union_of_fams *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test10_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @test10_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -1095,10 +1095,10 @@ int test12_a, test12_b;
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS4:%.*]], label [[HANDLER_TYPE_MISMATCH6:%.*]], !prof [[PROF10:![0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds4:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB19:[0-9]+]], i64 0) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 0) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.type_mismatch6:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds ([[STRUCT_ANON_5:%.*]], ptr @test12_foo, i64 1, i32 0, i32 0, i32 0) to i64)) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds ([[STRUCT_ANON_5:%.*]], ptr @test12_foo, i64 1, i32 0, i32 0, i32 0) to i64)) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
@@ -1188,7 +1188,7 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[INDEX]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB23:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
@@ -1249,7 +1249,7 @@ struct test14_foo {
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       trap:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR10]]
@@ -1305,7 +1305,7 @@ int test14(int idx) {
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       trap:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR10]]
@@ -1326,7 +1326,7 @@ int test14(int idx) {
 // SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       trap:
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR8]]
@@ -1487,7 +1487,7 @@ struct tests_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB26:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 84
@@ -1528,7 +1528,7 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44
@@ -1580,7 +1580,7 @@ struct test26_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
@@ -1651,7 +1651,7 @@ struct test27_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24
@@ -1717,7 +1717,7 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT17:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB31:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont17:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
@@ -1779,7 +1779,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB33:[0-9]+]], i64 [[TMP1]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB36:[0-9]+]], i64 [[TMP1]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
@@ -1791,7 +1791,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM15]], [[TMP3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT20:%.*]], label [[HANDLER_OUT_OF_BOUNDS16:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds16:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont20:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
@@ -1826,7 +1826,7 @@ struct annotated_struct_array {
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT21:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       cont21:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index 34f39cea5265..acafe9222d59 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve  %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE   %s
+
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef short int si8 __attribute__((ext_vector_type(8)));
 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
@@ -134,3 +137,53 @@ void test_builtin_reduce_and(si8 vi1, u4 vu1) {
   // CHECK-NEXT: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[VU1]])
   unsigned r3 = __builtin_reduce_and(vu1);
 }
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+
+void test_builtin_reduce_SVE(int a, unsigned long long b, short c, float d) {
+  // SVE-LABEL: void @test_builtin_reduce_SVE(
+
+  svint32_t vec_a = svdup_s32(a);
+  svuint64_t vec_b = svdup_u64(b);
+  svint16_t vec_c1 = svdup_s16(c);
+  svuint16_t vec_c2 = svdup_u16(c);
+  svfloat32_t vec_d = svdup_f32(d);
+
+  // SVE:      [[VF1:%.+]] = load <vscale x 4 x i32>, ptr %vec_a
+  // SVE-NEXT: call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[VF1]])
+  int r1 = __builtin_reduce_add(vec_a);
+
+  // SVE:      [[VF2:%.+]] = load <vscale x 4 x i32>, ptr %vec_a
+  // SVE-NEXT: call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> [[VF2]])
+  int r2 = __builtin_reduce_mul(vec_a);
+
+  // SVE:      [[VF3:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> [[VF3]])
+  long long r3 = __builtin_reduce_xor(vec_b);
+
+  // SVE:      [[VF4:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[VF4]])
+  long long r4 = __builtin_reduce_or(vec_b);
+
+  // SVE:      [[VF5:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[VF5]])
+  long long r5 = __builtin_reduce_and(vec_b);
+
+  // SVE:      [[VF6:%.+]] = load <vscale x 8 x i16>, ptr %vec_c1
+  // SVE-NEXT: call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> [[VF6]])
+  short r6 = __builtin_reduce_max(vec_c1);
+
+  // SVE:      [[VF7:%.+]] = load <vscale x 8 x i16>, ptr %vec_c2
+  // SVE-NEXT: call i16 @llvm.vector.reduce.umin.nxv8i16(<vscale x 8 x i16> [[VF7]])
+  unsigned short r7 = __builtin_reduce_min(vec_c2);
+
+  // SVE:      [[VF8:%.+]] = load <vscale x 4 x float>, ptr %vec_d
+  // SVE-NEXT: call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[VF8]])
+  float r8 = __builtin_reduce_max(vec_d);
+
+  // SVE:      [[VF9:%.+]] = load <vscale x 4 x float>, ptr %vec_d
+  // SVE-NEXT: call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[VF9]])
+  float r9 = __builtin_reduce_min(vec_d);
+}
+#endif
diff --git a/clang/test/CodeGen/ms-mixed-ptr-sizes.c b/clang/test/CodeGen/ms-mixed-ptr-sizes.c
index 89d05fd30b72..51bea60eb39d 100644
--- a/clang/test/CodeGen/ms-mixed-ptr-sizes.c
+++ b/clang/test/CodeGen/ms-mixed-ptr-sizes.c
@@ -49,7 +49,7 @@ void test_other(struct Foo *f, __attribute__((address_space(10))) int *i) {
 }
 
 int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local noundef i32 @test_compare1
+  // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare1
   // X64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(271)
   // X64: %cmp = icmp eq ptr addrspace(271) %{{.+}}, %i
   // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr addrspace(271)
@@ -58,7 +58,7 @@ int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) {
 }
 
 int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local noundef i32 @test_compare2
+  // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare2
   // X64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(270)
   // X64: %cmp = icmp eq ptr addrspace(270) %{{.+}}, %i
   // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr
@@ -67,7 +67,7 @@ int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) {
 }
 
 int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local noundef i32 @test_compare3
+  // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare3
   // X64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr
   // X64: %cmp = icmp eq ptr %{{.+}}, %j
   // X86: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr addrspace(272)
@@ -76,7 +76,7 @@ int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) {
 }
 
 int test_compare4(int *__ptr32 __sptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local noundef i32 @test_compare4
+  // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare4
   // X64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr
   // X64: %cmp = icmp eq ptr %{{.+}}, %j
   // X86: %{{.+}} = addrspacecast ptr %i to ptr addrspace(272)
diff --git a/clang/test/CodeGen/regparm-flag.c b/clang/test/CodeGen/regparm-flag.c
index c35b53cd4e19..d888c1e344c0 100644
--- a/clang/test/CodeGen/regparm-flag.c
+++ b/clang/test/CodeGen/regparm-flag.c
@@ -1,4 +1,8 @@
 // RUN: %clang_cc1 -triple i386-unknown-unknown -mregparm 4 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -fsanitize=array-bounds %s -emit-llvm -o - | FileCheck %s --check-prefix=RUNTIME0
+// RUN: %clang_cc1 -triple i386-unknown-unknown -mregparm 1 -fsanitize=array-bounds %s -emit-llvm -o - | FileCheck %s --check-prefix=RUNTIME1
+// RUN: %clang_cc1 -triple i386-unknown-unknown -mregparm 2 -fsanitize=array-bounds %s -emit-llvm -o - | FileCheck %s --check-prefix=RUNTIME2
+// RUN: %clang_cc1 -triple i386-unknown-unknown -mregparm 3 -fsanitize=array-bounds %s -emit-llvm -o - | FileCheck %s --check-prefix=RUNTIME2
 
 void f1(int a, int b, int c, int d,
         int e, int f, int g, int h);
@@ -13,7 +17,21 @@ void f0(void) {
   f2(1, 2);
 }
 
+struct has_array {
+  int a;
+  int b[4];
+  int c;
+};
+
+int access(struct has_array *p, int index)
+{
+  return p->b[index];
+}
+
 // CHECK: declare void @f1(i32 inreg noundef, i32 inreg noundef, i32 inreg noundef, i32 inreg noundef,
 // CHECK: i32 noundef, i32 noundef, i32 noundef, i32 noundef)
 // CHECK: declare void @f2(i32 noundef, i32 noundef)
 
+// RUNTIME0: declare void @__ubsan_handle_out_of_bounds_abort(ptr, i32)
+// RUNTIME1: declare void @__ubsan_handle_out_of_bounds_abort(ptr inreg, i32)
+// RUNTIME2: declare void @__ubsan_handle_out_of_bounds_abort(ptr inreg, i32 inreg)
diff --git a/clang/test/CodeGenCUDA/kernel-stub-name.cu b/clang/test/CodeGenCUDA/kernel-stub-name.cu
index 23df7f5d721b..0faea75cbbe5 100644
--- a/clang/test/CodeGenCUDA/kernel-stub-name.cu
+++ b/clang/test/CodeGenCUDA/kernel-stub-name.cu
@@ -2,7 +2,7 @@
 
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
-// RUN:   | FileCheck -check-prefixes=CHECK,GNU %s
+// RUN:   | FileCheck -check-prefixes=CHECK,GNU,GNU-HIP,HIP %s
 
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
@@ -11,7 +11,12 @@
 // RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm %s \
 // RUN:     -aux-triple amdgcn-amd-amdhsa -fcuda-include-gpubinary \
 // RUN:     %t -o - -x hip\
-// RUN:   | FileCheck -check-prefixes=CHECK,MSVC %s
+// RUN:   | FileCheck -check-prefixes=CHECK,MSVC,MSVC-HIP,HIP %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm %s \
+// RUN:     -aux-triple nvptx64 -fcuda-include-gpubinary \
+// RUN:     %t -target-sdk-version=9.2 -o - \
+// RUN:   | FileCheck -check-prefixes=CHECK,MSVC,CUDA %s
 
 // RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm %s \
 // RUN:     -aux-triple amdgcn-amd-amdhsa -fcuda-include-gpubinary \
@@ -22,19 +27,23 @@
 
 // Check kernel handles are emitted for non-MSVC target but not for MSVC target.
 
-// GNU: @[[HCKERN:ckernel]] = constant ptr @[[CSTUB:__device_stub__ckernel]], align 8
-// GNU: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant ptr @[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]], align 8
-// GNU: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant ptr @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]], comdat, align 8
-// GNU: @[[HDKERN:_Z11kernel_declv]] = external constant ptr, align 8
-// GNU: @[[HTDKERN:_Z20template_kernel_declIiEvT_]] = external constant ptr, align 8
-
-// MSVC: @[[HCKERN:ckernel]] = dso_local constant ptr @[[CSTUB:__device_stub__ckernel]], align 8
-// MSVC: @[[HNSKERN:"\?nskernel@ns@@YAXXZ.*"]] = dso_local constant ptr @[[NSSTUB:"\?__device_stub__nskernel@ns@@YAXXZ"]], align 8
-// MSVC: @[[HTKERN:"\?\?\$kernelfunc@H@@YAXXZ.*"]] = linkonce_odr dso_local constant ptr @[[TSTUB:"\?\?\$__device_stub__kernelfunc@H@@YAXXZ.*"]], comdat, align 8
-// MSVC: @[[HDKERN:"\?kernel_decl@@YAXXZ.*"]] = external dso_local constant ptr, align 8
-// MSVC: @[[HTDKERN:"\?\?\$template_kernel_decl@H@@YAXH.*"]] = external dso_local constant ptr, align 8
+// GNU-HIP: @[[HCKERN:ckernel]] = constant ptr @[[CSTUB:__device_stub__ckernel]], align 8
+// GNU-HIP: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant ptr @[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]], align 8
+// GNU-HIP: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant ptr @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]], comdat, align 8
+// GNU-HIP: @[[HDKERN:_Z11kernel_declv]] = external constant ptr, align 8
+// GNU-HIP: @[[HTDKERN:_Z20template_kernel_declIiEvT_]] = external constant ptr, align 8
+
+// MSVC-HIP: @[[HCKERN:ckernel]] = dso_local constant ptr @[[CSTUB:__device_stub__ckernel]], align 8
+// MSVC-HIP: @[[HNSKERN:"\?nskernel@ns@@YAXXZ.*"]] = dso_local constant ptr @[[NSSTUB:"\?__device_stub__nskernel@ns@@YAXXZ"]], align 8
+// MSVC-HIP: @[[HTKERN:"\?\?\$kernelfunc@H@@YAXXZ.*"]] = linkonce_odr dso_local constant ptr @[[TSTUB:"\?\?\$__device_stub__kernelfunc@H@@YAXXZ.*"]], comdat, align 8
+// MSVC-HIP: @[[HDKERN:"\?kernel_decl@@YAXXZ.*"]] = external dso_local constant ptr, align 8
+// MSVC-HIP: @[[HTDKERN:"\?\?\$template_kernel_decl@H@@YAXH.*"]] = external dso_local constant ptr, align 8
 extern "C" __global__ void ckernel() {}
 
+// CUDA: @[[HCKERN:__device_stub__ckernel\.id]] = dso_local global i8 0
+// CUDA: @[[HNSKERN:"\?__device_stub__nskernel@ns@@YAXXZ\.id"]] = dso_local global i8 0
+// CUDA: @[[HTKERN:"\?\?\$__device_stub__kernelfunc@H@@YAXXZ\.id"]] = linkonce_odr dso_local global i8 0, comdat
+
 namespace ns {
 __global__ void nskernel() {}
 } // namespace ns
@@ -60,18 +69,27 @@ extern "C" void launch(void *kern);
 
 // Non-template kernel stub functions
 
-// CHECK: define{{.*}}@[[CSTUB]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]]
+// HIP: define{{.*}}@[[CSTUB]]
+// CUDA: define{{.*}}@[[CSTUB:__device_stub__ckernel]]
+// HIP: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]]
+// CUDA: call{{.*}}@cudaLaunch{{.*}}@[[CSTUB]]
+// CUDA: store volatile i8 1, ptr @[[HCKERN]], align 1
+// CHECK: ret void
 
-// CHECK: define{{.*}}@[[NSSTUB]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]
+// HIP: define{{.*}}@[[NSSTUB]]
+// CUDA: define{{.*}}@[[NSSTUB:"\?__device_stub__nskernel@ns@@YAXXZ"]]
+// HIP: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]
+// CUDA: call{{.*}}@cudaLaunch{{.*}}@[[NSSTUB]]
+// CUDA: store volatile i8 1, ptr @[[HNSKERN]], align 1
+// CHECK: ret void
 
 // Check kernel stub is called for triple chevron.
 
 // CHECK-LABEL: define{{.*}}@fun1()
 // CHECK: call void @[[CSTUB]]()
 // CHECK: call void @[[NSSTUB]]()
-// CHECK: call void @[[TSTUB]]()
+// HIP: call void @[[TSTUB]]()
+// CUDA: call void @[[TSTUB:"\?\?\$__device_stub__kernelfunc@H@@YAXXZ.*"]]()
 // GNU: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]()
 // GNU: call void @[[TDSTUB:_Z35__device_stub__template_kernel_declIiEvT_]](
 // MSVC: call void @[[DSTUB:"\?__device_stub__kernel_decl@@YAXXZ"]]()
@@ -88,7 +106,10 @@ extern "C" void fun1(void) {
 // Template kernel stub functions
 
 // CHECK: define{{.*}}@[[TSTUB]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]]
+// HIP: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]]
+// CUDA: call{{.*}}@cudaLaunch{{.*}}@[[TSTUB]]
+// CUDA: store volatile i8 1, ptr @[[HTKERN]], align 1
+// CHECK: ret void
 
 // Check declaration of stub function for external kernel.
 
@@ -98,11 +119,11 @@ extern "C" void fun1(void) {
 // Check kernel handle is used for passing the kernel as a function pointer.
 
 // CHECK-LABEL: define{{.*}}@fun2()
-// CHECK: call void @launch({{.*}}[[HCKERN]]
-// CHECK: call void @launch({{.*}}[[HNSKERN]]
-// CHECK: call void @launch({{.*}}[[HTKERN]]
-// CHECK: call void @launch({{.*}}[[HDKERN]]
-// CHECK: call void @launch({{.*}}[[HTDKERN]]
+// HIP: call void @launch({{.*}}[[HCKERN]]
+// HIP: call void @launch({{.*}}[[HNSKERN]]
+// HIP: call void @launch({{.*}}[[HTKERN]]
+// HIP: call void @launch({{.*}}[[HDKERN]]
+// HIP: call void @launch({{.*}}[[HTDKERN]]
 extern "C" void fun2() {
   launch((void *)ckernel);
   launch((void *)ns::nskernel);
@@ -114,10 +135,10 @@ extern "C" void fun2() {
 // Check kernel handle is used for assigning a kernel to a function pointer.
 
 // CHECK-LABEL: define{{.*}}@fun3()
-// CHECK:  store ptr @[[HCKERN]], ptr @kernel_ptr, align 8
-// CHECK:  store ptr @[[HCKERN]], ptr @kernel_ptr, align 8
-// CHECK:  store ptr @[[HCKERN]], ptr @void_ptr, align 8
-// CHECK:  store ptr @[[HCKERN]], ptr @void_ptr, align 8
+// HIP:  store ptr @[[HCKERN]], ptr @kernel_ptr, align 8
+// HIP:  store ptr @[[HCKERN]], ptr @kernel_ptr, align 8
+// HIP:  store ptr @[[HCKERN]], ptr @void_ptr, align 8
+// HIP:  store ptr @[[HCKERN]], ptr @void_ptr, align 8
 extern "C" void fun3() {
   kernel_ptr = ckernel;
   kernel_ptr = &ckernel;
@@ -129,11 +150,11 @@ extern "C" void fun3() {
 // used with triple chevron.
 
 // CHECK-LABEL: define{{.*}}@fun4()
-// CHECK:  store ptr @[[HCKERN]], ptr @kernel_ptr
-// CHECK:  call noundef i32 @{{.*hipConfigureCall}}
-// CHECK:  %[[HANDLE:.*]] = load ptr, ptr @kernel_ptr, align 8
-// CHECK:  %[[STUB:.*]] = load ptr, ptr %[[HANDLE]], align 8
-// CHECK:  call void %[[STUB]]()
+// HIP:  store ptr @[[HCKERN]], ptr @kernel_ptr
+// HIP:  call noundef i32 @{{.*hipConfigureCall}}
+// HIP:  %[[HANDLE:.*]] = load ptr, ptr @kernel_ptr, align 8
+// HIP:  %[[STUB:.*]] = load ptr, ptr %[[HANDLE]], align 8
+// HIP:  call void %[[STUB]]()
 extern "C" void fun4() {
   kernel_ptr = ckernel;
   kernel_ptr<<<1,1>>>();
@@ -142,9 +163,9 @@ extern "C" void fun4() {
 // Check kernel handle is passed to a function.
 
 // CHECK-LABEL: define{{.*}}@fun5()
-// CHECK:  store ptr @[[HCKERN]], ptr @kernel_ptr
-// CHECK:  %[[HANDLE:.*]] = load ptr, ptr @kernel_ptr, align 8
-// CHECK:  call void @launch(ptr noundef %[[HANDLE]])
+// HIP:  store ptr @[[HCKERN]], ptr @kernel_ptr
+// HIP:  %[[HANDLE:.*]] = load ptr, ptr @kernel_ptr, align 8
+// HIP:  call void @launch(ptr noundef %[[HANDLE]])
 extern "C" void fun5() {
   kernel_ptr = ckernel;
   launch((void *)kernel_ptr);
@@ -152,10 +173,10 @@ extern "C" void fun5() {
 
 // Check kernel handle is registered.
 
-// CHECK-LABEL: define{{.*}}@__hip_register_globals
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]]
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]]
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]]
+// HIP-LABEL: define{{.*}}@__hip_register_globals
+// HIP: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]]
+// HIP: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]]
+// HIP: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]]
 // NEG-NOT: call{{.*}}@__hipRegisterFunction{{.*}}__device_stub
 // NEG-NOT: call{{.*}}@__hipRegisterFunction{{.*}}kernel_decl
 // NEG-NOT: call{{.*}}@__hipRegisterFunction{{.*}}template_kernel_decl
diff --git a/clang/test/CodeGenCXX/blocks.cpp b/clang/test/CodeGenCXX/blocks.cpp
index eaab1890dfc4..afe078890553 100644
--- a/clang/test/CodeGenCXX/blocks.cpp
+++ b/clang/test/CodeGenCXX/blocks.cpp
@@ -149,8 +149,8 @@ namespace test5 {
   // CHECK-NEXT: [[X:%.*]] = alloca [[A:%.*]], align 4
   // CHECK-NEXT: [[B:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[BLOCK:%.*]] = alloca [[BLOCK_T:.*]], align 8
-  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[COND_CLEANUP_SAVE:%.*]] = alloca ptr, align 8
+  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[T0:%.*]] = zext i1
   // CHECK-NEXT: store i8 [[T0]], ptr [[COND]], align 1
   // CHECK-NEXT: call void @_ZN5test51AC1Ev(ptr {{[^,]*}} [[X]])
@@ -162,8 +162,8 @@ namespace test5 {
   // CHECK-NOT:  br
   // CHECK:      [[CAPTURE:%.*]] = getelementptr inbounds [[BLOCK_T]], ptr [[BLOCK]], i32 0, i32 5
   // CHECK-NEXT: call void @_ZN5test51AC1ERKS0_(ptr {{[^,]*}} [[CAPTURE]], ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[X]])
-  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: store ptr [[CAPTURE]], ptr [[COND_CLEANUP_SAVE]], align 8
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: br label
   // CHECK:      br label
   // CHECK:      phi
diff --git a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
new file mode 100644
index 000000000000..ac466ee5bba4
--- /dev/null
+++ b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
@@ -0,0 +1,522 @@
+// RUN: %clang_cc1 --std=c++20 -fexceptions -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=EH %s
+// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=NOEH,CHECK %s
+
+struct Printy {
+  Printy(const char *name) : name(name) {}
+  ~Printy() {}
+  const char *name;
+};
+
+int foo() { return 2; }
+
+struct Printies {
+  Printy a;
+  Printy b;
+  Printy c;
+};
+
+void ParenInit() {
+  // CHECK-LABEL: define dso_local void @_Z9ParenInitv()
+  // CHECK: [[CLEANUP_DEST:%.+]] = alloca i32, align 4
+  Printies ps(Printy("a"), 
+              // CHECK: call void @_ZN6PrintyC1EPKc
+              ({
+                if (foo()) return;
+                // CHECK:     if.then:
+                // CHECK-NEXT:   store i32 1, ptr [[CLEANUP_DEST]], align 4
+                // CHECK-NEXT:   br label %cleanup
+                Printy("b");
+                // CHECK:     if.end:
+                // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc
+              }),
+              ({
+                if (foo()) return;
+                // CHECK:     if.then{{.*}}:
+                // CHECK-NEXT:  store i32 1, ptr [[CLEANUP_DEST]], align 4
+                // CHECK-NEXT:  call void @_ZN6PrintyD1Ev
+                // CHECK-NEXT:  br label %cleanup
+                Printy("c");
+                // CHECK:     if.end{{.*}}:
+                // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc
+                // CHECK-NEXT:  call void @_ZN8PrintiesD1Ev
+                // CHECK-NEXT:  br label %return
+              }));
+  // CHECK:     cleanup:
+  // CHECK-NEXT:  call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:  br label %return
+}
+
+void break_in_stmt_expr() {
+  // Verify that the "break" in "if.then".calls dtor before jumping to "for.end".
+
+  // CHECK-LABEL: define dso_local void @_Z18break_in_stmt_exprv()
+  Printies p{Printy("a"), 
+            // CHECK: call void @_ZN6PrintyC1EPKc
+            ({
+                for (;;) {
+                    Printies ps{
+                      Printy("b"), 
+                      // CHECK: for.cond:
+                      // CHECK:   call void @_ZN6PrintyC1EPKc
+                      ({
+                        if (foo()) {
+                          break;
+                          // CHECK:       if.then:
+                          // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+                          // CHECK-NEXT:    br label %for.end
+                        }
+                        Printy("c");
+                        // CHECK:       if.end:
+                        // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                      }),
+                      Printy("d")};
+                      // CHECK:           call void @_ZN6PrintyC1EPKc
+                      // CHECK-NEXT:      call void @_ZN8PrintiesD1Ev
+                      // CHECK-NEXT:      br label %for.cond
+                }
+                Printy("e");
+  // CHECK:       for.end:
+  // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+              }),
+              Printy("f")};
+  // CHECK:         call void @_ZN6PrintyC1EPKc
+  // CHECK-NEXT:    call void @_ZN8PrintiesD1Ev
+}
+
+void goto_in_stmt_expr() {
+  // Verify that:
+  //  - correct branch fixups for deactivated normal cleanups are generated correctly.
+
+  // CHECK-LABEL: define dso_local void @_Z17goto_in_stmt_exprv()
+  // CHECK: [[CLEANUP_DEST_SLOT:%cleanup.dest.slot.*]] = alloca i32, align 4
+  {
+    Printies p1{Printy("a"), // CHECK: call void @_ZN6PrintyC1EPKc
+                ({
+                  {
+                    Printies p2{Printy("b"),
+                                // CHECK: call void @_ZN6PrintyC1EPKc
+                                ({
+                                  if (foo() == 1) {
+                                    goto in;
+                                    // CHECK:       if.then:
+                                    // CHECK-NEXT:    store i32 2, ptr [[CLEANUP_DEST_SLOT]], align 4
+                                    // CHECK-NEXT:    br label %[[CLEANUP1:.+]]
+                                  }
+                                  if (foo() == 2) {
+                                    goto out;
+                                    // CHECK:       if.then{{.*}}:
+                                    // CHECK-NEXT:    store i32 3, ptr [[CLEANUP_DEST_SLOT]], align 4
+                                    // CHECK-NEXT:    br label %[[CLEANUP1]]
+                                  }
+                                  Printy("c");
+                                  // CHECK:       if.end{{.*}}:
+                                  // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                                }),
+                                Printy("d")};
+                                // CHECK:           call void @_ZN6PrintyC1EPKc
+                                // CHECK-NEXT:      call void @_ZN8PrintiesD1Ev
+                                // CHECK-NEXT:      br label %in
+
+                  }
+                in:
+                  Printy("e");
+                // CHECK:       in:                                               ; preds = %if.end{{.*}}, %[[CLEANUP1]]
+                // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                }),
+                Printy("f")};
+                // CHECK:         call void @_ZN6PrintyC1EPKc
+                // CHECK-NEXT:    call void @_ZN8PrintiesD1Ev
+                // CHECK-NEXT:    br label %out
+  }
+out:
+  return;
+  // CHECK:       out:
+  // CHECK-NEXT:    ret void
+
+  // CHECK:       [[CLEANUP1]]:                                          ; preds = %if.then{{.*}}, %if.then
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %cleanup.dest = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4
+  // CHECK-NEXT:    switch i32 %cleanup.dest, label %[[CLEANUP2:.+]] [
+  // CHECK-NEXT:      i32 2, label %in
+  // CHECK-NEXT:    ]
+
+  // CHECK:       [[CLEANUP2]]:                                         ; preds = %[[CLEANUP1]]
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %cleanup.dest{{.*}} = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4
+  // CHECK-NEXT:    switch i32 %cleanup.dest{{.*}}, label %unreachable [
+  // CHECK-NEXT:      i32 3, label %out
+  // CHECK-NEXT:    ]
+}
+
+void ArrayInit() {
+  // Printy arr[4] = {ctorA, ctorB, stmt-exprC, stmt-exprD};
+  // Verify that:
+  //  - We do the necessary stores for array cleanups (endOfInit and last constructed element).
+  //  - We update the array init element correctly for ctorA, ctorB and stmt-exprC.
+  //  - stmt-exprC and stmt-exprD share the array body dtor code (see %cleanup).
+
+  // CHECK-LABEL: define dso_local void @_Z9ArrayInitv()
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  // CHECK: %cleanup.dest.slot = alloca i32, align 4
+  // CHECK: %arrayinit.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i64 0, i64 0
+  // CHECK: store ptr %arrayinit.begin, ptr %arrayinit.endOfInit, align 8
+  Printy arr[4] = {
+    Printy("a"),
+    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
+    // CHECK: [[ARRAYINIT_ELEMENT1:%.+]] = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
+    // CHECK: store ptr [[ARRAYINIT_ELEMENT1]], ptr %arrayinit.endOfInit, align 8
+    Printy("b"),
+    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT1]], ptr noundef @.str.1)
+    // CHECK: [[ARRAYINIT_ELEMENT2:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAYINIT_ELEMENT1]], i64 1
+    // CHECK: store ptr [[ARRAYINIT_ELEMENT2]], ptr %arrayinit.endOfInit, align 8
+    ({
+    // CHECK: br i1 {{.*}}, label %if.then, label %if.end
+      if (foo()) {
+        return;
+      // CHECK:       if.then:
+      // CHECK-NEXT:    store i32 1, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:    br label %cleanup
+      }
+      // CHECK:       if.end:
+      Printy("c");
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    %arrayinit.element2 = getelementptr inbounds %struct.Printy, ptr %arrayinit.element1, i64 1
+      // CHECK-NEXT:    store ptr %arrayinit.element2, ptr %arrayinit.endOfInit, align 8
+    }),
+    ({
+    // CHECK: br i1 {{%.+}} label %[[IF_THEN2:.+]], label %[[IF_END2:.+]]
+      if (foo()) {
+        return;
+      // CHECK:       [[IF_THEN2]]:
+      // CHECK-NEXT:    store i32 1, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:    br label %cleanup
+      }
+      // CHECK:       [[IF_END2]]:
+      Printy("d");
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    %array.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i32 0, i32 0
+      // CHECK-NEXT:    %0 = getelementptr inbounds %struct.Printy, ptr %array.begin, i64 4
+      // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY1:.+]]
+  }),
+  };
+
+  // CHECK:       [[ARRAY_DESTROY_BODY1]]:
+  // CHECK-NEXT:    %arraydestroy.elementPast{{.*}} = phi ptr [ %0, %[[IF_END2]] ], [ %arraydestroy.element{{.*}}, %[[ARRAY_DESTROY_BODY1]] ]
+  // CHECK-NEXT:    %arraydestroy.element{{.*}} = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast{{.*}}, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %arraydestroy.done{{.*}} = icmp eq ptr %arraydestroy.element{{.*}}, %array.begin
+  // CHECK-NEXT:    br i1 %arraydestroy.done{{.*}}, label %[[ARRAY_DESTROY_DONE1:.+]], label %[[ARRAY_DESTROY_BODY1]]
+
+  // CHECK:       [[ARRAY_DESTROY_DONE1]]:
+  // CHECK-NEXT:    ret void
+
+  // CHECK:       cleanup:
+  // CHECK-NEXT:    %1 = load ptr, ptr %arrayinit.endOfInit, align 8
+  // CHECK-NEXT:    %arraydestroy.isempty = icmp eq ptr %arrayinit.begin, %1
+  // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2:.+]]
+
+  // CHECK:       [[ARRAY_DESTROY_BODY2]]:
+  // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %1, %cleanup ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY2]] ]
+  // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %arrayinit.begin
+  // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE2]], label %[[ARRAY_DESTROY_BODY2]]
+
+  // CHECK:       [[ARRAY_DESTROY_DONE2]]:
+  // CHECK-NEXT:    br label %[[ARRAY_DESTROY_DONE1]]
+}
+
+void ArraySubobjects() {
+  struct S {
+    Printy arr1[2];
+    Printy arr2[2];
+    Printy p;
+  };
+  // CHECK-LABEL: define dso_local void @_Z15ArraySubobjectsv()
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  S s{{Printy("a"), Printy("b")},
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      {Printy("a"),
+      // CHECK: [[ARRAYINIT_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy]
+      // CHECK: store ptr [[ARRAYINIT_BEGIN]], ptr %arrayinit.endOfInit, align 8
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      // CHECK: [[ARRAYINIT_ELEMENT:%.+]] = getelementptr inbounds %struct.Printy
+      // CHECK: store ptr [[ARRAYINIT_ELEMENT]], ptr %arrayinit.endOfInit, align 8
+      ({
+         if (foo()) {
+           return;
+           // CHECK:      if.then:
+           // CHECK-NEXT:   [[V0:%.+]] = load ptr, ptr %arrayinit.endOfInit, align 8
+           // CHECK-NEXT:   %arraydestroy.isempty = icmp eq ptr [[ARRAYINIT_BEGIN]], [[V0]]
+           // CHECK-NEXT:   br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE:.+]], label %[[ARRAY_DESTROY_BODY:.+]]
+         }
+         Printy("b");
+       })
+      },
+      Printy("c")
+      // CHECK:       if.end:
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK:         call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    call void @_ZZ15ArraySubobjectsvEN1SD1Ev
+      // CHECK-NEXT:    br label %return
+    };
+    // CHECK:       return:
+    // CHECK-NEXT:    ret void
+
+    // CHECK:       [[ARRAY_DESTROY_BODY]]:
+    // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %0, %if.then ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY]] ]
+    // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+    // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, [[ARRAYINIT_BEGIN]]
+    // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE]], label %[[ARRAY_DESTROY_BODY]]
+
+    // CHECK:       [[ARRAY_DESTROY_DONE]]
+    // CHECK-NEXT:    [[ARRAY_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy], ptr %arr1, i32 0, i32 0
+    // CHECK-NEXT:    [[V1:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAY_BEGIN]], i64 2
+    // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY2:.+]]
+
+    // CHECK:       [[ARRAY_DESTROY_BODY2]]:
+    // CHECK-NEXT:    %arraydestroy.elementPast5 = phi ptr [ %1, %[[ARRAY_DESTROY_DONE]] ], [ %arraydestroy.element6, %[[ARRAY_DESTROY_BODY2]] ]
+    // CHECK-NEXT:    %arraydestroy.element6 = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast5, i64 -1
+    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element6)
+    // CHECK-NEXT:    %arraydestroy.done7 = icmp eq ptr %arraydestroy.element6, [[ARRAY_BEGIN]]
+    // CHECK-NEXT:    br i1 %arraydestroy.done7, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2]]
+
+
+    // CHECK:     [[ARRAY_DESTROY_DONE2]]:
+    // CHECK-NEXT:  br label %return
+}
+
+void LambdaInit() {
+  // CHECK-LABEL: define dso_local void @_Z10LambdaInitv()
+  auto S = [a = Printy("a"), b = ({
+                               if (foo()) {
+                                 return;
+                                 // CHECK:       if.then:
+                                 // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+                                 // CHECK-NEXT:    br label %return
+                               }
+                               Printy("b");
+                             })]() { return a; };
+}
+
+struct PrintyRefBind {
+  const Printy &a;
+  const Printy &b;
+};
+
+struct Temp {
+  Temp();
+  ~Temp();
+};
+Temp CreateTemp();
+Printy CreatePrinty();
+Printy CreatePrinty(const Temp&);
+
+void LifetimeExtended() {
+  // CHECK-LABEL: define dso_local void @_Z16LifetimeExtendedv
+  PrintyRefBind ps = {Printy("a"), ({
+                        if (foo()) {
+                          return;
+                          // CHECK: if.then:
+                          // CHECK-NEXT: call void @_ZN6PrintyD1Ev
+                          // CHECK-NEXT: br label %return
+                        }
+                        Printy("b");
+                      })};
+}
+
+void ConditionalLifetimeExtended() {
+  // CHECK-LABEL: @_Z27ConditionalLifetimeExtendedv()
+
+  // Verify that we create two cleanup flags.
+  //  1. First for the cleanup which is deactivated after full expression.
+  //  2. Second for the life-ext cleanup which is activated if the branch is taken.
+
+  // Note: We use `CreateTemp()` to ensure that life-ext destroy cleanup is not at
+  // the top of EHStack on deactivation. This ensures using active flags.
+
+  Printy* p1 = nullptr;
+  // CHECK:       store i1 false, ptr [[BRANCH1_DEFERRED:%cleanup.cond]], align 1
+  // CHECK-NEXT:  store i1 false, ptr [[BRANCH1_LIFEEXT:%cleanup.cond.*]], align 1
+  PrintyRefBind ps = {
+      p1 != nullptr ? static_cast<const Printy&>(CreatePrinty())
+      // CHECK:       cond.true:
+      // CHECK-NEXT:    call void @_Z12CreatePrintyv
+      // CHECK-NEXT:    store i1 true, ptr [[BRANCH1_DEFERRED]], align 1
+      // CHECK-NEXT:    store i1 true, ptr [[BRANCH1_LIFEEXT]], align 1
+      // CHECK-NEXT:    br label %{{.*}}
+      : foo() ? static_cast<const Printy&>(CreatePrinty(CreateTemp()))
+              : *p1,
+      ({
+        if (foo()) return;
+        Printy("c");
+        // CHECK:       if.end:
+        // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+        // CHECK-NEXT:    store ptr
+      })};
+      // CHECK-NEXT:      store i1 false, ptr [[BRANCH1_DEFERRED]], align 1
+      // CHECK-NEXT:      store i32 0, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:      br label %cleanup
+
+}
+
+void NewArrayInit() {
+  // CHECK-LABEL: define dso_local void @_Z12NewArrayInitv()
+  // CHECK: %array.init.end = alloca ptr, align 8
+  // CHECK: store ptr %0, ptr %array.init.end, align 8
+  Printy *array = new Printy[3]{
+    "a",
+    // CHECK: call void @_ZN6PrintyC1EPKc
+    // CHECK: store ptr %array.exp.next, ptr %array.init.end, align 8
+    "b", 
+    // CHECK: call void @_ZN6PrintyC1EPKc
+    // CHECK: store ptr %array.exp.next1, ptr %array.init.end, align 8
+    ({
+        if (foo()) {
+          return;
+          // CHECK: if.then:
+          // CHECK:   br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body
+        }
+        "b";
+        // CHECK: if.end:
+        // CHECK:   call void @_ZN6PrintyC1EPKc
+    })};
+  // CHECK:       arraydestroy.body:
+  // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %{{.*}}, %if.then ], [ %arraydestroy.element, %arraydestroy.body ]
+  // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %0
+  // CHECK-NEXT:    br i1 %arraydestroy.done, label %arraydestroy.done{{.*}}, label %arraydestroy.body
+
+  // CHECK:       arraydestroy.done{{.*}}:                               ; preds = %arraydestroy.body, %if.then
+  // CHECK-NEXT:    br label %return
+}
+
+void DestroyInConditionalCleanup() {
+  // EH-LABEL: DestroyInConditionalCleanupv()
+  // NOEH-LABEL: DestroyInConditionalCleanupv()
+  struct A {
+    A() {}
+    ~A() {}
+  };
+
+  struct Value {
+    Value(A) {}
+    ~Value() {}
+  };
+
+  struct V2 {
+    Value K;
+    Value V;
+  };
+  // Verify we use conditional cleanups.
+  (void)(foo() ? V2{A(), A()} : V2{A(), A()});
+  // NOEH:   cond.true:
+  // NOEH:      call void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev
+  // NOEH:      store ptr %{{.*}}, ptr %cond-cleanup.save
+
+  // EH:   cond.true:
+  // EH:        invoke void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev
+  // EH:        store ptr %{{.*}}, ptr %cond-cleanup.save
+}
+
+void ArrayInitWithContinue() {
+  // CHECK-LABEL: @_Z21ArrayInitWithContinuev
+  // Verify that we start to emit the array destructor.
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  for (int i = 0; i < 1; ++i) {
+    Printy arr[2] = {"a", ({
+                       if (foo()) {
+                         continue;
+                       }
+                       "b";
+                     })};
+  }
+}
+
+struct [[clang::trivial_abi]] HasTrivialABI {
+  HasTrivialABI();
+  ~HasTrivialABI();
+};
+void AcceptTrivialABI(HasTrivialABI, int);
+void TrivialABI() {
+  // CHECK-LABEL: define dso_local void @_Z10TrivialABIv()
+  AcceptTrivialABI(HasTrivialABI(), ({
+                     if (foo()) return;
+                     // CHECK:      if.then:
+                     // CHECK-NEXT:   call void @_ZN13HasTrivialABID1Ev
+                     // CHECK-NEXT:   br label %return
+                     0;
+                   }));
+}
+
+namespace CleanupFlag {
+struct A {
+  A() {}
+  ~A() {}
+};
+
+struct B {
+  B(const A&) {}
+  B() {}
+  ~B() {}
+};
+
+struct S {
+  A a;
+  B b;
+};
+
+int AcceptS(S s);
+
+void Accept2(int x, int y);
+
+void InactiveNormalCleanup() {
+  // CHECK-LABEL: define {{.*}}InactiveNormalCleanupEv()
+  
+  // The first A{} below is an inactive normal cleanup which
+  // is not popped from EHStack on deactivation. This needs an
+  // "active" cleanup flag.
+
+  // CHECK: [[ACTIVE:%cleanup.isactive.*]] = alloca i1, align 1
+  // CHECK: call void [[A_CTOR:@.*AC1Ev]]
+  // CHECK: store i1 true, ptr [[ACTIVE]], align 1
+  // CHECK: call void [[A_CTOR]]
+  // CHECK: call void [[B_CTOR:@.*BC1ERKNS_1AE]]
+  // CHECK: store i1 false, ptr [[ACTIVE]], align 1
+  // CHECK: call noundef i32 [[ACCEPTS:@.*AcceptSENS_1SE]]
+  Accept2(AcceptS({.a = A{}, .b = A{}}), ({
+            if (foo()) return;
+            // CHECK: if.then:
+            // CHECK:   br label %cleanup
+            0;
+            // CHECK: if.end:
+            // CHECK:   call void [[ACCEPT2:@.*Accept2Eii]]
+            // CHECK:   br label %cleanup
+          }));
+  // CHECK: cleanup:
+  // CHECK:   call void [[S_DTOR:@.*SD1Ev]]
+  // CHECK:   call void [[A_DTOR:@.*AD1Ev]]
+  // CHECK:   %cleanup.is_active = load i1, ptr [[ACTIVE]]
+  // CHECK:   br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
+
+  // CHECK: cleanup.action:
+  // CHECK:   call void [[A_DTOR]]
+
+  // The "active" cleanup flag is not required for unused cleanups.
+  Accept2(AcceptS({.a = A{}, .b = A{}}), 0);
+  // CHECK: cleanup.cont:
+  // CHECK:   call void [[A_CTOR]]
+  // CHECK-NOT: store i1 true
+  // CHECK:   call void [[A_CTOR]]
+  // CHECK:   call void [[B_CTOR]]
+  // CHECK-NOT: store i1 false
+  // CHECK:   call noundef i32 [[ACCEPTS]]
+  // CHECK:   call void [[ACCEPT2]]
+  // CHECK:   call void [[S_DTOR]]
+  // CHECK:   call void [[A_DTOR]]
+  // CHECK:   br label %return
+}
+}  // namespace CleanupFlag
diff --git a/clang/test/CodeGenCXX/dependent-template-alias.cpp b/clang/test/CodeGenCXX/dependent-template-alias.cpp
new file mode 100644
index 000000000000..deb243f9fc88
--- /dev/null
+++ b/clang/test/CodeGenCXX/dependent-template-alias.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple x86_64-unk-unk -o - -emit-llvm -debug-info-kind=standalone -gtemplate-alias %s -gsimple-template-names=simple \
+// RUN: | FileCheck %s
+
+//// Check that -gtemplate-alias falls back to DW_TAG_typedef emission
+//// for instantiation dependent type aliases.
+
+template <int>
+using A = int;
+
+template<int I>
+struct S {
+  using AA = A<I>;
+  AA aa;
+};
+
+S<0> s;
+
+// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "aa", scope: ![[#]], file: ![[#]], line: [[#]], baseType: ![[AA:[0-9]+]], size: 32)
+// CHECK: [[AA]] = !DIDerivedType(tag: DW_TAG_typedef, name: "AA", file: ![[#]], line: [[#]], baseType: ![[A:[0-9]+]])
+// CHECK: [[A]] = !DIDerivedType(tag: DW_TAG_typedef, name: "A<I>", file: ![[#]], line: [[#]], baseType: ![[int:[0-9]+]])
+// CHECK: [[int]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/clang/test/CodeGenCXX/mangle-ms-vector-types.cpp b/clang/test/CodeGenCXX/mangle-ms-vector-types.cpp
index 6699c99deaec..9a0c88961662 100644
--- a/clang/test/CodeGenCXX/mangle-ms-vector-types.cpp
+++ b/clang/test/CodeGenCXX/mangle-ms-vector-types.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fms-extensions -fcxx-exceptions -ffreestanding -target-feature +avx -emit-llvm %s -o - -triple=i686-pc-win32 | FileCheck %s
+// RUN: %clang_cc1 -fms-extensions -fcxx-exceptions -ffreestanding -target-feature +avx -emit-llvm %s -o - -triple=i686-pc-win32 -fexperimental-new-constant-interpreter | FileCheck %s
 
 #include <xmmintrin.h>
 #include <emmintrin.h>
diff --git a/clang/test/CodeGenCXX/mangle.cpp b/clang/test/CodeGenCXX/mangle.cpp
index 31467d943840..d0800af55c87 100644
--- a/clang/test/CodeGenCXX/mangle.cpp
+++ b/clang/test/CodeGenCXX/mangle.cpp
@@ -1032,10 +1032,6 @@ namespace test51 {
   template <typename T>
   decltype(S1<T>().~S1<T>(), S1<T>().~S1<T>()) fun4() {};
   template <typename T>
-  decltype(S1<int>().~S1<T>()) fun5(){};
-  template <template <typename T> class U>
-  decltype(S1<int>().~U<int>()) fun6(){};
-  template <typename T>
   decltype(E().E::~T()) fun7() {}
   template <template <typename> class U>
   decltype(X<int>::Y().U<int>::Y::~Y()) fun8() {}
@@ -1047,10 +1043,6 @@ namespace test51 {
   // CHECK-LABEL: @_ZN6test514fun3I2S1IiEiEEDTcldtcvS1_IT0_E_EdnT_EEv
   template void fun4<int>();
   // CHECK-LABEL: @_ZN6test514fun4IiEEDTcmcldtcv2S1IT_E_Edn2S1IS2_EEcldtcvS3__Edn2S1IS2_EEEv
-  template void fun5<int>();
-  // CHECK-LABEL: @_ZN6test514fun5IiEEDTcldtcv2S1IiE_Edn2S1IT_EEEv
-  template void fun6<S1>();
-  // CHECK-LABEL: @_ZN6test514fun6I2S1EEDTcldtcvS1_IiE_EdnT_IiEEEv
   template void fun7<E>();
   // CHECK-LABEL: @_ZN6test514fun7INS_1EEEEDTcldtcvS1__Esr1EEdnT_EEv
   template void fun8<X>();
diff --git a/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp b/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
index 8a94a5cc91e2..85f10fcdff14 100644
--- a/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
+++ b/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
@@ -116,6 +116,34 @@ void while_unroll_zero_test(int *List, int Length) {
   }
 }
 
+using size_t = unsigned long long;
+
+template <bool Flag>
+int value_dependent(int n) {
+  // CHECK: define {{.*}} @_Z15value_dependentILb1EEii
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma GCC unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_16:.*]]
+    n *= n;
+  }
+#pragma GCC unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_17:.*]]
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
+
 // CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], [[MP:![0-9]+]], ![[UNROLL_ENABLE:.*]]}
 // CHECK: ![[UNROLL_ENABLE]] = !{!"llvm.loop.unroll.enable"}
 // CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[UNROLL_DISABLE:.*]]}
@@ -129,3 +157,5 @@ void while_unroll_zero_test(int *List, int Length) {
 // CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_14]] = distinct !{![[LOOP_14]], [[MP]], ![[UNROLL_DISABLE:.*]]}
 // CHECK: ![[LOOP_15]] = distinct !{![[LOOP_15]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_16]] = distinct !{![[LOOP_16]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_17]] = distinct !{![[LOOP_17]], [[MP]], ![[UNROLL_DISABLE:.*]]}
diff --git a/clang/test/CodeGenCXX/pragma-unroll.cpp b/clang/test/CodeGenCXX/pragma-unroll.cpp
index 02d9bad7148d..6754788b7243 100644
--- a/clang/test/CodeGenCXX/pragma-unroll.cpp
+++ b/clang/test/CodeGenCXX/pragma-unroll.cpp
@@ -96,6 +96,54 @@ void template_test(double *List, int Length) {
   for_template_define_test<double>(List, Length, Value);
 }
 
+void for_unroll_zero_test(int *List, int Length) {
+  // CHECK: define {{.*}} @_Z20for_unroll_zero_testPii
+  #pragma unroll 0
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_14:.*]]
+    List[i] = i * 2;
+  }
+}
+
+void while_unroll_zero_test(int *List, int Length) {
+  // CHECK: define {{.*}} @_Z22while_unroll_zero_testPii
+  int i = 0;
+#pragma unroll(0)
+  while (i < Length) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_15:.*]]
+    List[i] = i * 2;
+    i++;
+  }
+}
+
+using size_t = unsigned long long;
+
+template <bool Flag>
+int value_dependent(int n) {
+  // CHECK: define {{.*}} @_Z15value_dependentILb1EEii
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_16:.*]]
+    n *= n;
+  }
+#pragma unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_17:.*]]
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
+
 // CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], [[MP:![0-9]+]], ![[UNROLL_ENABLE:.*]]}
 // CHECK: ![[UNROLL_ENABLE]] = !{!"llvm.loop.unroll.enable"}
 // CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[UNROLL_DISABLE:.*]]}
@@ -107,3 +155,7 @@ void template_test(double *List, int Length) {
 // CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_6]] = distinct !{![[LOOP_6]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[UNROLL_8:.*]]}
+// CHECK: ![[LOOP_14]] = distinct !{![[LOOP_14]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_15]] = distinct !{![[LOOP_15]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_16]] = distinct !{![[LOOP_16]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_17]] = distinct !{![[LOOP_17]], [[MP]], ![[UNROLL_DISABLE:.*]]}
diff --git a/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
new file mode 100644
index 000000000000..5b2d01465784
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
@@ -0,0 +1,78 @@
+// REQUIRES: x86_64-linux
+// This tests that the coroutine elide optimization could happen succesfully with ThinLTO.
+// This test is adapted from coro-elide.cpp and splits functions into two files.
+//
+// RUN: split-file %s %t
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-callee.cpp -o %t/coro-elide-callee.o
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-caller.cpp -o %t/coro-elide-caller.o
+// RUN: llvm-lto -thinlto %t/coro-elide-callee.o %t/coro-elide-caller.o -o %t/summary
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O2 -x ir %t/coro-elide-caller.o -fthinlto-index=%t/summary.thinlto.bc -emit-llvm -o - | FileCheck %s
+
+//--- coro-elide-task.h
+#pragma once
+#include "Inputs/coroutine.h"
+
+struct Task {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        if (!h)
+          return std::noop_coroutine();
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+    Task get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_value(int x) noexcept {
+      _value = x;
+    }
+    std::coroutine_handle<> continuation;
+    int _value;
+  };
+
+  Task(std::coroutine_handle<promise_type> handle) : handle(handle) {}
+  ~Task() {
+    if (handle)
+      handle.destroy();
+  }
+
+  struct Awaiter {
+    bool await_ready() const noexcept { return false; }
+    void await_suspend(std::coroutine_handle<void> continuation) noexcept {}
+    int await_resume() noexcept {
+      return 43;
+    }
+  };
+
+  auto operator co_await() {
+    return Awaiter{};
+  }
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+//--- coro-elide-callee.cpp
+#include "coro-elide-task.h"
+Task task0() {
+  co_return 43;
+}
+
+//--- coro-elide-caller.cpp
+#include "coro-elide-task.h"
+
+Task task0();
+
+Task task1() {
+  co_return co_await task0();
+}
+
+// CHECK-LABEL: define{{.*}} void @_Z5task1v.resume
+// CHECK-NOT: {{.*}}_Znwm
diff --git a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
new file mode 100644
index 000000000000..06cc2069dbe9
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+struct Printy {
+  Printy(const char *name) : name(name) {}
+  ~Printy() {}
+  const char *name;
+};
+
+struct coroutine {
+  struct promise_type;
+  std::coroutine_handle<promise_type> handle;
+  ~coroutine() {
+    if (handle) handle.destroy();
+  }
+};
+
+struct coroutine::promise_type {
+  coroutine get_return_object() {
+    return {std::coroutine_handle<promise_type>::from_promise(*this)};
+  }
+  std::suspend_never initial_suspend() noexcept { return {}; }
+  std::suspend_always final_suspend() noexcept { return {}; }
+  void return_void() {}
+  void unhandled_exception() {}
+};
+
+struct Awaiter : std::suspend_always {
+  Printy await_resume() { return {"awaited"}; }
+};
+
+int foo() { return 2; }
+
+coroutine ArrayInitCoro() {
+  // Verify that:
+  //  - We do the necessary stores for array cleanups.
+  //  - Array cleanups are called by await.cleanup.
+  //  - We activate the cleanup after the first element and deactivate it in await.ready (see cleanup.isactive).
+
+  // CHECK-LABEL: define dso_local void @_Z13ArrayInitCorov
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  // CHECK: %cleanup.isactive = alloca i1, align 1
+  Printy arr[2] = {
+    Printy("a"),
+    // CHECK:       %arrayinit.begin = getelementptr inbounds [2 x %struct.Printy], ptr %arr.reload.addr, i64 0, i64 0
+    // CHECK-NEXT:  %arrayinit.begin.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 10
+    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.begin.spill.addr, align 8
+    // CHECK-NEXT:  store i1 true, ptr %cleanup.isactive.reload.addr, align 1
+    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.endOfInit.reload.addr, align 8
+    // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
+    // CHECK-NEXT:  %arrayinit.element = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
+    // CHECK-NEXT:  %arrayinit.element.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
+    // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.element.spill.addr, align 8
+    // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.endOfInit.reload.addr, align 8
+    co_await Awaiter{}
+    // CHECK-NEXT:  @_ZNSt14suspend_always11await_readyEv
+    // CHECK-NEXT:  br i1 %{{.+}}, label %await.ready, label %CoroSave30
+  };
+  // CHECK:       await.cleanup:                                    ; preds = %AfterCoroSuspend{{.*}}
+  // CHECK-NEXT:    br label %cleanup{{.*}}.from.await.cleanup
+
+  // CHECK:       cleanup{{.*}}.from.await.cleanup:                      ; preds = %await.cleanup
+  // CHECK:         br label %cleanup{{.*}}
+
+  // CHECK:       await.ready:
+  // CHECK-NEXT:    %arrayinit.element.reload.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
+  // CHECK-NEXT:    %arrayinit.element.reload = load ptr, ptr %arrayinit.element.reload.addr, align 8
+  // CHECK-NEXT:    call void @_ZN7Awaiter12await_resumeEv
+  // CHECK-NEXT:    store i1 false, ptr %cleanup.isactive.reload.addr, align 1
+  // CHECK-NEXT:    br label %cleanup{{.*}}.from.await.ready
+
+  // CHECK:       cleanup{{.*}}:                                         ; preds = %cleanup{{.*}}.from.await.ready, %cleanup{{.*}}.from.await.cleanup
+  // CHECK:         %cleanup.is_active = load i1, ptr %cleanup.isactive.reload.addr, align 1
+  // CHECK-NEXT:    br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
+
+  // CHECK:       cleanup.action:
+  // CHECK:         %arraydestroy.isempty = icmp eq ptr %arrayinit.begin.reload{{.*}}, %{{.*}}
+  // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body.from.cleanup.action
+  // Ignore rest of the array cleanup.
+}
+
+coroutine ArrayInitWithCoReturn() {
+  // CHECK-LABEL: define dso_local void @_Z21ArrayInitWithCoReturnv
+  // Verify that we start to emit the array destructor.
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  Printy arr[2] = {"a", ({
+                      if (foo()) {
+                        co_return;
+                      }
+                      "b";
+                    })};
+}
diff --git a/clang/test/CodeGenObjC/arc-blocks-exceptions.m b/clang/test/CodeGenObjC/arc-blocks-exceptions.m
index 821b818d4027..54b043d8ea07 100644
--- a/clang/test/CodeGenObjC/arc-blocks-exceptions.m
+++ b/clang/test/CodeGenObjC/arc-blocks-exceptions.m
@@ -5,17 +5,22 @@ void test1(_Bool c) {
   __weak id weakId = 0;
   test1_fn(c ? ^{ (void)weakId; } : 0);
 
-  // CHECK: [[CLEANUP_COND:%.*]] = alloca i1
-  // CHECK-NEXT: [[CLEANUP_SAVE:%.*]] = alloca ptr
+  // CHECK: [[CLEANUP_SAVE:%cond-cleanup.save.*]] = alloca ptr
+  // CHECK-NEXT: [[CLEANUP_COND:%.*]] = alloca i1
+  // CHECK-NEXT: [[CLEANUP_COND1:%.*]] = alloca i1
 
-  // CHECK: store i1 true, ptr [[CLEANUP_COND]]
-  // CHECK-NEXT: store ptr {{.*}}, ptr [[CLEANUP_SAVE]]
+  // CHECK: store i1 false, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: store i1 false, ptr [[CLEANUP_COND1]]
+
+  // CHECK: store ptr {{.*}}, ptr [[CLEANUP_SAVE]]
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_COND1]]
 
   // CHECK: invoke void @test1_fn(
   // CHECK-NEXT: to label %[[INVOKE_CONT:.*]] unwind label %[[LANDING_PAD_LAB:.*]]
 
   // CHECK: [[INVOKE_CONT]]:
-  // CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[CLEANUP_COND1]]
   // CHECK-NEXT: br i1 [[LOAD]], label %[[END_OF_SCOPE_LAB:.*]], label
 
   // CHECK: [[END_OF_SCOPE_LAB]]:
diff --git a/clang/test/CodeGenObjC/arc-blocks.m b/clang/test/CodeGenObjC/arc-blocks.m
index 105a72b4af1e..f718e8bbf9a6 100644
--- a/clang/test/CodeGenObjC/arc-blocks.m
+++ b/clang/test/CodeGenObjC/arc-blocks.m
@@ -445,8 +445,8 @@ void test13(id x) {
   // CHECK:      [[X:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[B:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[BLOCK:%.*]] = alloca [[BLOCK_T:.*]], align 8
-  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[COND_CLEANUP_SAVE:%.*]] = alloca ptr,
+  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[T0:%.*]] = call ptr @llvm.objc.retain(ptr {{%.*}})
   // CHECK-NEXT: store ptr [[T0]], ptr [[X]], align 8
   // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[B]])
@@ -460,8 +460,8 @@ void test13(id x) {
   // CHECK-NEXT: [[T0:%.*]] = load ptr, ptr [[X]], align 8
   // CHECK-NEXT: [[T1:%.*]] = call ptr @llvm.objc.retain(ptr [[T0]])
   // CHECK-NEXT: store ptr [[T1]], ptr [[CAPTURE]], align 8
-  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: store ptr [[CAPTURE]], ptr [[COND_CLEANUP_SAVE]], align 8
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: br label
   // CHECK:      br label
   // CHECK:      [[T0:%.*]] = phi ptr
diff --git a/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm b/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm
new file mode 100644
index 000000000000..3bbdbebc5cb5
--- /dev/null
+++ b/clang/test/CodeGenObjCXX/msabi-stret-arm64.mm
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -triple aarch64-pc-windows-msvc -fobjc-runtime=gnustep-2.2 -fobjc-dispatch-method=non-legacy -emit-llvm -o - %s | FileCheck %s
+
+// Pass and return for type size <= 8 bytes.
+struct S1 {
+  int a[2];
+};
+
+// Pass and return hfa <= 8 bytes
+struct F1 {
+  float a[2];
+};
+
+// Pass and return for type size > 16 bytes.
+struct S2 {
+  int a[5];
+};
+
+// Pass and return aggregate (of size < 16 bytes) with non-trivial destructor.
+// Sret and inreg: Returned in x0
+struct S3 {
+  int a[3];
+  ~S3();
+};
+S3::~S3() {
+}
+
+
+@interface MsgTest { id isa; } @end
+@implementation MsgTest 
+- (S1) smallS1 {
+  S1 x;
+  x.a[0] = 0;
+  x.a[1] = 1;
+  return x;
+
+}
+- (F1) smallF1 {
+  F1 x;
+  x.a[0] = 0.2f;
+  x.a[1] = 0.5f;
+  return x;
+}
+- (S2) stretS2 {
+  S2 x;
+  for (int i = 0; i < 5; i++) {
+    x.a[i] = i;
+  }
+  return x;
+}
+- (S3) stretInRegS3 {
+  S3 x;
+  for (int i = 0; i < 3; i++) {
+    x.a[i] = i;
+  }
+  return x;
+}
++ (S3) msgTestStretInRegS3 {
+  S3 x;
+  for (int i = 0; i < 3; i++) {
+    x.a[i] = i;
+  }
+  return x;
+}
+@end
+
+void test0(MsgTest *t) {
+    // CHECK: call {{.*}} @objc_msgSend
+    S1 ret = [t smallS1];
+    // CHECK: call {{.*}} @objc_msgSend
+    F1 ret2 = [t smallF1];
+    // CHECK: call {{.*}} @objc_msgSend_stret
+    S2 ret3 = [t stretS2];
+    // CHECK: call {{.*}} @objc_msgSend_stret2
+    S3 ret4 = [t stretInRegS3];
+    // CHECK: call {{.*}} @objc_msgSend_stret2
+    S3 ret5 = [MsgTest msgTestStretInRegS3];
+}
diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c
index 77ba43122b24..ad4a5f9ac6fb 100644
--- a/clang/test/Driver/aarch64-mcpu.c
+++ b/clang/test/Driver/aarch64-mcpu.c
@@ -64,10 +64,16 @@
 // NEOVERSE-V1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v1"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-v2  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-V2 %s
 // NEOVERSE-V2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v2"
+// RUN: %clang --target=aarch64 -mcpu=neoverse-v3  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-V3 %s
+// NEOVERSE-V3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v3"
+// RUN: %clang --target=aarch64 -mcpu=neoverse-v3ae  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-V3AE %s
+// NEOVERSE-V3AE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v3ae"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-n1 -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-N1 %s
 // NEOVERSE-N1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n1"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-n2 -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-N2 %s
 // NEOVERSE-N2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n2"
+// RUN: %clang --target=aarch64 -mcpu=neoverse-n3 -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-N3 %s
+// NEOVERSE-N3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n3"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-512tvb -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-512TVB %s
 // NEOVERSE-512TVB: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-512tvb"
 // RUN: %clang --target=aarch64 -mcpu=cortex-a520 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A520 %s
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index faaff05004f6..8ab6a0713147 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -27,4 +27,4 @@
 
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
 // RUN:   -fuse-ld=ld %s 2>&1 | FileCheck -check-prefixes=LD %s
-// LD: ld.lld"
+// LD: ld.lld
diff --git a/clang/test/Driver/claim-unused.c b/clang/test/Driver/claim-unused.c
deleted file mode 100644
index c7b798934b3c..000000000000
--- a/clang/test/Driver/claim-unused.c
+++ /dev/null
@@ -1,3 +0,0 @@
-// RUN: touch %t.o
-// RUN: %clang --param ssp-buffer-size=1 %t.o -### 2>&1 | FileCheck %s
-// CHECK-NOT: warning: argument unused during compilation: '--param ssp-buffer-size=1'
diff --git a/clang/test/Driver/default-denormal-fp-math.c b/clang/test/Driver/default-denormal-fp-math.c
index 5f87e151df49..c04ad5c08b8d 100644
--- a/clang/test/Driver/default-denormal-fp-math.c
+++ b/clang/test/Driver/default-denormal-fp-math.c
@@ -3,15 +3,6 @@
 
 // RUN: %clang -### -target x86_64-unknown-linux-gnu --sysroot=%S/Inputs/basic_linux_tree -c %s -v 2>&1 | FileCheck -check-prefix=CHECK-IEEE %s
 
-// crtfastmath enables ftz and daz
-// RUN: %clang -### -target x86_64-unknown-linux-gnu -ffast-math --sysroot=%S/Inputs/basic_linux_tree -c %s -v 2>&1 | FileCheck -check-prefix=CHECK-PRESERVESIGN %s
-
-// crt not linked in with nostartfiles
-// RUN: %clang -### -target x86_64-unknown-linux-gnu -ffast-math -nostartfiles --sysroot=%S/Inputs/basic_linux_tree -c %s -v 2>&1 | FileCheck -check-prefix=CHECK-IEEE %s
-
-// If there's no crtfastmath, don't assume ftz/daz
-// RUN: %clang -### -target x86_64-unknown-linux-gnu -ffast-math --sysroot=/dev/null -c %s -v 2>&1 | FileCheck -check-prefix=CHECK-IEEE %s
-
 // RUN: %clang -### -target x86_64-scei-ps4 -c %s -v 2>&1 | FileCheck -check-prefix=CHECK-PRESERVESIGN %s
 
 // Flag omitted for default
diff --git a/clang/test/Driver/experimental-late-parse-attributes.c b/clang/test/Driver/experimental-late-parse-attributes.c
new file mode 100644
index 000000000000..6b54b898afa7
--- /dev/null
+++ b/clang/test/Driver/experimental-late-parse-attributes.c
@@ -0,0 +1,12 @@
+// RUN: %clang %s -c -fexperimental-late-parse-attributes 2>&1 -### | FileCheck %s -check-prefix=CHECK-ON
+// RUN: %clang %s -c -fno-experimental-late-parse-attributes -fexperimental-late-parse-attributes 2>&1 -### | FileCheck %s -check-prefix=CHECK-ON
+
+// CHECK-ON: -cc1
+// CHECK-ON: -fexperimental-late-parse-attributes
+
+// RUN: %clang %s -c 2>&1 -### | FileCheck %s -check-prefix=CHECK-OFF
+// RUN: %clang %s -c -fno-experimental-late-parse-attributes 2>&1 -### | FileCheck %s -check-prefix=CHECK-OFF
+// RUN: %clang %s -c -fexperimental-late-parse-attributes -fno-experimental-late-parse-attributes 2>&1 -### | FileCheck %s -check-prefix=CHECK-OFF
+
+// CHECK-OFF: -cc1
+// CHECK-OFF-NOT: -fexperimental-late-parse-attributes
diff --git a/clang/test/Driver/fast-math.c b/clang/test/Driver/fast-math.c
index b07d5732932c..274f1f22ea5e 100644
--- a/clang/test/Driver/fast-math.c
+++ b/clang/test/Driver/fast-math.c
@@ -157,7 +157,7 @@
 
 // FIXME: This case leaves nnan and ninf. That seems wrong!
 // RUN: %clang -### -ffast-math -fno-unsafe-math-optimizations -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefixes=CHECK,NO-FAST,NINF,NNAN,FINITE-ONLY,NO-REASSOC,NO-NSZ,NO-ARCP,NO-AFN,NOROUNDING %s
+// RUN:   | FileCheck --check-prefixes=CHECK,NO-FAST,NINF,NNAN,FINITE-ONLY,NO-REASSOC,NO-NSZ,NO-ARCP,NO-AFN,NOROUNDING,NO-TRAPPING %s
 // RUN: %clang -### -ffast-math -fmath-errno -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-FAST,NINF,NNAN,FINITE-ONLY,REASSOC,NSZ,ARCP,AFN,CONTRACT-FAST,ERRNO,NOROUNDING %s
 // RUN: %clang -### -ffast-math -fno-associative-math -c %s 2>&1 \
@@ -209,7 +209,7 @@
 
 // RUN: %clang -### -funsafe-math-optimizations -fno-unsafe-math-optimizations \
 // RUN:     -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefixes=CHECK,NO-UNSAFE,NO-REASSOC,NO-ARCP,NO-NSZ,NO-AFN %s
+// RUN:   | FileCheck --check-prefixes=CHECK,NO-UNSAFE,NO-REASSOC,NO-ARCP,NO-NSZ,NO-AFN,NO-TRAPPING %s
 // RUN: %clang -### -ffast-math -fno-associative-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-UNSAFE,NO-REASSOC,ARCP,NSZ,AFN %s
 
@@ -224,9 +224,8 @@
 // RUN: %clang -### -ffast-math -ftrapping-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-FAST,NO-UNSAFE,ARCP,NSZ,AFN,TRAPPING %s
 
-// FIXME: -fno-unsafe-math-optimizations shouldn't imply trapping math
 // RUN: %clang -### -ffast-math -fno-unsafe-math-optimizations -c %s 2>&1 \
-// RUN:   | FileCheck --check-prefixes=CHECK,NO-FAST,NO-UNSAFE,NO-ARCP,NO-NSZ,NO-AFN,TRAPPING %s
+// RUN:   | FileCheck --check-prefixes=CHECK,NO-FAST,NO-UNSAFE,NO-ARCP,NO-NSZ,NO-AFN,NO-TRAPPING %s
 
 // Reassociate is allowed because it does not require reciprocal-math.
 
diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c
index 74b7de7a275a..9d1245239911 100644
--- a/clang/test/Driver/fp-model.c
+++ b/clang/test/Driver/fp-model.c
@@ -64,7 +64,8 @@
 
 // RUN: %clang -### -ffp-model=strict -fdenormal-fp-math=preserve-sign,preserve-sign -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN10 %s
-// WARN10: warning: overriding '-ffp-model=strict' option with '-fdenormal-fp-math=preserve-sign,preserve-sign' [-Woverriding-option]
+// WARN10: "-cc1"
+// WARN10-NOT: warning: overriding '-ffp-model=strict' option with '-fdenormal-fp-math=preserve-sign,preserve-sign' [-Woverriding-option]
 
 // RUN: %clang -### -ffp-model=fast -ffp-model=strict -c %s 2>&1 | FileCheck \
 // RUN:   --check-prefix=WARN11 %s
@@ -73,9 +74,8 @@
 
 // RUN: %clang -### -Ofast -ffp-model=strict -c %s 2>&1 | FileCheck \
 // RUN:   --check-prefix=WARN12 %s
-// RUN: %clang -### -ffast-math -ffp-model=strict -c %s 2>&1 | FileCheck \
-// RUN:   --check-prefix=WARN12 %s
-// WARN12-NOT: warning: overriding '-ffp-model=strict' option with '-ffp-model=strict' [-Woverriding-option]
+// RUN: %clang -### -Werror -ffast-math -ffp-model=strict -c %s
+// WARN12: warning: overriding '-ffp-model=strict' option with '-Ofast'
 
 // RUN: %clang -### -ffp-model=strict -fapprox-func -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN13 %s
@@ -129,6 +129,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-NO-EXCEPT %s
 // RUN: %clang -### -nostdinc -ffp-model=strict -Ofast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO-EXCEPT %s
+// CHECK-NO-EXCEPT: "-cc1"
 // CHECK-NO-EXCEPT-NOT: "-ffp-exception-behavior=strict"
 
 // RUN: %clang -### -nostdinc -ffp-exception-behavior=strict -c %s 2>&1 \
diff --git a/clang/test/Driver/gcc-param.c b/clang/test/Driver/gcc-param.c
new file mode 100644
index 000000000000..4672e1156ce7
--- /dev/null
+++ b/clang/test/Driver/gcc-param.c
@@ -0,0 +1,2 @@
+// RUN: touch %t.o
+// RUN: %clang -Werror --param ssp-buffer-size=1 %t.o -###
diff --git a/clang/test/Driver/hlsl-lang-targets-spirv.hlsl b/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
index b86c2e01f8d8..61b10e1648c5 100644
--- a/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
+++ b/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
@@ -1,4 +1,5 @@
 // REQUIRES: spirv-registered-target
+// REQUIRES: directx-registered-target
 
 // Supported targets
 //
diff --git a/clang/test/Driver/integrated-as.c b/clang/test/Driver/integrated-as.c
index e78fde873cf4..b0a26f6011b0 100644
--- a/clang/test/Driver/integrated-as.c
+++ b/clang/test/Driver/integrated-as.c
@@ -3,7 +3,7 @@
 // RUN: %clang -### -c -save-temps -integrated-as --target=x86_64 %s 2>&1 | FileCheck %s
 
 // CHECK: cc1as
-// CHECK: -mrelax-all
+// CHECK-NOT: -mrelax-all
 
 // RISC-V does not enable -mrelax-all
 // RUN: %clang -### -c -save-temps -integrated-as --target=riscv64 %s 2>&1 | FileCheck %s -check-prefix=RISCV-RELAX
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index d918f4f2d7db..28fb075a80db 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -2,11 +2,10 @@
 // General tests that ld invocations on Linux targets sane. Note that we use
 // sysroot to make these tests independent of the host system.
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### -Werror %s -no-pie 2>&1 \
 // RUN:     --target=i386-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-32 %s
-// CHECK-LD-32-NOT: warning:
 // CHECK-LD-32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-32: "{{.*}}/usr/lib/gcc/i386-unknown-linux/10.2.0{{/|\\\\}}crtbegin.o"
 // CHECK-LD-32: "-L[[SYSROOT]]/usr/lib/gcc/i386-unknown-linux/10.2.0"
@@ -14,11 +13,10 @@
 // CHECK-LD-32: "-L[[SYSROOT]]/lib"
 // CHECK-LD-32: "-L[[SYSROOT]]/usr/lib"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64 %s
-// CHECK-LD-64-NOT: warning:
 // CHECK-LD-64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-64: "--eh-frame-hdr"
 // CHECK-LD-64: "-m" "elf_x86_64"
@@ -32,11 +30,10 @@
 // CHECK-LD-64: "-lc"
 // CHECK-LD-64: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnux32 -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-X32 %s
-// CHECK-LD-X32-NOT: warning:
 // CHECK-LD-X32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-X32: "--eh-frame-hdr"
 // CHECK-LD-X32: "-m" "elf32_x86_64"
@@ -45,13 +42,12 @@
 // CHECK-LD-X32: "-lc"
 // CHECK-LD-X32: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     --rtlib=compiler-rt \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-RT %s
-// CHECK-LD-RT-NOT: warning:
 // CHECK-LD-RT: "-resource-dir" "[[RESDIR:[^"]*]]"
 // CHECK-LD-RT: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-RT: "--eh-frame-hdr"
@@ -67,13 +63,12 @@
 // CHECK-LD-RT: libclang_rt.builtins.a"
 // CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-linux{{/|\\\\}}clang_rt.crtend.o"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=i686-unknown-linux \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     --rtlib=compiler-rt \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-RT-I686 %s
-// CHECK-LD-RT-I686-NOT: warning:
 // CHECK-LD-RT-I686: "-resource-dir" "[[RESDIR:[^"]*]]"
 // CHECK-LD-RT-I686: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-RT-I686: "--eh-frame-hdr"
@@ -89,13 +84,12 @@
 // CHECK-LD-RT-I686: libclang_rt.builtins.a"
 // CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}i686-unknown-linux{{/|\\\\}}clang_rt.crtend.o"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=arm-linux-androideabi \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --rtlib=compiler-rt \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-RT-ANDROID %s
-// CHECK-LD-RT-ANDROID-NOT: warning:
 // CHECK-LD-RT-ANDROID: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-RT-ANDROID: "--eh-frame-hdr"
 // CHECK-LD-RT-ANDROID: "-m" "armelf_linux_eabi"
@@ -104,11 +98,10 @@
 // CHECK-LD-RT-ANDROID: "-lc"
 // CHECK-LD-RT-ANDROID: libclang_rt.builtins.a"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-GCC %s
-// CHECK-LD-GCC-NOT: warning:
 // CHECK-LD-GCC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-GCC: "--eh-frame-hdr"
 // CHECK-LD-GCC: "-m" "elf_x86_64"
@@ -122,12 +115,11 @@
 // CHECK-LD-GCC: "-lc"
 // CHECK-LD-GCC: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     -static-libgcc \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64-STATIC-LIBGCC %s
-// CHECK-LD-64-STATIC-LIBGCC-NOT: warning:
 // CHECK-LD-64-STATIC-LIBGCC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-64-STATIC-LIBGCC: "--eh-frame-hdr"
 // CHECK-LD-64-STATIC-LIBGCC: "-m" "elf_x86_64"
@@ -268,12 +260,10 @@
 // CHECK-CLANG-ANDROID-STATIC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-CLANG-ANDROID-STATIC: "--start-group" "{{[^"]*}}{{/|\\\\}}libclang_rt.builtins.a" "-l:libunwind.a" "-lc" "--end-group"
 //
-// RUN: %clang -### %s 2>&1      \
-// RUN:     --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
+// RUN: %clang -### %s -Werror --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     -static \
-// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64-STATIC %s
-// CHECK-LD-64-STATIC-NOT: warning:
 // CHECK-LD-64-STATIC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-64-STATIC: "--eh-frame-hdr"
 // CHECK-LD-64-STATIC: "-m" "elf_x86_64"
@@ -486,13 +476,12 @@
 //
 // Test that we can use -stdlib=libc++ in a build system even when it
 // occasionally links C code instead of C++ code.
-// RUN: %clang -x c -### %s -no-pie 2>&1 \
+// RUN: %clang -x c -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnu \
 // RUN:     -stdlib=libc++ \
 // RUN:     -ccc-install-dir %S/Inputs/basic_linux_libcxx_tree/usr/bin \
 // RUN:     --sysroot=%S/Inputs/basic_linux_libcxx_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BASIC-LIBCXX-C-LINK %s
-// CHECK-BASIC-LIBCXX-C-LINK-NOT: warning:
 // CHECK-BASIC-LIBCXX-C-LINK: "-cc1"
 // CHECK-BASIC-LIBCXX-C-LINK: "-isysroot" "[[SYSROOT:[^"]+]]"
 // CHECK-BASIC-LIBCXX-C-LINK-NOT: "-internal-isystem" "[[SYSROOT]]/usr/bin/../include/c++/v1"
@@ -1417,6 +1406,9 @@
 // RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -funsafe-math-optimizations\
 // RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
+// RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -ffp-model=fast \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
 // RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -Ofast\
 // RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
@@ -1446,6 +1438,32 @@
 // RUN: %clang --target=i386-unknown-linux -no-pie -### %s -ffast-math \
 // RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NOCRTFASTMATH %s
+// Don't link crtfastmath.o with -shared
+// RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -ffast-math -shared \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTFASTMATH %s
+// RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -Ofast -shared \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTFASTMATH %s
+// Check for effects of -mdaz-ftz
+// RUN: %clang --target=x86_64-unknown-linux -### %s -ffast-math -shared -mdaz-ftz \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
+// RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -ffast-math -mdaz-ftz \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
+// RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -mdaz-ftz \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
+// RUN: %clang --target=x86_64-unknown-linux -### %s -ffast-math -shared -mno-daz-ftz \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTFASTMATH %s
+// RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -ffast-math -mno-daz-ftz \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTFASTMATH %s
+// RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -mno-daz-ftz \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOCRTFASTMATH %s
 // CHECK-CRTFASTMATH: usr/lib/gcc/x86_64-unknown-linux/10.2.0{{/|\\\\}}crtfastmath.o
 // CHECK-NOCRTFASTMATH-NOT: crtfastmath.o
 
@@ -1635,11 +1653,10 @@
 // CHECK-MUSL-AARCH64_BE: "-dynamic-linker" "/lib/ld-musl-aarch64_be.so.1"
 
 // Check whether multilib gcc install works fine on Gentoo with gcc-config
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnu -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_multi_version_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-GENTOO %s
-// CHECK-LD-GENTOO-NOT: warning:
 // CHECK-LD-GENTOO: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-GENTOO: "--eh-frame-hdr"
 // CHECK-LD-GENTOO: "-m" "elf_x86_64"
@@ -1650,11 +1667,10 @@
 // CHECK-LD-GENTOO: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 // CHECK-LD-GENTOO: "-lc"
 // CHECK-LD-GENTOO: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=i686-unknown-linux-gnu -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_multi_version_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-GENTOO-32 %s
-// CHECK-LD-GENTOO-32-NOT: warning:
 // CHECK-LD-GENTOO-32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-GENTOO-32: "--eh-frame-hdr"
 // CHECK-LD-GENTOO-32: "-m" "elf_i386"
@@ -1665,11 +1681,10 @@
 // CHECK-LD-GENTOO-32: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 // CHECK-LD-GENTOO-32: "-lc"
 // CHECK-LD-GENTOO-32: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnux32 -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_multi_version_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-GENTOO-X32 %s
-// CHECK-LD-GENTOO-X32-NOT: warning:
 // CHECK-LD-GENTOO-X32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-GENTOO-X32: "--eh-frame-hdr"
 // CHECK-LD-GENTOO-X32: "-m" "elf32_x86_64"
@@ -1691,11 +1706,10 @@
 // CHECK-LD-RHEL7-DTS: [[SYSROOT]]/usr/lib/gcc/x86_64-redhat-linux/7/../../../../bin/ld
 
 // Check whether gcc7 install works fine on Amazon Linux AMI
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-amazon-linux -rtlib=libgcc --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/ami_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-AMI %s
-// CHECK-LD-AMI-NOT: warning:
 // CHECK-LD-AMI: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-AMI: "--eh-frame-hdr"
 // CHECK-LD-AMI: "-m" "elf_x86_64"
diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index 8399b4e97f86..8c701a736fc7 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -204,12 +204,12 @@
 // RUN: not %clang --target=riscv32-unknown-elf -march=unknown -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-STR %s
 // RV32-STR: error: invalid arch name 'unknown',
-// RV32-STR: string must begin with rv32{i,e,g} or rv64{i,e,g}
+// RV32-STR: string must begin with rv32{i,e,g}, rv64{i,e,g}, or a supported profile name
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32q -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-LETTER %s
 // RV32-LETTER: error: invalid arch name 'rv32q',
-// RV32-LETTER: first letter should be 'e', 'i' or 'g'
+// RV32-LETTER: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32imcq -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-ORDER %s
@@ -239,12 +239,12 @@
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32xabc -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X %s
 // RV32X: error: invalid arch name 'rv32xabc',
-// RV32X: first letter should be 'e', 'i' or 'g'
+// RV32X: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32sabc -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32S %s
 // RV32S: error: invalid arch name 'rv32sabc',
-// RV32S: first letter should be 'e', 'i' or 'g'
+// RV32S: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32ix -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X-NAME %s
diff --git a/clang/test/Driver/riscv-profiles.c b/clang/test/Driver/riscv-profiles.c
index 647567d4c971..298f301de3fe 100644
--- a/clang/test/Driver/riscv-profiles.c
+++ b/clang/test/Driver/riscv-profiles.c
@@ -318,7 +318,7 @@
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zkt"
 
 // RUN: not %clang --target=riscv64 -### -c %s 2>&1 -march=rva19u64_zfa | FileCheck -check-prefix=INVALID-PROFILE %s
-// INVALID-PROFILE: error: invalid arch name 'rva19u64_zfa', unsupported profile
+// INVALID-PROFILE: error: invalid arch name 'rva19u64_zfa', string must begin with rv32{i,e,g}, rv64{i,e,g}, or a supported profile name
 
 // RUN: not %clang --target=riscv64 -### -c %s 2>&1 -march=rva22u64zfa | FileCheck -check-prefix=INVALID-ADDITIONAL %s
 // INVALID-ADDITIONAL: error: invalid arch name 'rva22u64zfa', additional extensions must be after separator '_'
diff --git a/clang/test/Driver/solaris-ld.c b/clang/test/Driver/solaris-ld.c
index 6d74389e8922..ce0728d392bf 100644
--- a/clang/test/Driver/solaris-ld.c
+++ b/clang/test/Driver/solaris-ld.c
@@ -193,6 +193,9 @@
 // RUN: %clang --target=sparc-sun-solaris2.11 -### %s -ffast-math \
 // RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH-SPARC32 %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -ffp-model=fast \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH-SPARC32 %s
 // CHECK-CRTFASTMATH-SPARC32: "-isysroot" "[[SYSROOT:[^"]+]]"
 // CHECK-CRTFASTMATH-SPARC32: "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2{{/|\\\\}}crtfastmath.o"
 // CHECK-NOCRTFASTMATH-SPARC32-NOT: crtfastmath.o
diff --git a/clang/test/Driver/tls-dialect.c b/clang/test/Driver/tls-dialect.c
index a808dd81531c..3471b55b0eba 100644
--- a/clang/test/Driver/tls-dialect.c
+++ b/clang/test/Driver/tls-dialect.c
@@ -1,3 +1,5 @@
+// RUN: %clang -### --target=loongarch64-linux -mtls-dialect=trad %s 2>&1 | FileCheck --check-prefix=NODESC %s
+// RUN: %clang -### --target=loongarch64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s
 // RUN: %clang -### --target=riscv64-freebsd -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=DESC %s
 // RUN: %clang -### --target=riscv64-linux -mtls-dialect=trad %s 2>&1 | FileCheck --check-prefix=NODESC %s
 // RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s
@@ -9,6 +11,8 @@
 // RUN: %clang -### --target=riscv64-android %s 2>&1 | FileCheck --check-prefix=DESC %s
 
 /// LTO
+// RUN: %clang -### --target=loongarch64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s
+// RUN: %clang -### --target=loongarch64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s
 // RUN: %clang -### --target=riscv64-linux -flto -mtls-dialect=desc %s 2>&1 | FileCheck --check-prefix=LTO-DESC %s
 // RUN: %clang -### --target=riscv64-linux -flto %s 2>&1 | FileCheck --check-prefix=LTO-NODESC %s
 
@@ -18,6 +22,7 @@
 // RUN: not %clang --target=x86_64-apple-macos -mtls-dialect=desc -flto %s 2>&1 | FileCheck -check-prefix=UNSUPPORTED-TARGET %s
 
 /// Unsupported argument
+// RUN: not %clang -### --target=loongarch64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s
 // RUN: not %clang -### --target=riscv64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s
 
 // DESC:       "-cc1" {{.*}}"-enable-tlsdesc"
diff --git a/clang/test/Driver/wasm-features.c b/clang/test/Driver/wasm-features.c
index 5dae5dbc89b9..1f7fb2134982 100644
--- a/clang/test/Driver/wasm-features.c
+++ b/clang/test/Driver/wasm-features.c
@@ -77,6 +77,12 @@
 // RELAXED-SIMD: "-target-feature" "+relaxed-simd"
 // NO-RELAXED-SIMD: "-target-feature" "-relaxed-simd"
 
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mhalf-precision 2>&1 | FileCheck %s -check-prefix=HALF-PRECISION
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-half-precision 2>&1 | FileCheck %s -check-prefix=NO-HALF-PRECISION
+
+// HALF-PRECISION: "-target-feature" "+half-precision"
+// NO-HALF-PRECISION: "-target-feature" "-half-precision"
+
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mexception-handling 2>&1 | FileCheck %s -check-prefix=EXCEPTION-HANDLING
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-exception-handling 2>&1 | FileCheck %s -check-prefix=NO-EXCEPTION-HANDLING
 
diff --git a/clang/test/Frontend/ast-dump-on-llvm.ll b/clang/test/Frontend/ast-dump-on-llvm.ll
new file mode 100644
index 000000000000..cdacfde4ba84
--- /dev/null
+++ b/clang/test/Frontend/ast-dump-on-llvm.ll
@@ -0,0 +1,29 @@
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump=json %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP-EQ-JSON
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump=default %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP-EQ-DEFAULT
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump-all %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP-ALL
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump-all=json %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP-ALL-EQ-JSON
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump-all=default %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP-ALL-EQ-DEFAULT
+
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-print %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-PRINT
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-view %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-VIEW
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-list %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-LIST
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump-lookups %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP-LOOKUP
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump-filter=FunctionDecl %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP-FILTER-EQ
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -ast-dump-decl-types %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-AST-DUMP-DECL-TYPES
+; RUN: not %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-SYNTAX-ONLY
+
+
+; CHECK-AST-DUMP: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-DUMP-EQ-JSON: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-DUMP-EQ-DEFAULT: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-DUMP-ALL: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-DUMP-ALL-EQ-JSON: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-DUMP-ALL-EQ-DEFAULT: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-PRINT: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-VIEW: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-LIST: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-DUMP-LOOKUP: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-DUMP-FILTER-EQ: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-AST-DUMP-DECL-TYPES: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
+; CHECK-SYNTAX-ONLY: fatal error: cannot apply AST actions to LLVM IR file '{{.*}}'
diff --git a/clang/test/Index/annotate-nested-name-specifier.cpp b/clang/test/Index/annotate-nested-name-specifier.cpp
index a7338db6b05b..318149725840 100644
--- a/clang/test/Index/annotate-nested-name-specifier.cpp
+++ b/clang/test/Index/annotate-nested-name-specifier.cpp
@@ -132,7 +132,7 @@ struct X8 {
 
 struct X9 : X8 {
   typedef X8 inherited;
-  void f() { 
+  void f() {
     inherited::f();
   }
 };
@@ -299,7 +299,7 @@ struct X9 : X8 {
 // CHECK: Identifier: "type" [77:16 - 77:20] TypeRef=X4::type:70:13
 // CHECK: Punctuation: ">" [77:20 - 77:21] MemberRefExpr=
 // CHECK: Punctuation: "::" [77:21 - 77:23] MemberRefExpr=
-// CHECK: Identifier: "g" [77:23 - 77:24] MemberRefExpr=
+// CHECK: Identifier: "g" [77:23 - 77:24] OverloadedDeclRef=
 // CHECK: Punctuation: "(" [77:24 - 77:25] CallExpr=
 // CHECK: Identifier: "t" [77:25 - 77:26] DeclRefExpr=t:74:12
 // CHECK: Punctuation: ")" [77:26 - 77:27] CallExpr=
diff --git a/clang/test/Interpreter/fail.cpp b/clang/test/Interpreter/fail.cpp
index 4e301f37548f..633d92794325 100644
--- a/clang/test/Interpreter/fail.cpp
+++ b/clang/test/Interpreter/fail.cpp
@@ -1,12 +1,19 @@
-// FIXME: There're some inconsistencies between interactive and non-interactive
-// modes. For example, when clang-repl runs in the interactive mode, issues an
-// error, and then successfully recovers if we decide it's a success then for
-// the non-interactive mode the exit code should be a failure.
-// RUN: clang-repl "int x = 10;" "int y=7; err;" "int y = 10;"
 // REQUIRES: host-supports-jit
 // UNSUPPORTED: system-aix
-// RUN: cat %s | not clang-repl | FileCheck %s
-BOOM!
+// clang-repl can be called from the prompt in non-interactive mode as a
+// calculator in shell scripts, for example. In that case if there is an error
+// we should set the exit code as failure.
+// RUN: not clang-repl "int x = 10;" "int y=7; err;" "int y = 10;"
+
+// In interactive (REPL) mode, we can have errors but we should exit with
+// success because errors in the input code are part of the interactive use.
+// RUN: cat %s | clang-repl | FileCheck %s
+
+// However, interactive mode should fail when we specified -verify and there
+// was a diagnostic mismatches. This will make the testsuite fail as intended.
+// RUN: cat %s | not clang-repl -Xcc -Xclang -Xcc -verify | FileCheck %s
+
+BOOM! // expected-error {{intended to fail the -verify test}}
 extern "C" int printf(const char *, ...);
 int i = 42;
 auto r1 = printf("i = %d\n", i);
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index baaa9d4434e9..4a08eb61cd39 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -222,7 +222,7 @@
 #error "wrong value for __cpp_aggregate_bases"
 #endif
 
-#if check(structured_bindings, 0, 0, 0, 201606, 201606, 201606, 201606)
+#if check(structured_bindings, 0, 0, 0, 202403L, 202403L, 202403L, 202403L)
 #error "wrong value for __cpp_structured_bindings"
 #endif
 
diff --git a/clang/test/Lexer/update_consecutive_macro_address_space.c b/clang/test/Lexer/update_consecutive_macro_address_space.c
index 80ef4557591c..6f74709556c0 100644
--- a/clang/test/Lexer/update_consecutive_macro_address_space.c
+++ b/clang/test/Lexer/update_consecutive_macro_address_space.c
@@ -1,14 +1,15 @@
 // RUN: %clang -cc1 -print-stats %s 2>&1 | FileCheck %s
-// CHECK: 6 local SLocEntries allocated
+// CHECK: 7 local SLocEntries allocated
 //
-// Verify that the macro arg expansion is split to two file ids, we have 6 file
-// ids rather than 5:
+// Verify that the macro arg expansion is split to two file ids, we have 7 file
+// ids rather than 6:
 //   0: invalid file id
 //   1: main file
 //   2: builtin file
-//   3: macro expansion for X
-//   4: macro arg expansions for 1
-//   5: macro arg expansions for == 2
+//   3: scratch space for __GCC_[CON|DE]STRUCTIVE_SIZE macros
+//   4: macro expansion for X
+//   5: macro arg expansions for 1
+//   6: macro arg expansions for == 2
 #define X(x) (int)(x);
 void func() {
   X(1
diff --git a/clang/test/Misc/cc1as-relax-all.s b/clang/test/Misc/cc1as-relax-all.s
new file mode 100644
index 000000000000..e76fc6f61bab
--- /dev/null
+++ b/clang/test/Misc/cc1as-relax-all.s
@@ -0,0 +1,13 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang -cc1as -triple x86_64 -filetype obj -mrelax-all %s -o %t.o
+// RUN: llvm-objdump -d %t.o | FileCheck %s
+
+// CHECK:      <.text>:
+// CHECK-NEXT:   0: e9 06 00 00 00                jmp     0xb <foo>
+// CHECK-NEXT:   5: 0f 84 00 00 00 00             je      0xb <foo>
+// CHECK-EMPTY:
+
+jmp foo
+je foo
+
+foo: ret
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 9c91c4157cd6..768b243b04e3 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -5,11 +5,11 @@
 
 // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64
 // AARCH64: error: unknown target CPU 'not-a-cpu'
-// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
+// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
-// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
+// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
diff --git a/clang/test/Modules/force-transitive-changes.cppm b/clang/test/Modules/force-transitive-changes.cppm
new file mode 100644
index 000000000000..5732d264d1d5
--- /dev/null
+++ b/clang/test/Modules/force-transitive-changes.cppm
@@ -0,0 +1,38 @@
+// Test that the changes from export imported modules and touched
+// modules can be popullated as expected.
+//
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-reduced-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/A.v1.cppm -emit-reduced-module-interface -o %t/A.v1.pcm
+
+// The BMI of B should change it export imports A, so all the change to A should be popullated
+// to B.
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -emit-reduced-module-interface -fmodule-file=A=%t/A.pcm \
+// RUN:     -o %t/B.pcm
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -emit-reduced-module-interface -fmodule-file=A=%t/A.v1.pcm \
+// RUN:     -o %t/B.v1.pcm
+// RUN: not diff %t/B.v1.pcm %t/B.pcm  &> /dev/null
+
+//--- A.cppm
+export module A;
+export int funcA() {
+    return 43;
+}
+
+//--- A.v1.cppm
+export module A;
+
+export int funcA() {
+    return 43;
+}
+
+//--- B.cppm
+export module B;
+export import A;
+
+export int funcB() {
+    return funcA();
+}
diff --git a/clang/test/Modules/implicit-module-no-timestamp.cpp b/clang/test/Modules/implicit-module-no-timestamp.cpp
new file mode 100644
index 000000000000..1b681a610bab
--- /dev/null
+++ b/clang/test/Modules/implicit-module-no-timestamp.cpp
@@ -0,0 +1,34 @@
+// UNSUPPORTED: system-windows
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: cp a1.h a.h
+// RUN: %clang_cc1 -fmodules -fvalidate-ast-input-files-content -fno-pch-timestamp -fmodule-map-file=module.modulemap -fmodules-cache-path=%t test1.cpp
+// RUN: cp a2.h a.h
+// RUN: %clang_cc1 -fmodules -fvalidate-ast-input-files-content -fno-pch-timestamp -fmodule-map-file=module.modulemap -fmodules-cache-path=%t test2.cpp
+
+//--- a1.h
+#define FOO
+
+//--- a2.h
+#define BAR
+
+//--- module.modulemap
+module a {
+  header "a.h"
+}
+
+//--- test1.cpp
+#include "a.h"
+
+#ifndef FOO
+#error foo
+#endif
+
+//--- test2.cpp
+#include "a.h"
+
+#ifndef BAR
+#error bar
+#endif
diff --git a/clang/test/Modules/pr67893.cppm b/clang/test/Modules/pr67893.cppm
index 58990cec01d6..95479193f8ea 100644
--- a/clang/test/Modules/pr67893.cppm
+++ b/clang/test/Modules/pr67893.cppm
@@ -5,7 +5,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/a.cppm \
 // RUN:      -emit-module-interface -o %t/a.pcm
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.cppm \
-// RUN:      -emit-module-interface -fprebuilt-module-path=%t
+// RUN:      -emit-module-interface -fprebuilt-module-path=%t -o %t/m.pcm
 // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/m.pcm  \
 // RUN:      -fprebuilt-module-path=%t -S -emit-llvm -o - | FileCheck %t/m.cppm
 
diff --git a/clang/test/Modules/pr75057.cppm b/clang/test/Modules/pr75057.cppm
new file mode 100644
index 000000000000..96781b3ccacc
--- /dev/null
+++ b/clang/test/Modules/pr75057.cppm
@@ -0,0 +1,66 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// Treat the behavior of using headers as baseline.
+// RUN: %clang_cc1 -std=c++20 %t/use-header.cc -isystem %t -fsyntax-only -verify
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -isystem %t -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use-module.cc -isystem %t -fmodule-file=a=%t/a.pcm -fsyntax-only -verify
+
+// Test again with reduced BMI.
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -isystem %t -emit-reduced-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use-module.cc -isystem %t -fmodule-file=a=%t/a.pcm -fsyntax-only -verify
+
+//--- sys.h
+#ifndef SYS_H
+#define SYS_H
+
+#pragma GCC system_header
+
+template <class C>
+struct [[deprecated]] iterator {};
+
+_Pragma("GCC diagnostic push")
+_Pragma("GCC diagnostic ignored \"-Wdeprecated\"")                         
+_Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+
+template <class C>
+struct reverse_iterator 
+: public iterator<C> {};
+
+_Pragma("GCC diagnostic pop")
+
+template <class T>
+class C {
+public:
+    void i() {
+        reverse_iterator<T> i;
+    }
+};
+
+#endif
+
+//--- use-header.cc
+// expected-no-diagnostics
+// However, we see unexpected warnings
+#include <sys.h>
+
+void use() {
+    C<int>().i();
+}
+
+//--- a.cppm
+module;
+#include <sys.h>
+export module a;
+export using ::iterator;
+export using ::C;
+
+//--- use-module.cc
+// expected-no-diagnostics
+import a;
+
+void use() {
+    C<int>().i();
+}
diff --git a/clang/test/Modules/pr88400.cppm b/clang/test/Modules/pr88400.cppm
new file mode 100644
index 000000000000..ff69137a0b90
--- /dev/null
+++ b/clang/test/Modules/pr88400.cppm
@@ -0,0 +1,61 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/bar.cppm -emit-module-interface -o %t/bar.pcm
+// RUN: %clang_cc1 -std=c++20 %t/foo.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 %t/bar.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+//
+// RUN: %clang_cc1 -std=c++20 %t/bar.cppm -emit-reduced-module-interface -o %t/bar.pcm
+// RUN: %clang_cc1 -std=c++20 %t/foo.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 %t/bar.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+
+//--- header.h
+#pragma once
+
+namespace N {
+    template<typename T>
+    concept X = true;
+
+    template<X T>
+    class Y {
+    public:
+        template<X U>
+        friend class Y;
+    };
+
+    inline Y<int> x;
+}
+
+//--- bar.cppm
+module;
+
+#include "header.h"
+
+export module bar;
+
+namespace N {
+    // To make sure N::Y won't get elided.
+    using N::x;
+}
+
+//--- foo.cc
+// expected-no-diagnostics
+#include "header.h"
+
+import bar;
+
+void y() {
+    N::Y<int> y{};
+};
+
+//--- bar.cc
+// expected-no-diagnostics
+import bar;
+
+#include "header.h"
+
+void y() {
+    N::Y<int> y{};
+};
+
diff --git a/clang/test/Modules/pr90259.cppm b/clang/test/Modules/pr90259.cppm
new file mode 100644
index 000000000000..17786998a2a7
--- /dev/null
+++ b/clang/test/Modules/pr90259.cppm
@@ -0,0 +1,44 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/mod1.cppm -emit-reduced-module-interface -o %t/mod-mod1.pcm
+// RUN: %clang_cc1 -std=c++20 %t/mod.cppm -fprebuilt-module-path=%t  \
+// RUN:     -emit-reduced-module-interface -o %t/mod.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -verify -fsyntax-only
+
+//--- mod1.cppm
+export module mod:mod1;
+namespace {
+    int abc = 43;
+}
+namespace mod {
+    static int def = 44;
+}
+export int f() {
+    return abc + mod::def;
+}
+
+//--- mod.cppm
+// expected-no-diagnostics
+export module mod;
+import :mod1;
+
+namespace {
+    double abc = 43.0;
+}
+
+namespace mod {
+    static double def = 44.0;
+}
+
+export double func() {
+    return (double)f() + abc + mod::def;
+}
+
+//--- use.cpp
+// expected-no-diagnostics
+import mod;
+double use() {
+    return func();
+}
diff --git a/clang/test/Modules/search-partitions.cpp b/clang/test/Modules/search-partitions.cpp
index 92732958db94..92f7c637c833 100644
--- a/clang/test/Modules/search-partitions.cpp
+++ b/clang/test/Modules/search-partitions.cpp
@@ -11,7 +11,7 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/partition3.cpp \
 // RUN:  -o %t/A-Part3.pcm
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/moduleA.cpp \
+// RUN: %clang_cc1 -std=c++20 %t/moduleA.cpp -fsyntax-only -verify \
 // RUN:  -fprebuilt-module-path=%t
 
 // Test again with reduced BMI
@@ -28,9 +28,7 @@
 // RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/partition3.cpp \
 // RUN:  -o %t/A-Part3.pcm
 
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only %t/moduleA.cpp -fprebuilt-module-path=%t 
-
-// expected-no-diagnostics
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %t/moduleA.cpp -fprebuilt-module-path=%t 
 
 //--- partition1.cpp
 export module A:Part1;
@@ -50,7 +48,7 @@ export module A:Part3;
 int part3();
 
 //--- moduleA.cpp
-
+// expected-no-diagnostics
 export module A;
 
 import :Part1;
diff --git a/clang/test/OpenMP/target_ast_print.cpp b/clang/test/OpenMP/target_ast_print.cpp
index 45907e93321a..4e066bcf5e43 100644
--- a/clang/test/OpenMP/target_ast_print.cpp
+++ b/clang/test/OpenMP/target_ast_print.cpp
@@ -1201,6 +1201,64 @@ foo();
 }
 #endif // OMP52
 
+#ifdef OMP60
+
+///==========================================================================///
+// RUN: %clang_cc1 -DOMP60 -verify -Wno-vla -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix OMP60
+// RUN: %clang_cc1 -DOMP60 -fopenmp -fopenmp-version=60 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -DOMP60 -fopenmp -fopenmp-version=60 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP60
+
+// RUN: %clang_cc1 -DOMP60 -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix OMP60
+// RUN: %clang_cc1 -DOMP60 -fopenmp-simd -fopenmp-version=60 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -DOMP60 -fopenmp-simd -fopenmp-version=60 -std=c++11 -include-pch %t -fsyntax-only -verify -Wno-vla %s -ast-print | FileCheck %s --check-prefix OMP60
+
+void foo() {}
+template <typename T, int C>
+T tmain(T argc, T *argv) {
+  T i;
+#pragma omp target map(from always: i)
+  foo();
+#pragma omp target map(from, close: i)
+  foo();
+#pragma omp target map(always,close: i)
+  foo();
+  return 0;
+}
+//OMP60: template <typename T, int C> T tmain(T argc, T *argv) {
+//OMP60-NEXT: T i;
+//OMP60-NEXT: #pragma omp target map(always,from: i)
+//OMP60-NEXT:     foo();
+//OMP60-NEXT: #pragma omp target map(close,from: i)
+//OMP60-NEXT:     foo();
+//OMP60-NEXT: #pragma omp target map(always,close,tofrom: i)
+//OMP60-NEXT:     foo();
+//OMP60-NEXT: return 0;
+//OMP60-NEXT:}
+//OMP60:  template<> int tmain<int, 5>(int argc, int *argv) {
+//OMP60-NEXT:  int i;
+//OMP60-NEXT:  #pragma omp target map(always,from: i)
+//OMP60-NEXT:      foo();
+//OMP60-NEXT:  #pragma omp target map(close,from: i)
+//OMP60-NEXT:      foo();
+//OMP60-NEXT:  #pragma omp target map(always,close,tofrom: i)
+//OMP60-NEXT:      foo();
+//OMP60-NEXT:  return 0;
+//OMP60-NEXT:}
+//OMP60:  template<> char tmain<char, 1>(char argc, char *argv) {
+//OMP60-NEXT:  char i;
+//OMP60-NEXT:  #pragma omp target map(always,from: i)
+//OMP60-NEXT:      foo();
+//OMP60-NEXT:  #pragma omp target map(close,from: i)
+//OMP60-NEXT:      foo();
+//OMP60-NEXT:  #pragma omp target map(always,close,tofrom: i)
+//OMP60-NEXT:      foo();
+//OMP60-NEXT:  return 0;
+//OMP60-NEXT:}
+int main (int argc, char **argv) {
+  return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
+}
+#endif // OMP60
+
 #ifdef OMPX
 
 // RUN: %clang_cc1 -DOMPX -verify -Wno-vla -fopenmp -fopenmp-extensions -ast-print %s | FileCheck %s --check-prefix=OMPX
diff --git a/clang/test/OpenMP/target_map_messages.cpp b/clang/test/OpenMP/target_map_messages.cpp
index a6776ee12c0e..3bd432b47e63 100644
--- a/clang/test/OpenMP/target_map_messages.cpp
+++ b/clang/test/OpenMP/target_map_messages.cpp
@@ -1,34 +1,35 @@
 // -fopenmp, -fno-openmp-extensions
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,lt50,lt51,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,lt50,lt51,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,omp,ge51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=51 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,ge52,omp,ge52-omp,omp52 -fopenmp -fno-openmp-extensions -fopenmp-version=52 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,omp,lt51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,ge51,lt60,omp,ge51-omp -fopenmp -fno-openmp-extensions -fopenmp-version=51 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,ge51,ge52,lt60,omp,ge52-omp,omp52 -fopenmp -fno-openmp-extensions -fopenmp-version=52 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,ge52,ge60,omp,ge60-omp,omp60 -fopenmp -fno-openmp-extensions -fopenmp-version=60 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -DCCODE -verify -fopenmp -fno-openmp-extensions -ferror-limit 300 -x c %s -Wno-openmp -Wuninitialized -Wno-vla
 
 // -fopenmp-simd, -fno-openmp-extensions
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,lt50,lt51,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,lt50,lt51,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,omp,ge51-omp -fopenmp-simd -fno-openmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,omp,lt51-omp -fopenmp-simd -fno-openmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,ge51,lt60,omp,ge51-omp -fopenmp-simd -fno-openmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -DCCODE -verify -fopenmp-simd -fno-openmp-extensions -ferror-limit 300 -x c %s -Wno-openmp-mapping -Wuninitialized -Wno-vla
 
 // -fopenmp -fopenmp-extensions
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,lt50,lt51,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,lt50,lt51,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,ompx,ge51-ompx -fopenmp -fopenmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,ge51,lt60,ompx,ge51-ompx -fopenmp -fopenmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -DCCODE -verify -fopenmp -fopenmp-extensions -ferror-limit 300 -x c %s -Wno-openmp -Wuninitialized -Wno-vla
 
 // -fopenmp-simd -fopenmp-extensions
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,lt50,lt51,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,lt50,lt51,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,lt51,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
-// RUN: %clang_cc1 -verify=expected,ge50,ge51,ompx,ge51-ompx -fopenmp-simd -fopenmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=40 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,lt50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=45 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,lt51,lt60,ompx,lt51-ompx -fopenmp-simd -fopenmp-extensions -fopenmp-version=50 -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
+// RUN: %clang_cc1 -verify=expected,ge50,ge51,lt60,ompx,ge51-ompx -fopenmp-simd -fopenmp-extensions -ferror-limit 300 %s -Wno-openmp-target -Wuninitialized -Wno-vla
 // RUN: %clang_cc1 -DCCODE -verify -fopenmp-simd -fopenmp-extensions -ferror-limit 300 -x c %s -Wno-openmp-mapping -Wuninitialized -Wno-vla
 
 // Check
@@ -113,7 +114,7 @@ struct SA {
     #pragma omp target map(b[true:true])
     {}
 
-    #pragma omp target map(: c,f) // expected-error {{missing map type}}
+    #pragma omp target map(: c,f) // lt60-error {{missing map type}} // ge60-error {{empty modifier-specification-list is not allowed}}
     {}
     #pragma omp target map(always, tofrom: c,f)
     {}
@@ -159,28 +160,28 @@ struct SA {
     // expected-error@+1 {{use of undeclared identifier 'present'}}
     #pragma omp target map(present)
     {}
-    // ge52-omp-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+    // ge52-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, tofrom: c,f)
     {}
-    // ge52-omp-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+    // ge52-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, tofrom: c[1:2],f)
     {}
-    // ge52-omp-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+    // ge52-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, tofrom: c,f[1:2])
     {}
-    // ge52-omp-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+    // ge52-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // expected-error@+3 {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, tofrom: c[:],f)
     {}
-    // ge52-omp-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+    // ge52-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // expected-error@+3 {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
     // ge51-omp-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
@@ -193,19 +194,19 @@ struct SA {
     {}
     #pragma omp target map(always, close, always, close, tofrom: a)   // expected-error 2 {{same map type modifier has been specified more than once}}
     {}
+    // ge60-error@+3 {{same map type modifier has been specified more than once}}
     // ge51-error@+2 {{same map type modifier has been specified more than once}}
     // lt51-error@+1 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(present, present, tofrom: a)
     {}
-    // ge52-omp-error@+5 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
-    // ge52-omp-error@+4 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+    // ge52-error@+4 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ompx-error@+3 {{same map type modifier has been specified more than once}}
     // ge51-omp-error@+2 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-omp-error@+1 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(ompx_hold, ompx_hold, tofrom: a)
     {}
-    // ge52-omp-error@+9 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
-    // ge52-omp-error@+8 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
+    // ge60-error@+9 {{same map type modifier has been specified more than once}}
+    // ge52-error@+8 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // expected-error@+7 2 {{same map type modifier has been specified more than once}}
     // ge51-error@+6 {{same map type modifier has been specified more than once}}
     // lt51-ompx-error@+5 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
@@ -219,34 +220,45 @@ struct SA {
     {}
     #pragma omp target map( , , tofrom: a)   // expected-error {{missing map type modifier}} expected-error {{missing map type modifier}}
     {}
-    #pragma omp target map( , , : a)   // expected-error {{missing map type modifier}} expected-error {{missing map type modifier}} expected-error {{missing map type}}
+    #pragma omp target map( , , : a)   // expected-error {{missing map type modifier}} expected-error {{missing map type modifier}} lt60-error {{missing map type}}
     {}
+    // ge60-error@+4 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator'}}
     // ge51-error@+3 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-error@+2 2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     // expected-error@+1 {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
     #pragma omp target map( d, f, bf: a)
     {}
+    // ge60-error@+5 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'iterator}}
     // expected-error@+4 {{missing map type modifier}}
     // ge51-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
-    // expected-error@+1 {{missing map type}}
+    // lt60-error@+1 {{missing map type}}
     #pragma omp target map( , f, : a)
     {}
-    #pragma omp target map(always close: a)   // expected-error {{missing map type}} omp52-error{{missing ',' after map type modifier}}
+    #pragma omp target map(always close: a)   // lt60-error {{missing map type}} ge52-error{{missing ',' after map type modifier}}
     {}
-    #pragma omp target map(always close bf: a)   // omp52-error 2 {{missing ',' after map type modifier}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} 
+    #pragma omp target map(always close bf: a)   // ge52-error 2 {{missing ',' after map type modifier}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
     {}
-    // omp52-error@+4 {{missing ',' after map type modifier}}
+    // ge52-error@+4 {{missing ',' after map type modifier}}
     // ge51-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
-    // expected-error@+1 {{missing map type}}
+    // lt60-error@+1 {{missing map type}}
     #pragma omp target map(always tofrom close: a)
     {}
+    // ge60-note@+4 {{map type 'tofrom' is previous specified here}}
+    // ge60-error@+3 {{map type is already specified}}
     // ge51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
     // lt51-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(tofrom from: a)
     {}
-    #pragma omp target map(close bf: a)   // omp52-error {{missing ',' after map type modifier}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+    // ge60-note@+5 {{map type 'to' is previous specified here}}
+    // ge60-error@+4 {{map type is already specified}}
+    // ge52-error@+3 {{missing ',' after map type modifier}}
+    // ge51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
+    // lt51-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
+    #pragma omp target map(to always from: a)
+    {}
+    #pragma omp target map(close bf: a)   // ge52-error {{missing ',' after map type modifier}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
     {}
     #pragma omp target map(([b[I]][bf])f)  // lt50-error {{expected ',' or ']' in lambda capture list}} lt50-error {{expected ')'}} lt50-note {{to match this '('}}
     {}
@@ -266,6 +278,7 @@ struct SA {
     // lt51-omp-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
     #pragma omp target map(iterator(it=0:10, it=0:20), tofrom:a)
     {}
+    // ge60-error@+7 {{expected '(' after 'iterator'}}
     // ge51-ompx-error@+6 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present', 'ompx_hold'}}
     // lt51-ompx-error@+5 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'ompx_hold'}}
     // lt51-error@+4 {{expected '(' after 'iterator'}}
@@ -694,20 +707,20 @@ T tmain(T argc) {
   foo();
 
 #pragma omp target data map(always, tofrom: x)
-#pragma omp target data map(always: x) // expected-error {{missing map type}}
+#pragma omp target data map(always: x) // lt60-error {{missing map type}}
 // ge51-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
 // lt51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
-// expected-error@+1 {{missing map type}}
+// lt60-error@+1 {{missing map type}}
 #pragma omp target data map(tofrom, always: x)
 #pragma omp target data map(always, tofrom: always, tofrom, x)
 #pragma omp target map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
   foo();
 
 #pragma omp target data map(close, tofrom: x)
-#pragma omp target data map(close: x) // expected-error {{missing map type}}
+#pragma omp target data map(close: x) // lt60-error {{missing map type}}
 // ge51-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
 // lt51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
-// expected-error@+1 {{missing map type}}
+// lt60-error@+1 {{missing map type}}
 #pragma omp target data map(tofrom, close: x)
 #pragma omp target data map(close, tofrom: close, tofrom, x)
   foo();
@@ -829,19 +842,19 @@ int main(int argc, char **argv) {
   foo();
 
 #pragma omp target data map(always, tofrom: x)
-#pragma omp target data map(always: x) // expected-error {{missing map type}}
+#pragma omp target data map(always: x) // lt60-error {{missing map type}}
 // ge51-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
 // lt51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
-// expected-error@+1 {{missing map type}}
+// lt60-error@+1 {{missing map type}}
 #pragma omp target data map(tofrom, always: x)
 #pragma omp target data map(always, tofrom: always, tofrom, x)
 #pragma omp target map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
   foo();
 #pragma omp target data map(close, tofrom: x)
-#pragma omp target data map(close: x) // expected-error {{missing map type}}
+#pragma omp target data map(close: x) // lt60-error {{missing map type}}
 // ge51-error@+3 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper', 'present'}}
 // lt51-error@+2 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
-// expected-error@+1 {{missing map type}}
+// lt60-error@+1 {{missing map type}}
 #pragma omp target data map(tofrom, close: x)
   foo();
 // lt51-error@+1 {{incorrect map type modifier, expected one of: 'always', 'close', 'mapper'}}
diff --git a/clang/test/OpenMP/task_depend_messages.cpp b/clang/test/OpenMP/task_depend_messages.cpp
index 388595bef4de..3f39c55527b5 100644
--- a/clang/test/OpenMP/task_depend_messages.cpp
+++ b/clang/test/OpenMP/task_depend_messages.cpp
@@ -62,7 +62,7 @@ int main(int argc, char **argv, char *env[]) {
   #pragma omp task depend(in : argv[ : argc][1 : argc - 1])
   #pragma omp task depend(in : arr[0])
   #pragma omp task depend(depobj:argc) // omp45-error {{expected 'in', 'out', 'inout' or 'mutexinoutset' in OpenMP clause 'depend'}} omp50-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}} omp51-error {{expected lvalue expression of 'omp_depend_t' type, not 'int'}}
-  #pragma omp task depend(depobj : argv[ : argc][1 : argc - 1]) // omp45-error {{expected 'in', 'out', 'inout' or 'mutexinoutset' in OpenMP clause 'depend'}} omp50-error {{expected lvalue expression of 'omp_depend_t' type, not '<OpenMP array section type>'}} omp51-error {{expected lvalue expression of 'omp_depend_t' type, not '<OpenMP array section type>'}}
+  #pragma omp task depend(depobj : argv[ : argc][1 : argc - 1]) // omp45-error {{expected 'in', 'out', 'inout' or 'mutexinoutset' in OpenMP clause 'depend'}} omp50-error {{expected lvalue expression of 'omp_depend_t' type, not '<array section type>'}} omp51-error {{expected lvalue expression of 'omp_depend_t' type, not '<array section type>'}}
   #pragma omp task depend(depobj : arr[0]) // omp45-error {{expected 'in', 'out', 'inout' or 'mutexinoutset' in OpenMP clause 'depend'}}
   #pragma omp task depend(in : ([ // expected-error {{expected variable name or 'this' in lambda capture list}} expected-error {{expected ')'}} expected-note {{to match this '('}}
   #pragma omp task depend(in : ([] // expected-error {{expected body of lambda expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
diff --git a/clang/test/Parser/altivec.c b/clang/test/Parser/altivec.c
index daee5eae4d84..445369f0dc06 100644
--- a/clang/test/Parser/altivec.c
+++ b/clang/test/Parser/altivec.c
@@ -110,6 +110,12 @@ vector __bool long long v_bll4;      // expected-error {{use of 'long long' with
 #endif
 __vector long double  vv_ld3;        // expected-error {{cannot use 'long double' with '__vector'}}
 vector long double  v_ld4;           // expected-error {{cannot use 'long double' with '__vector'}}
+vector float _Complex v_cf;          // expected-error {{cannot use '_Complex' with '__vector'}}
+vector double _Complex v_cd;         // expected-error {{cannot use '_Complex' with '__vector'}}
+vector long double _Complex v_cld;   // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector float _Complex v_cf2;       // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector double _Complex v_cd2;      // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector long double _Complex v_cld2;// expected-error {{cannot use '_Complex' with '__vector'}}
 vector bool float v_bf;              // expected-error {{cannot use 'float' with '__vector bool'}}
 vector bool double v_bd;             // expected-error {{cannot use 'double' with '__vector bool'}}
 vector bool pixel v_bp;              // expected-error {{cannot use '__pixel' with '__vector bool'}}
diff --git a/clang/test/Parser/cxx-altivec.cpp b/clang/test/Parser/cxx-altivec.cpp
index 6da36663422b..5cb760dababb 100644
--- a/clang/test/Parser/cxx-altivec.cpp
+++ b/clang/test/Parser/cxx-altivec.cpp
@@ -111,6 +111,12 @@ vector __bool long long v_bll4;      // expected-error {{use of 'long long' with
 #endif
 __vector long double  vv_ld3;        // expected-error {{cannot use 'long double' with '__vector'}}
 vector long double  v_ld4;           // expected-error {{cannot use 'long double' with '__vector'}}
+vector float _Complex v_cf;          // expected-error {{cannot use '_Complex' with '__vector'}}
+vector double _Complex v_cd;         // expected-error {{cannot use '_Complex' with '__vector'}}
+vector long double _Complex v_cld;   // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector float _Complex v_cf2;       // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector double _Complex v_cd2;      // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector long double _Complex v_cld2;// expected-error {{cannot use '_Complex' with '__vector'}}
 // FIXME: why is this diagnostic different from the others?
 vector bool v_b;                     // expected-error {{a type specifier is required for all declarations}}
 vector bool float v_bf;              // expected-error {{cannot use 'float' with '__vector bool'}}
diff --git a/clang/test/Parser/cxx1z-decomposition.cpp b/clang/test/Parser/cxx1z-decomposition.cpp
index 90d60df2e47f..4b17f72effb0 100644
--- a/clang/test/Parser/cxx1z-decomposition.cpp
+++ b/clang/test/Parser/cxx1z-decomposition.cpp
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx17 -fcxx-exceptions
-// RUN: %clang_cc1 -std=c++2b %s -verify=expected,cxx2b -fcxx-exceptions
-// RUN: not %clang_cc1 -std=c++17 %s -emit-llvm-only -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-unknown-linux-gnu -verify=expected,cxx17,pre2c -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++2b %s -triple x86_64-unknown-linux-gnu -verify=expected,cxx2b,pre2c,post2b -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-unknown-linux-gnu -verify=expected,cxx2c,post2b -fcxx-exceptions
+// RUN: not %clang_cc1 -std=c++17 %s -triple x86_64-unknown-linux-gnu -emit-llvm-only -fcxx-exceptions
 
 struct S { int a, b, c; };
 
@@ -58,7 +59,7 @@ namespace OtherDecl {
 namespace GoodSpecifiers {
   void f() {
     int n[1];
-    const volatile auto &[a] = n; // cxx2b-warning {{volatile qualifier in structured binding declaration is deprecated}}
+    const volatile auto &[a] = n; // post2b-warning {{volatile qualifier in structured binding declaration is deprecated}}
   }
 }
 
@@ -97,8 +98,8 @@ namespace BadSpecifiers {
     S [a] = s; // expected-error {{cannot be declared with type 'S'}}
     decltype(auto) [b] = s; // expected-error {{cannot be declared with type 'decltype(auto)'}}
     auto ([c2]) = s; // cxx17-error {{decomposition declaration cannot be declared with parenthese}} \
-                     // cxx2b-error {{use of undeclared identifier 'c2'}} \
-                     // cxx2b-error {{expected body of lambda expression}} \
+                     // post2b-error {{use of undeclared identifier 'c2'}} \
+                     // post2b-error {{expected body of lambda expression}} \
 
     // FIXME: This error is not very good.
     auto [d]() = s; // expected-error {{expected ';'}} expected-error {{expected expression}}
@@ -119,9 +120,6 @@ namespace BadSpecifiers {
     [[]] auto [ok_3] = s;
     alignas(S) auto [ok_4] = s;
 
-    // ... but not after the identifier or declarator.
-    // FIXME: These errors are not very good.
-    auto [bad_attr_1 [[]]] = s; // expected-error {{attribute list cannot appear here}} expected-error 2{{}}
     auto [bad_attr_2] [[]] = s; // expected-error {{expected ';'}} expected-error {{}}
   }
 }
@@ -156,3 +154,50 @@ namespace Init {
     S [goodish4] { 4 }; // expected-error {{cannot be declared with type 'S'}}
   }
 }
+
+
+namespace attributes {
+
+struct S{
+    int a;
+    int b = 0;
+};
+
+void err() {
+    auto [[]] = S{0}; // expected-error {{expected unqualified-id}}
+    auto [ alignas(42) a, foo ] = S{0}; // expected-error {{an attribute list cannot appear here}}
+    auto [ c, [[]] d ] = S{0}; // expected-error {{an attribute list cannot appear here}}
+    auto [ e, alignas(42) f ] = S{0}; // expected-error {{an attribute list cannot appear here}}
+}
+
+void ok() {
+    auto [ a alignas(42) [[]], b alignas(42) [[]]] = S{0}; // expected-error 2{{'alignas' attribute only applies to variables, data members and tag types}} \
+                                                           // pre2c-warning  2{{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+    auto [ c [[]] alignas(42), d [[]] alignas(42) [[]]] = S{0}; // expected-error 2{{'alignas' attribute only applies to variables, data members and tag types}} \
+                                                                // pre2c-warning  2{{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+}
+
+
+auto [G1 [[deprecated]], G2 [[deprecated]]] = S{42}; // #deprecated-here
+// pre2c-warning@-1 2{{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+
+int test() {
+  return G1 + G2; // expected-warning {{'G1' is deprecated}} expected-note@#deprecated-here {{here}} \
+                  // expected-warning {{'G2' is deprecated}} expected-note@#deprecated-here {{here}}
+}
+
+void invalid_attributes() {
+  // pre2c-warning@+1 {{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+  auto [a alignas(42) // expected-error {{'alignas' attribute only applies to variables, data members and tag types}}
+      [[assume(true), // expected-error {{'assume' attribute cannot be applied to a declaration}}
+        carries_dependency, // expected-error {{'carries_dependency' attribute only applies to parameters, Objective-C methods, and functions}}
+        fallthrough,  // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+        likely, // expected-error {{'likely' attribute cannot be applied to a declaration}}
+        unlikely, // expected-error {{'unlikely' attribute cannot be applied to a declaration}}
+        nodiscard,  // expected-warning {{'nodiscard' attribute only applies to Objective-C methods, enums, structs, unions, classes, functions, function pointers, and typedefs}}
+        noreturn,  // expected-error {{'noreturn' attribute only applies to functions}}
+        no_unique_address]], // expected-error {{'no_unique_address' attribute only applies to non-bit-field non-static data members}}
+    b] = S{0};
+}
+
+}
diff --git a/clang/test/Parser/pragma-unroll.cpp b/clang/test/Parser/pragma-unroll.cpp
index f41bd7a18d5a..19066acddcef 100644
--- a/clang/test/Parser/pragma-unroll.cpp
+++ b/clang/test/Parser/pragma-unroll.cpp
@@ -124,3 +124,32 @@ void test(int *List, int Length) {
 
 #pragma unroll
 /* expected-error {{expected statement}} */ }
+
+using size_t = unsigned long long;
+
+template <bool Flag>
+int FailToBuild(int n) {
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma unroll Flag ? 0 : N // Ok, allow 0.
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+#pragma GCC unroll Flag ? 0 : N // Ok, allow 0.
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+  return n;
+}
+
+int foo(int n) {
+    return FailToBuild<true>(n);
+}
+
+int bar(int n) {
+    return FailToBuild<false>(n);
+}
diff --git a/clang/test/ParserOpenACC/parse-cache-construct.c b/clang/test/ParserOpenACC/parse-cache-construct.c
index fd161c03c09f..de26fc2b277a 100644
--- a/clang/test/ParserOpenACC/parse-cache-construct.c
+++ b/clang/test/ParserOpenACC/parse-cache-construct.c
@@ -25,11 +25,13 @@ void func() {
   }
 
   for (int i = 0; i < 10; ++i) {
+    // expected-error@+2{{expected expression}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
     #pragma acc cache()
   }
 
   for (int i = 0; i < 10; ++i) {
+    // expected-error@+3{{expected expression}}
     // expected-error@+2{{invalid OpenACC clause 'clause'}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
     #pragma acc cache() clause-list
diff --git a/clang/test/ParserOpenACC/parse-cache-construct.cpp b/clang/test/ParserOpenACC/parse-cache-construct.cpp
index f0a35824696d..f1c71e8b5847 100644
--- a/clang/test/ParserOpenACC/parse-cache-construct.cpp
+++ b/clang/test/ParserOpenACC/parse-cache-construct.cpp
@@ -72,14 +72,12 @@ void use() {
     #pragma acc cache(Arrs.MemArr[3].array[1:4])
   }
   for (int i = 0; i < 10; ++i) {
-    // FIXME: Once we have a new array-section type to represent OpenACC as
-    // well, change this error message.
-    // expected-error@+2{{OpenMP array section is not allowed here}}
+    // expected-error@+2{{OpenACC sub-array is not allowed here}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
     #pragma acc cache(Arrs.MemArr[3:4].array[1:4])
   }
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+2{{OpenMP array section is not allowed here}}
+    // expected-error@+2{{OpenACC sub-array is not allowed here}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
     #pragma acc cache(Arrs.MemArr[3:4].array[4])
   }
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 799f22b8c120..8a439a5ccd4b 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -405,7 +405,10 @@ void SelfUpdate() {
 #pragma acc update self
   for(;;){}
 
-  // expected-error@+3{{use of undeclared identifier 'zero'}}
+  // expected-error@+6{{use of undeclared identifier 'zero'}}
+  // expected-error@+5{{expected ','}}
+  // expected-error@+4{{expected expression}}
+  // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}}
   // expected-warning@+2{{OpenACC clause 'seq' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
 #pragma acc update self(zero : s.array[s.value : 5], s.value), seq
@@ -450,11 +453,13 @@ void VarListClauses() {
 #pragma acc serial copy(, seq
   for(;;){}
 
-  // expected-error@+1{{expected expression}}
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC clause 'copy' not yet implemented, clause ignored}}
 #pragma acc serial copy()
   for(;;){}
 
-  // expected-error@+2{{expected expression}}
+  // expected-error@+3{{expected expression}}
+  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(), seq
   for(;;){}
@@ -482,36 +487,40 @@ void VarListClauses() {
 #pragma acc serial copy(HasMem.MemArr[3].array[1:4]), seq
   for(;;){}
 
-  // expected-error@+3{{OpenMP array section is not allowed here}}
+  // expected-error@+3{{OpenACC sub-array is not allowed here}}
   // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[1:3].array[1]), seq
   for(;;){}
 
-  // expected-error@+3{{OpenMP array section is not allowed here}}
+  // expected-error@+3{{OpenACC sub-array is not allowed here}}
   // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[1:3].array[1:2]), seq
   for(;;){}
 
-  // expected-error@+2{{expected expression}}
+  // expected-error@+3{{expected expression}}
+  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[:]), seq
   for(;;){}
 
-  // expected-error@+2{{expected expression}}
+  // expected-error@+3{{expected expression}}
+  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[::]), seq
   for(;;){}
 
-  // expected-error@+4{{expected expression}}
-  // expected-error@+3{{expected ']'}}
-  // expected-note@+2{{to match this '['}}
+  // expected-error@+5{{expected expression}}
+  // expected-error@+4{{expected ']'}}
+  // expected-note@+3{{to match this '['}}
+  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[: :]), seq
   for(;;){}
 
-  // expected-error@+2{{expected expression}}
+  // expected-error@+3{{expected expression}}
+  // expected-warning@+2{{OpenACC clause 'copy' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copy(HasMem.MemArr[3:]), seq
   for(;;){}
@@ -582,13 +591,11 @@ void VarListClauses() {
 #pragma acc serial detach(s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+3{{expected ','}}
-  // expected-warning@+2{{OpenACC clause 'private' not yet implemented, clause ignored}}
+  // expected-error@+2{{expected ','}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial private(s.array[s.value] s.array[s.value :5] ), seq
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'private' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial private(s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -691,7 +698,9 @@ void VarListClauses() {
 #pragma acc serial copyout(zero : s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+2{{use of undeclared identifier 'zero'}}
+  // expected-error@+4{{use of undeclared identifier 'zero'}}
+  // expected-error@+3{{expected ','}}
+  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(zero s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -714,7 +723,9 @@ void VarListClauses() {
 #pragma acc serial copyout(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
+  // expected-error@+4{{use of undeclared identifier 'invalid'}}
+  // expected-error@+3{{expected ','}}
+  // expected-warning@+2{{OpenACC clause 'copyout' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyout(invalid s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -740,7 +751,9 @@ void VarListClauses() {
 #pragma acc serial create(zero : s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+2{{use of undeclared identifier 'zero'}}
+  // expected-error@+4{{use of undeclared identifier 'zero'}}
+  // expected-error@+3{{expected ','}}
+  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(zero s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -763,7 +776,9 @@ void VarListClauses() {
 #pragma acc serial create(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
+  // expected-error@+4{{use of undeclared identifier 'invalid'}}
+  // expected-error@+3{{expected ','}}
+  // expected-warning@+2{{OpenACC clause 'create' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial create(invalid s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -789,7 +804,9 @@ void VarListClauses() {
 #pragma acc serial copyin(readonly : s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+2{{use of undeclared identifier 'readonly'}}
+  // expected-error@+4{{use of undeclared identifier 'readonly'}}
+  // expected-error@+3{{expected ','}}
+  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(readonly s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -812,7 +829,9 @@ void VarListClauses() {
 #pragma acc serial copyin(invalid:s.array[s.value : 5], s.value), seq
   for(;;){}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
+  // expected-error@+4{{use of undeclared identifier 'invalid'}}
+  // expected-error@+3{{expected ','}}
+  // expected-warning@+2{{OpenACC clause 'copyin' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial copyin(invalid s.array[s.value : 5], s.value), seq
   for(;;){}
@@ -823,8 +842,9 @@ void ReductionClauseParsing() {
   // expected-error@+1{{expected '('}}
 #pragma acc serial reduction
   for(;;){}
-  // expected-error@+2{{missing reduction operator, expected '+', '*', 'max', 'min', '&', '|', '^', '&&', or '||', follwed by a ':'}}
-  // expected-error@+1{{expected expression}}
+  // expected-error@+3{{missing reduction operator, expected '+', '*', 'max', 'min', '&', '|', '^', '&&', or '||', follwed by a ':'}}
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC clause 'reduction' not yet implemented, clause ignored}}
 #pragma acc serial reduction()
   for(;;){}
   // expected-error@+2{{missing reduction operator, expected '+', '*', 'max', 'min', '&', '|', '^', '&&', or '||', follwed by a ':'}}
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 85762b7fed4d..4d10eeafa884 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -326,7 +326,7 @@
 // CHECK-MCPU-A57: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-A72: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
-// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+neon" "-target-feature" "+ssbs"
+// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+complxnum" "-target-feature" "+flagm" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+fp16fml" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+predres" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+neon" "-target-feature" "+ssbs"
 // CHECK-MCPU-M3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-M4: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MCPU-KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
diff --git a/clang/test/Preprocessor/hardware_interference.cpp b/clang/test/Preprocessor/hardware_interference.cpp
new file mode 100644
index 000000000000..f3727aadd32a
--- /dev/null
+++ b/clang/test/Preprocessor/hardware_interference.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -E -dM -D__GCC_CONSTRUCTIVE_SIZE=1000 -D__GCC_DESTRUCTIVE_SIZE=1001 %s -verify -Weverything | FileCheck %s
+// RUN: %clang_cc1 -D__GCC_CONSTRUCTIVE_SIZE=1000 -D__GCC_DESTRUCTIVE_SIZE=1001 %s -verify -Weverything
+// RUN: %clang_cc1 -E -dM -U__GCC_CONSTRUCTIVE_SIZE -U__GCC_DESTRUCTIVE_SIZE %s -verify -Weverything | FileCheck --check-prefix DISABLED %s
+// expected-no-diagnostics
+
+// Validate that we can set a new value on the command line without issuing any
+// diagnostics and that we can disabled the macro on the command line without
+// issuing any diagnostics.
+
+// CHECK: #define __GCC_CONSTRUCTIVE_SIZE 1000
+// CHECK: #define __GCC_DESTRUCTIVE_SIZE 1001
+// DISABLED-NOT: __GCC_CONSTRUCTIVE_SIZE
+// DISABLED-NOT: __GCC_DESTRUCTIVE_SIZE
+
+int main() {
+  return 0;
+}
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index cf96870b27ac..f0845985c9ef 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -119,6 +119,8 @@
 // AARCH64-NEXT: #define __FP_FAST_FMA 1
 // AARCH64-NEXT: #define __FP_FAST_FMAF 1
 // AARCH64-NEXT: #define __GCC_ASM_FLAG_OUTPUTS__ 1
+// AARCH64-NEXT: #define __GCC_CONSTRUCTIVE_SIZE {{.+}}
+// AARCH64-NEXT: #define __GCC_DESTRUCTIVE_SIZE {{.+}}
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
@@ -220,11 +222,11 @@
 // AARCH64-NEXT: #define __LONG_MAX__ 9223372036854775807L
 // AARCH64-NEXT: #define __LONG_WIDTH__ 64
 // AARCH64-NEXT: #define __LP64__ 1
-// AARCH64-NEXT: #define __MEMORY_SCOPE_DEVICE 1 
-// AARCH64-NEXT: #define __MEMORY_SCOPE_SINGLE 4 
-// AARCH64-NEXT: #define __MEMORY_SCOPE_SYSTEM 0 
-// AARCH64-NEXT: #define __MEMORY_SCOPE_WRKGRP 2 
-// AARCH64-NEXT: #define __MEMORY_SCOPE_WVFRNT 3 
+// AARCH64-NEXT: #define __MEMORY_SCOPE_DEVICE 1
+// AARCH64-NEXT: #define __MEMORY_SCOPE_SINGLE 4
+// AARCH64-NEXT: #define __MEMORY_SCOPE_SYSTEM 0
+// AARCH64-NEXT: #define __MEMORY_SCOPE_WRKGRP 2
+// AARCH64-NEXT: #define __MEMORY_SCOPE_WVFRNT 3
 // AARCH64-NEXT: #define __NO_INLINE__ 1
 // AARCH64-NEXT: #define __NO_MATH_ERRNO__ 1
 // AARCH64-NEXT: #define __OBJC_BOOL_IS_BOOL 0
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index c4a55efca6f7..2641fee94023 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1,3 +1,10 @@
+// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix INTERFERENCE %s
+//
+// We purposefully do not test the values produced, only that the macros are
+// predefined to some value.
+// INTERFERENCE:#define __GCC_CONSTRUCTIVE_SIZE {{.+}}
+// INTERFERENCE:#define __GCC_DESTRUCTIVE_SIZE {{.+}}
+
 // RUN: %clang_cc1 -E -dM -x assembler-with-cpp < /dev/null | FileCheck -match-full-lines -check-prefix ASM %s
 //
 // ASM:#define __ASSEMBLER__ 1
@@ -1697,6 +1704,8 @@
 // WEBASSEMBLY-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
 // WEBASSEMBLY-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
 // WEBASSEMBLY-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// WEBASSEMBLY-NEXT:#define __GCC_CONSTRUCTIVE_SIZE {{.+}}
+// WEBASSEMBLY-NEXT:#define __GCC_DESTRUCTIVE_SIZE {{.+}}
 // WEBASSEMBLY-NEXT:#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // WEBASSEMBLY-NEXT:#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
 // WEBASSEMBLY-NEXT:#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
@@ -1806,11 +1815,11 @@
 // WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L
 // WEBASSEMBLY64-NEXT:#define __LONG_WIDTH__ 64
 // WEBASSEMBLY64-NEXT:#define __LP64__ 1
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_DEVICE 1 
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SINGLE 4 
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SYSTEM 0 
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_WRKGRP 2 
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_WVFRNT 3 
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_DEVICE 1
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SINGLE 4
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SYSTEM 0
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_WRKGRP 2
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_WVFRNT 3
 // WEBASSEMBLY-NEXT:#define __NO_INLINE__ 1
 // WEBASSEMBLY-NEXT:#define __NO_MATH_ERRNO__ 1
 // WEBASSEMBLY-NEXT:#define __OBJC_BOOL_IS_BOOL 0
@@ -2126,11 +2135,11 @@
 // AVR:#define __LDBL_MIN__ 1.17549435e-38L
 // AVR:#define __LONG_LONG_MAX__ 9223372036854775807LL
 // AVR:#define __LONG_MAX__ 2147483647L
-// AVR:#define __MEMORY_SCOPE_DEVICE 1 
-// AVR:#define __MEMORY_SCOPE_SINGLE 4 
-// AVR:#define __MEMORY_SCOPE_SYSTEM 0 
-// AVR:#define __MEMORY_SCOPE_WRKGRP 2 
-// AVR:#define __MEMORY_SCOPE_WVFRNT 3 
+// AVR:#define __MEMORY_SCOPE_DEVICE 1
+// AVR:#define __MEMORY_SCOPE_SINGLE 4
+// AVR:#define __MEMORY_SCOPE_SYSTEM 0
+// AVR:#define __MEMORY_SCOPE_WRKGRP 2
+// AVR:#define __MEMORY_SCOPE_WVFRNT 3
 // AVR:#define __NO_INLINE__ 1
 // AVR:#define __ORDER_BIG_ENDIAN__ 4321
 // AVR:#define __ORDER_LITTLE_ENDIAN__ 1234
@@ -2422,11 +2431,11 @@
 // RISCV32: #define __LITTLE_ENDIAN__ 1
 // RISCV32: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // RISCV32: #define __LONG_MAX__ 2147483647L
-// RISCV32: #define __MEMORY_SCOPE_DEVICE 1 
-// RISCV32: #define __MEMORY_SCOPE_SINGLE 4 
-// RISCV32: #define __MEMORY_SCOPE_SYSTEM 0 
-// RISCV32: #define __MEMORY_SCOPE_WRKGRP 2 
-// RISCV32: #define __MEMORY_SCOPE_WVFRNT 3 
+// RISCV32: #define __MEMORY_SCOPE_DEVICE 1
+// RISCV32: #define __MEMORY_SCOPE_SINGLE 4
+// RISCV32: #define __MEMORY_SCOPE_SYSTEM 0
+// RISCV32: #define __MEMORY_SCOPE_WRKGRP 2
+// RISCV32: #define __MEMORY_SCOPE_WVFRNT 3
 // RISCV32: #define __NO_INLINE__ 1
 // RISCV32: #define __POINTER_WIDTH__ 32
 // RISCV32: #define __PRAGMA_REDEFINE_EXTNAME 1
@@ -2634,11 +2643,11 @@
 // RISCV64: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // RISCV64: #define __LONG_MAX__ 9223372036854775807L
 // RISCV64: #define __LP64__ 1
-// RISCV64: #define __MEMORY_SCOPE_DEVICE 1 
-// RISCV64: #define __MEMORY_SCOPE_SINGLE 4 
-// RISCV64: #define __MEMORY_SCOPE_SYSTEM 0 
-// RISCV64: #define __MEMORY_SCOPE_WRKGRP 2 
-// RISCV64: #define __MEMORY_SCOPE_WVFRNT 3 
+// RISCV64: #define __MEMORY_SCOPE_DEVICE 1
+// RISCV64: #define __MEMORY_SCOPE_SINGLE 4
+// RISCV64: #define __MEMORY_SCOPE_SYSTEM 0
+// RISCV64: #define __MEMORY_SCOPE_WRKGRP 2
+// RISCV64: #define __MEMORY_SCOPE_WVFRNT 3
 // RISCV64: #define __NO_INLINE__ 1
 // RISCV64: #define __POINTER_WIDTH__ 64
 // RISCV64: #define __PRAGMA_REDEFINE_EXTNAME 1
diff --git a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
index 251362cd03c0..cc5233fbcb2a 100644
--- a/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
+++ b/clang/test/Preprocessor/predefined-macros-hlsl.hlsl
@@ -1,14 +1,19 @@
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-amplification | FileCheck -match-full-lines %s --check-prefixes=CHECK,AMPLIFICATION
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-compute | FileCheck -match-full-lines %s --check-prefixes=CHECK,COMPUTE
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-domain | FileCheck -match-full-lines %s --check-prefixes=CHECK,DOMAIN
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-geometry | FileCheck -match-full-lines %s --check-prefixes=CHECK,GEOMETRY
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-hull | FileCheck -match-full-lines %s --check-prefixes=CHECK,HULL
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-library | FileCheck -match-full-lines %s --check-prefixes=CHECK,LIBRARY
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-mesh | FileCheck -match-full-lines %s --check-prefixes=CHECK,MESH
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-pixel | FileCheck -match-full-lines %s --check-prefixes=CHECK,PIXEL
-// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-vertex | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-amplification | FileCheck -match-full-lines %s --check-prefixes=CHECK,AMPLIFICATION,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-compute | FileCheck -match-full-lines %s --check-prefixes=CHECK,COMPUTE,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-domain | FileCheck -match-full-lines %s --check-prefixes=CHECK,DOMAIN,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-geometry | FileCheck -match-full-lines %s --check-prefixes=CHECK,GEOMETRY,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-hull | FileCheck -match-full-lines %s --check-prefixes=CHECK,HULL,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-library | FileCheck -match-full-lines %s --check-prefixes=CHECK,LIBRARY,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-mesh | FileCheck -match-full-lines %s --check-prefixes=CHECK,MESH,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-pixel | FileCheck -match-full-lines %s --check-prefixes=CHECK,PIXEL,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.0-vertex | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,NOHALF
+// RUN: %clang_cc1 %s -E -dM -o - -triple dxil-pc-shadermodel6.3-vertex -fnative-half-type | FileCheck -match-full-lines %s --check-prefixes=CHECK,VERTEX,HALF
+
+// HALF: #define __HLSL_ENABLE_16_BIT 1
+// NOHALF-NOT: __HLSL_ENABLE_16_BIT
 
 // CHECK: #define __HLSL_VERSION 2021
+
 // CHECK: #define __SHADER_STAGE_AMPLIFICATION 14
 // CHECK: #define __SHADER_STAGE_COMPUTE 5
 // CHECK: #define __SHADER_STAGE_DOMAIN 4
diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c
index b830dc39d477..14e2f584bd09 100644
--- a/clang/test/Preprocessor/predefined-win-macros.c
+++ b/clang/test/Preprocessor/predefined-win-macros.c
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple x86_64-pc-win32 -fms-extensions -fms-compatibility \
 // RUN:     -fms-compatibility-version=19.00 -std=c++14 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS64
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple x86_64-pc-win32 -fms-extensions -fms-compatibility \
-// RUN:     -fms-compatibility-version=19.00 -std=c++14 -o - | grep GCC | count 5
+// RUN:     -fms-compatibility-version=19.00 -std=c++14 -o - | grep GCC | count 7
 // CHECK-MS64: #define _INTEGRAL_MAX_BITS 64
 // CHECK-MS64: #define _ISO_VOLATILE 1
 // CHECK-MS64: #define _MSC_EXTENSIONS 1
@@ -26,7 +26,7 @@
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
 // RUN:     -fms-compatibility-version=19.00 -std=c++17 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
-// RUN:     -fms-compatibility-version=19.00 -std=c++17 -o - | grep GCC | count 5
+// RUN:     -fms-compatibility-version=19.00 -std=c++17 -o - | grep GCC | count 7
 // CHECK-MS: #define _INTEGRAL_MAX_BITS 64
 // CHECK-MS: #define _ISO_VOLATILE 1
 // CHECK-MS: #define _MSC_EXTENSIONS 1
@@ -39,6 +39,8 @@
 // CHECK-MS-NOT: GNU
 // CHECK-MS-NOT: GXX
 // CHECK-MS: #define __GCC_ASM_FLAG_OUTPUTS__ 1
+// CHECK-MS: #define __GCC_CONSTRUCTIVE_SIZE {{.+}}
+// CHECK-MS: #define __GCC_DESTRUCTIVE_SIZE {{.+}}
 // CHECK-MS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // CHECK-MS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
 // CHECK-MS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
diff --git a/clang/test/Preprocessor/wasm-target-features.c b/clang/test/Preprocessor/wasm-target-features.c
index 32e24ad1b716..72ecc60a6e78 100644
--- a/clang/test/Preprocessor/wasm-target-features.c
+++ b/clang/test/Preprocessor/wasm-target-features.c
@@ -44,6 +44,15 @@
 // EXTENDED-CONST: #define __wasm_extended_const__ 1{{$}}
 
 // RUN: %clang -E -dM %s -o - 2>&1 \
+// RUN:     -target wasm32-unknown-unknown -mhalf-precision \
+// RUN:   | FileCheck %s -check-prefix=HALF-PRECISION
+// RUN: %clang -E -dM %s -o - 2>&1 \
+// RUN:     -target wasm64-unknown-unknown -mhalf-precision \
+// RUN:   | FileCheck %s -check-prefix=HALF-PRECISION
+//
+// HALF-PRECISION: #define __wasm_half_precision__ 1{{$}}
+
+// RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target wasm32-unknown-unknown -mmultimemory \
 // RUN:   | FileCheck %s -check-prefix=MULTIMEMORY
 // RUN: %clang -E -dM %s -o - 2>&1 \
@@ -135,6 +144,7 @@
 // MVP-NOT: #define __wasm_bulk_memory__ 1{{$}}
 // MVP-NOT: #define __wasm_exception_handling__ 1{{$}}
 // MVP-NOT: #define __wasm_extended_const__ 1{{$}}
+// MVP-NOT: #define __wasm_half_precision__ 1{{$}}
 // MVP-NOT: #define __wasm_multimemory__ 1{{$}}
 // MVP-NOT: #define __wasm_multivalue__ 1{{$}}
 // MVP-NOT: #define __wasm_mutable_globals__ 1{{$}}
@@ -152,7 +162,9 @@
 // RUN:     -target wasm64-unknown-unknown -mcpu=generic \
 // RUN:   | FileCheck %s -check-prefix=GENERIC-INCLUDE
 //
+// GENERIC-INCLUDE-DAG: #define __wasm_multivalue__ 1{{$}}
 // GENERIC-INCLUDE-DAG: #define __wasm_mutable_globals__ 1{{$}}
+// GENERIC-INCLUDE-DAG: #define __wasm_reference_types__ 1{{$}}
 // GENERIC-INCLUDE-DAG: #define __wasm_sign_ext__ 1{{$}}
 //
 // RUN: %clang -E -dM %s -o - 2>&1 \
@@ -166,10 +178,9 @@
 // GENERIC-NOT: #define __wasm_bulk_memory__ 1{{$}}
 // GENERIC-NOT: #define __wasm_exception_handling__ 1{{$}}
 // GENERIC-NOT: #define __wasm_extended_const__ 1{{$}}
+// GENERIC-NOT: #define __wasm_half_precision__ 1{{$}}
 // GENERIC-NOT: #define __wasm_multimemory__ 1{{$}}
-// GENERIC-NOT: #define __wasm_multivalue__ 1{{$}}
 // GENERIC-NOT: #define __wasm_nontrapping_fptoint__ 1{{$}}
-// GENERIC-NOT: #define __wasm_reference_types__ 1{{$}}
 // GENERIC-NOT: #define __wasm_relaxed_simd__ 1{{$}}
 // GENERIC-NOT: #define __wasm_simd128__ 1{{$}}
 // GENERIC-NOT: #define __wasm_tail_call__ 1{{$}}
@@ -183,7 +194,9 @@
 //
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_atomics__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_bulk_memory__ 1{{$}}
+// BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_half_precision__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_multimemory__ 1{{$}}
+// BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_multivalue__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_mutable_globals__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_nontrapping_fptoint__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_reference_types__ 1{{$}}
@@ -200,7 +213,6 @@
 //
 // BLEEDING-EDGE-NOT: #define __wasm_exception_handling__ 1{{$}}
 // BLEEDING-EDGE-NOT: #define __wasm_extended_const__ 1{{$}}
-// BLEEDING-EDGE-NOT: #define __wasm_multivalue__ 1{{$}}
 // BLEEDING-EDGE-NOT: #define __wasm_relaxed_simd__ 1{{$}}
 
 // RUN: %clang -E -dM %s -o - 2>&1 \
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
new file mode 100644
index 000000000000..ddb78696ce62
--- /dev/null
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -0,0 +1,725 @@
+// RUN: %clang_cc1 -verify -std=c++2a -fsyntax-only -Wno-bit-int-extension %s
+// RUN: %clang_cc1 -verify -std=c++2a -fsyntax-only -Wno-bit-int-extension -triple ppc64-unknown-linux %s
+// RUN: %clang_cc1 -verify -std=c++2a -fsyntax-only -Wno-bit-int-extension -triple ppc64le-unknown-linux %s
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define LITTLE_END 1
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define LITTLE_END 0
+#else
+#error "huh?"
+#endif
+
+// We also support _BitInt as long as it is >=8 and a power of 2.
+typedef _BitInt(8) BitInt8;
+typedef _BitInt(32) BitInt32;
+typedef _BitInt(128) BitInt128;
+
+typedef double vector4double __attribute__((__vector_size__(32)));
+typedef float vector4float __attribute__((__vector_size__(16)));
+typedef long long vector4long __attribute__((__vector_size__(32)));
+typedef int vector4int __attribute__((__vector_size__(16)));
+typedef short vector4short __attribute__((__vector_size__(8)));
+typedef char vector4char __attribute__((__vector_size__(4)));
+typedef BitInt8 vector4BitInt8 __attribute__((__vector_size__(4)));
+typedef BitInt32 vector4BitInt32 __attribute__((__vector_size__(16)));
+typedef BitInt128 vector4BitInt128 __attribute__((__vector_size__(64)));
+typedef double vector8double __attribute__((__vector_size__(64)));
+typedef float vector8float __attribute__((__vector_size__(32)));
+typedef long long vector8long __attribute__((__vector_size__(64)));
+typedef int vector8int __attribute__((__vector_size__(32)));
+typedef short vector8short __attribute__((__vector_size__(16)));
+typedef char vector8char __attribute__((__vector_size__(8)));
+typedef BitInt8 vector8BitInt8 __attribute__((__vector_size__(8)));
+typedef BitInt32 vector8BitInt32 __attribute__((__vector_size__(32)));
+typedef BitInt128 vector8BitInt128 __attribute__((__vector_size__(128)));
+
+#define CHECK_NUM(__size, __typeFrom, __typeTo, ...)                            \
+  constexpr vector##__size##__typeTo                                            \
+      from_##vector##__size##__typeFrom##_to_##vector##__size##__typeTo##_var = \
+          __builtin_convertvector((vector##__size##__typeFrom){__VA_ARGS__},    \
+                                  vector##__size##__typeTo);
+#define CHECK_TO_ALL_TYPES(__size, __typeFrom, ...)                            \
+  CHECK_NUM(__size, __typeFrom, double, __VA_ARGS__)                           \
+  CHECK_NUM(__size, __typeFrom, float, __VA_ARGS__)                            \
+  CHECK_NUM(__size, __typeFrom, long, __VA_ARGS__)                             \
+  CHECK_NUM(__size, __typeFrom, int, __VA_ARGS__)                              \
+  CHECK_NUM(__size, __typeFrom, short, __VA_ARGS__)                            \
+  CHECK_NUM(__size, __typeFrom, char, __VA_ARGS__)                             \
+  CHECK_NUM(__size, __typeFrom, BitInt8, __VA_ARGS__)                          \
+  CHECK_NUM(__size, __typeFrom, BitInt32, __VA_ARGS__)                         \
+  CHECK_NUM(__size, __typeFrom, BitInt128, __VA_ARGS__)                        \
+  static_assert(                                                               \
+      __builtin_bit_cast(                                                      \
+          unsigned,                                                            \
+          __builtin_shufflevector(                                             \
+              from_vector##__size##__typeFrom##_to_vector##__size##char_var,   \
+              from_vector##__size##__typeFrom##_to_vector##__size##char_var,   \
+              0, 1, 2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));         \
+  static_assert(                                                               \
+      __builtin_bit_cast(                                                      \
+          unsigned long long,                                                  \
+          __builtin_shufflevector(                                             \
+              from_vector##__size##__typeFrom##_to_vector##__size##short_var,  \
+              from_vector##__size##__typeFrom##_to_vector##__size##short_var,  \
+              0, 1, 2, 3)) ==                                                  \
+      (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+
+#define CHECK_ALL_COMBINATIONS(__size, ...)                                    \
+  CHECK_TO_ALL_TYPES(__size, double, __VA_ARGS__)                              \
+  CHECK_TO_ALL_TYPES(__size, float, __VA_ARGS__)                               \
+  CHECK_TO_ALL_TYPES(__size, long, __VA_ARGS__)                                \
+  CHECK_TO_ALL_TYPES(__size, int, __VA_ARGS__)                                 \
+  CHECK_TO_ALL_TYPES(__size, short, __VA_ARGS__)                               \
+  CHECK_TO_ALL_TYPES(__size, char, __VA_ARGS__)                                \
+  CHECK_TO_ALL_TYPES(__size, BitInt8, __VA_ARGS__)                             \
+  CHECK_TO_ALL_TYPES(__size, BitInt32, __VA_ARGS__)                            \
+  CHECK_TO_ALL_TYPES(__size, BitInt128, __VA_ARGS__)
+
+// The result below is expanded from these macros. Use them to autogenerate the
+// test cases below.
+// CHECK_ALL_COMBINATIONS(4, 0, 1, 2, 3);
+// CHECK_ALL_COMBINATIONS(8, 0, 1, 2, 3, 4, 5, 6, 7);
+
+constexpr vector4double from_vector4double_to_vector4double_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4double_to_vector4float_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4double_to_vector4long_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4double_to_vector4int_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4double_to_vector4short_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4double_to_vector4char_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4double_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4double_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4double_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(
+                  unsigned,
+                  __builtin_shufflevector(from_vector4double_to_vector4char_var,
+                                          from_vector4double_to_vector4char_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4double_to_vector4short_var,
+                                     from_vector4double_to_vector4short_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4float_to_vector4double_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4float_to_vector4float_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4float_to_vector4long_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4float_to_vector4int_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4float_to_vector4short_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4float_to_vector4char_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4float_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4float_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4float_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4float_to_vector4char_var,
+                                     from_vector4float_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4float_to_vector4short_var,
+                                          from_vector4float_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4long_to_vector4double_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4long_to_vector4float_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4long_to_vector4long_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4long_to_vector4int_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4long_to_vector4short_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4long_to_vector4char_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4long_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4long_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4long_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4long_to_vector4char_var,
+                                     from_vector4long_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4long_to_vector4short_var,
+                                          from_vector4long_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4int_to_vector4double_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4int_to_vector4float_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4int_to_vector4long_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4int_to_vector4int_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4int_to_vector4short_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4int_to_vector4char_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4int_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4int_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4int_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4int_to_vector4char_var,
+                                     from_vector4int_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4int_to_vector4short_var,
+                                          from_vector4int_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4short_to_vector4double_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4short_to_vector4float_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4short_to_vector4long_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4short_to_vector4int_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4short_to_vector4short_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4short_to_vector4char_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4short_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4short_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4short_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4short_to_vector4char_var,
+                                     from_vector4short_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4short_to_vector4short_var,
+                                          from_vector4short_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4char_to_vector4double_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4char_to_vector4float_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4char_to_vector4long_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4char_to_vector4int_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4char_to_vector4short_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4char_to_vector4char_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4char_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4char_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4char_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4char_to_vector4char_var,
+                                     from_vector4char_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4char_to_vector4short_var,
+                                          from_vector4char_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt8_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt8_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt8_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt8_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt8_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt8_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt8_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt8_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt8_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt8_to_vector4char_var,
+                                     from_vector4BitInt8_to_vector4char_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt8_to_vector4short_var,
+                                     from_vector4BitInt8_to_vector4short_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt32_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt32_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt32_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt32_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt32_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt32_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt32_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt32_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt32_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt32_to_vector4char_var,
+                                     from_vector4BitInt32_to_vector4char_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt32_to_vector4short_var,
+                                     from_vector4BitInt32_to_vector4short_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt128_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt128_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt128_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt128_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt128_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt128_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt128_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt128_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt128_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt128_to_vector4char_var,
+                                     from_vector4BitInt128_to_vector4char_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt128_to_vector4short_var,
+                                     from_vector4BitInt128_to_vector4short_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+;
+constexpr vector8double from_vector8double_to_vector8double_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8double_to_vector8float_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8double_to_vector8long_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8double_to_vector8int_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8double_to_vector8short_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8double_to_vector8char_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8double_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8double_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8double_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(
+                  unsigned,
+                  __builtin_shufflevector(from_vector8double_to_vector8char_var,
+                                          from_vector8double_to_vector8char_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8double_to_vector8short_var,
+                                     from_vector8double_to_vector8short_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8float_to_vector8double_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8float_to_vector8float_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8float_to_vector8long_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8float_to_vector8int_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8float_to_vector8short_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8float_to_vector8char_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8float_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8float_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8float_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8float_to_vector8char_var,
+                                     from_vector8float_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8float_to_vector8short_var,
+                                          from_vector8float_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8long_to_vector8double_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8long_to_vector8float_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8long_to_vector8long_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8long_to_vector8int_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8long_to_vector8short_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8long_to_vector8char_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8long_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8long_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8long_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8long_to_vector8char_var,
+                                     from_vector8long_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8long_to_vector8short_var,
+                                          from_vector8long_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8int_to_vector8double_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8int_to_vector8float_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8float);
+constexpr vector8long from_vector8int_to_vector8long_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8int_to_vector8int_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8int_to_vector8short_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8short);
+constexpr vector8char from_vector8int_to_vector8char_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8int_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8int_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8int_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8int_to_vector8char_var,
+                                     from_vector8int_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8int_to_vector8short_var,
+                                          from_vector8int_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8short_to_vector8double_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8short_to_vector8float_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8short_to_vector8long_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8short_to_vector8int_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8short_to_vector8short_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8short_to_vector8char_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8short_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8short_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8short_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8short_to_vector8char_var,
+                                     from_vector8short_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8short_to_vector8short_var,
+                                          from_vector8short_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8char_to_vector8double_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8char_to_vector8float_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8char_to_vector8long_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8char_to_vector8int_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8char_to_vector8short_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8char_to_vector8char_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8char_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8char_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8char_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8char_to_vector8char_var,
+                                     from_vector8char_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8char_to_vector8short_var,
+                                          from_vector8char_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt8_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt8_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt8_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt8_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt8_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt8_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt8_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt8_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt8_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt8_to_vector8char_var,
+                                     from_vector8BitInt8_to_vector8char_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt8_to_vector8short_var,
+                                     from_vector8BitInt8_to_vector8short_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt32_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt32_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt32_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt32_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt32_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt32_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt32_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt32_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt32_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt32_to_vector8char_var,
+                                     from_vector8BitInt32_to_vector8char_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt32_to_vector8short_var,
+                                     from_vector8BitInt32_to_vector8short_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt128_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt128_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt128_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt128_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt128_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt128_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt128_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt128_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt128_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt128_to_vector8char_var,
+                                     from_vector8BitInt128_to_vector8char_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt128_to_vector8short_var,
+                                     from_vector8BitInt128_to_vector8short_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+;
+#undef CHECK_ALL_COMBINATIONS
+#undef CHECK_TO_ALL_TYPES
+#undef CHECK_NUM
+
+// Shuffle vector
+constexpr vector4char vector4charConst1 = {0, 1, 2, 3};
+constexpr vector4char vector4charConst2 = {4, 5, 6, 7};
+constexpr vector8char vector8intConst = {8, 9, 10, 11, 12, 13, 14, 15};
+
+constexpr vector4char vectorShuffle1 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 0, 1, 2, 3);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle1) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+constexpr vector4char vectorShuffle2 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 4, 5, 6, 7);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle2) ==
+              (LITTLE_END ? 0x07060504 : 0x04050607));
+constexpr vector4char vectorShuffle3 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 0, 2, 4, 6);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle3) ==
+              (LITTLE_END ? 0x06040200 : 0x00020406));
+constexpr vector8char vectorShuffle4 = __builtin_shufflevector(
+    vector8intConst, vector8intConst, 0, 2, 4, 6, 8, 10, 12, 14);
+static_assert(__builtin_bit_cast(unsigned long long, vectorShuffle4) ==
+              (LITTLE_END ? 0x0E0C0A080E0C0A08 : 0x080A0C0E080A0C0E));
+constexpr vector4char vectorShuffle5 =
+    __builtin_shufflevector(vector8intConst, vector8intConst, 0, 2, 4, 6);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle5) ==
+              (LITTLE_END ? 0x0E0C0A08 : 0x080A0C0E));
+constexpr vector8char vectorShuffle6 = __builtin_shufflevector(
+    vector4charConst1, vector4charConst2, 0, 2, 4, 6, 1, 3, 5, 7);
+static_assert(__builtin_bit_cast(unsigned long long, vectorShuffle6) ==
+              (LITTLE_END ? 0x0705030106040200 : 0x0002040601030507));
+
+constexpr vector4char
+    vectorShuffleFail1 = // expected-error {{constexpr variable 'vectorShuffleFail1'\
+ must be initialized by a constant expression}}
+    __builtin_shufflevector( // expected-error {{index for __builtin_shufflevector \
+not within the bounds of the input vectors; index of -1 found at position 0 not \
+permitted in a constexpr context.}}
+        vector4charConst1,
+        vector4charConst2, -1, -1, -1, -1);
diff --git a/clang/test/Sema/convertvector.c b/clang/test/Sema/convertvector.c
index 8ae43c3ba3d4..1ff04af90981 100644
--- a/clang/test/Sema/convertvector.c
+++ b/clang/test/Sema/convertvector.c
@@ -15,3 +15,6 @@ vector8float foo3(double x) {
   return __builtin_convertvector(x, vector8float);  // expected-error {{must be a vector}}
 }
 
+float foo4(float x) {
+  return __builtin_convertvector(x, float); // expected-error {{first argument to __builtin_convertvector must be a vector}}
+}
diff --git a/clang/test/Sema/zvector.c b/clang/test/Sema/zvector.c
index 798b12bd50b7..900c39adc2a3 100644
--- a/clang/test/Sema/zvector.c
+++ b/clang/test/Sema/zvector.c
@@ -22,6 +22,10 @@ vector double fd, fd2;
 
 vector long ll; // expected-error {{cannot use 'long' with '__vector'}}
 vector float ff; // expected-error {{cannot use 'float' with '__vector'}}
+vector long double ld; // expected-error {{cannot use 'long double' with '__vector'}}
+vector float _Complex cf; // expected-error {{cannot use '_Complex' with '__vector'}}
+vector double _Complex cd; // expected-error {{cannot use '_Complex' with '__vector'}}
+vector long double _Complex cld; // expected-error {{cannot use '_Complex' with '__vector'}}
 
 signed char sc_scalar;
 unsigned char uc_scalar;
@@ -53,6 +57,10 @@ __vector bool long long bl3;
 __vector double fd3;
 __vector long ll3; // expected-error {{cannot use 'long' with '__vector'}}
 __vector float ff3; // expected-error {{cannot use 'float' with '__vector'}}
+__vector long double ld3; // expected-error {{cannot use 'long double' with '__vector'}}
+__vector float _Complex cf3; // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector double _Complex cd3; // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector long double _Complex cld3; // expected-error {{cannot use '_Complex' with '__vector'}}
 
 // Likewise for __bool
 vector __bool char bc4;
diff --git a/clang/test/Sema/zvector2.c b/clang/test/Sema/zvector2.c
index a4e0a9e2c3f1..8a440374f335 100644
--- a/clang/test/Sema/zvector2.c
+++ b/clang/test/Sema/zvector2.c
@@ -25,6 +25,10 @@ vector float ff, ff2;
 // Verify that __vector is also recognized
 __vector float ff3;
 
+// With z14 we support vector float, but still no vector _Complex float.
+vector float _Complex cf; // expected-error {{cannot use '_Complex' with '__vector'}}
+__vector float _Complex cf3; // expected-error {{cannot use '_Complex' with '__vector'}}
+
 // Verify operation of vec_step
 int res_ff[vec_step(ff) == 4 ? 1 : -1];
 
diff --git a/clang/test/SemaCXX/PR68885.cpp b/clang/test/SemaCXX/PR68885.cpp
new file mode 100644
index 000000000000..4ff95771e2ca
--- /dev/null
+++ b/clang/test/SemaCXX/PR68885.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -verify -std=c++20 -fsyntax-only %s
+
+// expected-no-diagnostics
+
+template <decltype(auto) a>
+struct S {
+    static constexpr int i = 42;
+};
+
+template <decltype(auto) a> requires true
+struct S<a> {
+    static constexpr int i = 0;
+};
+
+static constexpr int a = 0;
+
+void test() {
+    static_assert(S<a>::i == 0);
+    static_assert(S<(a)>::i == 0);
+    static_assert(S<((a))>::i == 0);
+}
diff --git a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
index 2f067ea53a50..90404f115c75 100644
--- a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
+++ b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
@@ -12,14 +12,19 @@ namespace std {
     size_t n;
     initializer_list();
   };
-  // FIXME: This should probably not be necessary.
-  template<typename T> initializer_list(initializer_list<T>) -> initializer_list<T>;
 }
 
 template<typename T> constexpr bool has_type(...) { return false; }
 template<typename T> constexpr bool has_type(T&) { return true; }
 
-std::initializer_list il = {1, 2, 3, 4, 5};
+std::initializer_list il1 = {1, 2, 3, 4, 5};
+auto il2 = std::initializer_list{1, 2, 3, 4};
+auto il3 = std::initializer_list{il1};
+auto il4 = std::initializer_list{il1, il1, il1};
+static_assert(has_type<std::initializer_list<int>>(il1));
+static_assert(has_type<std::initializer_list<int>>(il2));
+static_assert(has_type<std::initializer_list<int>>(il3));
+static_assert(has_type<std::initializer_list<std::initializer_list<int>>>(il4));
 
 template<typename T> struct vector {
   template<typename Iter> vector(Iter, Iter);
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index 192621225a54..e19807437207 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -260,6 +260,26 @@ int(*test)(int)  = l1;
 
 }
 
+namespace consteval_lambda_in_template {
+struct S {
+    int *value;
+    constexpr S(int v) : value(new int {v}) {}
+    constexpr ~S() { delete value; }
+};
+consteval S fn() { return S(5); }
+
+template <typename T>
+void fn2() {
+    (void)[]() consteval -> int {
+      return *(fn().value);  // OK, immediate context
+    };
+}
+
+void caller() {
+    fn2<int>();
+}
+}
+
 namespace std {
 
 template <typename T> struct remove_reference { using type = T; };
diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
index 606715e6aacf..a3e5a0931491 100644
--- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
+++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
@@ -160,3 +160,37 @@ namespace GH88929 {
     using E = P...[0]; // expected-error {{unknown type name 'P'}} \
                        // expected-error {{expected ';' after alias declaration}}
 }
+
+namespace GH88925 {
+template <typename...> struct S {};
+
+template <auto...> struct W {};
+
+template <int...> struct sequence {};
+
+template <typename... args, int... indices> auto f(sequence<indices...>) {
+  return S<args...[indices]...>(); // #use
+}
+
+template <auto... args, int... indices> auto g(sequence<indices...>) {
+  return W<args...[indices]...>(); // #nttp-use
+}
+
+void h() {
+  static_assert(__is_same(decltype(f<int>(sequence<0, 0>())), S<int, int>));
+  static_assert(__is_same(decltype(f<int, long>(sequence<0, 0>())), S<int, int>));
+  static_assert(__is_same(decltype(f<int, long>(sequence<0, 1>())), S<int, long>));
+  f<int, long>(sequence<3>());
+  // expected-error@#use {{invalid index 3 for pack 'args' of size 2}}}
+  // expected-note-re@-2 {{function template specialization '{{.*}}' requested here}}
+
+  struct foo {};
+  struct bar {};
+  struct baz {};
+
+  static_assert(__is_same(decltype(g<foo{}, bar{}, baz{}>(sequence<0, 2, 1>())), W<foo{}, baz{}, bar{}>));
+  g<foo{}>(sequence<4>());
+  // expected-error@#nttp-use {{invalid index 4 for pack args of size 1}}
+  // expected-note-re@-2 {{function template specialization '{{.*}}' requested here}}
+}
+}
diff --git a/clang/test/SemaCXX/destructor.cpp b/clang/test/SemaCXX/destructor.cpp
index beac50e449e9..028bc7cc1969 100644
--- a/clang/test/SemaCXX/destructor.cpp
+++ b/clang/test/SemaCXX/destructor.cpp
@@ -565,4 +565,16 @@ struct Foo : public Baz { // expected-error {{cannot override a non-deleted func
 };
 }
 
+namespace GH89544 {
+class Foo {
+  ~Foo() = {}
+  // expected-error@-1 {{initializer on function does not look like a pure-specifier}}
+  // expected-error@-2 {{expected ';' at end of declaration list}}
+};
+
+static_assert(!__is_trivially_constructible(Foo), "");
+static_assert(!__is_trivially_constructible(Foo, const Foo &), "");
+static_assert(!__is_trivially_constructible(Foo, Foo &&), "");
+} // namespace GH89544
+
 #endif // BE_THE_HEADER
diff --git a/clang/test/SemaCXX/identical-type-primary-partial-specialization.cpp b/clang/test/SemaCXX/identical-type-primary-partial-specialization.cpp
new file mode 100644
index 000000000000..ad51ca8252ef
--- /dev/null
+++ b/clang/test/SemaCXX/identical-type-primary-partial-specialization.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++17 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
+
+template <decltype(auto) a>
+struct S { // expected-note {{previous definition is here}}
+    static constexpr int i = 42;
+};
+
+template <decltype(auto) a>
+struct S<a> { // expected-error {{class template partial specialization does not specialize any template argument; to define the primary template, remove the template argument list}} \
+              // expected-error {{redefinition of 'S'}}
+    static constexpr int i = 0;
+};
diff --git a/clang/test/SemaCXX/member-expr.cpp b/clang/test/SemaCXX/member-expr.cpp
index 75c9ef0caa2e..0596e40f6c2f 100644
--- a/clang/test/SemaCXX/member-expr.cpp
+++ b/clang/test/SemaCXX/member-expr.cpp
@@ -40,8 +40,8 @@ namespace C {
 }
 
 void test2(X *xp) {
-  xp->::i = 7; // expected-error{{qualified member access refers to a member in the global namespace}}
-  xp->C::i = 7; // expected-error{{qualified member access refers to a member in namespace 'C'}}
+  xp->::i = 7; // expected-error{{'i' is not a member of class 'X'}}
+  xp->C::i = 7; // expected-error{{'C::i' is not a member of class 'X'}}
 }
 
 
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index dee4a29bd2bf..01991887b284 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2509,6 +2509,20 @@ void is_convertible()
   static_assert(__is_convertible(FloatWrapper, IntWrapper));
   static_assert(__is_convertible(FloatWrapper, float));
   static_assert(__is_convertible(float, FloatWrapper));
+  static_assert(__is_convertible(IntWrapper, IntWrapper&&));
+  static_assert(__is_convertible(IntWrapper, const IntWrapper&));
+  static_assert(__is_convertible(IntWrapper, int&&));
+  static_assert(__is_convertible(IntWrapper, const int&));
+  static_assert(__is_convertible(int, IntWrapper&&));
+  static_assert(__is_convertible(int, const IntWrapper&));
+  static_assert(__is_convertible(IntWrapper, FloatWrapper&&));
+  static_assert(__is_convertible(IntWrapper, const FloatWrapper&));
+  static_assert(__is_convertible(FloatWrapper, IntWrapper&&));
+  static_assert(__is_convertible(FloatWrapper, const IntWrapper&&));
+  static_assert(__is_convertible(FloatWrapper, float&&));
+  static_assert(__is_convertible(FloatWrapper, const float&));
+  static_assert(__is_convertible(float, FloatWrapper&&));
+  static_assert(__is_convertible(float, const FloatWrapper&));
 }
 
 void is_nothrow_convertible()
@@ -2521,6 +2535,20 @@ void is_nothrow_convertible()
   static_assert(!__is_nothrow_convertible(FloatWrapper, IntWrapper));
   static_assert(!__is_nothrow_convertible(FloatWrapper, float));
   static_assert(__is_nothrow_convertible(float, FloatWrapper));
+  static_assert(__is_nothrow_convertible(IntWrapper, IntWrapper&&));
+  static_assert(__is_nothrow_convertible(IntWrapper, const IntWrapper&));
+  static_assert(__is_nothrow_convertible(IntWrapper, int&&));
+  static_assert(__is_nothrow_convertible(IntWrapper, const int&));
+  static_assert(!__is_nothrow_convertible(int, IntWrapper&&));
+  static_assert(!__is_nothrow_convertible(int, const IntWrapper&));
+  static_assert(!__is_nothrow_convertible(IntWrapper, FloatWrapper&&));
+  static_assert(!__is_nothrow_convertible(IntWrapper, const FloatWrapper&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, IntWrapper&&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, const IntWrapper&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, float&&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, const float&));
+  static_assert(__is_nothrow_convertible(float, FloatWrapper&&));
+  static_assert(__is_nothrow_convertible(float, const FloatWrapper&));
 }
 
 struct FromInt { FromInt(int); };
diff --git a/clang/test/SemaCXX/unused.cpp b/clang/test/SemaCXX/unused.cpp
index 0af9e5b68b00..1f40c1b1ca90 100644
--- a/clang/test/SemaCXX/unused.cpp
+++ b/clang/test/SemaCXX/unused.cpp
@@ -102,11 +102,21 @@ namespace PR33839 {
     for (auto [x] : a) { // expected-warning {{unused variable '[x]'}}
     }
   }
-  void use() { 
+  void use() {
     f<int>(); // expected-note {{instantiation of}}
     g<true>();
     g<false>();
     h<int>(); // expected-note {{instantiation of}}
   }
 }
+
+namespace maybe_unused_binding {
+
+void test() {
+  struct X { int a, b; } x;
+  auto [a [[maybe_unused]], b] = x; // expected-warning {{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+}
+
+}
+
 #endif
diff --git a/clang/test/SemaObjC/format-strings-oslog.m b/clang/test/SemaObjC/format-strings-oslog.m
index 20fec93b653b..af5aef3d6179 100644
--- a/clang/test/SemaObjC/format-strings-oslog.m
+++ b/clang/test/SemaObjC/format-strings-oslog.m
@@ -44,15 +44,18 @@ void test_os_log_format(const char *pc, int i, void *p, void *buf) {
 }
 
 // Test os_log_format primitive with ObjC string literal format argument.
-void test_objc(const char *pc, int i, void *p, void *buf, NSString *nss) {
+void test_objc(const char *pc, int i, void *p, void *buf, NSString *nss, id obj) {
   __builtin_os_log_format(buf, @"");
   __builtin_os_log_format(buf, @"%d"); // expected-warning {{more '%' conversions than data arguments}}
   __builtin_os_log_format(buf, @"%d", i);
+
   __builtin_os_log_format(buf, @"%P", p); // expected-warning {{using '%P' format specifier without precision}}
   __builtin_os_log_format(buf, @"%.10P", p);
   __builtin_os_log_format(buf, @"%.*P", p); // expected-warning {{field precision should have type 'int', but argument has type 'void *'}}
   __builtin_os_log_format(buf, @"%.*P", i, p);
   __builtin_os_log_format(buf, @"%.*P", i, i); // expected-warning {{format specifies type 'void *' but the argument has type 'int'}}
+  __builtin_os_log_format(buf, @"%.8P", nss);    // expected-warning {{using '%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value}}
+  __builtin_os_log_format(buf, @"%.*P", i, obj); // expected-warning {{using '%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value}}
 
   __builtin_os_log_format(buf, @"%{private}s", pc);
   __builtin_os_log_format(buf, @"%@", nss);
diff --git a/clang/test/SemaOpenACC/compute-construct-private-clause.c b/clang/test/SemaOpenACC/compute-construct-private-clause.c
new file mode 100644
index 000000000000..15775279fc86
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-private-clause.c
@@ -0,0 +1,145 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct Incomplete;
+enum SomeE{ A };
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  enum SomeE EnumMember;
+  void *PointerMember;
+} Complete;
+
+int GlobalInt;
+float GlobalArray[5];
+void *GlobalPointer;
+Complete GlobalComposite;
+
+void uses(int IntParam, void *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  void *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+
+  // Check Appertainment:
+#pragma acc parallel private(LocalInt)
+  while(1);
+#pragma acc serial private(LocalInt)
+  while(1);
+  // expected-error@+1{{OpenACC 'private' clause is not valid on 'kernels' directive}}
+#pragma acc kernels private(LocalInt)
+  while(1);
+
+  // Valid cases:
+#pragma acc parallel private(LocalInt, LocalPointer, LocalArray)
+  while(1);
+#pragma acc parallel private(LocalArray)
+  while(1);
+  // TODO OpenACC: Fix array sections, this should be allowed.
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel private(LocalArray[:])
+  while(1);
+#pragma acc parallel private(LocalArray[:5])
+  while(1);
+  // TODO OpenACC: Fix array sections, this should be allowed.
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel private(LocalArray[2:])
+  while(1);
+#pragma acc parallel private(LocalArray[2:5])
+  while(1);
+#pragma acc parallel private(LocalArray[2])
+  while(1);
+#pragma acc parallel private(LocalComposite)
+  while(1);
+#pragma acc parallel private(LocalComposite.EnumMember)
+  while(1);
+#pragma acc parallel private(LocalComposite.ScalarMember)
+  while(1);
+#pragma acc parallel private(LocalComposite.ArrayMember)
+  while(1);
+#pragma acc parallel private(LocalComposite.ArrayMember[5])
+  while(1);
+#pragma acc parallel private(LocalComposite.PointerMember)
+  while(1);
+#pragma acc parallel private(GlobalInt, GlobalArray, GlobalPointer, GlobalComposite)
+  while(1);
+#pragma acc parallel private(GlobalArray[2], GlobalPointer[2], GlobalComposite.CompositeMember.A)
+  while(1);
+#pragma acc parallel private(LocalComposite, GlobalComposite)
+  while(1);
+#pragma acc parallel private(IntParam, PointerParam, ArrayParam, CompositeParam)
+  while(1);
+#pragma acc parallel private(PointerParam[IntParam], ArrayParam[IntParam], CompositeParam.CompositeMember.A)
+  while(1);
+
+#pragma acc parallel private(LocalArray) private(LocalArray[2])
+  while(1);
+
+#pragma acc parallel private(LocalArray, LocalArray[2])
+  while(1);
+
+#pragma acc parallel private(LocalComposite, LocalComposite.ScalarMember)
+  while(1);
+
+#pragma acc parallel private(LocalComposite.CompositeMember.A, LocalComposite.ScalarMember)
+  while(1);
+
+#pragma acc parallel private(LocalComposite.CompositeMember.A) private(LocalComposite.ScalarMember)
+  while(1);
+
+  Complete LocalComposite2;
+#pragma acc parallel private(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  while(1);
+
+  // Invalid cases, arbitrary expressions.
+  struct Incomplete *I;
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(*I)
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(GlobalInt + IntParam)
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(+GlobalInt)
+  while(1);
+
+  // TODO OpenACC: Fix array sections, this should be allowed.
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel private(PointerParam[:])
+  while(1);
+#pragma acc parallel private(PointerParam[:5])
+  while(1);
+#pragma acc parallel private(PointerParam[:IntParam])
+  while(1);
+  // TODO OpenACC: Fix array sections, this should be allowed.
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel private(PointerParam[2:])
+  while(1);
+#pragma acc parallel private(PointerParam[2:5])
+  while(1);
+#pragma acc parallel private(PointerParam[2])
+  while(1);
+  // TODO OpenACC: Fix array sections, this should be allowed.
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel private(ArrayParam[:])
+  while(1);
+#pragma acc parallel private(ArrayParam[:5])
+  while(1);
+#pragma acc parallel private(ArrayParam[:IntParam])
+  while(1);
+  // TODO OpenACC: Fix array sections, this should be allowed.
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel private(ArrayParam[2:])
+  while(1);
+#pragma acc parallel private(ArrayParam[2:5])
+  while(1);
+#pragma acc parallel private(ArrayParam[2])
+  while(1);
+
+  // expected-error@+1{{OpenACC sub-array is not allowed here}}
+#pragma acc parallel private((float*)ArrayParam[2:5])
+  while(1);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private((float)ArrayParam[2])
+  while(1);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-private-clause.cpp b/clang/test/SemaOpenACC/compute-construct-private-clause.cpp
new file mode 100644
index 000000000000..4dd4e0d8029d
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-private-clause.cpp
@@ -0,0 +1,161 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct Incomplete;
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+int GlobalInt;
+float GlobalArray[5];
+char *GlobalPointer;
+Complete GlobalComposite;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+
+  // Check Appertainment:
+
+#pragma acc parallel private(LocalInt)
+  while(true);
+#pragma acc serial private(LocalInt)
+  while(true);
+  // expected-error@+1{{OpenACC 'private' clause is not valid on 'kernels' directive}}
+#pragma acc kernels private(LocalInt)
+  while(true);
+
+  // Valid cases:
+#pragma acc parallel private(LocalInt, LocalPointer, LocalArray)
+  while(true);
+#pragma acc parallel private(LocalArray)
+  while(true);
+#pragma acc parallel private(LocalArray[2])
+  while(true);
+#pragma acc parallel private(LocalComposite)
+  while(true);
+#pragma acc parallel private(LocalComposite.EnumMember)
+  while(true);
+#pragma acc parallel private(LocalComposite.ScalarMember)
+  while(true);
+#pragma acc parallel private(LocalComposite.ArrayMember)
+  while(true);
+#pragma acc parallel private(LocalComposite.ArrayMember[5])
+  while(true);
+#pragma acc parallel private(LocalComposite.PointerMember)
+  while(true);
+#pragma acc parallel private(GlobalInt, GlobalArray, GlobalPointer, GlobalComposite)
+  while(true);
+#pragma acc parallel private(GlobalArray[2], GlobalPointer[2], GlobalComposite.CompositeMember.A)
+  while(true);
+#pragma acc parallel private(LocalComposite, GlobalComposite)
+  while(true);
+#pragma acc parallel private(IntParam, PointerParam, ArrayParam, CompositeParam) private(IntParamRef)
+  while(true);
+#pragma acc parallel private(PointerParam[IntParam], ArrayParam[IntParam], CompositeParam.CompositeMember.A)
+  while(true);
+
+
+  // Invalid cases, arbitrary expressions.
+  Incomplete *I;
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(*I)
+  while(true);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(GlobalInt + IntParam)
+  while(true);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(+GlobalInt)
+  while(true);
+}
+
+template<typename T, unsigned I, typename V>
+void TemplUses(T t, T (&arrayT)[I], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(+t)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(+I)
+  while(true);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel private(I)
+  while(true);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+#pragma acc parallel private(t, I)
+  while(true);
+
+#pragma acc parallel private(arrayT)
+  while(true);
+
+#pragma acc parallel private(TemplComp)
+  while(true);
+
+#pragma acc parallel private(TemplComp.PointerMember[5])
+  while(true);
+
+#pragma acc parallel private(TemplComp.PointerMember[5]) private(TemplComp)
+  while(true);
+
+ int *Pointer;
+#pragma acc parallel private(Pointer[:I])
+  while(true);
+#pragma acc parallel private(Pointer[:t])
+  while(true);
+  // TODO OpenACC: When fixing sub-arrays, this should be permitted}}
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel private(Pointer[1:])
+  while(true);
+}
+
+template<unsigned I, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel private(I)
+  while(true);
+
+#pragma acc parallel private(NTTP_REF)
+  while(true);
+}
+
+struct S {
+  int ThisMember;
+  int ThisMemberArray[5];
+
+  void foo();
+};
+
+void S::foo() {
+#pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
+  while(true);
+
+#pragma acc parallel private(ThisMemberArray[1:2])
+  while(true);
+
+#pragma acc parallel private(this)
+  while(true);
+
+#pragma acc parallel private(ThisMember, this->ThisMember)
+  while(true);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp b/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp
new file mode 100644
index 000000000000..341be3c58ebd
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp
@@ -0,0 +1,552 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+int Global;
+short GlobalArray[5];
+
+void NormalUses(float *PointerParam) {
+  // CHECK: FunctionDecl{{.*}}NormalUses
+  // CHECK: ParmVarDecl
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel private(Global, GlobalArray[2])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'short' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'short *' <ArrayToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 2
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(GlobalArray, PointerParam[Global])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(GlobalArray) private(PointerParam[Global])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(GlobalArray, PointerParam[Global : Global])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}} 'PointerParam' 'float *'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+
+// This example is an error typically, but we want to make sure we're properly
+// capturing NTTPs until instantiation time.
+template<unsigned I>
+void UnInstTempl() {
+  // CHECK-NEXT: FunctionTemplateDecl{{.*}} UnInstTempl
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}}referenced 'unsigned int' depth 0 index 0 I
+  // CHECK-NEXT: FunctionDecl{{.*}} UnInstTempl 'void ()'
+  // CHECK-NEXT: CompoundStmt
+#pragma acc parallel private(I)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'unsigned int' NonTypeTemplateParm{{.*}}'I' 'unsigned int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+
+template<auto &NTTP, typename T, typename U>
+void TemplUses(T t, U u, T*PointerParam) {
+  // CHECK-NEXT: FunctionTemplateDecl
+  // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}}referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 1 T
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 2 U
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void (T, U, T *)'
+  // CHECK-NEXT: ParmVarDecl{{.*}} referenced t 'T'
+  // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U'
+  // CHECK-NEXT: ParmVarDecl{{.*}} referenced PointerParam 'T *'
+  // CHECK-NEXT: CompoundStmt
+
+
+#pragma acc parallel private(GlobalArray, PointerParam[Global])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'T' lvalue
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T *' lvalue ParmVar{{.*}}'PointerParam' 'T *'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(t, u)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(t) private(u)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(t) private(NTTP, u)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(u[0])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'<dependent type>' lvalue
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(u[0:t])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}}EndMarker
+  int EndMarker;
+
+  // Check the instantiated versions of the above.
+  // CHECK-NEXT: FunctionDecl{{.*}} used TemplUses 'void (int, int *, int *)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument decl
+  // CHECK-NEXT: Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: TemplateArgument type 'int'
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: TemplateArgument type 'int[1]'
+  // CHECK-NEXT: ConstantArrayType{{.*}} 'int[1]' 1
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: ParmVarDecl{{.*}} used t 'int'
+  // CHECK-NEXT: ParmVarDecl{{.*}} used u 'int *'
+  // CHECK-NEXT: ParmVarDecl{{.*}} used PointerParam 'int *'
+  // CHECK-NEXT: CompoundStmt
+
+// #pragma acc parallel private(GlobalArray, PointerParam[Global])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'int' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}}'PointerParam' 'int *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+// #pragma acc parallel private(t, u)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+// #pragma acc parallel private(t) private(u)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+// #pragma acc parallel private(t) private(NTTP, u)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+// #pragma acc parallel private(u[0])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'int' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+// #pragma acc parallel private(u[0:t])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: DeclStmt
+  // CHECK-NEXT: VarDecl{{.*}}EndMarker
+}
+
+struct S {
+  // CHECK-NEXT: CXXRecordDecl{{.*}} struct S definition
+  // CHECK: CXXRecordDecl{{.*}} implicit struct S
+  int ThisMember;
+  // CHECK-NEXT: FieldDecl{{.*}} ThisMember 'int'
+  int ThisMemberArray[5];
+  // CHECK-NEXT: FieldDecl{{.*}} ThisMemberArray 'int[5]'
+
+  void foo();
+  // CHECK-NEXT: CXXMethodDecl{{.*}} foo 'void ()'
+
+  template<typename T>
+  void bar() {
+  // CHECK-NEXT: FunctionTemplateDecl{{.*}}bar
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 0 T
+  // CHECK-NEXT: CXXMethodDecl{{.*}} bar 'void ()' implicit-inline
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: MemberExpr{{.*}} 'int' lvalue ->ThisMember
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}} 'int' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ThisMemberArray[1:2])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 2
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(this)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+  // Check Instantiations:
+  // CHECK-NEXT: CXXMethodDecl{{.*}} used bar 'void ()' implicit_instantiation implicit-inline
+  // CHECK-NEXT: TemplateArgument type 'int'
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: CompoundStmt
+
+// #pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: MemberExpr{{.*}} 'int' lvalue ->ThisMember
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}} 'int' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+// #pragma acc parallel private(ThisMemberArray[1:2])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 2
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+// #pragma acc parallel private(this)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+};
+
+void S::foo() {
+  // CHECK: CXXMethodDecl{{.*}} foo 'void ()'
+  // CHECK-NEXT: CompoundStmt
+#pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: MemberExpr{{.*}} 'int' lvalue ->ThisMember
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}} 'int' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ThisMemberArray[1:2])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' implicit this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 2
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(this)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'S *' this
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+
+template<typename U>
+struct STempl {
+  // CHECK-NEXT: ClassTemplateDecl{{.*}} STempl
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}} typename depth 0 index 0 U
+  // CHECK-NEXT: CXXRecordDecl{{.*}} struct STempl definition
+  // CHECK: CXXRecordDecl{{.*}} implicit struct STempl
+  U ThisMember;
+  // CHECK-NEXT: FieldDecl{{.*}} ThisMember 'U'
+  U ThisMemberArray[5];
+  // CHECK-NEXT: FieldDecl{{.*}} ThisMemberArray 'U[5]'
+
+  void foo() {
+    // CHECK-NEXT: CXXMethodDecl {{.*}} foo 'void ()'
+    // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: MemberExpr{{.*}} 'U' lvalue ->ThisMember
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<U> *' implicit this
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}} 'U' lvalue
+  // CHECK-NEXT: MemberExpr{{.*}} 'U[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<U> *' this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ThisMemberArray[1:2])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: MemberExpr{{.*}} 'U[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<U> *' implicit this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 2
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(this)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<U> *' this
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+
+  template<typename T>
+  void bar() {
+  // CHECK-NEXT: FunctionTemplateDecl{{.*}} bar
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}} typename depth 1 index 0 T
+  // CHECK-NEXT: CXXMethodDecl{{.*}} bar 'void ()'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: MemberExpr{{.*}} 'U' lvalue ->ThisMember
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<U> *' implicit this
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}} 'U' lvalue
+  // CHECK-NEXT: MemberExpr{{.*}} 'U[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<U> *' this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(ThisMemberArray[1:2])
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: MemberExpr{{.*}} 'U[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<U> *' implicit this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 2
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel private(this)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<U> *' this
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+}
+
+// Instantiation of the class template.
+
+// CHECK-NEXT: ClassTemplateSpecializationDecl{{.*}}struct STempl
+// CHECK: TemplateArgument type 'int'
+// CHECK-NEXT: BuiltinType {{.*}}'int'
+// CHECK-NEXT: CXXRecordDecl{{.*}} struct STempl
+// CHECK-NEXT: FieldDecl{{.*}}ThisMember 'int'
+// CHECK-NEXT: FieldDecl{{.*}} ThisMemberArray 'int[5]'
+
+// CHECK-NEXT: CXXMethodDecl{{.*}} foo 'void ()'
+// CHECK-NEXT: FunctionTemplateDecl{{.*}}bar
+// CHECK-NEXT: TemplateTypeParmDecl{{.*}} typename depth 0 index 0 T
+// CHECK-NEXT: CXXMethodDecl{{.*}}bar 'void ()'
+// CHECK-NEXT: CXXMethodDecl{{.*}}bar 'void ()'
+// CHECK-NEXT: TemplateArgument type 'int'
+// CHECK-NEXT: BuiltinType{{.*}} 'int'
+// CHECK-NEXT: CompoundStmt
+
+//#pragma acc parallel private(ThisMember, this->ThisMemberArray[1])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: MemberExpr{{.*}} 'int' lvalue ->ThisMember
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<int> *' implicit this
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}} 'int' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' <ArrayToPointerDecay>
+  // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<int> *' this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(ThisMemberArray[1:2])
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: ArraySectionExpr{{.*}}
+  // CHECK-NEXT: MemberExpr{{.*}} 'int[5]' lvalue ->ThisMemberArray
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<int> *' implicit this
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1
+  // CHECK-NEXT: IntegerLiteral{{.*}}'int' 2
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+//#pragma acc parallel private(this)
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel
+  // CHECK-NEXT: private clause
+  // CHECK-NEXT: CXXThisExpr{{.*}} 'STempl<int> *' this
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+};
+
+void Inst() {
+  static constexpr unsigned CEVar = 1;
+  int i;
+  int Arr[5];
+  TemplUses<CEVar, int, int[1]>({}, {}, &i);
+
+  S s;
+  s.bar<int>();
+  STempl<int> stempl;
+  stempl.bar<int>();
+}
diff --git a/clang/test/SemaOpenCL/vec_step.cl b/clang/test/SemaOpenCL/vec_step.cl
index afb6dc94d92e..c116f09b351f 100644
--- a/clang/test/SemaOpenCL/vec_step.cl
+++ b/clang/test/SemaOpenCL/vec_step.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s -fexperimental-new-constant-interpreter
 
 typedef int int2 __attribute__((ext_vector_type(2)));
 typedef int int3 __attribute__((ext_vector_type(3)));
diff --git a/clang/test/SemaTemplate/instantiate-function-1.cpp b/clang/test/SemaTemplate/instantiate-function-1.cpp
index ceef27437748..a4967264c654 100644
--- a/clang/test/SemaTemplate/instantiate-function-1.cpp
+++ b/clang/test/SemaTemplate/instantiate-function-1.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify %s
 template<typename T, typename U>
 struct X0 {
-  void f(T x, U y) { 
+  void f(T x, U y) {
     (void)(x + y); // expected-error{{invalid operands}}
   }
 };
@@ -41,7 +41,7 @@ template <typename T> struct X4 {
   T f() const {
     return; // expected-error{{non-void function 'f' should return a value}}
   }
-  
+
   T g() const {
     return 1; // expected-error{{void function 'g' should not return a value}}
   }
@@ -64,7 +64,7 @@ template<typename T, typename U, typename V> struct X6 {
     // IfStmt
     if (t > 0)
       return u;
-    else { 
+    else {
       if (t < 0)
         return v; // expected-error{{cannot initialize return object of type}}
     }
@@ -131,12 +131,12 @@ template<typename T> struct Member0 {
     t;
     t.f;
     t->f;
-    
+
     T* tp;
     tp.f; // expected-error{{member reference base type 'T *' is not a structure or union}}
     tp->f;
 
-    this->f;
+    this->f; // expected-error{{reference to non-static member function must be called}}
     this.f; // expected-error{{member reference base type 'Member0<T> *' is not a structure or union}}
   }
 };
@@ -239,11 +239,11 @@ namespace PR9880 {
     static yes_tag check(char[sizeof(&U::luaIndex)]);
     enum { value = sizeof(check<T>(0)) == sizeof(yes_tag) };
   };
-  
+
   class SomeClass {
   public:
     int luaIndex(lua_State* L);
   };
-  
+
   int i = HasIndexMetamethod<SomeClass>::value;
 }
diff --git a/clang/test/TestRunner.sh b/clang/test/TestRunner.sh
deleted file mode 100755
index f96d3d552d2e..000000000000
--- a/clang/test/TestRunner.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-#
-# TestRunner.sh - Backward compatible utility for testing an individual file.
-
-# Find where this script is.
-Dir=$(dirname $(which $0))
-AbsDir=$(cd $Dir; pwd)
-
-# Find 'lit', assuming standard layout.
-lit=$AbsDir/../../../utils/lit/lit.py
-
-# Dispatch to lit.
-$lit "$@"
diff --git a/clang/test/Unit/lit.cfg.py b/clang/test/Unit/lit.cfg.py
index 475069e630d7..37e91d0f8629 100644
--- a/clang/test/Unit/lit.cfg.py
+++ b/clang/test/Unit/lit.cfg.py
@@ -25,13 +25,9 @@ config.test_format = lit.formats.GoogleTest(config.llvm_build_mode, "Tests")
 
 # Propagate the temp directory. Windows requires this because it uses \Windows\
 # if none of these are present.
-if "TMP" in os.environ:
-    config.environment["TMP"] = os.environ["TMP"]
-if "TEMP" in os.environ:
-    config.environment["TEMP"] = os.environ["TEMP"]
-
-if "HOME" in os.environ:
-    config.environment["HOME"] = os.environ["HOME"]
+for v in ["TMP", "TEMP", "HOME", "SystemDrive"]:
+    if v in os.environ:
+        config.environment[v] = os.environ[v]
 
 # Propagate sanitizer options.
 for var in [
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index feb733fe3c9e..01f7c6047726 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -413,8 +413,9 @@ static bool format(StringRef FileName, bool ErrorOnIncompleteFormat = false) {
   // On Windows, overwriting a file with an open file mapping doesn't work,
   // so read the whole file into memory when formatting in-place.
   ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-      !OutputXML && Inplace ? MemoryBuffer::getFileAsStream(FileName)
-                            : MemoryBuffer::getFileOrSTDIN(FileName);
+      !OutputXML && Inplace
+          ? MemoryBuffer::getFileAsStream(FileName)
+          : MemoryBuffer::getFileOrSTDIN(FileName, /*IsText=*/true);
   if (std::error_code EC = CodeOrErr.getError()) {
     errs() << EC.message() << "\n";
     return true;
@@ -558,7 +559,7 @@ static int dumpConfig() {
     // Read in the code in case the filename alone isn't enough to detect the
     // language.
     ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-        MemoryBuffer::getFileOrSTDIN(FileNames[0]);
+        MemoryBuffer::getFileOrSTDIN(FileNames[0], /*IsText=*/true);
     if (std::error_code EC = CodeOrErr.getError()) {
       llvm::errs() << EC.message() << "\n";
       return 1;
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 191e944ae91e..21f04a291b2f 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -594,9 +594,7 @@ getInterfaceFile(const StringRef Filename) {
   std::unique_ptr<InterfaceFile> IF;
   switch (identify_magic(Buffer->getBuffer())) {
   case file_magic::macho_dynamically_linked_shared_lib:
-    LLVM_FALLTHROUGH;
   case file_magic::macho_dynamically_linked_shared_lib_stub:
-    LLVM_FALLTHROUGH;
   case file_magic::macho_universal_binary:
     return DylibReader::get(Buffer->getMemBufferRef());
     break;
@@ -699,8 +697,8 @@ InstallAPIContext Options::createContext() {
     }
     Expected<AliasMap> Result = parseAliasList(Buffer.get());
     if (!Result) {
-      Diags->Report(diag::err_cannot_read_alias_list)
-          << ListPath << toString(Result.takeError());
+      Diags->Report(diag::err_cannot_read_input_list)
+          << /*IsFileList=*/false << ListPath << toString(Result.takeError());
       return Ctx;
     }
     Aliases.insert(Result.get().begin(), Result.get().end());
@@ -719,8 +717,9 @@ InstallAPIContext Options::createContext() {
       return Ctx;
     }
     if (auto Err = FileListReader::loadHeaders(std::move(Buffer.get()),
-                                               Ctx.InputHeaders)) {
-      Diags->Report(diag::err_cannot_open_file) << ListPath << std::move(Err);
+                                               Ctx.InputHeaders, FM)) {
+      Diags->Report(diag::err_cannot_read_input_list)
+          << /*IsFileList=*/true << ListPath << std::move(Err);
       return Ctx;
     }
   }
diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp
index aecf61b97fc7..9cfc70462893 100644
--- a/clang/tools/clang-repl/ClangRepl.cpp
+++ b/clang/tools/clang-repl/ClangRepl.cpp
@@ -215,13 +215,15 @@ int main(int argc, const char **argv) {
   } else
     Interp = ExitOnErr(clang::Interpreter::create(std::move(CI)));
 
+  bool HasError = false;
+
   for (const std::string &input : OptInputs) {
-    if (auto Err = Interp->ParseAndExecute(input))
+    if (auto Err = Interp->ParseAndExecute(input)) {
       llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), "error: ");
+      HasError = true;
+    }
   }
 
-  bool HasError = false;
-
   if (OptInputs.empty()) {
     llvm::LineEditor LE("clang-repl");
     std::string Input;
@@ -241,18 +243,13 @@ int main(int argc, const char **argv) {
         break;
       }
       if (Input == R"(%undo)") {
-        if (auto Err = Interp->Undo()) {
+        if (auto Err = Interp->Undo())
           llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), "error: ");
-          HasError = true;
-        }
       } else if (Input.rfind("%lib ", 0) == 0) {
-        if (auto Err = Interp->LoadDynamicLibrary(Input.data() + 5)) {
+        if (auto Err = Interp->LoadDynamicLibrary(Input.data() + 5))
           llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), "error: ");
-          HasError = true;
-        }
       } else if (auto Err = Interp->ParseAndExecute(Input)) {
         llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), "error: ");
-        HasError = true;
       }
 
       Input = "";
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index d70b92b0984e..290bf2a42536 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -31,6 +31,9 @@ add_clang_tool(clang
 
   DEPENDS
   intrinsics_gen
+  # These generated headers are included transitively.
+  ARMTargetParserTableGen
+  AArch64TargetParserTableGen
   ${support_plugins}
   GENERATE_DRIVER
   )
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index 5498c3f9d4a2..86afe22fac24 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -426,6 +426,7 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
   assert(MRI && "Unable to create target register info!");
 
   MCTargetOptions MCOptions;
+  MCOptions.MCRelaxAll = Opts.RelaxAll;
   MCOptions.EmitDwarfUnwind = Opts.EmitDwarfUnwind;
   MCOptions.EmitCompactUnwindNonCanonical = Opts.EmitCompactUnwindNonCanonical;
   MCOptions.X86RelaxRelocations = Opts.RelaxELFRelocations;
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 74163f30e19b..eb0ba09c5b91 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2782,6 +2782,11 @@ class OpenACCClauseEnqueue : public OpenACCClauseVisitor<OpenACCClauseEnqueue> {
 public:
   OpenACCClauseEnqueue(EnqueueVisitor &V) : Visitor(V) {}
 
+  void VisitVarList(const OpenACCClauseWithVarList &C) {
+    for (Expr *Var : C.getVarList())
+      Visitor.AddStmt(Var);
+  }
+
 #define VISIT_CLAUSE(CLAUSE_NAME)                                              \
   void Visit##CLAUSE_NAME##Clause(const OpenACC##CLAUSE_NAME##Clause &C);
 #include "clang/Basic/OpenACCClauses.def"
@@ -2807,6 +2812,10 @@ void OpenACCClauseEnqueue::VisitNumGangsClause(const OpenACCNumGangsClause &C) {
   for (Expr *IE : C.getIntExprs())
     Visitor.AddStmt(IE);
 }
+
+void OpenACCClauseEnqueue::VisitPrivateClause(const OpenACCPrivateClause &C) {
+  VisitVarList(C);
+}
 } // namespace
 
 void EnqueueVisitor::EnqueueChildren(const OpenACCClause *C) {
@@ -5713,8 +5722,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("UnaryOperator");
   case CXCursor_ArraySubscriptExpr:
     return cxstring::createRef("ArraySubscriptExpr");
-  case CXCursor_OMPArraySectionExpr:
-    return cxstring::createRef("OMPArraySectionExpr");
+  case CXCursor_ArraySectionExpr:
+    return cxstring::createRef("ArraySectionExpr");
   case CXCursor_OMPArrayShapingExpr:
     return cxstring::createRef("OMPArrayShapingExpr");
   case CXCursor_OMPIteratorExpr:
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 454bf7549861..9325a16d2a84 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -423,8 +423,8 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
     K = CXCursor_UnexposedExpr;
     break;
 
-  case Stmt::OMPArraySectionExprClass:
-    K = CXCursor_OMPArraySectionExpr;
+  case Stmt::ArraySectionExprClass:
+    K = CXCursor_ArraySectionExpr;
     break;
 
   case Stmt::OMPArrayShapingExprClass:
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index 87774b00956a..c08deb903f12 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -1569,8 +1569,9 @@ TEST_P(ASTMatchersTest, IsArrow_MatchesMemberVariablesViaArrow) {
       matches("class Y { void x() { y; } int y; };", memberExpr(isArrow())));
   EXPECT_TRUE(notMatches("class Y { void x() { (*this).y; } int y; };",
                          memberExpr(isArrow())));
-  EXPECT_TRUE(matches("template <class T> class Y { void x() { this->m; } };",
-                      cxxDependentScopeMemberExpr(isArrow())));
+  EXPECT_TRUE(
+      matches("template <class T> class Y { void x() { this->m; } int m; };",
+              memberExpr(isArrow())));
   EXPECT_TRUE(
       notMatches("template <class T> class Y { void x() { (*this).m; } };",
                  cxxDependentScopeMemberExpr(isArrow())));
diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
index 9c1dc1a76db6..9c4ec07e139a 100644
--- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
+++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
@@ -10,8 +10,8 @@
 #include "clang/AST/TypeLoc.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Frontend/ASTUnit.h"
 #include "clang/Tooling/Tooling.h"
-#include "llvm/ADT/SmallString.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <cctype>
@@ -43,7 +43,7 @@ std::unique_ptr<ASTUnit> buildASTFromCode(const Twine &Code) {
 }
 
 ExprMatcher declRefTo(StringRef Name) {
-  return declRefExpr(to(namedDecl(hasName(Name))));
+  return declRefExpr(to(namedDecl(hasName(Name)).bind("decl")));
 }
 
 StmtMatcher withEnclosingCompound(ExprMatcher Matcher) {
@@ -57,6 +57,13 @@ bool isMutated(const SmallVectorImpl<BoundNodes> &Results, ASTUnit *AST) {
   return ExprMutationAnalyzer(*S, AST->getASTContext()).isMutated(E);
 }
 
+bool isDeclMutated(const SmallVectorImpl<BoundNodes> &Results, ASTUnit *AST) {
+  const auto *const S = selectFirst<Stmt>("stmt", Results);
+  const auto *const D = selectFirst<Decl>("decl", Results);
+  TraversalKindScope RAII(AST->getASTContext(), TK_AsIs);
+  return ExprMutationAnalyzer(*S, AST->getASTContext()).isMutated(D);
+}
+
 SmallVector<std::string, 1>
 mutatedBy(const SmallVectorImpl<BoundNodes> &Results, ASTUnit *AST) {
   const auto *const S = selectFirst<Stmt>("stmt", Results);
@@ -1176,7 +1183,7 @@ TEST(ExprMutationAnalyzerTest, CastToConstRef) {
 
 // section: comma expressions
 
-TEST(ExprMutationAnalyzerTest, CommaExprWithAnAssigment) {
+TEST(ExprMutationAnalyzerTest, CommaExprWithAnAssignment) {
   const auto AST = buildASTFromCodeWithArgs(
       "void f() { int x; int y; (x, y) = 5; }", {"-Wno-unused-value"});
   const auto Results =
@@ -1267,7 +1274,7 @@ TEST(ExprMutationAnalyzerTest, CommaExprAsReturnAsValue) {
   EXPECT_FALSE(isMutated(Results, AST.get()));
 }
 
-TEST(ExprMutationAnalyzerTest, CommaEpxrAsReturnAsNonConstRef) {
+TEST(ExprMutationAnalyzerTest, CommaExprAsReturnAsNonConstRef) {
   const auto AST = buildASTFromCodeWithArgs(
       "int& f() { int x, y; return (y, x); }", {"-Wno-unused-value"});
   const auto Results =
@@ -1552,6 +1559,21 @@ TEST(ExprMutationAnalyzerTest, UniquePtr) {
 
 // section: complex problems detected on real code
 
+TEST(ExprMutationAnalyzerTest, SelfRef) {
+  std::unique_ptr<ASTUnit> AST{};
+  SmallVector<BoundNodes, 1> Results{};
+
+  AST = buildASTFromCodeWithArgs("void f() { int &x = x; }",
+                                 {"-Wno-unused-value", "-Wno-uninitialized"});
+  Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_FALSE(isDeclMutated(Results, AST.get()));
+
+  AST = buildASTFromCodeWithArgs("void f() { int &x = x; x = 1; }",
+                                 {"-Wno-unused-value", "-Wno-uninitialized"});
+  Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_TRUE(isDeclMutated(Results, AST.get()));
+}
+
 TEST(ExprMutationAnalyzerTest, UnevaluatedContext) {
   const std::string Example =
       "template <typename T>"
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 215e208615ac..301bec32c0cf 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -3309,6 +3309,28 @@ TEST(TransferTest, ResultObjectLocationPropagatesThroughConditionalOperator) {
       });
 }
 
+TEST(TransferTest, ResultObjectLocationDontVisitNestedRecordDecl) {
+  // This is a crash repro.
+  // We used to crash because when propagating result objects, we would visit
+  // nested record and function declarations, but we don't model fields used
+  // only in these.
+  std::string Code = R"(
+    struct S1 {};
+    struct S2 { S1 s1; };
+    void target() {
+      struct Nested {
+        void f() {
+          S2 s2 = { S1() };
+        }
+      };
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {});
+}
+
 TEST(TransferTest, StaticCast) {
   std::string Code = R"(
     void target(int Foo) {
@@ -3348,20 +3370,11 @@ TEST(TransferTest, IntegralCast) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
 
-        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-        ASSERT_THAT(FooDecl, NotNull());
-
-        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-        ASSERT_THAT(BarDecl, NotNull());
-
-        const auto *FooVal = Env.getValue(*FooDecl);
-        const auto *BarVal = Env.getValue(*BarDecl);
-        EXPECT_TRUE(isa<IntegerValue>(FooVal));
-        EXPECT_TRUE(isa<IntegerValue>(BarVal));
-        EXPECT_EQ(FooVal, BarVal);
+        const auto &FooVal = getValueForDecl<IntegerValue>(ASTCtx, Env, "Foo");
+        const auto &BarVal = getValueForDecl<IntegerValue>(ASTCtx, Env, "Bar");
+        EXPECT_EQ(&FooVal, &BarVal);
       });
 }
 
@@ -3376,17 +3389,10 @@ TEST(TransferTest, IntegraltoBooleanCast) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
 
-        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-        ASSERT_THAT(FooDecl, NotNull());
-
-        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-        ASSERT_THAT(BarDecl, NotNull());
-
-        const auto *FooVal = Env.getValue(*FooDecl);
-        const auto *BarVal = Env.getValue(*BarDecl);
+        const auto &FooVal = getValueForDecl(ASTCtx, Env, "Foo");
+        const auto &BarVal = getValueForDecl(ASTCtx, Env, "Bar");
         EXPECT_TRUE(isa<IntegerValue>(FooVal));
         EXPECT_TRUE(isa<BoolValue>(BarVal));
       });
@@ -3404,23 +3410,38 @@ TEST(TransferTest, IntegralToBooleanCastFromBool) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
 
-        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-        ASSERT_THAT(FooDecl, NotNull());
-
-        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-        ASSERT_THAT(BarDecl, NotNull());
-
-        const auto *FooVal = Env.getValue(*FooDecl);
-        const auto *BarVal = Env.getValue(*BarDecl);
-        EXPECT_TRUE(isa<BoolValue>(FooVal));
-        EXPECT_TRUE(isa<BoolValue>(BarVal));
-        EXPECT_EQ(FooVal, BarVal);
+        const auto &FooVal = getValueForDecl<BoolValue>(ASTCtx, Env, "Foo");
+        const auto &BarVal = getValueForDecl<BoolValue>(ASTCtx, Env, "Bar");
+        EXPECT_EQ(&FooVal, &BarVal);
       });
 }
 
+TEST(TransferTest, WidenBoolValueInIntegerVariable) {
+  // This is a crash repro.
+  // This test sets up a case where we perform widening on an integer variable
+  // that contains a `BoolValue` for the previous iteration and an
+  // `IntegerValue` for the current iteration. We used to crash on this because
+  // `widenDistinctValues()` assumed that if the previous iteration had a
+  // `BoolValue`, the current iteration would too.
+  // FIXME: The real fix here is to make sure we never store `BoolValue`s in
+  // integer variables; see also the comment in `widenDistinctValues()`.
+  std::string Code = R"cc(
+    struct S {
+      int i;
+      S *next;
+    };
+    void target(S *s) {
+      for (; s; s = s->next)
+        s->i = false;
+    }
+  )cc";
+  runDataflow(Code,
+              [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &,
+                 ASTContext &) {});
+}
+
 TEST(TransferTest, NullToPointerCast) {
   std::string Code = R"(
     using my_nullptr_t = decltype(nullptr);
@@ -5336,6 +5357,38 @@ TEST(TransferTest, ConditionalOperatorLocation) {
       });
 }
 
+TEST(TransferTest, ConditionalOperatorOnConstantExpr) {
+  // This is a regression test: We used to crash when a `ConstantExpr` was used
+  // in the branches of a conditional operator.
+  std::string Code = R"cc(
+    consteval bool identity(bool B) { return B; }
+    void target(bool Cond) {
+      bool JoinTrueTrue = Cond ? identity(true) : identity(true);
+      bool JoinTrueFalse = Cond ? identity(true) : identity(false);
+      // [[p]]
+    }
+  )cc";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        Environment Env = getEnvironmentAtAnnotation(Results, "p").fork();
+
+        auto &JoinTrueTrue =
+            getValueForDecl<BoolValue>(ASTCtx, Env, "JoinTrueTrue");
+        // FIXME: This test documents the current behavior, namely that we
+        // don't actually use the constant result of the `ConstantExpr` and
+        // instead treat it like a normal function call.
+        EXPECT_EQ(JoinTrueTrue.formula().kind(), Formula::Kind::AtomRef);
+        // EXPECT_TRUE(JoinTrueTrue.formula().literal());
+
+        auto &JoinTrueFalse =
+            getValueForDecl<BoolValue>(ASTCtx, Env, "JoinTrueFalse");
+        EXPECT_EQ(JoinTrueFalse.formula().kind(), Formula::Kind::AtomRef);
+      },
+      LangStandard::lang_cxx20);
+}
+
 TEST(TransferTest, IfStmtBranchExtendsFlowCondition) {
   std::string Code = R"(
     void target(bool Foo) {
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index bc61b9c089e9..32ba6b6853c7 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -24507,16 +24507,25 @@ TEST_F(FormatTest, AlternativeOperators) {
   verifyFormat("int a compl(5);");
   verifyFormat("int a not(5);");
 
-  /* FIXME handle alternate tokens
-   * https://en.cppreference.com/w/cpp/language/operator_alternative
-  // alternative tokens
-  verifyFormat("compl foo();");     //  ~foo();
-  verifyFormat("foo() <%%>;");      // foo();
-  verifyFormat("void foo() <%%>;"); // void foo(){}
-  verifyFormat("int a <:1:>;");     // int a[1];[
+  verifyFormat("compl foo();");     // ~foo();
+  verifyFormat("foo() <%%>");       // foo() {}
+  verifyFormat("void foo() <%%>");  // void foo() {}
+  verifyFormat("int a<:1:>;");      // int a[1];
   verifyFormat("%:define ABC abc"); // #define ABC abc
   verifyFormat("%:%:");             // ##
-  */
+
+  verifyFormat("a = v(not;);\n"
+               "b = v(not+);\n"
+               "c = v(not x);\n"
+               "d = v(not 1);\n"
+               "e = v(not 123.f);");
+
+  verifyNoChange("#define ASSEMBLER_INSTRUCTION_LIST(V)  \\\n"
+                 "  V(and)                               \\\n"
+                 "  V(not)                               \\\n"
+                 "  V(not!)                              \\\n"
+                 "  V(other)",
+                 getLLVMStyleWithColumns(40));
 }
 
 TEST_F(FormatTest, STLWhileNotDefineChed) {
@@ -27354,6 +27363,45 @@ TEST_F(FormatTest, BreakAdjacentStringLiterals) {
   verifyFormat(Code, Style);
 }
 
+TEST_F(FormatTest, AlignUTFCommentsAndStringLiterals) {
+  verifyFormat(
+      "int rus;      // А теперь комментарии, например, на русском, 2-байта\n"
+      "int long_rus; // Верхний коммент еще не превысил границу в 80, однако\n"
+      "              // уже отодвинут. Перенос, при этом, отрабатывает верно");
+
+  auto Style = getLLVMStyle();
+  Style.ColumnLimit = 15;
+  verifyNoChange("#define test  \\\n"
+                 "  /* 测试 */  \\\n"
+                 "  \"aa\"        \\\n"
+                 "  \"bb\"",
+                 Style);
+
+  Style.ColumnLimit = 25;
+  verifyFormat("struct foo {\n"
+               "  int iiiiii; ///< iiiiii\n"
+               "  int b;      ///< ыыы\n"
+               "  int c;      ///< ыыыы\n"
+               "};",
+               Style);
+
+  Style.ColumnLimit = 35;
+  verifyFormat("#define SENSOR_DESC_1             \\\n"
+               "  \"{\"                             \\\n"
+               "  \"unit_of_measurement: \\\"°C\\\",\"  \\\n"
+               "  \"}\"",
+               Style);
+
+  Style.ColumnLimit = 80;
+  Style.AlignArrayOfStructures = FormatStyle::AIAS_Left;
+  verifyFormat("Languages languages = {\n"
+               "    Language{{'e', 'n'}, U\"Test English\" },\n"
+               "    Language{{'l', 'v'}, U\"Test Latviešu\"},\n"
+               "    Language{{'r', 'u'}, U\"Test Русский\" },\n"
+               "};",
+               Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 772eb53806b4..824fa0078cd0 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -6,19 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "FormatTestUtils.h"
+#include "FormatTestBase.h"
 #include "clang/Format/Format.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "gtest/gtest.h"
 
-#define DEBUG_TYPE "format-test"
+#define DEBUG_TYPE "sort-includes-test"
 
 namespace clang {
 namespace format {
 namespace {
 
-class SortIncludesTest : public ::testing::Test {
+class SortIncludesTest : public test::FormatTestBase {
 protected:
   std::vector<tooling::Range> GetCodeRange(StringRef Code) {
     return std::vector<tooling::Range>(1, tooling::Range(0, Code.size()));
@@ -821,6 +821,122 @@ TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionWithRegrouping) {
   EXPECT_EQ(27u, newCursor(Code, 28)); // Start of last line
 }
 
+TEST_F(SortIncludesTest,
+       CalculatesCorrectCursorPositionWhenNoReplacementsWithRegroupingAndCRLF) {
+  Style.IncludeBlocks = Style.IBS_Regroup;
+  FmtStyle.LineEnding = FormatStyle::LE_CRLF;
+  Style.IncludeCategories = {
+      {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
+  std::string Code = "#include \"a\"\r\n" // Start of line: 0
+                     "\r\n"               // Start of line: 14
+                     "#include \"b\"\r\n" // Start of line: 16
+                     "\r\n"               // Start of line: 30
+                     "#include \"c\"\r\n" // Start of line: 32
+                     "\r\n"               // Start of line: 46
+                     "int i;";            // Start of line: 48
+  verifyNoChange(Code);
+  EXPECT_EQ(0u, newCursor(Code, 0));
+  EXPECT_EQ(14u, newCursor(Code, 14));
+  EXPECT_EQ(16u, newCursor(Code, 16));
+  EXPECT_EQ(30u, newCursor(Code, 30));
+  EXPECT_EQ(32u, newCursor(Code, 32));
+  EXPECT_EQ(46u, newCursor(Code, 46));
+  EXPECT_EQ(48u, newCursor(Code, 48));
+}
+
+TEST_F(
+    SortIncludesTest,
+    CalculatesCorrectCursorPositionWhenRemoveLinesReplacementsWithRegroupingAndCRLF) {
+  Style.IncludeBlocks = Style.IBS_Regroup;
+  FmtStyle.LineEnding = FormatStyle::LE_CRLF;
+  Style.IncludeCategories = {{".*", 0, 0, false}};
+  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
+                     "\r\n"                   // Start of line: 14
+                     "#include \"b\"\r\n"     // Start of line: 16
+                     "\r\n"                   // Start of line: 30
+                     "#include \"c\"\r\n"     // Start of line: 32
+                     "\r\n"                   // Start of line: 46
+                     "int i;";                // Start of line: 48
+  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
+                         "#include \"b\"\r\n" // Start of line: 14
+                         "#include \"c\"\r\n" // Start of line: 28
+                         "\r\n"               // Start of line: 42
+                         "int i;";            // Start of line: 44
+  EXPECT_EQ(Expected, sort(Code));
+  EXPECT_EQ(0u, newCursor(Code, 0));
+  EXPECT_EQ(
+      14u,
+      newCursor(Code, 14)); // cursor on empty line in include block is ignored
+  EXPECT_EQ(14u, newCursor(Code, 16));
+  EXPECT_EQ(
+      30u,
+      newCursor(Code, 30)); // cursor on empty line in include block is ignored
+  EXPECT_EQ(28u, newCursor(Code, 32));
+  EXPECT_EQ(42u, newCursor(Code, 46));
+  EXPECT_EQ(44u, newCursor(Code, 48));
+}
+
+// FIXME: the tests below should pass.
+#if 0
+TEST_F(
+    SortIncludesTest,
+    CalculatesCorrectCursorPositionWhenNewLineReplacementsWithRegroupingAndCRLF) {
+  Style.IncludeBlocks = Style.IBS_Regroup;
+  FmtStyle.LineEnding = FormatStyle::LE_CRLF;
+  Style.IncludeCategories = {
+      {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
+  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
+                     "#include \"b\"\r\n"     // Start of line: 14
+                     "#include \"c\"\r\n"     // Start of line: 28
+                     "\r\n"                   // Start of line: 42
+                     "int i;";                // Start of line: 44
+  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
+                         "\r\n"               // Start of line: 14
+                         "#include \"b\"\r\n" // Start of line: 16
+                         "\r\n"               // Start of line: 30
+                         "#include \"c\"\r\n" // Start of line: 32
+                         "\r\n"               // Start of line: 46
+                         "int i;";            // Start of line: 48
+  EXPECT_EQ(Expected, sort(Code));
+  EXPECT_EQ(0u, newCursor(Code, 0));
+  EXPECT_EQ(15u, newCursor(Code, 16));
+  EXPECT_EQ(30u, newCursor(Code, 32));
+  EXPECT_EQ(44u, newCursor(Code, 46));
+  EXPECT_EQ(46u, newCursor(Code, 48));
+}
+
+TEST_F(
+    SortIncludesTest,
+    CalculatesCorrectCursorPositionWhenNoNewLineReplacementsWithRegroupingAndCRLF) {
+  Style.IncludeBlocks = Style.IBS_Regroup;
+  FmtStyle.LineEnding = FormatStyle::LE_CRLF;
+  Style.IncludeCategories = {
+      {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
+  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
+                     "\r\n"                   // Start of line: 14
+                     "#include \"c\"\r\n"     // Start of line: 16
+                     "\r\n"                   // Start of line: 30
+                     "#include \"b\"\r\n"     // Start of line: 32
+                     "\r\n"                   // Start of line: 46
+                     "int i;";                // Start of line: 48
+  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
+                         "\r\n"               // Start of line: 14
+                         "#include \"b\"\r\n" // Start of line: 16
+                         "\r\n"               // Start of line: 30
+                         "#include \"c\"\r\n" // Start of line: 32
+                         "\r\n"               // Start of line: 46
+                         "int i;";            // Start of line: 48
+  EXPECT_EQ(Expected, sort(Code));
+  EXPECT_EQ(0u, newCursor(Code, 0));
+  EXPECT_EQ(14u, newCursor(Code, 14));
+  EXPECT_EQ(30u, newCursor(Code, 32));
+  EXPECT_EQ(30u, newCursor(Code, 30));
+  EXPECT_EQ(15u, newCursor(Code, 15));
+  EXPECT_EQ(44u, newCursor(Code, 46));
+  EXPECT_EQ(46u, newCursor(Code, 48));
+}
+#endif
+
 TEST_F(SortIncludesTest, DeduplicateIncludes) {
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 6b8ab441cb46..01daf8dee505 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -438,6 +438,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_StructRBrace);
 
+  Tokens = annotate("struct macro(a) S {};");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_StructLBrace);
+  EXPECT_TOKEN(Tokens[7], tok::r_brace, TT_StructRBrace);
+
   Tokens = annotate("struct EXPORT_MACRO [[nodiscard]] C { int i; };");
   ASSERT_EQ(Tokens.size(), 15u) << Tokens;
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_StructLBrace);
@@ -448,6 +453,14 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[16], tok::r_brace, TT_StructRBrace);
 
+  Tokens = annotate("struct macro(a) S {\n"
+                    "  void f(T &t);\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 18u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_StructLBrace);
+  EXPECT_TOKEN(Tokens[11], tok::amp, TT_PointerOrReference);
+  EXPECT_TOKEN(Tokens[15], tok::r_brace, TT_StructRBrace);
+
   Tokens = annotate("template <typename T> struct S<const T[N]> {};");
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::less, TT_TemplateOpener);
@@ -2915,6 +2928,23 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_BRACE_KIND(Tokens[6], BK_BracedInit);
   EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit);
+
+  Tokens = annotate("#ifdef DEBUG_ENABLED\n"
+                    "#else\n"
+                    "#endif\n"
+                    "class RenderingServer : Object {\n"
+                    "#ifndef DISABLE_DEPRECATED\n"
+                    "  enum Features {\n"
+                    "    FEATURE_SHADERS,\n"
+                    "    FEATURE_MULTITHREADED,\n"
+                    "  };\n"
+                    "#endif\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 29u) << Tokens;
+  EXPECT_BRACE_KIND(Tokens[11], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[17], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[22], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[26], BK_Block);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) {
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 0d1365f09291..aafbf1f40949 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -1822,28 +1822,101 @@ void WriteSemanticSpellingSwitch(const std::string &VarName,
   OS << "  }\n";
 }
 
+// Note: these values need to match the values used by LateAttrParseKind in
+// `Attr.td`
+enum class LateAttrParseKind { Never = 0, Standard = 1, ExperimentalExt = 2 };
+
+static LateAttrParseKind getLateAttrParseKind(const Record *Attr) {
+  // This function basically does
+  // `Attr->getValueAsDef("LateParsed")->getValueAsInt("Kind")` but does a bunch
+  // of sanity checking to ensure that `LateAttrParseMode` in `Attr.td` is in
+  // sync with the `LateAttrParseKind` enum in this source file.
+
+  static constexpr StringRef LateParsedStr = "LateParsed";
+  static constexpr StringRef LateAttrParseKindStr = "LateAttrParseKind";
+  static constexpr StringRef KindFieldStr = "Kind";
+
+  auto *LAPK = Attr->getValueAsDef(LateParsedStr);
+
+  // Typecheck the `LateParsed` field.
+  SmallVector<Record *, 1> SuperClasses;
+  LAPK->getDirectSuperClasses(SuperClasses);
+  if (SuperClasses.size() != 1)
+    PrintFatalError(Attr, "Field `" + llvm::Twine(LateParsedStr) +
+                              "`should only have one super class");
+
+  if (SuperClasses[0]->getName().compare(LateAttrParseKindStr) != 0)
+    PrintFatalError(Attr, "Field `" + llvm::Twine(LateParsedStr) +
+                              "`should only have type `" +
+                              llvm::Twine(LateAttrParseKindStr) +
+                              "` but found type `" +
+                              SuperClasses[0]->getName() + "`");
+
+  // Get Kind and verify the enum name matches the name in `Attr.td`.
+  unsigned Kind = LAPK->getValueAsInt(KindFieldStr);
+  switch (LateAttrParseKind(Kind)) {
+#define CASE(X)                                                                \
+  case LateAttrParseKind::X:                                                   \
+    if (LAPK->getName().compare("LateAttrParse" #X) != 0) {                    \
+      PrintFatalError(Attr,                                                    \
+                      "Field `" + llvm::Twine(LateParsedStr) + "` set to `" +  \
+                          LAPK->getName() +                                    \
+                          "` but this converts to `LateAttrParseKind::" +      \
+                          llvm::Twine(#X) + "`");                              \
+    }                                                                          \
+    return LateAttrParseKind::X;
+
+    CASE(Never)
+    CASE(Standard)
+    CASE(ExperimentalExt)
+#undef CASE
+  }
+
+  // The Kind value is completely invalid
+  auto KindValueStr = llvm::utostr(Kind);
+  PrintFatalError(Attr, "Field `" + llvm::Twine(LateParsedStr) + "` set to `" +
+                            LAPK->getName() + "` has unexpected `" +
+                            llvm::Twine(KindFieldStr) + "` value of " +
+                            KindValueStr);
+}
+
 // Emits the LateParsed property for attributes.
-static void emitClangAttrLateParsedList(RecordKeeper &Records, raw_ostream &OS) {
-  OS << "#if defined(CLANG_ATTR_LATE_PARSED_LIST)\n";
-  std::vector<Record*> Attrs = Records.getAllDerivedDefinitions("Attr");
+static void emitClangAttrLateParsedListImpl(RecordKeeper &Records,
+                                            raw_ostream &OS,
+                                            LateAttrParseKind LateParseMode) {
+  std::vector<Record *> Attrs = Records.getAllDerivedDefinitions("Attr");
 
   for (const auto *Attr : Attrs) {
-    bool LateParsed = Attr->getValueAsBit("LateParsed");
+    if (LateAttrParseKind LateParsed = getLateAttrParseKind(Attr);
+        LateParsed != LateParseMode)
+      continue;
 
-    if (LateParsed) {
-      std::vector<FlattenedSpelling> Spellings = GetFlattenedSpellings(*Attr);
+    std::vector<FlattenedSpelling> Spellings = GetFlattenedSpellings(*Attr);
 
-      // FIXME: Handle non-GNU attributes
-      for (const auto &I : Spellings) {
-        if (I.variety() != "GNU")
-          continue;
-        OS << ".Case(\"" << I.name() << "\", " << LateParsed << ")\n";
-      }
+    // FIXME: Handle non-GNU attributes
+    for (const auto &I : Spellings) {
+      if (I.variety() != "GNU")
+        continue;
+      OS << ".Case(\"" << I.name() << "\", 1)\n";
     }
   }
+}
+
+static void emitClangAttrLateParsedList(RecordKeeper &Records,
+                                        raw_ostream &OS) {
+  OS << "#if defined(CLANG_ATTR_LATE_PARSED_LIST)\n";
+  emitClangAttrLateParsedListImpl(Records, OS, LateAttrParseKind::Standard);
   OS << "#endif // CLANG_ATTR_LATE_PARSED_LIST\n\n";
 }
 
+static void emitClangAttrLateParsedExperimentalList(RecordKeeper &Records,
+                                                    raw_ostream &OS) {
+  OS << "#if defined(CLANG_ATTR_LATE_PARSED_EXPERIMENTAL_EXT_LIST)\n";
+  emitClangAttrLateParsedListImpl(Records, OS,
+                                  LateAttrParseKind::ExperimentalExt);
+  OS << "#endif // CLANG_ATTR_LATE_PARSED_EXPERIMENTAL_EXT_LIST\n\n";
+}
+
 static bool hasGNUorCXX11Spelling(const Record &Attribute) {
   std::vector<FlattenedSpelling> Spellings = GetFlattenedSpellings(Attribute);
   for (const auto &I : Spellings) {
@@ -2101,9 +2174,21 @@ bool PragmaClangAttributeSupport::isAttributedSupported(
     return SpecifiedResult;
 
   // Opt-out rules:
-  // An attribute requires delayed parsing (LateParsed is on)
-  if (Attribute.getValueAsBit("LateParsed"))
+
+  // An attribute requires delayed parsing (LateParsed is on).
+  switch (getLateAttrParseKind(&Attribute)) {
+  case LateAttrParseKind::Never:
+    break;
+  case LateAttrParseKind::Standard:
+    return false;
+  case LateAttrParseKind::ExperimentalExt:
+    // This is only late parsed in certain parsing contexts when
+    // `LangOpts.ExperimentalLateParseAttributes` is true. Information about the
+    // parsing context and `LangOpts` is not available in this method so just
+    // opt this attribute out.
     return false;
+  }
+
   // An attribute has no GNU/CXX11 spelling
   if (!hasGNUorCXX11Spelling(Attribute))
     return false;
@@ -2885,8 +2970,27 @@ static void emitAttributes(RecordKeeper &Records, raw_ostream &OS,
         return;
       }
       OS << "\n  : " << SuperName << "(Ctx, CommonInfo, ";
-      OS << "attr::" << R.getName() << ", "
-         << (R.getValueAsBit("LateParsed") ? "true" : "false");
+      OS << "attr::" << R.getName() << ", ";
+
+      // Handle different late parsing modes.
+      OS << "/*IsLateParsed=*/";
+      switch (getLateAttrParseKind(&R)) {
+      case LateAttrParseKind::Never:
+        OS << "false";
+        break;
+      case LateAttrParseKind::ExperimentalExt:
+        // Currently no clients need to know the distinction between `Standard`
+        // and `ExperimentalExt` so treat `ExperimentalExt` just like
+        // `Standard` for now.
+      case LateAttrParseKind::Standard:
+        // Note: This is misleading. `IsLateParsed` doesn't mean the
+        // attribute was actually late parsed. Instead it means the attribute in
+        // `Attr.td` is marked as being late parsed. Maybe it should be called
+        // `IsLateParseable`?
+        OS << "true";
+        break;
+      }
+
       if (Inheritable) {
         OS << ", "
            << (R.getValueAsBit("InheritEvenIfAlreadyPresent") ? "true"
@@ -4843,6 +4947,7 @@ void EmitClangAttrParserStringSwitches(RecordKeeper &Records, raw_ostream &OS) {
   emitClangAttrAcceptsExprPack(Records, OS);
   emitClangAttrTypeArgList(Records, OS);
   emitClangAttrLateParsedList(Records, OS);
+  emitClangAttrLateParsedExperimentalList(Records, OS);
 }
 
 void EmitClangAttrSubjectMatchRulesParserStringSwitches(RecordKeeper &Records,
diff --git a/clang/utils/creduce-clang-crash.py b/clang/utils/creduce-clang-crash.py
index 27361bb88505..4d0c8224d8b4 100755
--- a/clang/utils/creduce-clang-crash.py
+++ b/clang/utils/creduce-clang-crash.py
@@ -15,7 +15,6 @@ import shutil
 import stat
 import sys
 import subprocess
-import pipes
 import shlex
 import tempfile
 import shutil
@@ -61,7 +60,7 @@ def check_cmd(cmd_name, cmd_dir, cmd_path=None):
 
 
 def quote_cmd(cmd):
-    return " ".join(pipes.quote(arg) for arg in cmd)
+    return " ".join(shlex.quote(arg) for arg in cmd)
 
 
 def write_to_script(text, filename):
@@ -220,7 +219,7 @@ fi
         )
 
         for msg in self.expected_output:
-            output += "grep -F %s t.log || exit 1\n" % pipes.quote(msg)
+            output += "grep -F %s t.log || exit 1\n" % shlex.quote(msg)
 
         write_to_script(output, self.testfile)
         self.check_interestingness()
@@ -318,9 +317,17 @@ fi
         interestingness test takes to run.
         """
         print("\nSimplifying the clang command...")
+        new_args = self.clang_args
+
+        # Remove the color diagnostics flag to make it easier to match error
+        # text.
+        new_args = self.try_remove_args(
+            new_args,
+            msg="Removed -fcolor-diagnostics",
+            opts_equal=["-fcolor-diagnostics"],
+        )
 
         # Remove some clang arguments to speed up the interestingness test
-        new_args = self.clang_args
         new_args = self.try_remove_args(
             new_args,
             msg="Removed debug info options",
diff --git a/clang/www/analyzer/alpha_checks.html b/clang/www/analyzer/alpha_checks.html
index f040d1957b0f..2c8eece41fb2 100644
--- a/clang/www/analyzer/alpha_checks.html
+++ b/clang/www/analyzer/alpha_checks.html
@@ -910,63 +910,6 @@ void test() {
 </pre></div></div></td></tr>
 
 
-<tr><td><a id="alpha.unix.Stream"><div class="namedescr expandable"><span class="name">
-alpha.unix.Stream</span><span class="lang">
-(C)</span><div class="descr">
-Check stream handling functions:<div class=functions>fopen<br>
-tmpfile<br>
-fclose<br>
-fread<br>
-fwrite<br>
-fseek<br>
-ftell<br>
-rewind<br>
-fgetpos<br>
-fsetpos<br>
-clearerr<br>
-feof<br>
-ferror<br>
-fileno</div></div></div></a></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  FILE *p = fopen("foo", "r");
-} // warn: opened file is never closed
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  FILE *p = fopen("foo", "r");
-  fseek(p, 1, SEEK_SET); // warn: stream pointer might be NULL
-  fclose(p);
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  FILE *p = fopen("foo", "r");
-
-  if (p)
-    fseek(p, 1, 3);
-     // warn: third arg should be SEEK_SET, SEEK_END, or SEEK_CUR
-
-  fclose(p);
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  FILE *p = fopen("foo", "r");
-  fclose(p);
-  fclose(p); // warn: already closed
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void test() {
-  FILE *p = tmpfile();
-  ftell(p); // warn: stream pointer might be NULL
-  fclose(p);
-}
-</pre></div></div></td></tr>
-
-
 <tr><td><a id="alpha.unix.cstring.BufferOverlap"><div class="namedescr expandable"><span class="name">
 alpha.unix.cstring.BufferOverlap</span><span class="lang">
 (C)</span><div class="descr">
diff --git a/clang/www/analyzer/open_projects.html b/clang/www/analyzer/open_projects.html
index 8e06ce2cb528..a7c99c6e25bb 100644
--- a/clang/www/analyzer/open_projects.html
+++ b/clang/www/analyzer/open_projects.html
@@ -47,22 +47,6 @@ mailing list</a> to notify other members of the community.</p>
       <a href="https://en.wikipedia.org/wiki/Taint_checking">tainted</a> index values.
       <p><i>(Difficulty: Medium)</i></p></p>
       </li>
-
-      <li><code>alpha.unix.StreamChecker</code>
-        <p>A SimpleStreamChecker has been presented in the Building a Checker in 24
-        Hours talk
-        (<a href="https://llvm.org/devmtg/2012-11/Zaks-Rose-Checker24Hours.pdf">slides</a>
-        <a href="https://youtu.be/kdxlsP5QVPw">video</a>).</p>
-
-        <p>This alpha checker is an attempt to write a production grade stream checker.
-        However, it was found to have an unacceptably high false positive rate.
-        One of the found problems was that eagerly splitting the state
-        based on whether the system call may fail leads to too many reports.
-        A <em>delayed</em> split where the implication is stored in the state
-        (similarly to nullability implications in <code>TrustNonnullChecker</code>)
-        may produce much better results.</p>
-        <p><i>(Difficulty: Medium)</i></p>
-      </li>
     </ul>
   </li>
 
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 83b71e7c122d..875521bd505d 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1435,7 +1435,7 @@ accessible?</td>
   </tr>
   <tr class="open" id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>References vs pointers in UDC overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -2756,7 +2756,7 @@ of class templates</td>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>References may only bind to &#8220;valid&#8221; objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -5812,7 +5812,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="1001">
     <td><a href="https://cplusplus.github.io/CWG/issues/1001.html">1001</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Parameter type adjustment in dependent parameter types</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -6034,7 +6034,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1038">
     <td><a href="https://cplusplus.github.io/CWG/issues/1038.html">1038</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution of <TT>&amp;x.static_func</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -9994,7 +9994,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1698">
     <td><a href="https://cplusplus.github.io/CWG/issues/1698.html">1698</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Files ending in <TT>\</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -10132,7 +10132,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="1721">
     <td><a href="https://cplusplus.github.io/CWG/issues/1721.html">1721</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Diagnosing ODR violations for static data members</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -11312,11 +11312,11 @@ and <I>POD class</I></td>
     <td>decltype-qualified enumeration names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1918">
+  <tr id="1918">
     <td><a href="https://cplusplus.github.io/CWG/issues/1918.html">1918</a></td>
-    <td>open</td>
+    <td>CD5</td>
     <td><TT>friend</TT> templates with dependent scopes</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1919">
     <td><a href="https://cplusplus.github.io/CWG/issues/1919.html">1919</a></td>
@@ -11474,11 +11474,11 @@ and <I>POD class</I></td>
     <td>New C incompatibilities</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="1945">
+  <tr id="1945">
     <td><a href="https://cplusplus.github.io/CWG/issues/1945.html">1945</a></td>
-    <td>open</td>
+    <td>CD5</td>
     <td>Friend declarations naming members of class templates in non-templates</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1946">
     <td><a href="https://cplusplus.github.io/CWG/issues/1946.html">1946</a></td>
@@ -11530,7 +11530,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><TT>typeid</TT> null dereference check in subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12098,11 +12098,11 @@ and <I>POD class</I></td>
     <td>C-style casts that cast away constness vs <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2049">
+  <tr id="2049">
     <td><a href="https://cplusplus.github.io/CWG/issues/2049.html">2049</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>List initializer in non-type template default argument</td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2050">
     <td><a href="https://cplusplus.github.io/CWG/issues/2050.html">2050</a></td>
@@ -12130,7 +12130,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2054">
     <td><a href="https://cplusplus.github.io/CWG/issues/2054.html">2054</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing description of class SFINAE</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12418,7 +12418,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2102">
     <td><a href="https://cplusplus.github.io/CWG/issues/2102.html">2102</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Constructor checking in <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12698,11 +12698,11 @@ and <I>POD class</I></td>
     <td>Thread storage duration and order of initialization</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2149">
+  <tr id="2149">
     <td><a href="https://cplusplus.github.io/CWG/issues/2149.html">2149</a></td>
-    <td>drafting</td>
+    <td>DR</td>
     <td>Brace elision and array length deduction</td>
-    <td align="center">Not resolved</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2150">
     <td><a href="https://cplusplus.github.io/CWG/issues/2150.html">2150</a></td>
@@ -13318,7 +13318,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2252">
     <td><a href="https://cplusplus.github.io/CWG/issues/2252.html">2252</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Enumeration list-initialization from the same type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14410,7 +14410,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2434">
     <td><a href="https://cplusplus.github.io/CWG/issues/2434.html">2434</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Mandatory copy elision vs non-class objects</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -14504,11 +14504,11 @@ and <I>POD class</I></td>
     <td>Thunks as an implementation technique for pointers to virtual functions</td>
     <td align="center">Extension</td>
   </tr>
-  <tr class="open" id="2450">
+  <tr id="2450">
     <td><a href="https://cplusplus.github.io/CWG/issues/2450.html">2450</a></td>
-    <td>review</td>
+    <td>DRWP</td>
     <td><I>braced-init-list</I> as a <I>template-argument</I></td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2451">
     <td><a href="https://cplusplus.github.io/CWG/issues/2451.html">2451</a></td>
@@ -14558,11 +14558,11 @@ and <I>POD class</I></td>
     <td>Value category of expressions denoting non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2459">
+  <tr id="2459">
     <td><a href="https://cplusplus.github.io/CWG/issues/2459.html">2459</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Template parameter initialization</td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2460">
     <td><a href="https://cplusplus.github.io/CWG/issues/2460.html">2460</a></td>
@@ -14662,7 +14662,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2476">
     <td><a href="https://cplusplus.github.io/CWG/issues/2476.html">2476</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><I>placeholder-type-specifier</I>s and function declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14830,7 +14830,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2504">
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Inheriting constructors from virtual base classes</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -14992,7 +14992,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2531">
     <td><a href="https://cplusplus.github.io/CWG/issues/2531.html">2531</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Static data members redeclared as constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15002,11 +15002,11 @@ and <I>POD class</I></td>
     <td>Kind of pointer value returned by <TT>new T[0]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2533">
+  <tr id="2533">
     <td><a href="https://cplusplus.github.io/CWG/issues/2533.html">2533</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Storage duration of implicitly created objects</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2534">
     <td><a href="https://cplusplus.github.io/CWG/issues/2534.html">2534</a></td>
@@ -15082,13 +15082,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2546">
     <td><a href="https://cplusplus.github.io/CWG/issues/2546.html">2546</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Defaulted secondary comparison operators defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2547">
     <td><a href="https://cplusplus.github.io/CWG/issues/2547.html">2547</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Defaulted comparison operator function for non-classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15142,7 +15142,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2556">
     <td><a href="https://cplusplus.github.io/CWG/issues/2556.html">2556</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unusable <TT>promise::return_void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15166,15 +15166,15 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2560">
     <td><a href="https://cplusplus.github.io/CWG/issues/2560.html">2560</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Parameter type determination in a <I>requirement-parameter-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
-    <td title="Clang 18 implements 2023-11-09 resolution" align="center">Not Resolved*</td>
+    <td title="Clang does not implement 2024-03-18 resolution" align="center">Not Resolved*</td>
   </tr>
   <tr class="open" id="2562">
     <td><a href="https://cplusplus.github.io/CWG/issues/2562.html">2562</a></td>
@@ -15214,7 +15214,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2568">
     <td><a href="https://cplusplus.github.io/CWG/issues/2568.html">2568</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Access checking during synthesis of defaulted comparison operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15226,7 +15226,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2570">
     <td><a href="https://cplusplus.github.io/CWG/issues/2570.html">2570</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Clarify constexpr for defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15334,7 +15334,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>friend declarations and module linkage</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -15352,7 +15352,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2591">
     <td><a href="https://cplusplus.github.io/CWG/issues/2591.html">2591</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit change of active union member for anonymous union in union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15376,7 +15376,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2595">
     <td><a href="https://cplusplus.github.io/CWG/issues/2595.html">2595</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>"More constrained" for eligible special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15406,7 +15406,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2600">
     <td><a href="https://cplusplus.github.io/CWG/issues/2600.html">2600</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type dependency of placeholder types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15574,7 +15574,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2628">
     <td><a href="https://cplusplus.github.io/CWG/issues/2628.html">2628</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit deduction guides should propagate constraints</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -15610,7 +15610,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2634">
     <td><a href="https://cplusplus.github.io/CWG/issues/2634.html">2634</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Avoid circularity in specification of scope for friend class declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15628,13 +15628,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2637">
     <td><a href="https://cplusplus.github.io/CWG/issues/2637.html">2637</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Injected-class-name as a <I>simple-template-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2638">
     <td><a href="https://cplusplus.github.io/CWG/issues/2638.html">2638</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Improve the example for initializing by initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15748,7 +15748,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2657">
     <td><a href="https://cplusplus.github.io/CWG/issues/2657.html">2657</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Cv-qualification adjustment when binding reference to temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15770,11 +15770,11 @@ and <I>POD class</I></td>
     <td>Confusing term "this parameter"</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2661">
+  <tr id="2661">
     <td><a href="https://cplusplus.github.io/CWG/issues/2661.html">2661</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Missing disambiguation rule for <I>pure-specifier</I> vs. <I>brace-or-equal-initializer</I></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2662">
     <td><a href="https://cplusplus.github.io/CWG/issues/2662.html">2662</a></td>
@@ -15814,7 +15814,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2668">
     <td><a href="https://cplusplus.github.io/CWG/issues/2668.html">2668</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><TT>co_await</TT> in a <I>lambda-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15838,7 +15838,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2672">
     <td><a href="https://cplusplus.github.io/CWG/issues/2672.html">2672</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Lambda body SFINAE is still required, contrary to intent and note</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -15940,7 +15940,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2689">
     <td><a href="https://cplusplus.github.io/CWG/issues/2689.html">2689</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Are cv-qualified <TT>std::nullptr_t</TT> fundamental types?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16004,11 +16004,11 @@ and <I>POD class</I></td>
     <td>Inconsistency of <I>throw-expression</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2700">
+  <tr id="2700">
     <td><a href="https://cplusplus.github.io/CWG/issues/2700.html">2700</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td><TT>#error</TT> disallows existing implementation practice</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2701">
     <td><a href="https://cplusplus.github.io/CWG/issues/2701.html">2701</a></td>
@@ -16048,7 +16048,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2707">
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16088,11 +16088,11 @@ and <I>POD class</I></td>
     <td>Initialization of reference-to-aggregate from designated initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2714">
+  <tr id="2714">
     <td><a href="https://cplusplus.github.io/CWG/issues/2714.html">2714</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2715">
     <td><a href="https://cplusplus.github.io/CWG/issues/2715.html">2715</a></td>
@@ -16156,7 +16156,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2725">
     <td><a href="https://cplusplus.github.io/CWG/issues/2725.html">2725</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution for non-call of class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16174,7 +16174,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16204,7 +16204,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2733">
     <td><a href="https://cplusplus.github.io/CWG/issues/2733.html">2733</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Applying <TT>[[maybe_unused]]</TT> to a label</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16228,7 +16228,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2737">
     <td><a href="https://cplusplus.github.io/CWG/issues/2737.html">2737</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Temporary lifetime extension for reference init-captures</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16258,7 +16258,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2742">
     <td><a href="https://cplusplus.github.io/CWG/issues/2742.html">2742</a></td>
-    <td>open</td>
+    <td>drafting</td>
     <td>Guaranteed copy elision for brace-initialization from prvalue</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16274,33 +16274,33 @@ and <I>POD class</I></td>
     <td>Multiple objects of the same type at the same address</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2745">
+  <tr id="2745">
     <td><a href="https://cplusplus.github.io/CWG/issues/2745.html">2745</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Dependent odr-use in generic lambdas</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2746">
+  <tr id="2746">
     <td><a href="https://cplusplus.github.io/CWG/issues/2746.html">2746</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Checking of default template arguments</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2747">
     <td><a href="https://cplusplus.github.io/CWG/issues/2747.html">2747</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Cannot depend on an already-deleted splice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2748">
     <td><a href="https://cplusplus.github.io/CWG/issues/2748.html">2748</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Accessing static data members via null pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2749">
     <td><a href="https://cplusplus.github.io/CWG/issues/2749.html">2749</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Treatment of "pointer to void" for relational comparisons</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16324,19 +16324,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2753">
     <td><a href="https://cplusplus.github.io/CWG/issues/2753.html">2753</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Storage reuse for string literal objects and backing arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2754">
     <td><a href="https://cplusplus.github.io/CWG/issues/2754.html">2754</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Using *this in explicit object member functions that are coroutines</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2755">
     <td><a href="https://cplusplus.github.io/CWG/issues/2755.html">2755</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Incorrect wording applied by P2738R1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16354,43 +16354,43 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2758">
     <td><a href="https://cplusplus.github.io/CWG/issues/2758.html">2758</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>What is "access and ambiguity control"?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2759">
     <td><a href="https://cplusplus.github.io/CWG/issues/2759.html">2759</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>[[no_unique_address] and common initial sequence</td>
     <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr id="2760">
     <td><a href="https://cplusplus.github.io/CWG/issues/2760.html">2760</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Defaulted constructor that is an immediate function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2761">
     <td><a href="https://cplusplus.github.io/CWG/issues/2761.html">2761</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicitly invoking the deleted destructor of an anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2762">
     <td><a href="https://cplusplus.github.io/CWG/issues/2762.html">2762</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type of implicit object parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2763">
     <td><a href="https://cplusplus.github.io/CWG/issues/2763.html">2763</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Ignorability of [[noreturn]] during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2764">
     <td><a href="https://cplusplus.github.io/CWG/issues/2764.html">2764</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Use of placeholders affecting name mangling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16403,7 +16403,8 @@ and <I>POD class</I></td>
   <tr class="open" id="2766">
     <td><a href="https://cplusplus.github.io/CWG/issues/2766.html">2766</a></td>
     <td>open</td>
-    <td>Repeated evaluation of a <I>string-literal</I> may yield different objects</td>
+    <td>Repeated evaluation of a <I>string-literal</I> may yield different
+objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2767">
@@ -16414,7 +16415,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2768">
     <td><a href="https://cplusplus.github.io/CWG/issues/2768.html">2768</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Assignment to enumeration variable with a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16430,15 +16431,15 @@ and <I>POD class</I></td>
     <td>Trailing <I>requires-clause</I> can refer to function parameters before they are substituted into</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2771">
+  <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Transformation for <I>unqualified-id</I>s in address operator</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2772">
     <td><a href="https://cplusplus.github.io/CWG/issues/2772.html">2772</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing Annex C entry for linkage effects of <I>linkage-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16456,7 +16457,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2775">
     <td><a href="https://cplusplus.github.io/CWG/issues/2775.html">2775</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear argument type for copy of exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16466,15 +16467,15 @@ and <I>POD class</I></td>
     <td>Substitution failure and implementation limits</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2777">
+  <tr id="2777">
     <td><a href="https://cplusplus.github.io/CWG/issues/2777.html">2777</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Type of <I>id-expression</I> denoting a template parameter object</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2778">
     <td><a href="https://cplusplus.github.io/CWG/issues/2778.html">2778</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Trivial destructor does not imply constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16486,7 +16487,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2780">
     <td><a href="https://cplusplus.github.io/CWG/issues/2780.html">2780</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>reinterpret_cast</TT> to reference to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16504,7 +16505,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2783">
     <td><a href="https://cplusplus.github.io/CWG/issues/2783.html">2783</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Handling of deduction guides in <I>global-module-fragment</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16516,7 +16517,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2785">
     <td><a href="https://cplusplus.github.io/CWG/issues/2785.html">2785</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type-dependence of <I>requires-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16540,7 +16541,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2789">
     <td><a href="https://cplusplus.github.io/CWG/issues/2789.html">2789</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution with implicit and explicit object member functions</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -16552,19 +16553,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2791">
     <td><a href="https://cplusplus.github.io/CWG/issues/2791.html">2791</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unclear phrasing about "returning to the caller"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2792">
     <td><a href="https://cplusplus.github.io/CWG/issues/2792.html">2792</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Clean up specification of <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2793">
     <td><a href="https://cplusplus.github.io/CWG/issues/2793.html">2793</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Block-scope declaration conflicting with parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16576,25 +16577,25 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2795">
     <td><a href="https://cplusplus.github.io/CWG/issues/2795.html">2795</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overlapping empty subobjects with different cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2796">
     <td><a href="https://cplusplus.github.io/CWG/issues/2796.html">2796</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Function pointer conversions for relational operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2797">
     <td><a href="https://cplusplus.github.io/CWG/issues/2797.html">2797</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Meaning of "corresponds" for rewritten operator candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2798">
     <td><a href="https://cplusplus.github.io/CWG/issues/2798.html">2798</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Manifestly constant evaluation of the <TT>static_assert</TT> message</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
@@ -16612,7 +16613,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2801">
     <td><a href="https://cplusplus.github.io/CWG/issues/2801.html">2801</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Reference binding with reference-related types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16624,7 +16625,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2803">
     <td><a href="https://cplusplus.github.io/CWG/issues/2803.html">2803</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Overload resolution for reference binding of similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16642,13 +16643,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2806">
     <td><a href="https://cplusplus.github.io/CWG/issues/2806.html">2806</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Make a <I>type-requirement</I> a type-only context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2807">
     <td><a href="https://cplusplus.github.io/CWG/issues/2807.html">2807</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Destructors declared <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16660,19 +16661,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2809">
     <td><a href="https://cplusplus.github.io/CWG/issues/2809.html">2809</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>An implicit definition does not redeclare a function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2810">
     <td><a href="https://cplusplus.github.io/CWG/issues/2810.html">2810</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Requiring the absence of diagnostics for templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2811">
     <td><a href="https://cplusplus.github.io/CWG/issues/2811.html">2811</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Clarify "use" of main</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16682,11 +16683,11 @@ and <I>POD class</I></td>
     <td>Allocation with explicit alignment</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2813">
+  <tr id="2813">
     <td><a href="https://cplusplus.github.io/CWG/issues/2813.html">2813</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Class member access with prvalues</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2814">
     <td><a href="https://cplusplus.github.io/CWG/issues/2814.html">2814</a></td>
@@ -16714,57 +16715,57 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Use of predefined reserved identifiers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Cast from null pointer value in a constant expression</td>
-    <td align="center">Not resolved</td>
+    <td title="Clang 19 implements 2023-12-01 resolution" align="center">Not Resolved*</td>
   </tr>
-  <tr class="open" id="2820">
+  <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Value-initialization and default constructors</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2821">
     <td><a href="https://cplusplus.github.io/CWG/issues/2821.html">2821</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Lifetime, zero-initialization, and dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2822">
     <td><a href="https://cplusplus.github.io/CWG/issues/2822.html">2822</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Side-effect-free pointer zap</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2823">
     <td><a href="https://cplusplus.github.io/CWG/issues/2823.html">2823</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit undefined behavior when dereferencing pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Copy-initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2825">
     <td><a href="https://cplusplus.github.io/CWG/issues/2825.html">2825</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Range-based for statement using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr id="2826">
+  <tr class="open" id="2826">
     <td><a href="https://cplusplus.github.io/CWG/issues/2826.html">2826</a></td>
-    <td>tentatively ready</td>
+    <td>drafting</td>
     <td>Missing definition of "temporary expression"</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2827">
     <td><a href="https://cplusplus.github.io/CWG/issues/2827.html">2827</a></td>
@@ -16772,11 +16773,11 @@ and <I>POD class</I></td>
     <td>Representation of unsigned integral types</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2828">
+  <tr id="2828">
     <td><a href="https://cplusplus.github.io/CWG/issues/2828.html">2828</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Ambiguous interpretation of C-style cast</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2829">
     <td><a href="https://cplusplus.github.io/CWG/issues/2829.html">2829</a></td>
@@ -16784,17 +16785,17 @@ and <I>POD class</I></td>
     <td>Redundant case in restricting user-defined conversion sequences</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2830">
+  <tr id="2830">
     <td><a href="https://cplusplus.github.io/CWG/issues/2830.html">2830</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Top-level cv-qualification should be ignored for list-initialization</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2831">
+  <tr id="2831">
     <td><a href="https://cplusplus.github.io/CWG/issues/2831.html">2831</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Non-templated function definitions and <I>requires-clause</I>s</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2832">
     <td><a href="https://cplusplus.github.io/CWG/issues/2832.html">2832</a></td>
@@ -16810,7 +16811,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2834">
     <td><a href="https://cplusplus.github.io/CWG/issues/2834.html">2834</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Partial ordering and explicit object parameters</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16822,7 +16823,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16855,6 +16856,276 @@ and <I>POD class</I></td>
     <td>open</td>
     <td>When do const objects start being const?</td>
     <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2842">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2842.html">2842</a></td>
+    <td>open</td>
+    <td>Preferring an <TT>initializer_list</TT> over a single value</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2843">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2843.html">2843</a></td>
+    <td>review</td>
+    <td>Undated reference to Unicode makes C++ a moving target</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2844">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2844.html">2844</a></td>
+    <td>open</td>
+    <td>Enumerating a finite set of built-in candidates</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2845">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2845.html">2845</a></td>
+    <td>DR</td>
+    <td>Make the closure type of a captureless lambda a structural type</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2846">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2846.html">2846</a></td>
+    <td>DR</td>
+    <td>Out-of-class definitions of explicit object member functions</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2847">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2847.html">2847</a></td>
+    <td>review</td>
+    <td>Constrained explicit specializations of function templates at class scope</td>
+    <td title="Clang 19 implements 2024-03-01 resolution" align="center">Not Resolved*</td>
+  </tr>
+  <tr id="2848">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2848.html">2848</a></td>
+    <td>DR</td>
+    <td>Omitting an empty template argument list for explicit instantiation</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2849">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2849.html">2849</a></td>
+    <td>DR</td>
+    <td>Parameter objects are not temporary objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2850">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2850.html">2850</a></td>
+    <td>DR</td>
+    <td>Unclear storage duration for function parameter objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2851">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2851.html">2851</a></td>
+    <td>DR</td>
+    <td>Allow floating-point conversions in converted constant expressions</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2852">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2852.html">2852</a></td>
+    <td>open</td>
+    <td>Complete-class contexts and class-scope lambdas</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2853">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2853.html">2853</a></td>
+    <td>DR</td>
+    <td>Pointer arithmetic with pointer to hypothetical element</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2854">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2854.html">2854</a></td>
+    <td>DR</td>
+    <td>Storage duration of exception objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2855">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2855.html">2855</a></td>
+    <td>DR</td>
+    <td>Undefined behavior in postfix increment</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2856">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2856.html">2856</a></td>
+    <td>DR</td>
+    <td>Copy-list-initialization with explicit default constructors</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2857">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2857.html">2857</a></td>
+    <td>DR</td>
+    <td>Argument-dependent lookup with incomplete class types</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2858">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
+    <td>tentatively ready</td>
+    <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
+    <td title="Clang 19 implements 2024-04-05 resolution" align="center">Not Resolved*</td>
+  </tr>
+  <tr class="open" id="2859">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
+    <td>tentatively ready</td>
+    <td>Value-initialization with multiple default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2860">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2860.html">2860</a></td>
+    <td>dup</td>
+    <td>Remove and fix the term "vacuous initialization"</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2861">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
+    <td>tentatively ready</td>
+    <td><TT>dynamic_cast</TT> on bad pointer value</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2862">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2862.html">2862</a></td>
+    <td>tentatively ready</td>
+    <td>Unclear boundaries of template declarations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2863">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2863.html">2863</a></td>
+    <td>tentatively ready</td>
+    <td>Unclear synchronization requirements for object lifetime rules</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2864">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
+    <td>tentatively ready</td>
+    <td>Narrowing floating-point conversions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2865">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
+    <td>open</td>
+    <td>Regression on result of conditional operator</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2866">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2866.html">2866</a></td>
+    <td>open</td>
+    <td>Observing the effects of <TT>[[no_unique_address]]</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2867">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
+    <td>open</td>
+    <td>Order of initialization for structured bindings</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2868">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2868.html">2868</a></td>
+    <td>open</td>
+    <td>Self-references in trivially copyable objects as function return values</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2869">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
+    <td>open</td>
+    <td><TT>this</TT> in local classes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2870">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
+    <td>open</td>
+    <td>Combining absent <I>encoding-prefix</I>es</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2871">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
+    <td>tentatively ready</td>
+    <td>User-declared constructor templates inhibiting default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2872">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
+    <td>open</td>
+    <td>Linkage and unclear "can be referred to"</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2873">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2873.html">2873</a></td>
+    <td>open</td>
+    <td>Taking the address of a function involving template argument deduction</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2874">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
+    <td>open</td>
+    <td>Qualified declarations of partial specializations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2875">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2875.html">2875</a></td>
+    <td>open</td>
+    <td>Missing support for round-tripping nullptr through indirection/address operators</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2876">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
+    <td>open</td>
+    <td>Disambiguation of <TT>T x = delete("text")</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2877">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
+    <td>open</td>
+    <td>Type-only lookup for <I>using-enum-declarator</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2878">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2878.html">2878</a></td>
+    <td>open</td>
+    <td>C-style casts to reference types</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2879">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2879.html">2879</a></td>
+    <td>open</td>
+    <td>Undesired outcomes with <TT>const_cast</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2880">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
+    <td>open</td>
+    <td>Accessibility check for destructor of incomplete class type</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2881">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
+    <td>open</td>
+    <td>Type restrictions for the explicit object parameter of a lambda</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2882">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
+    <td>open</td>
+    <td>Unclear treatment of conversion to <TT>void</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2883">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
+    <td>open</td>
+    <td>Definition of "odr-usable" ignores lambda scopes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2884">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2884.html">2884</a></td>
+    <td>open</td>
+    <td>Qualified declarations of partial specializations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2885">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2885.html">2885</a></td>
+    <td>open</td>
+    <td>Non-eligible trivial default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2886">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
+    <td>open</td>
+    <td>Temporaries and trivial potentially-throwing special member functions</td>
+    <td align="center">Not resolved</td>
   </tr></table>
 
 </div>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index c233171e63c8..260f74ded93c 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -167,7 +167,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Disallow Binding a Returned Glvalue to a Temporary</td>
   <td><a href="https://wg21.link/P2748R5">P2748R5</a></td>
-  <td class="none" align="center">No</td>
+  <td class="unreleased" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Clarifying rules for brace elision in aggregate initialization</td>
@@ -177,7 +177,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Attributes for Structured Bindings</td>
   <td><a href="https://wg21.link/P0609R3">P0609R3</a></td>
-  <td class="none" align="center">No</td>
+  <td class="unreleased" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Module Declarations Shouldn’t be Macros</td>
@@ -462,7 +462,14 @@ C++23, informally referred to as C++26.</p>
     <tr>
       <td>Lifetime extension in range-based for loops</td>
       <td><a href="https://wg21.link/P2718R0">P2718R0</a></td>
-      <td class="unreleased" align="center">Clang 19</td>
+      <td class="partial" align="center">
+        <details>
+          <summary>Clang 19 (Partial)</summary>
+            The lifetime extension of temporaries bound to member references
+            by default member initializers in aggregate initialization was 
+            not supported now.
+        </details>
+      </td>
     </tr>
     <!--Issaquah 2023 papers-->
     <tr>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index 7c0cf77a1524..47c8b3bae4a1 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -5,7 +5,7 @@ latest_release = 18
 
 clang_www_dir = os.path.dirname(__file__)
 default_issue_list_path = os.path.join(clang_www_dir, 'cwg_index.html')
-issue_list_url = "https://www.open-std.org/jtc1/sc22/wg21/docs/cwg_index.html"
+issue_list_url = "https://raw.githubusercontent.com/cplusplus/CWG/gh-pages/issues/cwg_index.html"
 output = os.path.join(clang_www_dir, 'cxx_dr_status.html')
 dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs')
 
@@ -138,10 +138,10 @@ def availability(issue):
 
   unresolved_status = ''
   proposed_resolution = ''
-  unresolved_status_match = re.search(r' (open|drafting|review)', status)
+  unresolved_status_match = re.search(r' (open|drafting|review|tentatively ready)', status)
   if unresolved_status_match:
     unresolved_status = unresolved_status_match.group(1)
-    proposed_resolution_match = re.search(r' (open|drafting|review) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
+    proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
       raise AvailabilityError('Issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
@@ -236,7 +236,7 @@ for dr in drs:
     avail = 'Extension'
     avail_style = ''
 
-  elif dr.status in ('open', 'drafting', 'review'):
+  elif dr.status in ('open', 'drafting', 'review', 'tentatively ready'):
     row_style = ' class="open"'
     try:
       avail, avail_style, unresolved_status = availability(dr.issue)
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index 6e0d9dbff65a..75b34c8e27e0 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -2,12 +2,6 @@ include(ExternalProject)
 include(CompilerRTUtils)
 include(HandleCompilerRT)
 
-# CMP0114: ExternalProject step targets fully adopt their steps.
-# New in CMake 3.19: https://cmake.org/cmake/help/latest/policy/CMP0114.html
-if(POLICY CMP0114)
-  cmake_policy(SET CMP0114 OLD)
-endif()
-
 function(set_target_output_directories target output_dir)
   # For RUNTIME_OUTPUT_DIRECTORY variable, Multi-configuration generators
   # append a per-configuration subdirectory to the specified directory.
diff --git a/compiler-rt/lib/dfsan/CMakeLists.txt b/compiler-rt/lib/dfsan/CMakeLists.txt
index f6479e9da87b..37c386d2de1f 100644
--- a/compiler-rt/lib/dfsan/CMakeLists.txt
+++ b/compiler-rt/lib/dfsan/CMakeLists.txt
@@ -63,7 +63,8 @@ add_custom_command(OUTPUT ${dfsan_abilist_filename}
                    COMMAND
                     ${CMAKE_COMMAND} -E make_directory ${dfsan_abilist_dir}
                    COMMAND
-                     cat ${CMAKE_CURRENT_SOURCE_DIR}/done_abilist.txt
+                    ${CMAKE_COMMAND} -E cat
+                         ${CMAKE_CURRENT_SOURCE_DIR}/done_abilist.txt
                          ${CMAKE_CURRENT_SOURCE_DIR}/libc_ubuntu1404_abilist.txt
                          > ${dfsan_abilist_filename}
                    DEPENDS done_abilist.txt libc_ubuntu1404_abilist.txt)
diff --git a/compiler-rt/lib/fuzzer/build.sh b/compiler-rt/lib/fuzzer/build.sh
index f7f329c0d19c..f58fd9557ce3 100755
--- a/compiler-rt/lib/fuzzer/build.sh
+++ b/compiler-rt/lib/fuzzer/build.sh
@@ -2,7 +2,7 @@
 LIBFUZZER_SRC_DIR=$(dirname $0)
 CXX="${CXX:-clang}"
 for f in $LIBFUZZER_SRC_DIR/*.cpp; do
-  $CXX -g -O2 -fno-omit-frame-pointer -std=c++14 $f -c &
+  $CXX -g -O2 -fno-omit-frame-pointer -std=c++17 $f -c &
 done
 wait
 rm -f libFuzzer.a
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_markup.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_markup.cpp
index b2a1069a9a61..31d91ef3c739 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_markup.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_markup.cpp
@@ -28,7 +28,7 @@ void MarkupStackTracePrinter::RenderData(InternalScopedString *buffer,
                                          const char *format, const DataInfo *DI,
                                          const char *strip_path_prefix) {
   RenderContext(buffer);
-  buffer->AppendF(kFormatData, DI->start);
+  buffer->AppendF(kFormatData, reinterpret_cast<void *>(DI->start));
 }
 
 bool MarkupStackTracePrinter::RenderNeedsSymbolization(const char *format) {
@@ -43,12 +43,13 @@ void MarkupStackTracePrinter::RenderFrame(InternalScopedString *buffer,
                                           const char *strip_path_prefix) {
   CHECK(!RenderNeedsSymbolization(format));
   RenderContext(buffer);
-  buffer->AppendF(kFormatFrame, frame_no, address);
+  buffer->AppendF(kFormatFrame, frame_no, reinterpret_cast<void *>(address));
 }
 
 bool MarkupSymbolizerTool::SymbolizePC(uptr addr, SymbolizedStack *stack) {
   char buffer[kFormatFunctionMax];
-  internal_snprintf(buffer, sizeof(buffer), kFormatFunction, addr);
+  internal_snprintf(buffer, sizeof(buffer), kFormatFunction,
+                    reinterpret_cast<void *>(addr));
   stack->info.function = internal_strdup(buffer);
   return true;
 }
@@ -118,7 +119,8 @@ static void RenderMmaps(InternalScopedString *buffer,
     // module.base_address == dlpi_addr
     // range.beg == dlpi_addr + p_vaddr
     // relative address == p_vaddr == range.beg - module.base_address
-    buffer->AppendF(kFormatMmap, range.beg, range.end - range.beg, moduleId,
+    buffer->AppendF(kFormatMmap, reinterpret_cast<void *>(range.beg),
+                    range.end - range.beg, static_cast<int>(moduleId),
                     accessBuffer.data(), range.beg - module.base_address());
 
     buffer->Append("\n");
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_markup_constants.h b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_markup_constants.h
index 83643504e128..a43661eaecf2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_markup_constants.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_markup_constants.h
@@ -33,13 +33,13 @@ constexpr uptr kFormatFunctionMax = 64;  // More than big enough for 64-bit hex.
 constexpr const char *kFormatData = "{{{data:%p}}}";
 
 // One frame in a backtrace (printed on a line by itself).
-constexpr const char *kFormatFrame = "{{{bt:%u:%p}}}";
+constexpr const char *kFormatFrame = "{{{bt:%d:%p}}}";
 
 // Module contextual element.
-constexpr const char *kFormatModule = "{{{module:%d:%s:elf:%s}}}";
+constexpr const char *kFormatModule = "{{{module:%zu:%s:elf:%s}}}";
 
 // mmap for a module segment.
-constexpr const char *kFormatMmap = "{{{mmap:%p:0x%x:load:%d:%s:0x%x}}}";
+constexpr const char *kFormatMmap = "{{{mmap:%p:0x%zx:load:%d:%s:0x%zx}}}";
 
 // Dump trigger element.
 #define FORMAT_DUMPFILE "{{{dumpfile:%s:%s}}}"
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def
index 9691a007eed5..dcd130ac449a 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.def
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.def
@@ -89,6 +89,7 @@ PRIMARY_REQUIRED(const s32, MaxReleaseToOsIntervalMs)
 // Indicates support for offsetting the start of a region by a random number of
 // pages. This is only used if `EnableContiguousRegions` is enabled.
 PRIMARY_OPTIONAL(const bool, EnableRandomOffset, false)
+PRIMARY_OPTIONAL(const s32, DefaultReleaseToOsIntervalMs, INT32_MIN)
 
 // When `EnableContiguousRegions` is true, all regions will be be arranged in
 // adjacency. This will reduce the fragmentation caused by region allocations
@@ -118,6 +119,7 @@ SECONDARY_CACHE_OPTIONAL(const u32, DefaultMaxEntriesCount, 0)
 SECONDARY_CACHE_OPTIONAL(const uptr, DefaultMaxEntrySize, 0)
 SECONDARY_CACHE_OPTIONAL(const s32, MinReleaseToOsIntervalMs, INT32_MIN)
 SECONDARY_CACHE_OPTIONAL(const s32, MaxReleaseToOsIntervalMs, INT32_MAX)
+SECONDARY_CACHE_OPTIONAL(const s32, DefaultReleaseToOsIntervalMs, INT32_MIN)
 
 #undef SECONDARY_CACHE_OPTIONAL
 #undef SECONDARY_REQUIRED_TEMPLATE_TYPE
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index e7bc90cd0960..927513dea92d 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -173,6 +173,9 @@ public:
         static_cast<u32>(getFlags()->quarantine_max_chunk_size);
 
     Stats.init();
+    // TODO(chiahungduan): Given that we support setting the default value in
+    // the PrimaryConfig and CacheConfig, consider to deprecate the use of
+    // `release_to_os_interval_ms` flag.
     const s32 ReleaseToOsIntervalMs = getFlags()->release_to_os_interval_ms;
     Primary.init(ReleaseToOsIntervalMs);
     Secondary.init(&Stats, ReleaseToOsIntervalMs);
diff --git a/compiler-rt/lib/scudo/standalone/flags.inc b/compiler-rt/lib/scudo/standalone/flags.inc
index f5a2bab5057a..ff0c28e1db7c 100644
--- a/compiler-rt/lib/scudo/standalone/flags.inc
+++ b/compiler-rt/lib/scudo/standalone/flags.inc
@@ -42,7 +42,7 @@ SCUDO_FLAG(bool, may_return_null, true,
            "returning NULL in otherwise non-fatal error scenarios, eg: OOM, "
            "invalid allocation alignments, etc.")
 
-SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000,
+SCUDO_FLAG(int, release_to_os_interval_ms, 5000,
            "Interval (in milliseconds) at which to attempt release of unused "
            "memory to the OS. Negative values disable the feature.")
 
diff --git a/compiler-rt/lib/scudo/standalone/mem_map_fuchsia.cpp b/compiler-rt/lib/scudo/standalone/mem_map_fuchsia.cpp
index 5f3c8b81c07b..fc793abf44cd 100644
--- a/compiler-rt/lib/scudo/standalone/mem_map_fuchsia.cpp
+++ b/compiler-rt/lib/scudo/standalone/mem_map_fuchsia.cpp
@@ -108,9 +108,9 @@ bool MemMapFuchsia::mapImpl(UNUSED uptr Addr, uptr Size, const char *Name,
   // Create the VMO.
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
   if (UNLIKELY(Status != ZX_OK)) {
-    if (!IsNoMemError(Status) || !AllowNoMem)
-      dieOnError(Status, "zx_vmo_create", Size);
-    return false;
+    if (AllowNoMem && IsNoMemError(Status))
+      return false;
+    dieOnError(Status, "zx_vmo_create", Size);
   }
 
   if (Name != nullptr)
@@ -123,15 +123,15 @@ bool MemMapFuchsia::mapImpl(UNUSED uptr Addr, uptr Size, const char *Name,
   Status =
       _zx_vmar_map(_zx_vmar_root_self(), MapFlags, 0, Vmo, 0, Size, &MapAddr);
   if (UNLIKELY(Status != ZX_OK)) {
-    if (!IsNoMemError(Status) || !AllowNoMem)
-      dieOnError(Status, "zx_vmar_map", Size);
-
-    Status = _zx_handle_close(Vmo);
-    CHECK_EQ(Status, ZX_OK);
+    if (AllowNoMem && IsNoMemError(Status)) {
+      Status = _zx_handle_close(Vmo);
+      CHECK_EQ(Status, ZX_OK);
 
-    MapAddr = 0;
-    Vmo = ZX_HANDLE_INVALID;
-    return false;
+      MapAddr = 0;
+      Vmo = ZX_HANDLE_INVALID;
+      return false;
+    }
+    dieOnError(Status, "zx_vmar_map", Size);
   }
 
   if (PreCommit) {
@@ -194,9 +194,9 @@ bool MemMapFuchsia::remapImpl(uptr Addr, uptr Size, const char *Name,
       _zx_vmar_map(_zx_vmar_root_self(), MapFlags, Addr - getRootVmarBase(),
                    Vmo, Addr - MapAddr, Size, &MappedAddr);
   if (UNLIKELY(Status != ZX_OK)) {
-    if (!IsNoMemError(Status) || !AllowNoMem)
-      dieOnError(Status, "zx_vmar_map", Size);
-    return false;
+    if (AllowNoMem && IsNoMemError(Status))
+      return false;
+    dieOnError(Status, "zx_vmar_map", Size);
   }
   DCHECK_EQ(Addr, MappedAddr);
 
@@ -234,9 +234,9 @@ bool ReservedMemoryFuchsia::createImpl(UNUSED uptr Addr, uptr Size,
   zx_status_t Status = _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_ALLOW_FAULTS, 0,
                                     getPlaceholderVmo(), 0, Size, &Base);
   if (UNLIKELY(Status != ZX_OK)) {
-    if (!IsNoMemError(Status) || !AllowNoMem)
-      dieOnError(Status, "zx_vmar_map", Size);
-    return false;
+    if (AllowNoMem && IsNoMemError(Status))
+      return false;
+    dieOnError(Status, "zx_vmar_map", Size);
   }
 
   Capacity = Size;
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 1d8a77b73e5c..ebfb8dfe0a31 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -88,6 +88,10 @@ public:
       Sci->MinRegionIndex = NumRegions;
       Sci->ReleaseInfo.LastReleaseAtNs = Time;
     }
+
+    // The default value in the primary config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 61d57976ae43..bed2ccb8b992 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -147,6 +147,9 @@ public:
     for (uptr I = 0; I < NumClasses; I++)
       getRegionInfo(I)->FLLockCV.bindTestOnly(getRegionInfo(I)->FLLock);
 
+    // The default value in the primary config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
@@ -884,9 +887,10 @@ private:
         ScopedLock ML(Region->MMLock);
 
         const bool RegionIsExhausted = Region->Exhausted;
-        if (!RegionIsExhausted)
+        if (!RegionIsExhausted) {
           PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
                                                   MaxBlockCount);
+        }
         ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
 
         {
@@ -1019,7 +1023,6 @@ private:
                                           MAP_ALLOWNOMEM))) {
         Printf("Can't reserve pages for size class %zu.\n",
                getSizeByClassId(ClassId));
-        Region->Exhausted = true;
         return 0U;
       }
       initRegion(Region, ClassId,
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 674af5071775..d8c9f5bcfcaf 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -209,6 +209,9 @@ public:
               static_cast<sptr>(Config::getDefaultMaxEntriesCount()));
     setOption(Option::MaxCacheEntrySize,
               static_cast<sptr>(Config::getDefaultMaxEntrySize()));
+    // The default value in the cache config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
index 21d5b7add512..59f3fb0962f8 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -252,13 +252,11 @@ INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, int value) {
       // introduced by interval transition.
       SCUDO_ALLOCATOR.releaseToOS(scudo::ReleaseToOS::Force);
 
-      if (value == 0) {
-        // Will set the release values to their minimum values.
-        value = INT32_MIN;
-      } else {
-        // Will set the release values to their maximum values.
+      // The values allowed on Android are {-1, 0, 1}. "1" means the longest
+      // interval.
+      CHECK(value >= -1 && value <= 1);
+      if (value == 1)
         value = INT32_MAX;
-      }
     }
 
     SCUDO_ALLOCATOR.setOption(scudo::Option::ReleaseInterval,
diff --git a/compiler-rt/test/CMakeLists.txt b/compiler-rt/test/CMakeLists.txt
index edc007aaf477..8805cc8f798f 100644
--- a/compiler-rt/test/CMakeLists.txt
+++ b/compiler-rt/test/CMakeLists.txt
@@ -92,6 +92,9 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS)
   if(COMPILER_RT_BUILD_PROFILE AND COMPILER_RT_HAS_PROFILE)
     compiler_rt_test_runtime(profile)
   endif()
+  if(COMPILER_RT_BUILD_CTX_PROFILE)
+    compiler_rt_test_runtime(ctx_profile)
+  endif()
   if(COMPILER_RT_BUILD_MEMPROF)
     compiler_rt_test_runtime(memprof)
   endif()
diff --git a/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp b/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
index 90c16776a63b..8d400800fe93 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
@@ -5,7 +5,7 @@
 
 // RUN: %clangxx_asan -DPART=0 -c %s -o %t-1.o -flto -mllvm -asan-use-private-alias
 // RUN: %clangxx_asan -DPART=1 -c %s -o %t-2.o -flto -mllvm -asan-use-private-alias
-// RUN: %clangxx_asan_lto %t-1.o %t-2.o -o %t -flto -mlinker-version=133
+// RUN: %clangxx_asan_lto %t-1.o %t-2.o -o %t -flto
 // RUN: %run %t 2>&1 | FileCheck %s
 
 #include <stdio.h>
diff --git a/compiler-rt/test/ctx_profile/CMakeLists.txt b/compiler-rt/test/ctx_profile/CMakeLists.txt
new file mode 100644
index 000000000000..23c6fb16ed1f
--- /dev/null
+++ b/compiler-rt/test/ctx_profile/CMakeLists.txt
@@ -0,0 +1,21 @@
+set(CTX_PROFILE_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(CTX_PROFILE_TESTSUITES)
+
+# Add unit tests.
+if(COMPILER_RT_INCLUDE_TESTS)
+  foreach(arch ${CTX_PROFILE_SUPPORTED_ARCH})
+    string(TOUPPER ${arch} ARCH_UPPER_CASE)
+    set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+    configure_lit_site_cfg(
+      ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+      ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME}/lit.site.cfg.py)
+    list(APPEND CTX_PROFILE_TEST_DEPS CtxProfileUnitTests)
+    list(APPEND CTX_PROFILE_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME})
+  endforeach()
+endif()
+
+add_lit_testsuite(check-ctx_profile "Running the Contextual Profiler tests"
+  ${CTX_PROFILE_TESTSUITES}
+  DEPENDS ${CTX_PROFILE_TEST_DEPS})
+set_target_properties(check-ctx_profile PROPERTIES FOLDER "Compiler-RT Misc")
diff --git a/compiler-rt/test/ctx_profile/Unit/lit.site.cfg.py.in b/compiler-rt/test/ctx_profile/Unit/lit.site.cfg.py.in
new file mode 100644
index 000000000000..3fa9a7a2780e
--- /dev/null
+++ b/compiler-rt/test/ctx_profile/Unit/lit.site.cfg.py.in
@@ -0,0 +1,28 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import os
+import platform
+import re
+import shlex
+
+# Load common config for all compiler-rt unit tests.
+lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/unittests/lit.common.unit.configured")
+
+# Setup config name.
+config.name = 'CtxProfile-Unit'
+config.target_arch = "@arch@"
+assert config.target_arch == 'x86_64'
+
+config.test_exec_root = os.path.join("@COMPILER_RT_BINARY_DIR@",
+                                     "lib", "ctx_profile", "tests")
+
+config.test_source_root = config.test_exec_root
+
+# When LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=on, the initial value of
+# config.compiler_rt_libdir (COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR) has the
+# host triple as the trailing path component. The value is incorrect for i386
+# tests on x86_64 hosts and vice versa. But, since only x86_64 is enabled as
+# target, and we don't support different environments for building and,
+# respectively, running tests, we we only need to fix up the x86_64 case.
+if config.enable_per_target_runtime_dir and config.target_arch != config.host_arch:
+    config.compiler_rt_libdir = re.sub(r'/i386(?=-[^/]+$)', '/x86_64', config.compiler_rt_libdir)
diff --git a/compiler-rt/test/memprof/CMakeLists.txt b/compiler-rt/test/memprof/CMakeLists.txt
index 3f0ba3812485..fa6a4cd5f0b7 100644
--- a/compiler-rt/test/memprof/CMakeLists.txt
+++ b/compiler-rt/test/memprof/CMakeLists.txt
@@ -43,6 +43,19 @@ foreach(arch ${MEMPROF_TEST_ARCH})
     ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
 endforeach()
 
+# Add unit tests.
+if(COMPILER_RT_INCLUDE_TESTS)
+  foreach(arch ${MEMPROF_TEST_ARCH})
+    string(TOUPPER ${arch} ARCH_UPPER_CASE)
+    set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+    configure_lit_site_cfg(
+      ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+      ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME}/lit.site.cfg.py)
+    list(APPEND MEMPROF_TEST_DEPS MemProfUnitTests)
+    list(APPEND MEMPROF_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/Unit/${CONFIG_NAME})
+  endforeach()
+endif()
+
 add_lit_testsuite(check-memprof "Running the MemProfiler tests"
   ${MEMPROF_TESTSUITES}
   DEPENDS ${MEMPROF_TEST_DEPS})
diff --git a/compiler-rt/test/memprof/Unit/lit.site.cfg.py.in b/compiler-rt/test/memprof/Unit/lit.site.cfg.py.in
new file mode 100644
index 000000000000..1e2442a1487a
--- /dev/null
+++ b/compiler-rt/test/memprof/Unit/lit.site.cfg.py.in
@@ -0,0 +1,31 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import os
+import platform
+import re
+import shlex
+
+# Load common config for all compiler-rt unit tests.
+lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/unittests/lit.common.unit.configured")
+
+# Setup config name.
+config.name = 'MemProfiler-Unit'
+config.target_arch = "@arch@"
+assert config.target_arch == 'x86_64'
+
+config.test_exec_root = os.path.join("@COMPILER_RT_BINARY_DIR@",
+                                     "lib", "memprof", "tests")
+
+config.test_source_root = config.test_exec_root
+
+# When LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=on, the initial value of
+# config.compiler_rt_libdir (COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR) has the
+# host triple as the trailing path component. The value is incorrect for i386
+# tests on x86_64 hosts and vice versa. But, since only x86_64 is enabled as
+# target, and we don't support different environments for building and,
+# respectively, running tests, we we only need to fix up the x86_64 case.
+if config.enable_per_target_runtime_dir and config.target_arch != config.host_arch:
+    config.compiler_rt_libdir = re.sub(r'/i386(?=-[^/]+$)', '/x86_64', config.compiler_rt_libdir)
+
+if not config.parallelism_group:
+  config.parallelism_group = 'shadow-memory'
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
index ed817961c688..e46c2edac4ce 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cpp
@@ -2,6 +2,8 @@
 
 // REQUIRES: has_sancovcc
 // UNSUPPORTED: ubsan,i386-darwin,target={{(powerpc64|s390x|thumb).*}}
+// This test is failing for lsan on darwin on x86_64h.
+// UNSUPPORTED: x86_64h && lsan && darwin
 // XFAIL: tsan
 // XFAIL: android && asan
 
diff --git a/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp b/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp
index 677418a53705..026028447e61 100644
--- a/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp
+++ b/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/mlir-support.cpp
@@ -38,7 +38,7 @@ mlir::Attribute TypeAttr = mlir::TypeAttr::get(IndexType);
 mlir::Attribute ArrayAttr = mlir::ArrayAttr::get(&Context, {UnitAttr});
 mlir::Attribute StringAttr = mlir::StringAttr::get(&Context, "foo");
 mlir::Attribute ElementsAttr = mlir::DenseElementsAttr::get(
-    VectorType.cast<mlir::ShapedType>(), llvm::ArrayRef<float>{2.0f, 3.0f});
+    mlir::cast<mlir::ShapedType>(VectorType), llvm::ArrayRef<float>{2.0f, 3.0f});
 
 int main() {
   // Reference symbols that might otherwise be stripped.
diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
index e34d3851187a..0af12c8cfd54 100644
--- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake
+++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
@@ -2,6 +2,10 @@ option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
   "Compile Fortran runtime as CUDA sources (experimental)" OFF
   )
 
+option(FLANG_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS
+  "Do not compile global variables' definitions when producing PTX library" OFF
+  )
+
 set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
 
 set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
@@ -56,6 +60,11 @@ macro(enable_cuda_compilation name files)
     # Add an OBJECT library consisting of CUDA PTX.
     llvm_add_library(${name}PTX OBJECT PARTIAL_SOURCES_INTENDED ${files})
     set_property(TARGET obj.${name}PTX PROPERTY CUDA_PTX_COMPILATION ON)
+    if (FLANG_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS)
+      target_compile_definitions(obj.${name}PTX
+        PRIVATE FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
+        )
+    endif()
   endif()
 endmacro()
 
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index ac120b4ff09b..351595ac0afd 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -179,46 +179,20 @@ like this:
 
 ```
 $ flang -v -o example example.o
-"/usr/bin/ld" [...] example.o [...] "--whole-archive" "-lFortran_main"
-"--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal" [...]
+"/usr/bin/ld" [...] example.o [...] "-lFortranRuntime" "-lFortranDecimal" [...]
 ```
 
 The automatically added libraries are:
 
-* `Fortran_main`: Provides the main entry point `main` that then invokes
-  `_QQmain` with the Fortran program unit.  This library has a dependency to
-  the `FortranRuntime` library.
 * `FortranRuntime`: Provides most of the Flang runtime library.
 * `FortranDecimal`: Provides operations for decimal numbers.
 
-The default is that, when using Flang as the linker, one of the Fortran
-translation units provides the program unit and therefore it is assumed that
-Fortran is the main code part (calling into C/C++ routines via `BIND (C)`
-interfaces).  When composing the linker commandline, Flang uses
-`--whole-archive` and `--no-whole-archive` (Windows: `/WHOLEARCHIVE:`,
-Darwin & AIX: *not implemented yet*) to make sure that all for `Fortran_main`
-is processed by the linker.  This is done to issue a proper error message when
-multiple definitions of `main` occur.  This happens, for instance, when linking
-a code that has a Fortran program unit with a C/C++ code that also defines a
-`main` function.  A user may be required to explicitly provide the C++ runtime
-libraries at link time (e.g., via `-lstdc++` for STL)
-
 If the code is C/C++ based and invokes Fortran routines, one can either use Clang
 or Flang as the linker driver.  If Clang is used, it will automatically all
 required runtime libraries needed by C++ (e.g., for STL) to the linker invocation.
 In this case, one has to explicitly provide the Fortran runtime libraries
-`FortranRuntime` and/or `FortranDecimal`.  An alternative is to use Flang to link
-and use the `-fno-fortran-main` flag.  This flag removes
-`Fortran_main` from the linker stage and hence requires one of the C/C++
-translation units to provide a definition of the `main` function. In this case,
-it may be required to explicitly supply C++ runtime libraries as mentioned above.
-
-When creating shared or static libraries using Flang with `-shared` or `-static`
-flag, Fortran_main is automatically removed from the linker stage (i.e.,
-`-fno-fortran-main` is on by default).  It is assumed that when creating a
-static or shared library, the generated library does not need a `main`
-function, as a final link stage will occur that will provide the `Fortran_main`
-library when creating the final executable.
+`FortranRuntime` and/or `FortranDecimal`.  An alternative is to use Flang to link.
+In this case, it may be required to explicitly supply C++ runtime libraries.
 
 On Darwin, the logical root where the system libraries are located (sysroot)
 must be specified. This can be done with the CMake build flag `DEFAULT_SYSROOT`
diff --git a/flang/docs/Preprocessing.md b/flang/docs/Preprocessing.md
index 3c523472f39b..0b70d857833c 100644
--- a/flang/docs/Preprocessing.md
+++ b/flang/docs/Preprocessing.md
@@ -93,6 +93,9 @@ local:
 * If a `#define` or `#undef` directive appears among continuation
   lines, it may or may not affect text in the continued statement that
   appeared before the directive.
+* A backslash at the end of a free form source line is a continuation
+  marker, with no space skipping or special handling of a leading `&`
+  on the next line.
 
 ## Behavior that few compilers properly support (or none), but should:
 
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index 2338fa1b14a3..3ca92da4f646 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -775,7 +775,9 @@ class FeatureListAction : public PluginParseTreeAction {
     }
   }
 
-  bool beginSourceFileAction() override { return runPrescan() && runParse(); }
+  bool beginSourceFileAction() override {
+    return runPrescan() && runParse(/*emitMessages=*/true);
+  }
 };
 
 static FrontendPluginRegistry::Add<FeatureListAction> X(
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 1e678c341d81..6b3e37cd9c25 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -41,20 +41,33 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     ActualIntegerConvertedToSmallerKind, HollerithOrCharacterAsBOZ,
     BindingAsProcedure, StatementFunctionExtensions,
     UseGenericIntrinsicWhenSpecificDoesntMatch, DataStmtExtensions,
-    RedundantContiguous, InitBlankCommon, EmptyBindCDerivedType,
-    MiscSourceExtensions, AllocateToOtherLength, LongNames, IntrinsicAsSpecific,
-    BenignNameClash, BenignRedundancy, NullMoldAllocatableComponentValue,
-    NopassScalarBase, MiscUseExtensions, ImpliedDoIndexScope,
-    DistinctCommonSizes, OddIndexVariableRestrictions,
-    IndistinguishableSpecifics)
+    RedundantContiguous, RedundantAttribute, InitBlankCommon,
+    EmptyBindCDerivedType, MiscSourceExtensions, AllocateToOtherLength,
+    LongNames, IntrinsicAsSpecific, BenignNameClash, BenignRedundancy,
+    NullMoldAllocatableComponentValue, NopassScalarBase, MiscUseExtensions,
+    ImpliedDoIndexScope, DistinctCommonSizes, OddIndexVariableRestrictions,
+    IndistinguishableSpecifics, SubroutineAndFunctionSpecifics,
+    EmptySequenceType, NonSequenceCrayPointee, BranchIntoConstruct,
+    BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize)
 
-// Portability and suspicious usage warnings for conforming code
+// Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     NonTargetPassedToTarget, PointerToPossibleNoncontiguous,
-    ShortCharacterActual, ExprPassedToVolatile, ImplicitInterfaceActual,
-    PolymorphicTransferArg, PointerComponentTransferArg, TransferSizePresence,
-    F202XAllocatableBreakingChange, DimMustBePresent, CommonBlockPadding,
-    LogicalVsCBool, BindCCharLength, ProcDummyArgShapes, ExternalNameConflict)
+    ShortCharacterActual, ShortArrayActual, ExprPassedToVolatile,
+    ImplicitInterfaceActual, PolymorphicTransferArg,
+    PointerComponentTransferArg, TransferSizePresence,
+    F202XAllocatableBreakingChange, OptionalMustBePresent, CommonBlockPadding,
+    LogicalVsCBool, BindCCharLength, ProcDummyArgShapes, ExternalNameConflict,
+    FoldingException, FoldingAvoidsRuntimeCrash, FoldingValueChecks,
+    FoldingFailure, FoldingLimit, Interoperability, Bounds, Preprocessing,
+    Scanning, OpenAccUsage, ProcPointerCompatibility, VoidMold,
+    KnownBadImplicitInterface, EmptyCase, CaseOverflow, CUDAUsage,
+    IgnoreTKRUsage, ExternalInterfaceMismatch, DefinedOperatorArgs, Final,
+    ZeroDoStep, UnusedForallIndex, OpenMPUsage, ModuleFile, DataLength,
+    IgnoredDirective, HomonymousSpecific, HomonymousResult,
+    IgnoredIntrinsicFunctionType, PreviousScalarUse,
+    RedeclaredInaccessibleComponent, ImplicitShared, IndexVarRedefinition,
+    IncompatibleImplicitInterfaces, BadTypeForTarget)
 
 using LanguageFeatures = EnumSet<LanguageFeature, LanguageFeature_enumSize>;
 using UsageWarnings = EnumSet<UsageWarning, UsageWarning_enumSize>;
@@ -77,8 +90,57 @@ public:
     disable_.set(LanguageFeature::LogicalAbbreviations);
     disable_.set(LanguageFeature::XOROperator);
     disable_.set(LanguageFeature::OldStyleParameter);
+    // These warnings are enabled by default, but only because they used
+    // to be unconditional.  TODO: prune this list
+    warnLanguage_.set(LanguageFeature::ExponentMatchingKindParam);
+    warnLanguage_.set(LanguageFeature::RedundantAttribute);
+    warnLanguage_.set(LanguageFeature::SubroutineAndFunctionSpecifics);
+    warnLanguage_.set(LanguageFeature::EmptySequenceType);
+    warnLanguage_.set(LanguageFeature::NonSequenceCrayPointee);
+    warnLanguage_.set(LanguageFeature::BranchIntoConstruct);
+    warnLanguage_.set(LanguageFeature::BadBranchTarget);
+    warnLanguage_.set(LanguageFeature::ConvertedArgument);
+    warnLanguage_.set(LanguageFeature::HollerithPolymorphic);
+    warnLanguage_.set(LanguageFeature::ListDirectedSize);
+    warnUsage_.set(UsageWarning::ShortArrayActual);
+    warnUsage_.set(UsageWarning::FoldingException);
+    warnUsage_.set(UsageWarning::FoldingAvoidsRuntimeCrash);
+    warnUsage_.set(UsageWarning::FoldingValueChecks);
+    warnUsage_.set(UsageWarning::FoldingFailure);
+    warnUsage_.set(UsageWarning::FoldingLimit);
+    warnUsage_.set(UsageWarning::Interoperability);
+    warnUsage_.set(UsageWarning::Bounds);
+    warnUsage_.set(UsageWarning::Preprocessing);
+    warnUsage_.set(UsageWarning::Scanning);
+    warnUsage_.set(UsageWarning::OpenAccUsage);
+    warnUsage_.set(UsageWarning::ProcPointerCompatibility);
+    warnUsage_.set(UsageWarning::VoidMold);
+    warnUsage_.set(UsageWarning::KnownBadImplicitInterface);
+    warnUsage_.set(UsageWarning::EmptyCase);
+    warnUsage_.set(UsageWarning::CaseOverflow);
+    warnUsage_.set(UsageWarning::CUDAUsage);
+    warnUsage_.set(UsageWarning::IgnoreTKRUsage);
+    warnUsage_.set(UsageWarning::ExternalInterfaceMismatch);
+    warnUsage_.set(UsageWarning::DefinedOperatorArgs);
+    warnUsage_.set(UsageWarning::Final);
+    warnUsage_.set(UsageWarning::ZeroDoStep);
+    warnUsage_.set(UsageWarning::UnusedForallIndex);
+    warnUsage_.set(UsageWarning::OpenMPUsage);
+    warnUsage_.set(UsageWarning::ModuleFile);
+    warnUsage_.set(UsageWarning::DataLength);
+    warnUsage_.set(UsageWarning::IgnoredDirective);
+    warnUsage_.set(UsageWarning::HomonymousSpecific);
+    warnUsage_.set(UsageWarning::HomonymousResult);
+    warnUsage_.set(UsageWarning::IgnoredIntrinsicFunctionType);
+    warnUsage_.set(UsageWarning::PreviousScalarUse);
+    warnUsage_.set(UsageWarning::RedeclaredInaccessibleComponent);
+    warnUsage_.set(UsageWarning::ImplicitShared);
+    warnUsage_.set(UsageWarning::IndexVarRedefinition);
+    warnUsage_.set(UsageWarning::IncompatibleImplicitInterfaces);
+    warnUsage_.set(UsageWarning::BadTypeForTarget);
   }
   LanguageFeatureControl(const LanguageFeatureControl &) = default;
+
   void Enable(LanguageFeature f, bool yes = true) { disable_.set(f, !yes); }
   void EnableWarning(LanguageFeature f, bool yes = true) {
     warnLanguage_.set(f, yes);
@@ -88,10 +150,19 @@ public:
   }
   void WarnOnAllNonstandard(bool yes = true) { warnAllLanguage_ = yes; }
   void WarnOnAllUsage(bool yes = true) { warnAllUsage_ = yes; }
+  void DisableAllNonstandardWarnings() {
+    warnAllLanguage_ = false;
+    warnLanguage_.clear();
+  }
+  void DisableAllUsageWarnings() {
+    warnAllUsage_ = false;
+    warnUsage_.clear();
+  }
+
   bool IsEnabled(LanguageFeature f) const { return !disable_.test(f); }
   bool ShouldWarn(LanguageFeature f) const {
     return (warnAllLanguage_ && f != LanguageFeature::OpenMP &&
-               f != LanguageFeature::OpenACC) ||
+               f != LanguageFeature::OpenACC && f != LanguageFeature::CUDA) ||
         warnLanguage_.test(f);
   }
   bool ShouldWarn(UsageWarning w) const {
diff --git a/flang/include/flang/Common/Fortran.h b/flang/include/flang/Common/Fortran.h
index 2a53452a2774..3b965fe60c2f 100644
--- a/flang/include/flang/Common/Fortran.h
+++ b/flang/include/flang/Common/Fortran.h
@@ -114,8 +114,8 @@ static constexpr IgnoreTKRSet ignoreTKRAll{IgnoreTKR::Type, IgnoreTKR::Kind,
     IgnoreTKR::Rank, IgnoreTKR::Device, IgnoreTKR::Managed};
 std::string AsFortran(IgnoreTKRSet);
 
-bool AreCompatibleCUDADataAttrs(
-    std::optional<CUDADataAttr>, std::optional<CUDADataAttr>, IgnoreTKRSet);
+bool AreCompatibleCUDADataAttrs(std::optional<CUDADataAttr>,
+    std::optional<CUDADataAttr>, IgnoreTKRSet, bool allowUnifiedMatchingRule);
 
 static constexpr char blankCommonObjectName[] = "__BLNK__";
 
diff --git a/flang/include/flang/Common/visit.h b/flang/include/flang/Common/visit.h
index 4d0897301e01..d867338be7e0 100644
--- a/flang/include/flang/Common/visit.h
+++ b/flang/include/flang/Common/visit.h
@@ -40,11 +40,17 @@ inline RT_API_ATTRS RESULT Log2VisitHelper(
       return visitor(std::get<(LOW + N)>(std::forward<VARIANT>(u))...); \
     }
       VISIT_CASE_N(1)
+      [[fallthrough]];
       VISIT_CASE_N(2)
+      [[fallthrough]];
       VISIT_CASE_N(3)
+      [[fallthrough]];
       VISIT_CASE_N(4)
+      [[fallthrough]];
       VISIT_CASE_N(5)
+      [[fallthrough]];
       VISIT_CASE_N(6)
+      [[fallthrough]];
       VISIT_CASE_N(7)
 #undef VISIT_CASE_N
     }
@@ -82,7 +88,7 @@ inline RT_API_ATTRS auto visit(VISITOR &&visitor, VARIANT &&...u)
 // Some versions of clang have bugs that cause compilation to hang
 // on these templates.  MSVC and older GCC versions may work but are
 // not well tested.  So enable only for GCC 9 and better.
-#if __GNUC__ < 9
+#if __GNUC__ < 9 && !defined(__clang__)
 #define FLANG_USE_STD_VISIT
 #endif
 
diff --git a/flang/include/flang/Frontend/FrontendAction.h b/flang/include/flang/Frontend/FrontendAction.h
index 266050084f4c..1a800cc681cf 100644
--- a/flang/include/flang/Frontend/FrontendAction.h
+++ b/flang/include/flang/Frontend/FrontendAction.h
@@ -112,7 +112,7 @@ protected:
   bool runPrescan();
   // Parse the current input file. Return False if fatal errors are reported,
   // True otherwise.
-  bool runParse();
+  bool runParse(bool emitMessages);
   // Run semantic checks for the current input file. Return False if fatal
   // errors are reported, True otherwise.
   bool runSemanticChecks();
diff --git a/flang/include/flang/Lower/Mangler.h b/flang/include/flang/Lower/Mangler.h
index 41939abe29e5..99da96b0d6ba 100644
--- a/flang/include/flang/Lower/Mangler.h
+++ b/flang/include/flang/Lower/Mangler.h
@@ -90,7 +90,7 @@ inline std::string mangleArrayLiteral(
   return mangleArrayLiteral(x.values().size() * sizeof(x.values()[0]),
                             x.shape(), Fortran::common::TypeCategory::Derived,
                             /*kind=*/0, /*charLen=*/-1,
-                            eleTy.cast<fir::RecordType>().getName());
+                            mlir::cast<fir::RecordType>(eleTy).getName());
 }
 
 /// Return the compiler-generated name of a static namelist variable descriptor.
diff --git a/flang/include/flang/Optimizer/Analysis/TBAAForest.h b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
index b69e50bbe05c..619ed4939c51 100644
--- a/flang/include/flang/Optimizer/Analysis/TBAAForest.h
+++ b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
@@ -88,7 +88,7 @@ public:
     // name must be used so that we add to the tbaa tree added in the FIR pass
     mlir::Attribute attr = func->getAttr(getInternalFuncNameAttrName());
     if (attr) {
-      return getFuncTree(attr.cast<mlir::StringAttr>());
+      return getFuncTree(mlir::cast<mlir::StringAttr>(attr));
     }
     return getFuncTree(func.getSymNameAttr());
   }
diff --git a/flang/include/flang/Optimizer/Builder/BoxValue.h b/flang/include/flang/Optimizer/Builder/BoxValue.h
index 2fed2d48a7a0..5c7e89dbc08f 100644
--- a/flang/include/flang/Optimizer/Builder/BoxValue.h
+++ b/flang/include/flang/Optimizer/Builder/BoxValue.h
@@ -78,7 +78,7 @@ class CharBoxValue : public AbstractBox {
 public:
   CharBoxValue(mlir::Value addr, mlir::Value len)
       : AbstractBox{addr}, len{len} {
-    if (addr && addr.getType().template isa<fir::BoxCharType>())
+    if (addr && mlir::isa<fir::BoxCharType>(addr.getType()))
       fir::emitFatalError(addr.getLoc(),
                           "BoxChar should not be in CharBoxValue");
   }
@@ -221,7 +221,7 @@ public:
     auto type = getAddr().getType();
     if (auto pointedTy = fir::dyn_cast_ptrEleTy(type))
       type = pointedTy;
-    return type.cast<fir::BaseBoxType>();
+    return mlir::cast<fir::BaseBoxType>(type);
   }
   /// Return the part of the address type after memory and box types. That is
   /// the element type, maybe wrapped in a fir.array type.
@@ -243,22 +243,22 @@ public:
   /// Get the scalar type related to the described entity
   mlir::Type getEleTy() const {
     auto type = getBaseTy();
-    if (auto seqTy = type.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type))
       return seqTy.getEleTy();
     return type;
   }
 
   /// Is the entity an array or an assumed rank ?
-  bool hasRank() const { return getBaseTy().isa<fir::SequenceType>(); }
+  bool hasRank() const { return mlir::isa<fir::SequenceType>(getBaseTy()); }
   /// Is this an assumed rank ?
   bool hasAssumedRank() const {
-    auto seqTy = getBaseTy().dyn_cast<fir::SequenceType>();
+    auto seqTy = mlir::dyn_cast<fir::SequenceType>(getBaseTy());
     return seqTy && seqTy.hasUnknownShape();
   }
   /// Returns the rank of the entity. Beware that zero will be returned for
   /// both scalars and assumed rank.
   unsigned rank() const {
-    if (auto seqTy = getBaseTy().dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(getBaseTy()))
       return seqTy.getDimension();
     return 0;
   }
@@ -267,7 +267,7 @@ public:
   bool isCharacter() const { return fir::isa_char(getEleTy()); }
 
   /// Is this a derived type entity ?
-  bool isDerived() const { return getEleTy().isa<fir::RecordType>(); }
+  bool isDerived() const { return mlir::isa<fir::RecordType>(getEleTy()); }
 
   bool isDerivedWithLenParameters() const {
     return fir::isRecordWithTypeParameters(getEleTy());
@@ -377,11 +377,11 @@ public:
   }
   /// Is this a Fortran pointer ?
   bool isPointer() const {
-    return getBoxTy().getEleTy().isa<fir::PointerType>();
+    return mlir::isa<fir::PointerType>(getBoxTy().getEleTy());
   }
   /// Is this an allocatable ?
   bool isAllocatable() const {
-    return getBoxTy().getEleTy().isa<fir::HeapType>();
+    return mlir::isa<fir::HeapType>(getBoxTy().getEleTy());
   }
   // Replace the fir.ref<fir.box>, keeping any non-deferred parameters.
   MutableBoxValue clone(mlir::Value newBox) const {
@@ -488,7 +488,7 @@ public:
     if (const auto *b = getUnboxed()) {
       if (*b) {
         auto type = b->getType();
-        if (type.template isa<fir::BoxCharType>())
+        if (mlir::isa<fir::BoxCharType>(type))
           fir::emitFatalError(b->getLoc(), "BoxChar should be unboxed");
         type = fir::unwrapSequenceType(fir::unwrapRefType(type));
         if (fir::isa_char(type))
diff --git a/flang/include/flang/Optimizer/Builder/Factory.h b/flang/include/flang/Optimizer/Builder/Factory.h
index ec294d26ac96..4e5c52ac44e0 100644
--- a/flang/include/flang/Optimizer/Builder/Factory.h
+++ b/flang/include/flang/Optimizer/Builder/Factory.h
@@ -43,9 +43,9 @@ template <typename B>
 void genCharacterCopy(mlir::Value src, mlir::Value srcLen, mlir::Value dst,
                       mlir::Value dstLen, B &builder, mlir::Location loc) {
   auto srcTy =
-      fir::dyn_cast_ptrEleTy(src.getType()).template cast<fir::CharacterType>();
+      mlir::cast<fir::CharacterType>(fir::dyn_cast_ptrEleTy(src.getType()));
   auto dstTy =
-      fir::dyn_cast_ptrEleTy(dst.getType()).template cast<fir::CharacterType>();
+      mlir::cast<fir::CharacterType>(fir::dyn_cast_ptrEleTy(dst.getType()));
   if (!srcLen && !dstLen && srcTy.getFKind() == dstTy.getFKind() &&
       srcTy.getLen() == dstTy.getLen()) {
     // same size, so just use load and store
@@ -61,8 +61,8 @@ void genCharacterCopy(mlir::Value src, mlir::Value srcLen, mlir::Value dst,
         fir::CharacterType::getSingleton(ty.getContext(), ty.getFKind())));
   };
   auto toEleTy = [&](fir::ReferenceType ty) {
-    auto seqTy = ty.getEleTy().cast<fir::SequenceType>();
-    return seqTy.getEleTy().cast<fir::CharacterType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(ty.getEleTy());
+    return mlir::cast<fir::CharacterType>(seqTy.getEleTy());
   };
   auto toCoorTy = [&](fir::ReferenceType ty) {
     return fir::ReferenceType::get(toEleTy(ty));
@@ -190,8 +190,8 @@ originateIndices(mlir::Location loc, B &builder, mlir::Type memTy,
   if (origins.empty()) {
     assert(!shapeVal || mlir::isa<fir::ShapeOp>(shapeVal.getDefiningOp()));
     auto ty = fir::dyn_cast_ptrOrBoxEleTy(memTy);
-    assert(ty && ty.isa<fir::SequenceType>());
-    auto seqTy = ty.cast<fir::SequenceType>();
+    assert(ty && mlir::isa<fir::SequenceType>(ty));
+    auto seqTy = mlir::cast<fir::SequenceType>(ty);
     auto one = builder.template create<mlir::arith::ConstantIndexOp>(loc, 1);
     const auto dimension = seqTy.getDimension();
     if (shapeVal) {
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 035035601e2f..6c36f7e84db6 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -77,12 +77,12 @@ public:
   /// Return the rank of this entity or -1 if it is an assumed rank.
   int getRank() const {
     mlir::Type type = fir::unwrapPassByRefType(fir::unwrapRefType(getType()));
-    if (auto seqTy = type.dyn_cast<fir::SequenceType>()) {
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type)) {
       if (seqTy.hasUnknownShape())
         return -1;
       return seqTy.getDimension();
     }
-    if (auto exprType = type.dyn_cast<hlfir::ExprType>())
+    if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type))
       return exprType.getRank();
     return 0;
   }
@@ -99,17 +99,17 @@ public:
 
   bool hasLengthParameters() const {
     mlir::Type eleTy = getFortranElementType();
-    return eleTy.isa<fir::CharacterType>() ||
+    return mlir::isa<fir::CharacterType>(eleTy) ||
            fir::isRecordWithTypeParameters(eleTy);
   }
 
   bool isCharacter() const {
-    return getFortranElementType().isa<fir::CharacterType>();
+    return mlir::isa<fir::CharacterType>(getFortranElementType());
   }
 
   bool hasIntrinsicType() const {
     mlir::Type eleTy = getFortranElementType();
-    return fir::isa_trivial(eleTy) || eleTy.isa<fir::CharacterType>();
+    return fir::isa_trivial(eleTy) || mlir::isa<fir::CharacterType>(eleTy);
   }
 
   bool isDerivedWithLengthParameters() const {
@@ -124,8 +124,8 @@ public:
     if (auto varIface = getIfVariableInterface()) {
       if (auto shape = varIface.getShape()) {
         auto shapeTy = shape.getType();
-        return shapeTy.isa<fir::ShiftType>() ||
-               shapeTy.isa<fir::ShapeShiftType>();
+        return mlir::isa<fir::ShiftType>(shapeTy) ||
+               mlir::isa<fir::ShapeShiftType>(shapeTy);
       }
       return false;
     }
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 604f2bd969ee..b7d060926761 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -663,8 +663,8 @@ static inline mlir::FunctionType genFuncType(mlir::MLIRContext *context,
 //===----------------------------------------------------------------------===//
 static inline mlir::Type getConvertedElementType(mlir::MLIRContext *context,
                                                  mlir::Type eleTy) {
-  if (eleTy.isa<mlir::IntegerType>() && !eleTy.isSignlessInteger()) {
-    const auto intTy{eleTy.dyn_cast<mlir::IntegerType>()};
+  if (mlir::isa<mlir::IntegerType>(eleTy) && !eleTy.isSignlessInteger()) {
+    const auto intTy{mlir::dyn_cast<mlir::IntegerType>(eleTy)};
     auto newEleTy{mlir::IntegerType::get(context, intTy.getWidth())};
     return newEleTy;
   }
diff --git a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
index 1e87bf0f6ad1..a7c4c075d818 100644
--- a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
@@ -180,10 +180,10 @@ struct VecTypeInfo {
 // Returns a VecTypeInfo with element type and length of given fir vector type.
 // Preserves signness of fir vector type if element type of integer.
 static inline VecTypeInfo getVecTypeFromFirType(mlir::Type firTy) {
-  assert(firTy.isa<fir::VectorType>());
+  assert(mlir::isa<fir::VectorType>(firTy));
   VecTypeInfo vecTyInfo;
-  vecTyInfo.eleTy = firTy.dyn_cast<fir::VectorType>().getEleTy();
-  vecTyInfo.len = firTy.dyn_cast<fir::VectorType>().getLen();
+  vecTyInfo.eleTy = mlir::dyn_cast<fir::VectorType>(firTy).getEleTy();
+  vecTyInfo.len = mlir::dyn_cast<fir::VectorType>(firTy).getLen();
   return vecTyInfo;
 }
 
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h b/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
index 18a24bad3960..2a0baaefd0ca 100755
--- a/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
@@ -22,10 +22,12 @@
 
 namespace fir {
 class FirOpBuilder;
+class GlobalOp;
 } // namespace fir
 
 namespace mlir {
 class Location;
+class Value;
 } // namespace mlir
 
 namespace Fortran::lower {
@@ -37,7 +39,7 @@ namespace fir::runtime {
 /// Create the list of environment variable defaults for the runtime to set. The
 /// form of the generated list is defined in the runtime header file
 /// environment-default-list.h
-void genEnvironmentDefaults(
+mlir::Value genEnvironmentDefaults(
     fir::FirOpBuilder &builder, mlir::Location loc,
     const std::vector<Fortran::lower::EnvironmentDefault> &envDefaults);
 
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Main.h b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
new file mode 100644
index 000000000000..e4c5dc914c70
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
@@ -0,0 +1,30 @@
+//===-- Main.h - generate main runtime API calls ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
+#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
+
+#include "flang/Lower/EnvironmentDefault.h"
+#include <vector>
+
+namespace mlir {
+class Location;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+class GlobalOp;
+} // namespace fir
+
+namespace fir::runtime {
+
+void genMain(fir::FirOpBuilder &builder, mlir::Location loc,
+             const std::vector<Fortran::lower::EnvironmentDefault> &defs);
+}
+
+#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
diff --git a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
index 544fc3cdf75e..0ef37a37ce94 100644
--- a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
+++ b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
@@ -21,17 +21,18 @@ include "flang/Optimizer/Dialect/FIROps.td"
 
 def IdenticalTypePred : Constraint<CPred<"$0.getType() == $1.getType()">>;
 def IntegerTypePred : Constraint<CPred<"fir::isa_integer($0.getType())">>;
-def IndexTypePred : Constraint<CPred<"$0.getType().isa<mlir::IndexType>()">>;
+def IndexTypePred : Constraint<CPred<
+                       "mlir::isa<mlir::IndexType>($0.getType())">>;
 
 // Widths are monotonic.
 //   $0.bits >= $1.bits >= $2.bits or $0.bits <= $1.bits <= $2.bits
 def MonotonicTypePred
-    : Constraint<CPred<"(($0.getType().isa<mlir::IntegerType>() && "
-                       "  $1.getType().isa<mlir::IntegerType>() && "
-                       "  $2.getType().isa<mlir::IntegerType>()) || "
-                       " ($0.getType().isa<mlir::FloatType>() && "
-                       "  $1.getType().isa<mlir::FloatType>() && "
-                       "  $2.getType().isa<mlir::FloatType>())) && "
+    : Constraint<CPred<"((mlir::isa<mlir::IntegerType>($0.getType()) && "
+                       "  mlir::isa<mlir::IntegerType>($1.getType()) && "
+                       "  mlir::isa<mlir::IntegerType>($2.getType())) || "
+                       " (mlir::isa<mlir::FloatType>($0.getType()) && "
+                       "  mlir::isa<mlir::FloatType>($1.getType()) && "
+                       "  mlir::isa<mlir::FloatType>($2.getType()))) && "
                        "(($0.getType().getIntOrFloatBitWidth() <= "
                        "  $1.getType().getIntOrFloatBitWidth() && "
                        "  $1.getType().getIntOrFloatBitWidth() <= "
@@ -42,8 +43,8 @@ def MonotonicTypePred
                        "  $2.getType().getIntOrFloatBitWidth()))">>;
 
 def IntPred : Constraint<CPred<
-                       "$0.getType().isa<mlir::IntegerType>() && "
-                       "$1.getType().isa<mlir::IntegerType>()">>;
+                       "mlir::isa<mlir::IntegerType>($0.getType()) && "
+                       "mlir::isa<mlir::IntegerType>($1.getType())">>;
                        
 // If both are int type and the first is smaller than the second.
 //   $0.bits <= $1.bits
@@ -101,8 +102,8 @@ def CombineConvertTruncOptPattern
 def createConstantOp
     : NativeCodeCall<"$_builder.create<mlir::arith::ConstantOp>"
                      "($_loc, $_builder.getIndexType(), "
-                     "rewriter.getIndexAttr($1.dyn_cast<mlir::IntegerAttr>()"
-                     ".getInt()))">;
+                     "rewriter.getIndexAttr("
+                     "mlir::dyn_cast<mlir::IntegerAttr>($1).getInt()))">;
 
 def ForwardConstantConvertPattern
     : Pat<(fir_ConvertOp:$res (Arith_ConstantOp:$cnt $attr)),
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 92790a691e47..dc38e56d93c6 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2708,14 +2708,14 @@ def fir_ConvertOp : fir_OneResultOp<"convert", [NoMemoryEffect]> {
   let hasCanonicalizer = 1;
 }
 
-def FortranTypeAttr : Attr<And<[CPred<"$_self.isa<mlir::TypeAttr>()">,
-    Or<[CPred<"$_self.cast<mlir::TypeAttr>().getValue().isa<fir::CharacterType,"
-              "fir::ComplexType, fir::IntegerType, fir::LogicalType,"
-              "fir::RealType, fir::RecordType>()">]>]>,
-    "Fortran surface type"> {
+def FortranTypeAttr : Attr<And<[CPred<"mlir::isa<mlir::TypeAttr>($_self)">,
+    Or<[CPred<"mlir::isa<fir::CharacterType, fir::ComplexType, "
+              "fir::IntegerType, fir::LogicalType, fir::RealType, "
+              "fir::RecordType>(mlir::cast<mlir::TypeAttr>($_self).getValue())"
+    >]>]>, "Fortran surface type"> {
   let storageType = [{ ::mlir::TypeAttr }];
   let returnType = "mlir::Type";
-  let convertFromStorage = "$_self.getValue().cast<mlir::Type>()";
+  let convertFromStorage = "mlir::cast<mlir::Type>($_self.getValue())";
 }
 
 def fir_TypeDescOp : fir_OneResultOp<"type_desc", [NoMemoryEffect]> {
@@ -3074,6 +3074,7 @@ def fir_DeclareOp : fir_Op<"declare", [AttrSizedOperandSegments,
     AnyRefOrBox:$memref,
     Optional<AnyShapeOrShiftType>:$shape,
     Variadic<AnyIntegerType>:$typeparams,
+    Optional<fir_DummyScopeType>:$dummy_scope,
     Builtin_StringAttr:$uniq_name,
     OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
     OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
@@ -3083,7 +3084,8 @@ def fir_DeclareOp : fir_Op<"declare", [AttrSizedOperandSegments,
 
   let assemblyFormat = [{
     $memref (`(` $shape^ `)`)? (`typeparams` $typeparams^)?
-     attr-dict `:` functional-type(operands, results)
+    (`dummy_scope` $dummy_scope^)?
+    attr-dict `:` functional-type(operands, results)
   }];
 
   let hasVerifier = 1;
@@ -3247,4 +3249,138 @@ def fir_CUDADeallocateOp : fir_Op<"cuda_deallocate",
   let hasVerifier = 1;
 }
 
+def fir_DummyScopeOp : fir_Op<"dummy_scope",
+    [MemoryEffects<[MemWrite<DebuggingResource>]>]> {
+  let summary = "Define a scope for dummy arguments";
+
+  let description = [{
+    An abstract handle to be used to associate dummy arguments of the same
+    subroutine between each other. By lowering, all [hl]fir.declare
+    operations representing declarations of dummy arguments of a subroutine
+    use the result of this operation. This allows recognizing the references
+    of these dummy arguments as belonging to the same runtime instance
+    of the subroutine even after MLIR inlining. Thus, the Fortran aliasing
+    rules might be applied to those references based on the original
+    declarations of the dummy arguments.
+    For example:
+    ```
+      subroutine test(x, y)
+        real, target :: x, y
+        x = y ! may alias
+        call inner(x, y)
+      contains
+        subroutine inner(x, y)
+          real :: x, y
+          x = y ! may not alias
+        end subroutine inner
+      end subroutine test
+    ```
+    After MLIR inlining this may look like this:
+    ```
+      func.func @_QPtest(
+          %arg0: !fir.ref<f32> {fir.target},
+          %arg1: !fir.ref<f32> {fir.target}) {
+        %0 = fir.declare %arg0 {fortran_attrs = #fir.var_attrs<target>} :
+            (!fir.ref<f32>) -> !fir.ref<f32>
+        %1 = fir.declare %arg1 {fortran_attrs = #fir.var_attrs<target>} :
+            (!fir.ref<f32>) -> !fir.ref<f32>
+        %2 = fir.load %1 : !fir.ref<f32>
+        fir.store %2 to %0 : !fir.ref<f32>
+        %3 = fir.declare %0 : (!fir.ref<f32>) -> !fir.ref<f32>
+        %4 = fir.declare %1 : (!fir.ref<f32>) -> !fir.ref<f32>
+        %5 = fir.load %4 : !fir.ref<f32>
+        fir.store %5 to %3 : !fir.ref<f32>
+        return
+      }
+    ```
+    Without marking %3 and %4 as declaring the dummy arguments
+    of the same runtime instance of `inner` subroutine the FIR
+    AliasAnalysis cannot deduce non-aliasing for the second load/store pair.
+    This information may be preserved by using fir.dummy_scope operation:
+    ```
+      func.func @_QPtest(
+          %arg0: !fir.ref<f32> {fir.target},
+          %arg1: !fir.ref<f32> {fir.target}) {
+        %h1 = fir.dummy_scope : i1
+        %0 = fir.declare %arg0 dummy_scope(%h1)
+            {fortran_attrs = #fir.var_attrs<target>} :
+            (!fir.ref<f32>) -> !fir.ref<f32>
+        %1 = fir.declare %arg1 dummy_scope(%h1)
+            {fortran_attrs = #fir.var_attrs<target>} :
+            (!fir.ref<f32>) -> !fir.ref<f32>
+        %2 = fir.load %1 : !fir.ref<f32>
+        fir.store %2 to %0 : !fir.ref<f32>
+        %h2 = fir.dummy_scope : i1
+        %3 = fir.declare %0 dummy_scope(%h2) : (!fir.ref<f32>) -> !fir.ref<f32>
+        %4 = fir.declare %1 dummy_scope(%h2) : (!fir.ref<f32>) -> !fir.ref<f32>
+        %5 = fir.load %4 : !fir.ref<f32>
+        fir.store %5 to %3 : !fir.ref<f32>
+        return
+      }
+    ```
+    Note that even if `inner` is called and inlined twice inside
+    `test`, the two inlined instances of `inner` must use two different
+    fir.dummy_scope operations for their fir.declare ops. This
+    two distinct fir.dummy_scope must remain distinct during the optimizations.
+    This is guaranteed by the write memory effect on the DebuggingResource.
+  }];
+
+  let results = (outs fir_DummyScopeType);
+  let assemblyFormat = "attr-dict `:` type(results)";
+}
+
+def fir_CUDAAllocOp : fir_Op<"cuda_alloc", [AttrSizedOperandSegments,
+    MemoryEffects<[MemAlloc]>]> {
+  let summary = "Allocate an object on device";
+
+  let description = [{
+    This is a drop in replacement for fir.alloca and fir.allocmem for device
+    object. Any device, managed or unified object declared in an host
+    subprogram needs to be allocated in the device memory through runtime calls.
+    The fir.cuda_alloc is an abstraction to the runtime calls and works together
+    with fir.cuda_free.
+  }];
+
+  let arguments = (ins
+    TypeAttr:$in_type,
+    OptionalAttr<StrAttr>:$uniq_name,
+    OptionalAttr<StrAttr>:$bindc_name,
+    Variadic<AnyIntegerType>:$typeparams,
+    Variadic<AnyIntegerType>:$shape,
+    fir_CUDADataAttributeAttr:$cuda_attr
+  );
+
+  let results = (outs fir_ReferenceType:$ptr);
+
+  let assemblyFormat = [{
+    $in_type (`(` $typeparams^ `:` type($typeparams) `)`)?
+        (`,` $shape^ `:` type($shape) )?  attr-dict `->` qualified(type($ptr))
+  }];
+
+  let builders = [
+    OpBuilder<(ins "mlir::Type":$inType, "llvm::StringRef":$uniqName,
+      "llvm::StringRef":$bindcName,
+      "fir::CUDADataAttributeAttr":$cudaAttr,
+      CArg<"mlir::ValueRange", "{}">:$typeparams,
+      CArg<"mlir::ValueRange", "{}">:$shape,
+      CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>];
+}
+
+def fir_CUDAFreeOp : fir_Op<"cuda_free", [MemoryEffects<[MemFree]>]> {
+  let summary = "Free a device allocated object";
+
+  let description = [{
+    The fir.cuda_free operation frees the memory allocated by fir.cuda_alloc.
+    This is used for non-allocatable device, managed and unified device
+    variables declare in host subprogram.
+  }];
+
+  let arguments = (ins
+    Arg<AnyReferenceLike, "", [MemFree]>:$devptr,
+    fir_CUDADataAttributeAttr:$cuda_attr
+  );
+
+  let assemblyFormat = "$devptr `:` qualified(type($devptr)) attr-dict";
+}
+
 #endif
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index 7fcd9c1babf2..b4344435db9f 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -97,35 +97,36 @@ bool isa_fir_or_std_type(mlir::Type t);
 
 /// Is `t` a FIR dialect type that implies a memory (de)reference?
 inline bool isa_ref_type(mlir::Type t) {
-  return t.isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
-               fir::LLVMPointerType>();
+  return mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
+                   fir::LLVMPointerType>(t);
 }
 
 /// Is `t` a boxed type?
 inline bool isa_box_type(mlir::Type t) {
-  return t.isa<fir::BaseBoxType, fir::BoxCharType, fir::BoxProcType>();
+  return mlir::isa<fir::BaseBoxType, fir::BoxCharType, fir::BoxProcType>(t);
 }
 
 /// Is `t` a type that is always trivially pass-by-reference? Specifically, this
 /// is testing if `t` is a ReferenceType or any box type. Compare this to
 /// conformsWithPassByRef(), which includes pointers and allocatables.
 inline bool isa_passbyref_type(mlir::Type t) {
-  return t.isa<fir::ReferenceType, mlir::FunctionType>() || isa_box_type(t);
+  return mlir::isa<fir::ReferenceType, mlir::FunctionType>(t) ||
+         isa_box_type(t);
 }
 
 /// Is `t` a type that can conform to be pass-by-reference? Depending on the
 /// context, these types may simply demote to pass-by-reference or a reference
 /// to them may have to be passed instead. Functions are always referent.
 inline bool conformsWithPassByRef(mlir::Type t) {
-  return isa_ref_type(t) || isa_box_type(t) || t.isa<mlir::FunctionType>();
+  return isa_ref_type(t) || isa_box_type(t) || mlir::isa<mlir::FunctionType>(t);
 }
 
 /// Is `t` a derived (record) type?
-inline bool isa_derived(mlir::Type t) { return t.isa<fir::RecordType>(); }
+inline bool isa_derived(mlir::Type t) { return mlir::isa<fir::RecordType>(t); }
 
 /// Is `t` type(c_ptr) or type(c_funptr)?
 inline bool isa_builtin_cptr_type(mlir::Type t) {
-  if (auto recTy = t.dyn_cast_or_null<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(t))
     return recTy.getName().ends_with("T__builtin_c_ptr") ||
            recTy.getName().ends_with("T__builtin_c_funptr");
   return false;
@@ -133,7 +134,7 @@ inline bool isa_builtin_cptr_type(mlir::Type t) {
 
 /// Is `t` a FIR dialect aggregate type?
 inline bool isa_aggregate(mlir::Type t) {
-  return t.isa<SequenceType, mlir::TupleType>() || fir::isa_derived(t);
+  return mlir::isa<SequenceType, mlir::TupleType>(t) || fir::isa_derived(t);
 }
 
 /// Extract the `Type` pointed to from a FIR memory reference type. If `t` is
@@ -146,17 +147,17 @@ mlir::Type dyn_cast_ptrOrBoxEleTy(mlir::Type t);
 
 /// Is `t` a FIR Real or MLIR Float type?
 inline bool isa_real(mlir::Type t) {
-  return t.isa<fir::RealType, mlir::FloatType>();
+  return mlir::isa<fir::RealType, mlir::FloatType>(t);
 }
 
 /// Is `t` an integral type?
 inline bool isa_integer(mlir::Type t) {
-  return t.isa<mlir::IndexType, mlir::IntegerType, fir::IntegerType>();
+  return mlir::isa<mlir::IndexType, mlir::IntegerType, fir::IntegerType>(t);
 }
 
 /// Is `t` a vector type?
 inline bool isa_vector(mlir::Type t) {
-  return t.isa<mlir::VectorType, fir::VectorType>();
+  return mlir::isa<mlir::VectorType, fir::VectorType>(t);
 }
 
 mlir::Type parseFirType(FIROpsDialect *, mlir::DialectAsmParser &parser);
@@ -169,22 +170,22 @@ void verifyIntegralType(mlir::Type type);
 
 /// Is `t` a FIR or MLIR Complex type?
 inline bool isa_complex(mlir::Type t) {
-  return t.isa<fir::ComplexType, mlir::ComplexType>();
+  return mlir::isa<fir::ComplexType, mlir::ComplexType>(t);
 }
 
 /// Is `t` a CHARACTER type? Does not check the length.
-inline bool isa_char(mlir::Type t) { return t.isa<fir::CharacterType>(); }
+inline bool isa_char(mlir::Type t) { return mlir::isa<fir::CharacterType>(t); }
 
 /// Is `t` a trivial intrinsic type? CHARACTER is <em>excluded</em> because it
 /// is a dependent type.
 inline bool isa_trivial(mlir::Type t) {
   return isa_integer(t) || isa_real(t) || isa_complex(t) || isa_vector(t) ||
-         t.isa<fir::LogicalType>();
+         mlir::isa<fir::LogicalType>(t);
 }
 
 /// Is `t` a CHARACTER type with a LEN other than 1?
 inline bool isa_char_string(mlir::Type t) {
-  if (auto ct = t.dyn_cast_or_null<fir::CharacterType>())
+  if (auto ct = mlir::dyn_cast_or_null<fir::CharacterType>(t))
     return ct.getLen() != fir::CharacterType::singleton();
   return false;
 }
@@ -198,7 +199,7 @@ bool isa_unknown_size_box(mlir::Type t);
 
 /// Returns true iff `t` is a fir.char type and has an unknown length.
 inline bool characterWithDynamicLen(mlir::Type t) {
-  if (auto charTy = t.dyn_cast<fir::CharacterType>())
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(t))
     return charTy.hasDynamicLen();
   return false;
 }
@@ -213,11 +214,11 @@ inline bool sequenceWithNonConstantShape(fir::SequenceType seqTy) {
 bool hasDynamicSize(mlir::Type t);
 
 inline unsigned getRankOfShapeType(mlir::Type t) {
-  if (auto shTy = t.dyn_cast<fir::ShapeType>())
+  if (auto shTy = mlir::dyn_cast<fir::ShapeType>(t))
     return shTy.getRank();
-  if (auto shTy = t.dyn_cast<fir::ShapeShiftType>())
+  if (auto shTy = mlir::dyn_cast<fir::ShapeShiftType>(t))
     return shTy.getRank();
-  if (auto shTy = t.dyn_cast<fir::ShiftType>())
+  if (auto shTy = mlir::dyn_cast<fir::ShiftType>(t))
     return shTy.getRank();
   return 0;
 }
@@ -225,14 +226,14 @@ inline unsigned getRankOfShapeType(mlir::Type t) {
 /// Get the memory reference type of the data pointer from the box type,
 inline mlir::Type boxMemRefType(fir::BaseBoxType t) {
   auto eleTy = t.getEleTy();
-  if (!eleTy.isa<fir::PointerType, fir::HeapType>())
+  if (!mlir::isa<fir::PointerType, fir::HeapType>(eleTy))
     eleTy = fir::ReferenceType::get(t);
   return eleTy;
 }
 
 /// If `t` is a SequenceType return its element type, otherwise return `t`.
 inline mlir::Type unwrapSequenceType(mlir::Type t) {
-  if (auto seqTy = t.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(t))
     return seqTy.getEleTy();
   return t;
 }
@@ -278,7 +279,7 @@ inline fir::SequenceType unwrapUntilSeqType(mlir::Type t) {
       t = ty;
       continue;
     }
-    if (auto seqTy = t.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(t))
       return seqTy;
     return {};
   }
@@ -287,8 +288,8 @@ inline fir::SequenceType unwrapUntilSeqType(mlir::Type t) {
 /// Unwrap the referential and sequential outer types (if any). Returns the
 /// the element if type is fir::RecordType
 inline fir::RecordType unwrapIfDerived(fir::BaseBoxType boxTy) {
-  return fir::unwrapSequenceType(fir::unwrapRefType(boxTy.getEleTy()))
-      .template dyn_cast<fir::RecordType>();
+  return mlir::dyn_cast<fir::RecordType>(
+      fir::unwrapSequenceType(fir::unwrapRefType(boxTy.getEleTy())));
 }
 
 /// Return true iff `boxTy` wraps a fir::RecordType with length parameters
@@ -377,7 +378,7 @@ bool isRecordWithDescriptorMember(mlir::Type ty);
 
 /// Return true iff `ty` is a RecordType with type parameters.
 inline bool isRecordWithTypeParameters(mlir::Type ty) {
-  if (auto recTy = ty.dyn_cast_or_null<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(ty))
     return recTy.isDependentType();
   return false;
 }
@@ -401,14 +402,14 @@ mlir::Type fromRealTypeID(mlir::MLIRContext *context, llvm::Type::TypeID typeID,
 int getTypeCode(mlir::Type ty, const KindMapping &kindMap);
 
 inline bool BaseBoxType::classof(mlir::Type type) {
-  return type.isa<fir::BoxType, fir::ClassType>();
+  return mlir::isa<fir::BoxType, fir::ClassType>(type);
 }
 
 /// Return true iff `ty` is none or fir.array<none>.
 inline bool isNoneOrSeqNone(mlir::Type type) {
-  if (auto seqTy = type.dyn_cast<fir::SequenceType>())
-    return seqTy.getEleTy().isa<mlir::NoneType>();
-  return type.isa<mlir::NoneType>();
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type))
+    return mlir::isa<mlir::NoneType>(seqTy.getEleTy());
+  return mlir::isa<mlir::NoneType>(type);
 }
 
 /// Return a fir.box<T> or fir.class<T> if the type is polymorphic. If the type
@@ -428,16 +429,16 @@ inline mlir::Type wrapInClassOrBoxType(mlir::Type eleTy,
 /// !fir.array<2xf32> -> !fir.array<2xnone>
 /// !fir.heap<!fir.array<2xf32>> -> !fir.heap<!fir.array<2xnone>>
 inline mlir::Type updateTypeForUnlimitedPolymorphic(mlir::Type ty) {
-  if (auto seqTy = ty.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
     return fir::SequenceType::get(
         seqTy.getShape(), updateTypeForUnlimitedPolymorphic(seqTy.getEleTy()));
-  if (auto heapTy = ty.dyn_cast<fir::HeapType>())
+  if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
     return fir::HeapType::get(
         updateTypeForUnlimitedPolymorphic(heapTy.getEleTy()));
-  if (auto pointerTy = ty.dyn_cast<fir::PointerType>())
+  if (auto pointerTy = mlir::dyn_cast<fir::PointerType>(ty))
     return fir::PointerType::get(
         updateTypeForUnlimitedPolymorphic(pointerTy.getEleTy()));
-  if (!ty.isa<mlir::NoneType, fir::RecordType>())
+  if (!mlir::isa<mlir::NoneType, fir::RecordType>(ty))
     return mlir::NoneType::get(ty.getContext());
   return ty;
 }
@@ -451,18 +452,19 @@ mlir::Type changeElementType(mlir::Type type, mlir::Type newElementType,
 
 /// Is `t` an address to fir.box or class type?
 inline bool isBoxAddress(mlir::Type t) {
-  return fir::isa_ref_type(t) && fir::unwrapRefType(t).isa<fir::BaseBoxType>();
+  return fir::isa_ref_type(t) &&
+         mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(t));
 }
 
 /// Is `t` a fir.box or class address or value type?
 inline bool isBoxAddressOrValue(mlir::Type t) {
-  return fir::unwrapRefType(t).isa<fir::BaseBoxType>();
+  return mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(t));
 }
 
 /// Is this a fir.boxproc address type?
 inline bool isBoxProcAddressType(mlir::Type t) {
   t = fir::dyn_cast_ptrEleTy(t);
-  return t && t.isa<fir::BoxProcType>();
+  return t && mlir::isa<fir::BoxProcType>(t);
 }
 
 /// Return a string representation of `ty`.
diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
index 3b876e4642da..ae984de63db4 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
@@ -576,9 +576,20 @@ def fir_VoidType : FIR_Type<"Void", "void"> {
   let genStorageClass = 0;
 }
 
+def fir_DummyScopeType : FIR_Type<"DummyScope", "dscope"> {
+  let summary = "Dummy scope type";
+
+  let description = [{
+    `fir.dscope` is a type returned by fir.dummy_scope operation.
+    It defines a unique identifier for a runtime instance of a subroutine
+    that is used by the [hl]fir.declare operations representing
+    the dummy arguments' declarations.
+  }];
+}
+
 // Whether a type is a BaseBoxType
 def IsBaseBoxTypePred
-        : CPred<"$_self.isa<::fir::BaseBoxType>()">;
+        : CPred<"mlir::isa<::fir::BaseBoxType>($_self)">;
 def fir_BaseBoxType : Type<IsBaseBoxTypePred, "fir.box or fir.class type">;
 
 // Generalized FIR and standard dialect types representing intrinsic types
diff --git a/flang/include/flang/Optimizer/Dialect/FortranVariableInterface.td b/flang/include/flang/Optimizer/Dialect/FortranVariableInterface.td
index 6405afbf1bfb..3f78a93a2515 100644
--- a/flang/include/flang/Optimizer/Dialect/FortranVariableInterface.td
+++ b/flang/include/flang/Optimizer/Dialect/FortranVariableInterface.td
@@ -75,7 +75,7 @@ def fir_FortranVariableOpInterface : OpInterface<"FortranVariableOpInterface"> {
     /// variable.
     mlir::Type getElementOrSequenceType() {
       mlir::Type type = fir::unwrapPassByRefType(fir::unwrapRefType(getBase().getType()));
-      if (auto boxCharType = type.dyn_cast<fir::BoxCharType>())
+      if (auto boxCharType = mlir::dyn_cast<fir::BoxCharType>(type))
         return boxCharType.getEleTy();
       return type;
     }
@@ -87,13 +87,13 @@ def fir_FortranVariableOpInterface : OpInterface<"FortranVariableOpInterface"> {
 
     /// Is the variable an array?
     bool isArray() {
-      return getElementOrSequenceType().isa<fir::SequenceType>();
+      return mlir::isa<fir::SequenceType>(getElementOrSequenceType());
     }
 
     /// Return the rank of the entity if it is known at compile time.
     std::optional<unsigned> getRank() {
       if (auto sequenceType =
-            getElementOrSequenceType().dyn_cast<fir::SequenceType>()) {
+            mlir::dyn_cast<fir::SequenceType>(getElementOrSequenceType())) {
         if (sequenceType.hasUnknownShape())
           return {};
         return sequenceType.getDimension();
@@ -133,7 +133,7 @@ def fir_FortranVariableOpInterface : OpInterface<"FortranVariableOpInterface"> {
 
     /// Is this a Fortran character variable?
     bool isCharacter() {
-      return getElementType().isa<fir::CharacterType>();
+      return mlir::isa<fir::CharacterType>(getElementType());
     }
 
     /// Is this a Fortran character variable with an explicit length?
@@ -149,7 +149,7 @@ def fir_FortranVariableOpInterface : OpInterface<"FortranVariableOpInterface"> {
 
     /// Is this variable represented as a fir.box or fir.class value?
     bool isBoxValue() {
-      return getBase().getType().isa<fir::BaseBoxType>();
+      return mlir::isa<fir::BaseBoxType>(getBase().getType());
     }
 
     /// Is this variable represented as a fir.box or fir.class address?
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
index aa68d0811c48..3830237f96f3 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
@@ -40,9 +40,9 @@ namespace hlfir {
 inline mlir::Type getFortranElementType(mlir::Type type) {
   type = fir::unwrapSequenceType(
       fir::unwrapPassByRefType(fir::unwrapRefType(type)));
-  if (auto exprType = type.dyn_cast<hlfir::ExprType>())
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type))
     return exprType.getEleTy();
-  if (auto boxCharType = type.dyn_cast<fir::BoxCharType>())
+  if (auto boxCharType = mlir::dyn_cast<fir::BoxCharType>(type))
     return boxCharType.getEleTy();
   return type;
 }
@@ -51,12 +51,12 @@ inline mlir::Type getFortranElementType(mlir::Type type) {
 /// fir.array type. Otherwise, returns the Fortran element typeof the entity.
 inline mlir::Type getFortranElementOrSequenceType(mlir::Type type) {
   type = fir::unwrapPassByRefType(fir::unwrapRefType(type));
-  if (auto exprType = type.dyn_cast<hlfir::ExprType>()) {
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type)) {
     if (exprType.isArray())
       return fir::SequenceType::get(exprType.getShape(), exprType.getEleTy());
     return exprType.getEleTy();
   }
-  if (auto boxCharType = type.dyn_cast<fir::BoxCharType>())
+  if (auto boxCharType = mlir::dyn_cast<fir::BoxCharType>(type))
     return boxCharType.getEleTy();
   return type;
 }
@@ -64,16 +64,16 @@ inline mlir::Type getFortranElementOrSequenceType(mlir::Type type) {
 /// Is this a fir.box or fir.class address type?
 inline bool isBoxAddressType(mlir::Type type) {
   type = fir::dyn_cast_ptrEleTy(type);
-  return type && type.isa<fir::BaseBoxType>();
+  return type && mlir::isa<fir::BaseBoxType>(type);
 }
 
 /// Is this a fir.box or fir.class address or value type?
 inline bool isBoxAddressOrValueType(mlir::Type type) {
-  return fir::unwrapRefType(type).isa<fir::BaseBoxType>();
+  return mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(type));
 }
 
 inline bool isPolymorphicType(mlir::Type type) {
-  if (auto exprType = type.dyn_cast<hlfir::ExprType>())
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type))
     return exprType.isPolymorphic();
   return fir::isPolymorphicType(type);
 }
@@ -81,14 +81,14 @@ inline bool isPolymorphicType(mlir::Type type) {
 /// Is this an SSA value type for the value of a Fortran procedure
 /// designator ?
 inline bool isFortranProcedureValue(mlir::Type type) {
-  return type.isa<fir::BoxProcType>() ||
-         (type.isa<mlir::TupleType>() &&
+  return mlir::isa<fir::BoxProcType>(type) ||
+         (mlir::isa<mlir::TupleType>(type) &&
           fir::isCharacterProcedureTuple(type, /*acceptRawFunc=*/false));
 }
 
 /// Is this an SSA value type for the value of a Fortran expression?
 inline bool isFortranValueType(mlir::Type type) {
-  return type.isa<hlfir::ExprType>() || fir::isa_trivial(type) ||
+  return mlir::isa<hlfir::ExprType>(type) || fir::isa_trivial(type) ||
          isFortranProcedureValue(type);
 }
 
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 743a6c98ec1a..ee3c26800ae3 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -87,6 +87,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
     AnyRefOrBox:$memref,
     Optional<AnyShapeOrShiftType>:$shape,
     Variadic<AnyIntegerType>:$typeparams,
+    Optional<fir_DummyScopeType>:$dummy_scope,
     Builtin_StringAttr:$uniq_name,
     OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
     OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
@@ -96,7 +97,8 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
 
   let assemblyFormat = [{
     $memref (`(` $shape^ `)`)? (`typeparams` $typeparams^)?
-     attr-dict `:` functional-type(operands, results)
+    (`dummy_scope` $dummy_scope^)?
+    attr-dict `:` functional-type(operands, results)
   }];
 
   let builders = [
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index 2b4fa50e0e42..2da6f24da40e 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -29,7 +29,9 @@
 namespace fir {
 /// Return the integer value of a arith::ConstantOp.
 inline std::int64_t toInt(mlir::arith::ConstantOp cop) {
-  return cop.getValue().cast<mlir::IntegerAttr>().getValue().getSExtValue();
+  return mlir::cast<mlir::IntegerAttr>(cop.getValue())
+      .getValue()
+      .getSExtValue();
 }
 
 // Reconstruct binding tables for dynamic dispatch.
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 402f212387e4..470ed8a125ac 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -47,33 +47,26 @@ namespace fir {
 #define GEN_PASS_DECL_POLYMORPHICOPCONVERSION
 #define GEN_PASS_DECL_OPENACCDATAOPERANDCONVERSION
 #define GEN_PASS_DECL_ADDDEBUGINFO
+#define GEN_PASS_DECL_STACKARRAYS
+#define GEN_PASS_DECL_LOOPVERSIONING
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 
 std::unique_ptr<mlir::Pass> createAffineDemotionPass();
 std::unique_ptr<mlir::Pass>
 createArrayValueCopyPass(fir::ArrayValueCopyOptions options = {});
-std::unique_ptr<mlir::Pass> createCharacterConversionPass();
 std::unique_ptr<mlir::Pass> createExternalNameConversionPass();
 std::unique_ptr<mlir::Pass>
 createExternalNameConversionPass(bool appendUnderscore);
 std::unique_ptr<mlir::Pass> createMemDataFlowOptPass();
 std::unique_ptr<mlir::Pass> createPromoteToAffinePass();
-std::unique_ptr<mlir::Pass> createMemoryAllocationPass();
-std::unique_ptr<mlir::Pass> createStackArraysPass();
 std::unique_ptr<mlir::Pass> createAliasTagsPass();
-std::unique_ptr<mlir::Pass> createSimplifyIntrinsicsPass();
 std::unique_ptr<mlir::Pass>
 createAddDebugInfoPass(fir::AddDebugInfoOptions options = {});
-std::unique_ptr<mlir::Pass> createLoopVersioningPass();
 
-std::unique_ptr<mlir::Pass>
-createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize);
 std::unique_ptr<mlir::Pass> createAnnotateConstantOperandsPass();
-std::unique_ptr<mlir::Pass> createSimplifyRegionLitePass();
 std::unique_ptr<mlir::Pass> createAlgebraicSimplificationPass();
 std::unique_ptr<mlir::Pass>
 createAlgebraicSimplificationPass(const mlir::GreedyRewriteConfig &config);
-std::unique_ptr<mlir::Pass> createPolymorphicOpConversionPass();
 
 std::unique_ptr<mlir::Pass> createOMPDescriptorMapInfoGenPass();
 std::unique_ptr<mlir::Pass> createOMPFunctionFilteringPass();
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 88e4321e5b2b..1eaaa32a508a 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -127,7 +127,6 @@ def CharacterConversion : Pass<"character-conversion"> {
     By default the translation is to naively zero-extend or truncate a code
     point to fit the destination size.
   }];
-  let constructor = "::fir::createCharacterConversionPass()";
   let dependentDialects = [ "fir::FIROpsDialect" ];
   let options = [
     Option<"useRuntimeCalls", "use-runtime-calls",
@@ -225,7 +224,6 @@ def SimplifyIntrinsics : Pass<"simplify-intrinsics", "mlir::ModuleOp"> {
     simplified function. The simplified function is added to the current module.
     This function can be inlined by a general purpose inlining pass.
   }];
-  let constructor = "::fir::createSimplifyIntrinsicsPass()";
 
   let options = [
     Option<"enableExperimental", "enable-experimental", "bool",
@@ -249,7 +247,6 @@ def MemoryAllocationOpt : Pass<"memory-allocation-opt", "mlir::func::FuncOp"> {
            "std::size_t", /*default=*/"~static_cast<std::size_t>(0)",
            "Set maximum number of elements of an array allocated on the stack.">
   ];
-  let constructor = "::fir::createMemoryAllocationPass()";
 }
 
 def StackArrays : Pass<"stack-arrays", "mlir::ModuleOp"> {
@@ -259,7 +256,6 @@ def StackArrays : Pass<"stack-arrays", "mlir::ModuleOp"> {
     allocations.
   }];
   let dependentDialects = [ "fir::FIROpsDialect" ];
-  let constructor = "::fir::createStackArraysPass()";
 }
 
 def AddAliasTags : Pass<"fir-add-alias-tags", "mlir::ModuleOp"> {
@@ -287,7 +283,6 @@ def SimplifyRegionLite : Pass<"simplify-region-lite", "mlir::ModuleOp"> {
   let description = [{
     Run region DCE and erase unreachable blocks in regions.
   }];
-  let constructor = "::fir::createSimplifyRegionLitePass()";
 }
 
 def AlgebraicSimplification : Pass<"flang-algebraic-simplification"> {
@@ -303,14 +298,13 @@ def AlgebraicSimplification : Pass<"flang-algebraic-simplification"> {
   let constructor = "::fir::createAlgebraicSimplificationPass()";
 }
 
-def PolymorphicOpConversion : Pass<"fir-polymorphic-op", "::mlir::func::FuncOp"> {
+def PolymorphicOpConversion : Pass<"fir-polymorphic-op", "mlir::ModuleOp"> {
   let summary =
     "Simplify operations on polymorphic types";
   let description = [{
     This pass breaks up the lowering of operations on polymorphic types by 
     introducing an intermediate FIR level that simplifies code geneation. 
   }];
-  let constructor = "::fir::createPolymorphicOpConversionPass()";
   let dependentDialects = [
     "fir::FIROpsDialect", "mlir::func::FuncDialect"
   ];
@@ -324,7 +318,6 @@ def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> {
     an array has element sized stride. The element sizes stride allows some
     loops to be vectorized as well as other loop optimizations.
   }];
-  let constructor = "::fir::createLoopVersioningPass()";
   let dependentDialects = [ "fir::FIROpsDialect" ];
 }
 
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index d7c23755c57b..4641f9d20d5b 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -455,7 +455,8 @@ struct SpecificationPart {
 struct InternalSubprogram {
   UNION_CLASS_BOILERPLATE(InternalSubprogram);
   std::variant<common::Indirection<FunctionSubprogram>,
-      common::Indirection<SubroutineSubprogram>>
+      common::Indirection<SubroutineSubprogram>,
+      common::Indirection<CompilerDirective>>
       u;
 };
 
diff --git a/flang/include/flang/Parser/preprocessor.h b/flang/include/flang/Parser/preprocessor.h
index 630d5273d427..c3076435be5f 100644
--- a/flang/include/flang/Parser/preprocessor.h
+++ b/flang/include/flang/Parser/preprocessor.h
@@ -81,6 +81,7 @@ public:
   void Undefine(std::string macro);
   bool IsNameDefined(const CharBlock &);
   bool IsFunctionLikeDefinition(const CharBlock &);
+  bool AnyDefinitions() const { return !definitions_.empty(); }
 
   // When called with partialFunctionLikeMacro not null, MacroReplacement()
   // and ReplaceMacros() handle an unclosed function-like macro reference
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 96d56d9b43a6..1b0b7e23ce6c 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -456,6 +456,7 @@ public:
     assert(descriptor().rank() <= maxRank);
     assert(descriptor().SizeInBytes() <= byteSize);
     if (DescriptorAddendum * addendum{descriptor().Addendum()}) {
+      (void)addendum;
       assert(hasAddendum);
       assert(addendum->LenParameters() <= maxLengthTypeParameters);
     } else {
diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h
index 3d9cb8b5b0ac..7d3f91360c8c 100644
--- a/flang/include/flang/Runtime/numeric.h
+++ b/flang/include/flang/Runtime/numeric.h
@@ -356,10 +356,18 @@ CppTypeFor<TypeCategory::Real, 16> RTDECL(Scale16)(
     CppTypeFor<TypeCategory::Real, 16>, std::int64_t);
 #endif
 
+// SELECTED_CHAR_KIND
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedCharKind)(
+    const char *, int, const char *, std::size_t);
+
 // SELECTED_INT_KIND
 CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedIntKind)(
     const char *, int, void *, int);
 
+// SELECTED_LOGICAL_KIND
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedLogicalKind)(
+    const char *, int, void *, int);
+
 // SELECTED_REAL_KIND
 CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedRealKind)(
     const char *, int, void *, int, void *, int, void *, int);
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index c8ee71945d8b..e6ba71d53e92 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -312,7 +312,7 @@ public:
     return context_.FindScope(where);
   }
   bool AnyFatalError() const { return context_.AnyFatalError(); }
-  void EmitMessages(llvm::raw_ostream &) const;
+  void EmitMessages(llvm::raw_ostream &);
   void DumpSymbols(llvm::raw_ostream &);
   void DumpSymbolsSources(llvm::raw_ostream &) const;
 
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index da10969ebc70..efb5c9ba1077 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -634,7 +634,7 @@ public:
   void Post(const parser::ErrLabel &errLabel);
   void Post(const parser::EndLabel &endLabel);
   void Post(const parser::EorLabel &eorLabel);
-  void checkLabelUse(const parser::Label &labelUsed);
+  void CheckLabelUse(const parser::Label &labelUsed);
 
 private:
   SemanticsContext &context_;
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index a9d8ebc84e2e..d85436489870 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -104,7 +104,7 @@ void addNestedPassToOps(mlir::PassManager &pm, PassConstructor ctor) {
 void addNestedPassToAllTopLevelOperations(
     mlir::PassManager &pm, PassConstructor ctor) {
   addNestedPassToOps<mlir::func::FuncOp, mlir::omp::DeclareReductionOp,
-      fir::GlobalOp>(pm, ctor);
+      mlir::omp::PrivateClauseOp, fir::GlobalOp>(pm, ctor);
 }
 
 void addNestedPassToAllTopLevelOperationsConditionally(mlir::PassManager &pm,
@@ -163,8 +163,8 @@ inline void addAVC(
 
 inline void addMemoryAllocationOpt(mlir::PassManager &pm) {
   addNestedPassConditionally<mlir::func::FuncOp>(pm, disableFirMao, [&]() {
-    return fir::createMemoryAllocationPass(
-        dynamicArrayStackToHeapAllocation, arrayStackAllocationThreshold);
+    return fir::createMemoryAllocationOpt(
+        {dynamicArrayStackToHeapAllocation, arrayStackAllocationThreshold});
   });
 }
 
@@ -243,22 +243,22 @@ inline void createDefaultFIROptimizerPassPipeline(
   config.enableRegionSimplification = false;
   pm.addPass(mlir::createCSEPass());
   fir::addAVC(pm, pc.OptLevel);
-  pm.addNestedPass<mlir::func::FuncOp>(fir::createCharacterConversionPass());
+  addNestedPassToAllTopLevelOperations(pm, fir::createCharacterConversion);
   pm.addPass(mlir::createCanonicalizerPass(config));
-  pm.addPass(fir::createSimplifyRegionLitePass());
+  pm.addPass(fir::createSimplifyRegionLite());
   if (pc.OptLevel.isOptimizingForSpeed()) {
     // These passes may increase code size.
-    pm.addPass(fir::createSimplifyIntrinsicsPass());
+    pm.addPass(fir::createSimplifyIntrinsics());
     pm.addPass(fir::createAlgebraicSimplificationPass(config));
   }
 
   if (pc.LoopVersioning)
-    pm.addPass(fir::createLoopVersioningPass());
+    pm.addPass(fir::createLoopVersioning());
 
   pm.addPass(mlir::createCSEPass());
 
   if (pc.StackArrays)
-    pm.addPass(fir::createStackArraysPass());
+    pm.addPass(fir::createStackArrays());
   else
     fir::addMemoryAllocationOpt(pm);
 
@@ -267,11 +267,11 @@ inline void createDefaultFIROptimizerPassPipeline(
   llvm::StringMap<mlir::OpPassManager> pipelines;
   pm.addPass(mlir::createInlinerPass(
       pipelines, addCanonicalizerPassWithoutRegionSimplification));
-  pm.addPass(fir::createSimplifyRegionLitePass());
+  pm.addPass(fir::createSimplifyRegionLite());
   pm.addPass(mlir::createCSEPass());
 
   // Polymorphic types
-  pm.addPass(fir::createPolymorphicOpConversionPass());
+  pm.addPass(fir::createPolymorphicOpConversion());
 
   if (pc.AliasAnalysis && !disableFirAliasTags && !useOldAliasTags)
     pm.addPass(fir::createAliasTagsPass());
@@ -281,7 +281,7 @@ inline void createDefaultFIROptimizerPassPipeline(
   pm.addPass(mlir::createConvertSCFToCFPass());
 
   pm.addPass(mlir::createCanonicalizerPass(config));
-  pm.addPass(fir::createSimplifyRegionLitePass());
+  pm.addPass(fir::createSimplifyRegionLite());
   pm.addPass(mlir::createCSEPass());
 }
 
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index b7d84af8c483..cebdd6d181c3 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -109,7 +109,7 @@ struct OffloadModuleOpts {
 
 //  Shares assinging of the OpenMP OffloadModuleInterface and its assorted
 //  attributes accross Flang tools (bbc/flang)
-void setOffloadModuleInterfaceAttributes(
+[[maybe_unused]] static void setOffloadModuleInterfaceAttributes(
     mlir::ModuleOp &module, OffloadModuleOpts Opts) {
   // Should be registered by the OpenMPDialect
   if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
@@ -127,10 +127,18 @@ void setOffloadModuleInterfaceAttributes(
   }
 }
 
-void setOpenMPVersionAttribute(mlir::ModuleOp &module, int64_t version) {
+[[maybe_unused]] static void setOpenMPVersionAttribute(
+    mlir::ModuleOp &module, int64_t version) {
   module.getOperation()->setAttr(
       mlir::StringAttr::get(module.getContext(), llvm::Twine{"omp.version"}),
       mlir::omp::VersionAttr::get(module.getContext(), version));
 }
 
+[[maybe_unused]] static int64_t getOpenMPVersionAttribute(
+    mlir::ModuleOp module, int64_t fallback = -1) {
+  if (mlir::Attribute verAttr = module->getAttr("omp.version"))
+    return llvm::cast<mlir::omp::VersionAttr>(verAttr).getVersion();
+  return fallback;
+}
+
 #endif // FORTRAN_TOOLS_CROSS_TOOL_HELPERS_H
diff --git a/flang/include/flang/Tools/PointerModels.h b/flang/include/flang/Tools/PointerModels.h
index 7acaf2f9fda5..c3c0977d6e54 100644
--- a/flang/include/flang/Tools/PointerModels.h
+++ b/flang/include/flang/Tools/PointerModels.h
@@ -20,7 +20,7 @@ struct OpenMPPointerLikeModel
     : public mlir::omp::PointerLikeType::ExternalModel<
           OpenMPPointerLikeModel<T>, T> {
   mlir::Type getElementType(mlir::Type pointer) const {
-    return pointer.cast<T>().getElementType();
+    return mlir::cast<T>(pointer).getElementType();
   }
 };
 
@@ -29,7 +29,7 @@ struct OpenACCPointerLikeModel
     : public mlir::acc::PointerLikeType::ExternalModel<
           OpenACCPointerLikeModel<T>, T> {
   mlir::Type getElementType(mlir::Type pointer) const {
-    return pointer.cast<T>().getElementType();
+    return mlir::cast<T>(pointer).getElementType();
   }
 };
 
diff --git a/flang/lib/Common/Fortran.cpp b/flang/lib/Common/Fortran.cpp
index 27ff31ef78da..170ce8c22509 100644
--- a/flang/lib/Common/Fortran.cpp
+++ b/flang/lib/Common/Fortran.cpp
@@ -97,12 +97,19 @@ std::string AsFortran(IgnoreTKRSet tkr) {
   return result;
 }
 
+/// Check compatibilty of CUDA attribute.
+/// When `allowUnifiedMatchingRule` is enabled, argument `x` represents the
+/// dummy argument attribute while `y` represents the actual argument attribute.
 bool AreCompatibleCUDADataAttrs(std::optional<CUDADataAttr> x,
-    std::optional<CUDADataAttr> y, IgnoreTKRSet ignoreTKR) {
+    std::optional<CUDADataAttr> y, IgnoreTKRSet ignoreTKR,
+    bool allowUnifiedMatchingRule) {
   if (!x && !y) {
     return true;
   } else if (x && y && *x == *y) {
     return true;
+  } else if ((!x && y && *y == CUDADataAttr::Pinned) ||
+      (x && *x == CUDADataAttr::Pinned && !y)) {
+    return true;
   } else if (ignoreTKR.test(IgnoreTKR::Device) &&
       x.value_or(CUDADataAttr::Device) == CUDADataAttr::Device &&
       y.value_or(CUDADataAttr::Device) == CUDADataAttr::Device) {
@@ -111,6 +118,24 @@ bool AreCompatibleCUDADataAttrs(std::optional<CUDADataAttr> x,
       x.value_or(CUDADataAttr::Managed) == CUDADataAttr::Managed &&
       y.value_or(CUDADataAttr::Managed) == CUDADataAttr::Managed) {
     return true;
+  } else if (allowUnifiedMatchingRule) {
+    if (!x) { // Dummy argument has no attribute -> host
+      if (y && (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified)) {
+        return true;
+      }
+    } else {
+      if (*x == CUDADataAttr::Device && y &&
+          (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified)) {
+        return true;
+      } else if (*x == CUDADataAttr::Managed && y &&
+          *y == CUDADataAttr::Unified) {
+        return true;
+      } else if (*x == CUDADataAttr::Unified && y &&
+          *y == CUDADataAttr::Managed) {
+        return true;
+      }
+    }
+    return false;
   } else {
     return false;
   }
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index 20f7476425ac..ab03ca5ed2d5 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -362,8 +362,9 @@ bool DummyDataObject::IsCompatibleWith(const DummyDataObject &actual,
     }
   }
   if (!attrs.test(Attr::Value) &&
-      !common::AreCompatibleCUDADataAttrs(
-          cudaDataAttr, actual.cudaDataAttr, ignoreTKR)) {
+      !common::AreCompatibleCUDADataAttrs(cudaDataAttr, actual.cudaDataAttr,
+          ignoreTKR,
+          /*allowUnifiedMatchingRule=*/false)) {
     if (whyNot) {
       *whyNot = "incompatible CUDA data attributes";
     }
@@ -1754,8 +1755,9 @@ bool DistinguishUtils::Distinguishable(
   } else if (y.attrs.test(Attr::Allocatable) && x.attrs.test(Attr::Pointer) &&
       x.intent != common::Intent::In) {
     return true;
-  } else if (!common::AreCompatibleCUDADataAttrs(
-                 x.cudaDataAttr, y.cudaDataAttr, x.ignoreTKR | y.ignoreTKR)) {
+  } else if (!common::AreCompatibleCUDADataAttrs(x.cudaDataAttr, y.cudaDataAttr,
+                 x.ignoreTKR | y.ignoreTKR,
+                 /*allowUnifiedMatchingRule=*/false)) {
     return true;
   } else if (features_.IsEnabled(
                  common::LanguageFeature::DistinguishableSpecifics) &&
diff --git a/flang/lib/Evaluate/common.cpp b/flang/lib/Evaluate/common.cpp
index c659a5002ba0..c633bff57b1e 100644
--- a/flang/lib/Evaluate/common.cpp
+++ b/flang/lib/Evaluate/common.cpp
@@ -15,21 +15,24 @@ namespace Fortran::evaluate {
 
 void RealFlagWarnings(
     FoldingContext &context, const RealFlags &flags, const char *operation) {
-  if (flags.test(RealFlag::Overflow)) {
-    context.messages().Say("overflow on %s"_warn_en_US, operation);
-  }
-  if (flags.test(RealFlag::DivideByZero)) {
-    if (std::strcmp(operation, "division") == 0) {
-      context.messages().Say("division by zero"_warn_en_US);
-    } else {
-      context.messages().Say("division by zero on %s"_warn_en_US, operation);
+  if (context.languageFeatures().ShouldWarn(
+          common::UsageWarning::FoldingException)) {
+    if (flags.test(RealFlag::Overflow)) {
+      context.messages().Say("overflow on %s"_warn_en_US, operation);
+    }
+    if (flags.test(RealFlag::DivideByZero)) {
+      if (std::strcmp(operation, "division") == 0) {
+        context.messages().Say("division by zero"_warn_en_US);
+      } else {
+        context.messages().Say("division by zero on %s"_warn_en_US, operation);
+      }
+    }
+    if (flags.test(RealFlag::InvalidArgument)) {
+      context.messages().Say("invalid argument on %s"_warn_en_US, operation);
+    }
+    if (flags.test(RealFlag::Underflow)) {
+      context.messages().Say("underflow on %s"_warn_en_US, operation);
     }
-  }
-  if (flags.test(RealFlag::InvalidArgument)) {
-    context.messages().Say("invalid argument on %s"_warn_en_US, operation);
-  }
-  if (flags.test(RealFlag::Underflow)) {
-    context.messages().Say("underflow on %s"_warn_en_US, operation);
   }
 }
 
diff --git a/flang/lib/Evaluate/fold-character.cpp b/flang/lib/Evaluate/fold-character.cpp
index 5d9cc11754a7..877bc2eac1fc 100644
--- a/flang/lib/Evaluate/fold-character.cpp
+++ b/flang/lib/Evaluate/fold-character.cpp
@@ -58,10 +58,13 @@ Expr<Type<TypeCategory::Character, KIND>> FoldIntrinsicFunction(
     return FoldElementalIntrinsic<T, IntT>(context, std::move(funcRef),
         ScalarFunc<T, IntT>([&](const Scalar<IntT> &i) {
           if (i.IsNegative() || i.BGE(Scalar<IntT>{0}.IBSET(8 * KIND))) {
-            context.messages().Say(
-                "%s(I=%jd) is out of range for CHARACTER(KIND=%d)"_warn_en_US,
-                parser::ToUpperCaseLetters(name),
-                static_cast<std::intmax_t>(i.ToInt64()), KIND);
+            if (context.languageFeatures().ShouldWarn(
+                    common::UsageWarning::FoldingValueChecks)) {
+              context.messages().Say(
+                  "%s(I=%jd) is out of range for CHARACTER(KIND=%d)"_warn_en_US,
+                  parser::ToUpperCaseLetters(name),
+                  static_cast<std::intmax_t>(i.ToInt64()), KIND);
+            }
           }
           return CharacterUtils<KIND>::CHAR(i.ToUInt64());
         }));
@@ -103,9 +106,12 @@ Expr<Type<TypeCategory::Character, KIND>> FoldIntrinsicFunction(
             static_cast<std::intmax_t>(n));
       } else if (static_cast<double>(n) * str.size() >
           (1 << 20)) { // sanity limit of 1MiB
-        context.messages().Say(
-            "Result of REPEAT() is too large to compute at compilation time (%g characters)"_port_en_US,
-            static_cast<double>(n) * str.size());
+        if (context.languageFeatures().ShouldWarn(
+                common::UsageWarning::FoldingLimit)) {
+          context.messages().Say(
+              "Result of REPEAT() is too large to compute at compilation time (%g characters)"_port_en_US,
+              static_cast<double>(n) * str.size());
+        }
       } else {
         return Expr<T>{Constant<T>{CharacterUtils<KIND>::REPEAT(str, n)}};
       }
diff --git a/flang/lib/Evaluate/fold-complex.cpp b/flang/lib/Evaluate/fold-complex.cpp
index 3260f82ffe8d..d44cc9c69dd6 100644
--- a/flang/lib/Evaluate/fold-complex.cpp
+++ b/flang/lib/Evaluate/fold-complex.cpp
@@ -29,7 +29,8 @@ Expr<Type<TypeCategory::Complex, KIND>> FoldIntrinsicFunction(
     if (auto callable{GetHostRuntimeWrapper<T, T>(name)}) {
       return FoldElementalIntrinsic<T, T>(
           context, std::move(funcRef), *callable);
-    } else {
+    } else if (context.languageFeatures().ShouldWarn(
+                   common::UsageWarning::FoldingFailure)) {
       context.messages().Say(
           "%s(complex(kind=%d)) cannot be folded on host"_warn_en_US, name,
           KIND);
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 093f26bea1a4..d5c393140c57 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -45,6 +45,12 @@
 
 namespace Fortran::evaluate {
 
+// Don't use Kahan extended precision summation any more when folding
+// transformational intrinsic functions other than SUM, since it is
+// not used in the runtime implementations of those functions and we
+// want results to match.
+static constexpr bool useKahanSummation{false};
+
 // Utilities
 template <typename T> class Folder {
 public:
@@ -1692,7 +1698,9 @@ Expr<TO> FoldOperation(
           if constexpr (TO::category == TypeCategory::Integer) {
             if constexpr (FromCat == TypeCategory::Integer) {
               auto converted{Scalar<TO>::ConvertSigned(*value)};
-              if (converted.overflow) {
+              if (converted.overflow &&
+                  msvcWorkaround.context.languageFeatures().ShouldWarn(
+                      common::UsageWarning::FoldingException)) {
                 ctx.messages().Say(
                     "INTEGER(%d) to INTEGER(%d) conversion overflowed"_warn_en_US,
                     Operand::kind, TO::kind);
@@ -1700,14 +1708,17 @@ Expr<TO> FoldOperation(
               return ScalarConstantToExpr(std::move(converted.value));
             } else if constexpr (FromCat == TypeCategory::Real) {
               auto converted{value->template ToInteger<Scalar<TO>>()};
-              if (converted.flags.test(RealFlag::InvalidArgument)) {
-                ctx.messages().Say(
-                    "REAL(%d) to INTEGER(%d) conversion: invalid argument"_warn_en_US,
-                    Operand::kind, TO::kind);
-              } else if (converted.flags.test(RealFlag::Overflow)) {
-                ctx.messages().Say(
-                    "REAL(%d) to INTEGER(%d) conversion overflowed"_warn_en_US,
-                    Operand::kind, TO::kind);
+              if (msvcWorkaround.context.languageFeatures().ShouldWarn(
+                      common::UsageWarning::FoldingException)) {
+                if (converted.flags.test(RealFlag::InvalidArgument)) {
+                  ctx.messages().Say(
+                      "REAL(%d) to INTEGER(%d) conversion: invalid argument"_warn_en_US,
+                      Operand::kind, TO::kind);
+                } else if (converted.flags.test(RealFlag::Overflow)) {
+                  ctx.messages().Say(
+                      "REAL(%d) to INTEGER(%d) conversion overflowed"_warn_en_US,
+                      Operand::kind, TO::kind);
+                }
               }
               return ScalarConstantToExpr(std::move(converted.value));
             }
@@ -1816,7 +1827,9 @@ Expr<T> FoldOperation(FoldingContext &context, Negate<T> &&x) {
   } else if (auto value{GetScalarConstantValue<T>(operand)}) {
     if constexpr (T::category == TypeCategory::Integer) {
       auto negated{value->Negate()};
-      if (negated.overflow) {
+      if (negated.overflow &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingException)) {
         context.messages().Say(
             "INTEGER(%d) negation overflowed"_warn_en_US, T::kind);
       }
@@ -1856,7 +1869,9 @@ Expr<T> FoldOperation(FoldingContext &context, Add<T> &&x) {
   if (auto folded{OperandsAreConstants(x)}) {
     if constexpr (T::category == TypeCategory::Integer) {
       auto sum{folded->first.AddSigned(folded->second)};
-      if (sum.overflow) {
+      if (sum.overflow &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingException)) {
         context.messages().Say(
             "INTEGER(%d) addition overflowed"_warn_en_US, T::kind);
       }
@@ -1882,7 +1897,9 @@ Expr<T> FoldOperation(FoldingContext &context, Subtract<T> &&x) {
   if (auto folded{OperandsAreConstants(x)}) {
     if constexpr (T::category == TypeCategory::Integer) {
       auto difference{folded->first.SubtractSigned(folded->second)};
-      if (difference.overflow) {
+      if (difference.overflow &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingException)) {
         context.messages().Say(
             "INTEGER(%d) subtraction overflowed"_warn_en_US, T::kind);
       }
@@ -1908,7 +1925,9 @@ Expr<T> FoldOperation(FoldingContext &context, Multiply<T> &&x) {
   if (auto folded{OperandsAreConstants(x)}) {
     if constexpr (T::category == TypeCategory::Integer) {
       auto product{folded->first.MultiplySigned(folded->second)};
-      if (product.SignedMultiplicationOverflowed()) {
+      if (product.SignedMultiplicationOverflowed() &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingException)) {
         context.messages().Say(
             "INTEGER(%d) multiplication overflowed"_warn_en_US, T::kind);
       }
@@ -1953,11 +1972,16 @@ Expr<T> FoldOperation(FoldingContext &context, Divide<T> &&x) {
     if constexpr (T::category == TypeCategory::Integer) {
       auto quotAndRem{folded->first.DivideSigned(folded->second)};
       if (quotAndRem.divisionByZero) {
-        context.messages().Say(
-            "INTEGER(%d) division by zero"_warn_en_US, T::kind);
+        if (context.languageFeatures().ShouldWarn(
+                common::UsageWarning::FoldingException)) {
+          context.messages().Say(
+              "INTEGER(%d) division by zero"_warn_en_US, T::kind);
+        }
         return Expr<T>{std::move(x)};
       }
-      if (quotAndRem.overflow) {
+      if (quotAndRem.overflow &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingException)) {
         context.messages().Say(
             "INTEGER(%d) division overflowed"_warn_en_US, T::kind);
       }
@@ -1998,22 +2022,26 @@ Expr<T> FoldOperation(FoldingContext &context, Power<T> &&x) {
   if (auto folded{OperandsAreConstants(x)}) {
     if constexpr (T::category == TypeCategory::Integer) {
       auto power{folded->first.Power(folded->second)};
-      if (power.divisionByZero) {
-        context.messages().Say(
-            "INTEGER(%d) zero to negative power"_warn_en_US, T::kind);
-      } else if (power.overflow) {
-        context.messages().Say(
-            "INTEGER(%d) power overflowed"_warn_en_US, T::kind);
-      } else if (power.zeroToZero) {
-        context.messages().Say(
-            "INTEGER(%d) 0**0 is not defined"_warn_en_US, T::kind);
+      if (context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingException)) {
+        if (power.divisionByZero) {
+          context.messages().Say(
+              "INTEGER(%d) zero to negative power"_warn_en_US, T::kind);
+        } else if (power.overflow) {
+          context.messages().Say(
+              "INTEGER(%d) power overflowed"_warn_en_US, T::kind);
+        } else if (power.zeroToZero) {
+          context.messages().Say(
+              "INTEGER(%d) 0**0 is not defined"_warn_en_US, T::kind);
+        }
       }
       return Expr<T>{Constant<T>{power.power}};
     } else {
       if (auto callable{GetHostRuntimeWrapper<T, T, T>("pow")}) {
         return Expr<T>{
             Constant<T>{(*callable)(context, folded->first, folded->second)}};
-      } else {
+      } else if (context.languageFeatures().ShouldWarn(
+                     common::UsageWarning::FoldingFailure)) {
         context.messages().Say(
             "Power for %s cannot be folded on host"_warn_en_US,
             T{}.AsFortran());
@@ -2097,7 +2125,9 @@ Expr<Type<TypeCategory::Real, KIND>> ToReal(
           CHECK(constant);
           Scalar<Result> real{constant->GetScalarValue().value()};
           From converted{From::ConvertUnsigned(real.RawBits()).value};
-          if (original != converted) { // C1601
+          if (original != converted &&
+              context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingValueChecks)) { // C1601
             context.messages().Say(
                 "Nonzero bits truncated from BOZ literal constant in REAL intrinsic"_warn_en_US);
           }
diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp
index 0a6ff12049f3..b76b9d49b582 100644
--- a/flang/lib/Evaluate/fold-integer.cpp
+++ b/flang/lib/Evaluate/fold-integer.cpp
@@ -297,7 +297,9 @@ static Expr<T> FoldCount(FoldingContext &context, FunctionRef<T> &&ref) {
     CountAccumulator<T, maskKind> accumulator{arrayAndMask->array};
     Constant<T> result{DoReduction<T>(arrayAndMask->array, arrayAndMask->mask,
         dim, Scalar<T>{}, accumulator)};
-    if (accumulator.overflow()) {
+    if (accumulator.overflow() &&
+        context.languageFeatures().ShouldWarn(
+            common::UsageWarning::FoldingException)) {
       context.messages().Say(
           "Result of intrinsic function COUNT overflows its result type"_warn_en_US);
     }
@@ -556,7 +558,9 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
   std::string name{intrinsic->name};
   auto FromInt64{[&name, &context](std::int64_t n) {
     Scalar<T> result{n};
-    if (result.ToInt64() != n) {
+    if (result.ToInt64() != n &&
+        context.languageFeatures().ShouldWarn(
+            common::UsageWarning::FoldingException)) {
       context.messages().Say(
           "Result of intrinsic function '%s' (%jd) overflows its result type"_warn_en_US,
           name, std::intmax_t{n});
@@ -567,7 +571,9 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
     return FoldElementalIntrinsic<T, T>(context, std::move(funcRef),
         ScalarFunc<T, T>([&context](const Scalar<T> &i) -> Scalar<T> {
           typename Scalar<T>::ValueWithOverflow j{i.ABS()};
-          if (j.overflow) {
+          if (j.overflow &&
+              context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingException)) {
             context.messages().Say(
                 "abs(integer(kind=%d)) folding overflowed"_warn_en_US, KIND);
           }
@@ -587,7 +593,9 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
             return FoldElementalIntrinsic<T, TR>(context, std::move(funcRef),
                 ScalarFunc<T, TR>([&](const Scalar<TR> &x) {
                   auto y{x.template ToInteger<Scalar<T>>(mode)};
-                  if (y.flags.test(RealFlag::Overflow)) {
+                  if (y.flags.test(RealFlag::Overflow) &&
+                      context.languageFeatures().ShouldWarn(
+                          common::UsageWarning::FoldingException)) {
                     context.messages().Say(
                         "%s intrinsic folding overflow"_warn_en_US, name);
                   }
@@ -634,7 +642,9 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
         ScalarFunc<T, T, T>([&context](const Scalar<T> &x,
                                 const Scalar<T> &y) -> Scalar<T> {
           auto result{x.DIM(y)};
-          if (result.overflow) {
+          if (result.overflow &&
+              context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingException)) {
             context.messages().Say("DIM intrinsic folding overflow"_warn_en_US);
           }
           return result.value;
@@ -1111,10 +1121,13 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
             [](FoldingContext &context, const Scalar<T> &x,
                 const Scalar<T> &y) -> Scalar<T> {
               auto quotRem{x.DivideSigned(y)};
-              if (quotRem.divisionByZero) {
-                context.messages().Say("mod() by zero"_warn_en_US);
-              } else if (quotRem.overflow) {
-                context.messages().Say("mod() folding overflowed"_warn_en_US);
+              if (context.languageFeatures().ShouldWarn(
+                      common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
+                if (quotRem.divisionByZero) {
+                  context.messages().Say("mod() by zero"_warn_en_US);
+                } else if (quotRem.overflow) {
+                  context.messages().Say("mod() folding overflowed"_warn_en_US);
+                }
               }
               return quotRem.remainder;
             }));
@@ -1124,7 +1137,9 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
                                            const Scalar<T> &x,
                                            const Scalar<T> &y) -> Scalar<T> {
           auto result{x.MODULO(y)};
-          if (result.overflow) {
+          if (result.overflow &&
+              context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingException)) {
             context.messages().Say("modulo() folding overflowed"_warn_en_US);
           }
           return result.value;
@@ -1256,7 +1271,9 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
         ScalarFunc<T, T, T>([&context](const Scalar<T> &j,
                                 const Scalar<T> &k) -> Scalar<T> {
           typename Scalar<T>::ValueWithOverflow result{j.SIGN(k)};
-          if (result.overflow) {
+          if (result.overflow &&
+              context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingException)) {
             context.messages().Say(
                 "sign(integer(kind=%d)) folding overflowed"_warn_en_US, KIND);
           }
@@ -1314,7 +1331,9 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
       auto realBytes{
           context.targetCharacteristics().GetByteSize(TypeCategory::Real,
               context.defaults().GetDefaultKind(TypeCategory::Real))};
-      if (intBytes != realBytes) {
+      if (intBytes != realBytes &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::FoldingValueChecks)) {
         context.messages().Say(*context.moduleFileName(),
             "NUMERIC_STORAGE_SIZE from ISO_FORTRAN_ENV is not well-defined when default INTEGER and REAL are not consistent due to compiler options"_warn_en_US);
       }
diff --git a/flang/lib/Evaluate/fold-logical.cpp b/flang/lib/Evaluate/fold-logical.cpp
index b7d641711c36..a7c655b72f56 100644
--- a/flang/lib/Evaluate/fold-logical.cpp
+++ b/flang/lib/Evaluate/fold-logical.cpp
@@ -530,7 +530,9 @@ static Expr<Type<TypeCategory::Logical, KIND>> RewriteOutOfRange(
             // Bounds depend on round= value
             if (auto *round{UnwrapExpr<Expr<SomeType>>(args[2])}) {
               if (const Symbol * whole{UnwrapWholeSymbolDataRef(*round)};
-                  whole && semantics::IsOptional(whole->GetUltimate())) {
+                  whole && semantics::IsOptional(whole->GetUltimate()) &&
+                  context.languageFeatures().ShouldWarn(
+                      common::UsageWarning::OptionalMustBePresent)) {
                 if (auto source{args[2]->sourceLocation()}) {
                   context.messages().Say(*source,
                       "ROUND= argument to OUT_OF_RANGE() is an optional dummy argument that must be present at execution"_warn_en_US);
diff --git a/flang/lib/Evaluate/fold-matmul.h b/flang/lib/Evaluate/fold-matmul.h
index 27b6db1fd8bf..a799cfb80a59 100644
--- a/flang/lib/Evaluate/fold-matmul.h
+++ b/flang/lib/Evaluate/fold-matmul.h
@@ -58,18 +58,25 @@ static Expr<T> FoldMatmul(FoldingContext &context, FunctionRef<T> &&funcRef) {
         Element bElt{mb->At(bAt)};
         if constexpr (T::category == TypeCategory::Real ||
             T::category == TypeCategory::Complex) {
-          // Kahan summation
-          auto product{aElt.Multiply(bElt, rounding)};
+          auto product{aElt.Multiply(bElt)};
           overflow |= product.flags.test(RealFlag::Overflow);
-          auto next{correction.Add(product.value, rounding)};
-          overflow |= next.flags.test(RealFlag::Overflow);
-          auto added{sum.Add(next.value, rounding)};
-          overflow |= added.flags.test(RealFlag::Overflow);
-          correction = added.value.Subtract(sum, rounding)
-                           .value.Subtract(next.value, rounding)
-                           .value;
-          sum = std::move(added.value);
+          if constexpr (useKahanSummation) {
+            auto next{correction.Add(product.value, rounding)};
+            overflow |= next.flags.test(RealFlag::Overflow);
+            auto added{sum.Add(next.value, rounding)};
+            overflow |= added.flags.test(RealFlag::Overflow);
+            correction = added.value.Subtract(sum, rounding)
+                             .value.Subtract(next.value, rounding)
+                             .value;
+            sum = std::move(added.value);
+          } else {
+            auto added{sum.Add(product.value)};
+            overflow |= added.flags.test(RealFlag::Overflow);
+            sum = std::move(added.value);
+          }
         } else if constexpr (T::category == TypeCategory::Integer) {
+          // Don't use Kahan summation in numeric MATMUL folding;
+          // the runtime doesn't use it, and results should match.
           auto product{aElt.MultiplySigned(bElt)};
           overflow |= product.SignedMultiplicationOverflowed();
           auto added{sum.AddSigned(product.lower)};
@@ -85,7 +92,9 @@ static Expr<T> FoldMatmul(FoldingContext &context, FunctionRef<T> &&funcRef) {
       elements.push_back(sum);
     }
   }
-  if (overflow) {
+  if (overflow &&
+      context.languageFeatures().ShouldWarn(
+          common::UsageWarning::FoldingException)) {
     context.messages().Say(
         "MATMUL of %s data overflowed during computation"_warn_en_US,
         T::AsFortran());
diff --git a/flang/lib/Evaluate/fold-real.cpp b/flang/lib/Evaluate/fold-real.cpp
index fd37437c643a..1ccf3f979ece 100644
--- a/flang/lib/Evaluate/fold-real.cpp
+++ b/flang/lib/Evaluate/fold-real.cpp
@@ -35,7 +35,8 @@ static Expr<T> FoldTransformationalBessel(
       }
       return Expr<T>{Constant<T>{
           std::move(results), ConstantSubscripts{std::max(n2 - n1 + 1, 0)}}};
-    } else {
+    } else if (context.languageFeatures().ShouldWarn(
+                   common::UsageWarning::FoldingFailure)) {
       context.messages().Say(
           "%s(integer(kind=4), real(kind=%d)) cannot be folded on host"_warn_en_US,
           name, T::kind);
@@ -54,7 +55,7 @@ public:
       : array_{array}, maxAbs_{maxAbs}, rounding_{rounding} {};
   void operator()(
       Scalar<T> &element, const ConstantSubscripts &at, bool /*first*/) {
-    // Kahan summation of scaled elements:
+    // Summation of scaled elements:
     // Naively,
     //   NORM2(A(:)) = SQRT(SUM(A(:)**2))
     // For any T > 0, we have mathematically
@@ -76,24 +77,27 @@ public:
       auto item{array_.At(at)};
       auto scaled{item.Divide(scale).value};
       auto square{scaled.Multiply(scaled).value};
-      auto next{square.Add(correction_, rounding_)};
-      overflow_ |= next.flags.test(RealFlag::Overflow);
-      auto sum{element.Add(next.value, rounding_)};
-      overflow_ |= sum.flags.test(RealFlag::Overflow);
-      correction_ = sum.value.Subtract(element, rounding_)
-                        .value.Subtract(next.value, rounding_)
-                        .value;
-      element = sum.value;
+      if constexpr (useKahanSummation) {
+        auto next{square.Add(correction_, rounding_)};
+        overflow_ |= next.flags.test(RealFlag::Overflow);
+        auto sum{element.Add(next.value, rounding_)};
+        overflow_ |= sum.flags.test(RealFlag::Overflow);
+        correction_ = sum.value.Subtract(element, rounding_)
+                          .value.Subtract(next.value, rounding_)
+                          .value;
+        element = sum.value;
+      } else {
+        auto sum{element.Add(square, rounding_)};
+        overflow_ |= sum.flags.test(RealFlag::Overflow);
+        element = sum.value;
+      }
     }
   }
   bool overflow() const { return overflow_; }
   void Done(Scalar<T> &result) {
-    // result+correction == SUM((data(:)/maxAbs)**2)
-    // result = maxAbs * SQRT(result+correction)
-    auto corrected{result.Add(correction_, rounding_)};
-    overflow_ |= corrected.flags.test(RealFlag::Overflow);
-    correction_ = Scalar<T>{};
-    auto root{corrected.value.SQRT().value};
+    // incoming result = SUM((data(:)/maxAbs)**2)
+    // outgoing result = maxAbs * SQRT(result)
+    auto root{result.SQRT().value};
     auto product{root.Multiply(maxAbs_.At(maxAbsAt_))};
     maxAbs_.IncrementSubscripts(maxAbsAt_);
     overflow_ |= product.flags.test(RealFlag::Overflow);
@@ -127,7 +131,9 @@ static Expr<Type<TypeCategory::Real, KIND>> FoldNorm2(FoldingContext &context,
         context.targetCharacteristics().roundingMode()};
     Constant<T> result{DoReduction<T>(arrayAndMask->array, arrayAndMask->mask,
         dim, identity, norm2Accumulator)};
-    if (norm2Accumulator.overflow()) {
+    if (norm2Accumulator.overflow() &&
+        context.languageFeatures().ShouldWarn(
+            common::UsageWarning::FoldingException)) {
       context.messages().Say(
           "NORM2() of REAL(%d) data overflowed"_warn_en_US, KIND);
     }
@@ -159,7 +165,8 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
     if (auto callable{GetHostRuntimeWrapper<T, T>(name)}) {
       return FoldElementalIntrinsic<T, T>(
           context, std::move(funcRef), *callable);
-    } else {
+    } else if (context.languageFeatures().ShouldWarn(
+                   common::UsageWarning::FoldingFailure)) {
       context.messages().Say(
           "%s(real(kind=%d)) cannot be folded on host"_warn_en_US, name, KIND);
     }
@@ -172,7 +179,8 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
     if (auto callable{GetHostRuntimeWrapper<T, T, T>(localName)}) {
       return FoldElementalIntrinsic<T, T, T>(
           context, std::move(funcRef), *callable);
-    } else {
+    } else if (context.languageFeatures().ShouldWarn(
+                   common::UsageWarning::FoldingFailure)) {
       context.messages().Say(
           "%s(real(kind=%d), real(kind%d)) cannot be folded on host"_warn_en_US,
           name, KIND, KIND);
@@ -183,7 +191,8 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
       if (auto callable{GetHostRuntimeWrapper<T, Int4, T>(name)}) {
         return FoldElementalIntrinsic<T, Int4, T>(
             context, std::move(funcRef), *callable);
-      } else {
+      } else if (context.languageFeatures().ShouldWarn(
+                     common::UsageWarning::FoldingFailure)) {
         context.messages().Say(
             "%s(integer(kind=4), real(kind=%d)) cannot be folded on host"_warn_en_US,
             name, KIND);
@@ -201,7 +210,9 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
           ScalarFunc<T, ComplexT>([&name, &context](
                                       const Scalar<ComplexT> &z) -> Scalar<T> {
             ValueWithRealFlags<Scalar<T>> y{z.ABS()};
-            if (y.flags.test(RealFlag::Overflow)) {
+            if (y.flags.test(RealFlag::Overflow) &&
+                context.languageFeatures().ShouldWarn(
+                    common::UsageWarning::FoldingException)) {
               context.messages().Say(
                   "complex ABS intrinsic folding overflow"_warn_en_US, name);
             }
@@ -223,7 +234,9 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
         ScalarFunc<T, T>(
             [&name, &context, mode](const Scalar<T> &x) -> Scalar<T> {
               ValueWithRealFlags<Scalar<T>> y{x.ToWholeNumber(mode)};
-              if (y.flags.test(RealFlag::Overflow)) {
+              if (y.flags.test(RealFlag::Overflow) &&
+                  context.languageFeatures().ShouldWarn(
+                      common::UsageWarning::FoldingException)) {
                 context.messages().Say(
                     "%s intrinsic folding overflow"_warn_en_US, name);
               }
@@ -234,7 +247,9 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
         ScalarFunc<T, T, T>([&context](const Scalar<T> &x,
                                 const Scalar<T> &y) -> Scalar<T> {
           ValueWithRealFlags<Scalar<T>> result{x.DIM(y)};
-          if (result.flags.test(RealFlag::Overflow)) {
+          if (result.flags.test(RealFlag::Overflow) &&
+              context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingException)) {
             context.messages().Say("DIM intrinsic folding overflow"_warn_en_US);
           }
           return result.value;
@@ -266,7 +281,9 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
         ScalarFunc<T, T, T>(
             [&](const Scalar<T> &x, const Scalar<T> &y) -> Scalar<T> {
               ValueWithRealFlags<Scalar<T>> result{x.HYPOT(y)};
-              if (result.flags.test(RealFlag::Overflow)) {
+              if (result.flags.test(RealFlag::Overflow) &&
+                  context.languageFeatures().ShouldWarn(
+                      common::UsageWarning::FoldingException)) {
                 context.messages().Say(
                     "HYPOT intrinsic folding overflow"_warn_en_US);
               }
@@ -290,7 +307,9 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
         ScalarFunc<T, T, T>(
             [&context](const Scalar<T> &x, const Scalar<T> &y) -> Scalar<T> {
               auto result{x.MOD(y)};
-              if (result.flags.test(RealFlag::DivideByZero)) {
+              if (result.flags.test(RealFlag::DivideByZero) &&
+                  context.languageFeatures().ShouldWarn(
+                      common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
                 context.messages().Say(
                     "second argument to MOD must not be zero"_warn_en_US);
               }
@@ -302,7 +321,9 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
         ScalarFunc<T, T, T>(
             [&context](const Scalar<T> &x, const Scalar<T> &y) -> Scalar<T> {
               auto result{x.MODULO(y)};
-              if (result.flags.test(RealFlag::DivideByZero)) {
+              if (result.flags.test(RealFlag::DivideByZero) &&
+                  context.languageFeatures().ShouldWarn(
+                      common::UsageWarning::FoldingAvoidsRuntimeCrash)) {
                 context.messages().Say(
                     "second argument to MODULO must not be zero"_warn_en_US);
               }
@@ -316,17 +337,22 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
             return FoldElementalIntrinsic<T, T, TS>(context, std::move(funcRef),
                 ScalarFunc<T, T, TS>([&](const Scalar<T> &x,
                                          const Scalar<TS> &s) -> Scalar<T> {
-                  if (s.IsZero()) {
+                  if (s.IsZero() &&
+                      context.languageFeatures().ShouldWarn(
+                          common::UsageWarning::FoldingValueChecks)) {
                     context.messages().Say(
                         "NEAREST: S argument is zero"_warn_en_US);
                   }
                   auto result{x.NEAREST(!s.IsNegative())};
-                  if (result.flags.test(RealFlag::Overflow)) {
-                    context.messages().Say(
-                        "NEAREST intrinsic folding overflow"_warn_en_US);
-                  } else if (result.flags.test(RealFlag::InvalidArgument)) {
-                    context.messages().Say(
-                        "NEAREST intrinsic folding: bad argument"_warn_en_US);
+                  if (context.languageFeatures().ShouldWarn(
+                          common::UsageWarning::FoldingException)) {
+                    if (result.flags.test(RealFlag::Overflow)) {
+                      context.messages().Say(
+                          "NEAREST intrinsic folding overflow"_warn_en_US);
+                    } else if (result.flags.test(RealFlag::InvalidArgument)) {
+                      context.messages().Say(
+                          "NEAREST intrinsic folding: bad argument"_warn_en_US);
+                    }
                   }
                   return result.value;
                 }));
@@ -362,7 +388,9 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
                                                            template
 #endif
                                                            SCALE(y)};
-                      if (result.flags.test(RealFlag::Overflow)) {
+                      if (result.flags.test(RealFlag::Overflow) &&
+                          context.languageFeatures().ShouldWarn(
+                              common::UsageWarning::FoldingException)) {
                         context.messages().Say(
                             "SCALE intrinsic folding overflow"_warn_en_US);
                       }
@@ -412,8 +440,11 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
                   bool upward{true};
                   switch (x.Compare(Scalar<T>::Convert(y).value)) {
                   case Relation::Unordered:
-                    context.messages().Say(
-                        "IEEE_NEXT_AFTER intrinsic folding: bad argument"_warn_en_US);
+                    if (context.languageFeatures().ShouldWarn(
+                            common::UsageWarning::FoldingValueChecks)) {
+                      context.messages().Say(
+                          "IEEE_NEXT_AFTER intrinsic folding: bad argument"_warn_en_US);
+                    }
                     return x;
                   case Relation::Equal:
                     return x;
@@ -425,7 +456,9 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
                     break;
                   }
                   auto result{x.NEAREST(upward)};
-                  if (result.flags.test(RealFlag::Overflow)) {
+                  if (result.flags.test(RealFlag::Overflow) &&
+                      context.languageFeatures().ShouldWarn(
+                          common::UsageWarning::FoldingException)) {
                     context.messages().Say(
                         "IEEE_NEXT_AFTER intrinsic folding overflow"_warn_en_US);
                   }
@@ -441,12 +474,15 @@ Expr<Type<TypeCategory::Real, KIND>> FoldIntrinsicFunction(
     return FoldElementalIntrinsic<T, T>(context, std::move(funcRef),
         ScalarFunc<T, T>([&](const Scalar<T> &x) -> Scalar<T> {
           auto result{x.NEAREST(upward)};
-          if (result.flags.test(RealFlag::Overflow)) {
-            context.messages().Say(
-                "%s intrinsic folding overflow"_warn_en_US, iName);
-          } else if (result.flags.test(RealFlag::InvalidArgument)) {
-            context.messages().Say(
-                "%s intrinsic folding: bad argument"_warn_en_US, iName);
+          if (context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::FoldingException)) {
+            if (result.flags.test(RealFlag::Overflow)) {
+              context.messages().Say(
+                  "%s intrinsic folding overflow"_warn_en_US, iName);
+            } else if (result.flags.test(RealFlag::InvalidArgument)) {
+              context.messages().Say(
+                  "%s intrinsic folding: bad argument"_warn_en_US, iName);
+            }
           }
           return result.value;
         }));
diff --git a/flang/lib/Evaluate/fold-reduction.h b/flang/lib/Evaluate/fold-reduction.h
index c84d35734ab5..fbdae8f4eee0 100644
--- a/flang/lib/Evaluate/fold-reduction.h
+++ b/flang/lib/Evaluate/fold-reduction.h
@@ -43,17 +43,23 @@ static Expr<T> FoldDotProduct(
       Expr<T> products{Fold(
           context, Expr<T>{std::move(conjgA)} * Expr<T>{Constant<T>{*vb}})};
       Constant<T> &cProducts{DEREF(UnwrapConstantValue<T>(products))};
-      Element correction{}; // Use Kahan summation for greater precision.
+      [[maybe_unused]] Element correction{};
       const auto &rounding{context.targetCharacteristics().roundingMode()};
       for (const Element &x : cProducts.values()) {
-        auto next{correction.Add(x, rounding)};
-        overflow |= next.flags.test(RealFlag::Overflow);
-        auto added{sum.Add(next.value, rounding)};
-        overflow |= added.flags.test(RealFlag::Overflow);
-        correction = added.value.Subtract(sum, rounding)
-                         .value.Subtract(next.value, rounding)
-                         .value;
-        sum = std::move(added.value);
+        if constexpr (useKahanSummation) {
+          auto next{correction.Add(x, rounding)};
+          overflow |= next.flags.test(RealFlag::Overflow);
+          auto added{sum.Add(next.value, rounding)};
+          overflow |= added.flags.test(RealFlag::Overflow);
+          correction = added.value.Subtract(sum, rounding)
+                           .value.Subtract(next.value, rounding)
+                           .value;
+          sum = std::move(added.value);
+        } else {
+          auto added{sum.Add(x, rounding)};
+          overflow |= added.flags.test(RealFlag::Overflow);
+          sum = std::move(added.value);
+        }
       }
     } else if constexpr (T::category == TypeCategory::Logical) {
       Expr<T> conjunctions{Fold(context,
@@ -80,20 +86,28 @@ static Expr<T> FoldDotProduct(
       Expr<T> products{
           Fold(context, Expr<T>{Constant<T>{*va}} * Expr<T>{Constant<T>{*vb}})};
       Constant<T> &cProducts{DEREF(UnwrapConstantValue<T>(products))};
-      Element correction{}; // Use Kahan summation for greater precision.
+      [[maybe_unused]] Element correction{};
       const auto &rounding{context.targetCharacteristics().roundingMode()};
       for (const Element &x : cProducts.values()) {
-        auto next{correction.Add(x, rounding)};
-        overflow |= next.flags.test(RealFlag::Overflow);
-        auto added{sum.Add(next.value, rounding)};
-        overflow |= added.flags.test(RealFlag::Overflow);
-        correction = added.value.Subtract(sum, rounding)
-                         .value.Subtract(next.value, rounding)
-                         .value;
-        sum = std::move(added.value);
+        if constexpr (useKahanSummation) {
+          auto next{correction.Add(x, rounding)};
+          overflow |= next.flags.test(RealFlag::Overflow);
+          auto added{sum.Add(next.value, rounding)};
+          overflow |= added.flags.test(RealFlag::Overflow);
+          correction = added.value.Subtract(sum, rounding)
+                           .value.Subtract(next.value, rounding)
+                           .value;
+          sum = std::move(added.value);
+        } else {
+          auto added{sum.Add(x, rounding)};
+          overflow |= added.flags.test(RealFlag::Overflow);
+          sum = std::move(added.value);
+        }
       }
     }
-    if (overflow) {
+    if (overflow &&
+        context.languageFeatures().ShouldWarn(
+            common::UsageWarning::FoldingException)) {
       context.messages().Say(
           "DOT_PRODUCT of %s data overflowed during computation"_warn_en_US,
           T::AsFortran());
@@ -309,7 +323,9 @@ static Expr<T> FoldProduct(
     ProductAccumulator accumulator{arrayAndMask->array};
     auto result{Expr<T>{DoReduction<T>(
         arrayAndMask->array, arrayAndMask->mask, dim, identity, accumulator)}};
-    if (accumulator.overflow()) {
+    if (accumulator.overflow() &&
+        context.languageFeatures().ShouldWarn(
+            common::UsageWarning::FoldingException)) {
       context.messages().Say(
           "PRODUCT() of %s data overflowed"_warn_en_US, T::AsFortran());
     }
@@ -375,7 +391,9 @@ static Expr<T> FoldSum(FoldingContext &context, FunctionRef<T> &&ref) {
         arrayAndMask->array, context.targetCharacteristics().roundingMode()};
     auto result{Expr<T>{DoReduction<T>(
         arrayAndMask->array, arrayAndMask->mask, dim, identity, accumulator)}};
-    if (accumulator.overflow()) {
+    if (accumulator.overflow() &&
+        context.languageFeatures().ShouldWarn(
+            common::UsageWarning::FoldingException)) {
       context.messages().Say(
           "SUM() of %s data overflowed"_warn_en_US, T::AsFortran());
     }
diff --git a/flang/lib/Evaluate/host.cpp b/flang/lib/Evaluate/host.cpp
index a5817bd0b59a..31bc43838580 100644
--- a/flang/lib/Evaluate/host.cpp
+++ b/flang/lib/Evaluate/host.cpp
@@ -100,9 +100,13 @@ void HostFloatingPointEnvironment::SetUpHostFloatingPointEnvironment(
     break;
   case common::RoundingMode::TiesAwayFromZero:
     fesetround(FE_TONEAREST);
-    context.messages().Say(
-        "TiesAwayFromZero rounding mode is not available when folding constants"
-        " with host runtime; using TiesToEven instead"_warn_en_US);
+    if (context.languageFeatures().ShouldWarn(
+            common::UsageWarning::FoldingFailure)) {
+      context.messages().Say(
+          "TiesAwayFromZero rounding mode is not available when folding "
+          "constants"
+          " with host runtime; using TiesToEven instead"_warn_en_US);
+    }
     break;
   }
   flags_.clear();
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index f07f94b1a022..441a762c930d 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -777,7 +777,9 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"identity", SameType, Rank::scalar, Optionality::optional},
             {"ordered", AnyLogical, Rank::scalar, Optionality::optional}},
         SameType, Rank::scalar, IntrinsicClass::transformationalFunction},
-    {"repeat", {{"string", SameCharNoLen, Rank::scalar}, {"ncopies", AnyInt}},
+    {"repeat",
+        {{"string", SameCharNoLen, Rank::scalar},
+            {"ncopies", AnyInt, Rank::scalar}},
         SameCharNoLen, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"reshape",
         {{"source", SameType, Rank::array}, {"shape", AnyInt, Rank::shape},
@@ -2281,7 +2283,7 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
                 UnwrapWholeSymbolOrComponentDataRef(actualForDummy[*dimArg])}) {
           if (IsOptional(*whole) || IsAllocatableOrObjectPointer(whole)) {
             if (context.languageFeatures().ShouldWarn(
-                    common::UsageWarning::DimMustBePresent)) {
+                    common::UsageWarning::OptionalMustBePresent)) {
               if (rank == Rank::scalarIfDim || arrayRank.value_or(-1) == 1) {
                 messages.Say(
                     "The actual argument for DIM= is optional, pointer, or allocatable, and it is assumed to be present and equal to 1 at execution time"_warn_en_US);
@@ -2739,16 +2741,21 @@ IntrinsicProcTable::Implementation::HandleC_F_Pointer(
           context.messages().Say(at,
               "FPTR= argument to C_F_POINTER() may not have a deferred type parameter"_err_en_US);
         } else if (type->category() == TypeCategory::Derived) {
-          if (type->IsUnlimitedPolymorphic()) {
-            context.messages().Say(at,
-                "FPTR= argument to C_F_POINTER() should not be unlimited polymorphic"_warn_en_US);
-          } else if (!type->GetDerivedTypeSpec().typeSymbol().attrs().test(
-                         semantics::Attr::BIND_C)) {
-            context.messages().Say(at,
-                "FPTR= argument to C_F_POINTER() should not have a derived type that is not BIND(C)"_warn_en_US);
+          if (context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::Interoperability)) {
+            if (type->IsUnlimitedPolymorphic()) {
+              context.messages().Say(at,
+                  "FPTR= argument to C_F_POINTER() should not be unlimited polymorphic"_warn_en_US);
+            } else if (!type->GetDerivedTypeSpec().typeSymbol().attrs().test(
+                           semantics::Attr::BIND_C)) {
+              context.messages().Say(at,
+                  "FPTR= argument to C_F_POINTER() should not have a derived type that is not BIND(C)"_warn_en_US);
+            }
           }
         } else if (!IsInteroperableIntrinsicType(
-                       *type, &context.languageFeatures())) {
+                       *type, &context.languageFeatures()) &&
+            context.languageFeatures().ShouldWarn(
+                common::UsageWarning::Interoperability)) {
           context.messages().Say(at,
               "FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type %s"_warn_en_US,
               type->AsFortran());
@@ -2848,7 +2855,9 @@ std::optional<SpecificCall> IntrinsicProcTable::Implementation::HandleC_Loc(
         context.messages().Say(arguments[0]->sourceLocation(),
             "C_LOC() argument may not be zero-length character"_err_en_US);
       } else if (typeAndShape->type().category() != TypeCategory::Derived &&
-          !IsInteroperableIntrinsicType(typeAndShape->type())) {
+          !IsInteroperableIntrinsicType(typeAndShape->type()) &&
+          context.languageFeatures().ShouldWarn(
+              common::UsageWarning::Interoperability)) {
         context.messages().Say(arguments[0]->sourceLocation(),
             "C_LOC() argument has non-interoperable intrinsic type, kind, or length"_warn_en_US);
       }
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index a369e07f94a1..ee1e5b398d9b 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -731,7 +731,7 @@ bool SomeKind<TypeCategory::Derived>::operator==(
   return PointeeComparison(derivedTypeSpec_, that.derivedTypeSpec_);
 }
 
-int SelectedCharKind(const std::string &s, int defaultKind) { // 16.9.168
+int SelectedCharKind(const std::string &s, int defaultKind) { // F'2023 16.9.180
   auto lower{parser::ToLowerCaseLetters(s)};
   auto n{lower.size()};
   while (n > 0 && lower[0] == ' ') {
diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp
index d73ba835a052..247386a365de 100644
--- a/flang/lib/Evaluate/variable.cpp
+++ b/flang/lib/Evaluate/variable.cpp
@@ -214,17 +214,21 @@ std::optional<Expr<SomeCharacter>> Substring::Fold(FoldingContext &context) {
   }
   if (!result) { // error cases
     if (*lbi < 1) {
-      context.messages().Say(
-          "Lower bound (%jd) on substring is less than one"_warn_en_US,
-          static_cast<std::intmax_t>(*lbi));
+      if (context.languageFeatures().ShouldWarn(common::UsageWarning::Bounds)) {
+        context.messages().Say(
+            "Lower bound (%jd) on substring is less than one"_warn_en_US,
+            static_cast<std::intmax_t>(*lbi));
+      }
       *lbi = 1;
       lower_ = AsExpr(Constant<SubscriptInteger>{1});
     }
     if (length && *ubi > *length) {
-      context.messages().Say(
-          "Upper bound (%jd) on substring is greater than character length (%jd)"_warn_en_US,
-          static_cast<std::intmax_t>(*ubi),
-          static_cast<std::intmax_t>(*length));
+      if (context.languageFeatures().ShouldWarn(common::UsageWarning::Bounds)) {
+        context.messages().Say(
+            "Upper bound (%jd) on substring is greater than character length (%jd)"_warn_en_US,
+            static_cast<std::intmax_t>(*ubi),
+            static_cast<std::intmax_t>(*length));
+      }
       *ubi = *length;
       upper_ = AsExpr(Constant<SubscriptInteger>{*ubi});
     }
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index bb1c239540d9..765e203ddfbe 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -153,7 +153,7 @@ bool FrontendAction::runPrescan() {
   return !reportFatalScanningErrors();
 }
 
-bool FrontendAction::runParse() {
+bool FrontendAction::runParse(bool emitMessages) {
   CompilerInstance &ci = this->getInstance();
 
   // Parse. In case of failure, report and return.
@@ -163,9 +163,11 @@ bool FrontendAction::runParse() {
     return false;
   }
 
-  // Report the diagnostics from getParsing
-  ci.getParsing().messages().Emit(llvm::errs(), ci.getAllCookedSources());
-
+  if (emitMessages) {
+    // Report any non-fatal diagnostics from getParsing now rather than
+    // combining them with messages from semantics.
+    ci.getParsing().messages().Emit(llvm::errs(), ci.getAllCookedSources());
+  }
   return true;
 }
 
@@ -174,10 +176,14 @@ bool FrontendAction::runSemanticChecks() {
   std::optional<parser::Program> &parseTree{ci.getParsing().parseTree()};
   assert(parseTree && "Cannot run semantic checks without a parse tree!");
 
+  // Transfer any pending non-fatal messages from parsing to semantics
+  // so that they are merged and all printed in order.
+  auto &semanticsCtx{ci.getSemanticsContext()};
+  semanticsCtx.messages().Annex(std::move(ci.getParsing().messages()));
+
   // Prepare semantics
   ci.setSemantics(std::make_unique<Fortran::semantics::Semantics>(
-      ci.getSemanticsContext(), *parseTree,
-      ci.getInvocation().getDebugModuleDir()));
+      semanticsCtx, *parseTree, ci.getInvocation().getDebugModuleDir()));
   auto &semantics = ci.getSemantics();
 
   // Run semantic checks
@@ -187,7 +193,7 @@ bool FrontendAction::runSemanticChecks() {
     return false;
   }
 
-  // Report the diagnostics from the semantic checks
+  // Report the diagnostics from parsing and the semantic checks
   semantics.EmitMessages(ci.getSemaOutputStream());
 
   return true;
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 87a714d17015..b96e2c87ae05 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -123,12 +123,12 @@ static bool saveMLIRTempFile(const CompilerInvocation &ci,
 bool PrescanAction::beginSourceFileAction() { return runPrescan(); }
 
 bool PrescanAndParseAction::beginSourceFileAction() {
-  return runPrescan() && runParse();
+  return runPrescan() && runParse(/*emitMessages=*/true);
 }
 
 bool PrescanAndSemaAction::beginSourceFileAction() {
-  return runPrescan() && runParse() && runSemanticChecks() &&
-         generateRtTypeTables();
+  return runPrescan() && runParse(/*emitMessages=*/false) &&
+         runSemanticChecks() && generateRtTypeTables();
 }
 
 bool PrescanAndSemaDebugAction::beginSourceFileAction() {
@@ -137,8 +137,8 @@ bool PrescanAndSemaDebugAction::beginSourceFileAction() {
   // from exiting early (i.e. in the presence of semantic errors). We should
   // never do this in actions intended for end-users or otherwise regular
   // compiler workflows!
-  return runPrescan() && runParse() && (runSemanticChecks() || true) &&
-         (generateRtTypeTables() || true);
+  return runPrescan() && runParse(/*emitMessages=*/false) &&
+         (runSemanticChecks() || true) && (generateRtTypeTables() || true);
 }
 
 static void addDependentLibs(mlir::ModuleOp &mlirModule, CompilerInstance &ci) {
@@ -275,8 +275,8 @@ bool CodeGenAction::beginSourceFileAction() {
     ci.getDiagnostics().Report(diagID);
     return false;
   }
-  bool res = runPrescan() && runParse() && runSemanticChecks() &&
-             generateRtTypeTables();
+  bool res = runPrescan() && runParse(/*emitMessages=*/false) &&
+             runSemanticChecks() && generateRtTypeTables();
   if (!res)
     return res;
 
@@ -861,7 +861,6 @@ getOutputStream(CompilerInstance &ci, llvm::StringRef inFile,
     return ci.createDefaultOutputFile(
         /*Binary=*/false, inFile, /*extension=*/"ll");
   case BackendActionTy::Backend_EmitFIR:
-    LLVM_FALLTHROUGH;
   case BackendActionTy::Backend_EmitHLFIR:
     return ci.createDefaultOutputFile(
         /*Binary=*/false, inFile, /*extension=*/"mlir");
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 8e84ea2fc5d5..a1957c0eb1bb 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -162,7 +162,7 @@ static void genRuntimeInitCharacter(fir::FirOpBuilder &builder,
   args.push_back(builder.createConvert(loc, inputTypes[0], box.getAddr()));
   args.push_back(builder.createConvert(loc, inputTypes[1], len));
   if (kind == 0)
-    kind = box.getEleTy().cast<fir::CharacterType>().getFKind();
+    kind = mlir::cast<fir::CharacterType>(box.getEleTy()).getFKind();
   args.push_back(builder.createIntegerConstant(loc, inputTypes[2], kind));
   int rank = box.rank();
   args.push_back(builder.createIntegerConstant(loc, inputTypes[3], rank));
@@ -879,7 +879,7 @@ void Fortran::lower::genDeallocateIfAllocated(
   builder.genIfThen(loc, isAllocated)
       .genThen([&]() {
         if (mlir::Type eleType = box.getEleTy();
-            eleType.isa<fir::RecordType>() && box.isPolymorphic()) {
+            mlir::isa<fir::RecordType>(eleType) && box.isPolymorphic()) {
           mlir::Value declaredTypeDesc = builder.create<fir::TypeDescOp>(
               loc, mlir::TypeAttr::get(eleType));
           genDeallocateBox(converter, box, loc, sym, declaredTypeDesc);
@@ -918,7 +918,7 @@ void Fortran::lower::genDeallocateStmt(
     mlir::Value declaredTypeDesc = {};
     if (box.isPolymorphic()) {
       mlir::Type eleType = box.getEleTy();
-      if (eleType.isa<fir::RecordType>())
+      if (mlir::isa<fir::RecordType>(eleType))
         if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
                 symbol.GetType()->AsDerived()) {
           declaredTypeDesc =
@@ -1007,7 +1007,7 @@ createMutableProperties(Fortran::lower::AbstractConverter &converter,
   fir::MutableProperties mutableProperties;
   std::string name = converter.mangleName(sym);
   mlir::Type baseAddrTy = converter.genType(sym);
-  if (auto boxType = baseAddrTy.dyn_cast<fir::BaseBoxType>())
+  if (auto boxType = mlir::dyn_cast<fir::BaseBoxType>(baseAddrTy))
     baseAddrTy = boxType.getEleTy();
   // Allocate and set a variable to hold the address.
   // It will be set to null in setUnallocatedStatus.
@@ -1032,9 +1032,9 @@ createMutableProperties(Fortran::lower::AbstractConverter &converter,
   mlir::Type eleTy = baseAddrTy;
   if (auto newTy = fir::dyn_cast_ptrEleTy(eleTy))
     eleTy = newTy;
-  if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
     eleTy = seqTy.getEleTy();
-  if (auto record = eleTy.dyn_cast<fir::RecordType>())
+  if (auto record = mlir::dyn_cast<fir::RecordType>(eleTy))
     if (record.getNumLenParams() != 0)
       TODO(loc, "deferred length type parameters.");
   if (fir::isa_char(eleTy) && nonDeferredParams.empty()) {
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 8b62fe8c022f..fb01789d3f8a 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -36,6 +36,7 @@
 #include "flang/Optimizer/Builder/Runtime/Character.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h"
+#include "flang/Optimizer/Builder/Runtime/Main.h"
 #include "flang/Optimizer/Builder/Runtime/Ragged.h"
 #include "flang/Optimizer/Builder/Runtime/Stop.h"
 #include "flang/Optimizer/Builder/Todo.h"
@@ -347,20 +348,11 @@ public:
     createGlobalOutsideOfFunctionLowering(
         [&]() { typeInfoConverter.createTypeInfo(*this); });
 
-    // Create the list of any environment defaults for the runtime to set. The
-    // runtime default list is only created if there is a main program to ensure
-    // it only happens once and to provide consistent results if multiple files
-    // are compiled separately.
+    // Generate the `main` entry point if necessary
     if (hasMainProgram)
       createGlobalOutsideOfFunctionLowering([&]() {
-        // FIXME: Ideally, this would create a call to a runtime function
-        // accepting the list of environment defaults. That way, we would not
-        // need to add an extern pointer to the runtime and said pointer would
-        // not need to be generated even if no defaults are specified.
-        // However, generating main or changing when the runtime reads
-        // environment variables is required to do so.
-        fir::runtime::genEnvironmentDefaults(*builder, toLocation(),
-                                             bridge.getEnvironmentDefaults());
+        fir::runtime::genMain(*builder, toLocation(),
+                              bridge.getEnvironmentDefaults());
       });
 
     finalizeOpenACCLowering();
@@ -683,25 +675,28 @@ public:
           auto if_builder = builder->genIfThenElse(loc, isAllocated);
           if_builder.genThen([&]() {
             std::string name = mangleName(sym) + ".alloc";
-            if (auto seqTy = symType.dyn_cast<fir::SequenceType>()) {
-              fir::ExtendedValue read = fir::factory::genMutableBoxRead(
-                  *builder, loc, box, /*mayBePolymorphic=*/false);
-              if (auto read_arr_box = read.getBoxOf<fir::ArrayBoxValue>()) {
-                fir::factory::genInlinedAllocation(
-                    *builder, loc, *new_box, read_arr_box->getLBounds(),
-                    read_arr_box->getExtents(),
-                    /*lenParams=*/std::nullopt, name,
-                    /*mustBeHeap=*/true);
-              } else if (auto read_char_arr_box =
-                             read.getBoxOf<fir::CharArrayBoxValue>()) {
-                fir::factory::genInlinedAllocation(
-                    *builder, loc, *new_box, read_char_arr_box->getLBounds(),
-                    read_char_arr_box->getExtents(),
-                    read_char_arr_box->getLen(), name,
-                    /*mustBeHeap=*/true);
-              } else {
-                TODO(loc, "Unhandled allocatable box type");
-              }
+            fir::ExtendedValue read = fir::factory::genMutableBoxRead(
+                *builder, loc, box, /*mayBePolymorphic=*/false);
+            if (auto read_arr_box = read.getBoxOf<fir::ArrayBoxValue>()) {
+              fir::factory::genInlinedAllocation(
+                  *builder, loc, *new_box, read_arr_box->getLBounds(),
+                  read_arr_box->getExtents(),
+                  /*lenParams=*/std::nullopt, name,
+                  /*mustBeHeap=*/true);
+            } else if (auto read_char_arr_box =
+                           read.getBoxOf<fir::CharArrayBoxValue>()) {
+              fir::factory::genInlinedAllocation(
+                  *builder, loc, *new_box, read_char_arr_box->getLBounds(),
+                  read_char_arr_box->getExtents(), read_char_arr_box->getLen(),
+                  name,
+                  /*mustBeHeap=*/true);
+            } else if (auto read_char_box =
+                           read.getBoxOf<fir::CharBoxValue>()) {
+              fir::factory::genInlinedAllocation(*builder, loc, *new_box,
+                                                 /*lbounds=*/std::nullopt,
+                                                 /*extents=*/std::nullopt,
+                                                 read_char_box->getLen(), name,
+                                                 /*mustBeHeap=*/true);
             } else {
               fir::factory::genInlinedAllocation(
                   *builder, loc, *new_box, box.getMutableProperties().lbounds,
@@ -1132,7 +1127,7 @@ private:
     fir::ExtendedValue lhs = symBoxToExtendedValue(lhs_sb);
     fir::ExtendedValue rhs = symBoxToExtendedValue(rhs_sb);
     mlir::Type symType = genType(sym);
-    if (auto seqTy = symType.dyn_cast<fir::SequenceType>()) {
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(symType)) {
       Fortran::lower::StatementContext stmtCtx;
       Fortran::lower::createSomeArrayAssignment(*this, lhs, rhs, localSymbols,
                                                 stmtCtx);
@@ -1355,7 +1350,7 @@ private:
       return;
     }
     mlir::Type selectorType = selector.getType();
-    bool realSelector = selectorType.isa<mlir::FloatType>();
+    bool realSelector = mlir::isa<mlir::FloatType>(selectorType);
     assert((inArithmeticIfContext || !realSelector) && "invalid selector type");
     mlir::Value zero;
     if (inArithmeticIfContext)
@@ -1630,7 +1625,7 @@ private:
         stmtCtx);
     stmtCtx.finalizeAndReset();
     // Raise an exception if REAL expr is a NaN.
-    if (expr.getType().isa<mlir::FloatType>())
+    if (mlir::isa<mlir::FloatType>(expr.getType()))
       expr = builder->create<mlir::arith::AddFOp>(toLocation(), expr, expr);
     // An empty valueList indicates to genMultiwayBranch that the branch is
     // an ArithmeticIfStmt that has two branches on value 0 or 0.0.
@@ -2648,9 +2643,6 @@ private:
       builder->create<fir::StoreOp>(loc, convArg, value);
     }
 
-    builder->create<fir::FirEndOp>(loc);
-    builder->setInsertionPointToStart(&b);
-
     Fortran::lower::pft::Evaluation *crtEval = &getEval();
     if (crtEval->lowerAsStructured()) {
       crtEval = &crtEval->getFirstNestedEvaluation();
@@ -2662,6 +2654,7 @@ private:
     for (Fortran::lower::pft::Evaluation &e : crtEval->getNestedEvaluations())
       genFIR(e);
 
+    builder->create<fir::FirEndOp>(loc);
     builder->setInsertionPointAfter(op);
     localSymbols.popScope();
   }
@@ -2807,7 +2800,7 @@ private:
     auto caseValue = valueList.begin();
     auto caseBlock = blockList.begin();
     for (mlir::Attribute attr : attrList) {
-      if (attr.isa<mlir::UnitAttr>()) {
+      if (mlir::isa<mlir::UnitAttr>(attr)) {
         genBranch(*caseBlock++);
         break;
       }
@@ -2825,7 +2818,7 @@ private:
                                             rhsVal.second);
       };
       mlir::Block *newBlock = insertBlock(*caseBlock);
-      if (attr.isa<fir::ClosedIntervalAttr>()) {
+      if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
         mlir::Block *newBlock2 = insertBlock(*caseBlock);
         mlir::Value cond =
             genCond(*caseValue++, mlir::arith::CmpIPredicate::sge);
@@ -2838,12 +2831,12 @@ private:
         continue;
       }
       mlir::arith::CmpIPredicate pred;
-      if (attr.isa<fir::PointIntervalAttr>()) {
+      if (mlir::isa<fir::PointIntervalAttr>(attr)) {
         pred = mlir::arith::CmpIPredicate::eq;
-      } else if (attr.isa<fir::LowerBoundAttr>()) {
+      } else if (mlir::isa<fir::LowerBoundAttr>(attr)) {
         pred = mlir::arith::CmpIPredicate::sge;
       } else {
-        assert(attr.isa<fir::UpperBoundAttr>() && "unexpected predicate");
+        assert(mlir::isa<fir::UpperBoundAttr>(attr) && "unexpected predicate");
         pred = mlir::arith::CmpIPredicate::sle;
       }
       mlir::Value cond = genCond(*caseValue++, pred);
@@ -3105,7 +3098,7 @@ private:
         bool isPointer = fir::isPointerType(baseTy);
         bool isAllocatable = fir::isAllocatableType(baseTy);
         bool isArray =
-            fir::dyn_cast_ptrOrBoxEleTy(baseTy).isa<fir::SequenceType>();
+            mlir::isa<fir::SequenceType>(fir::dyn_cast_ptrOrBoxEleTy(baseTy));
         const fir::BoxValue *selectorBox = selector.getBoxOf<fir::BoxValue>();
         if (std::holds_alternative<Fortran::parser::Default>(guard.u)) {
           // CLASS DEFAULT
@@ -3114,12 +3107,12 @@ private:
                        std::get_if<Fortran::parser::TypeSpec>(&guard.u)) {
           // TYPE IS
           fir::ExactTypeAttr attr =
-              typeGuardAttr.dyn_cast<fir::ExactTypeAttr>();
+              mlir::dyn_cast<fir::ExactTypeAttr>(typeGuardAttr);
           mlir::Value exactValue;
           mlir::Type addrTy = attr.getType();
           if (isArray) {
-            auto seqTy = fir::dyn_cast_ptrOrBoxEleTy(baseTy)
-                             .dyn_cast<fir::SequenceType>();
+            auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+                fir::dyn_cast_ptrOrBoxEleTy(baseTy));
             addrTy = fir::SequenceType::get(seqTy.getShape(), attr.getType());
           }
           if (isPointer)
@@ -3141,7 +3134,7 @@ private:
               addAssocEntitySymbol(selectorBox->clone(exact));
             } else if (intrinsic->category() ==
                        Fortran::common::TypeCategory::Character) {
-              auto charTy = attr.getType().dyn_cast<fir::CharacterType>();
+              auto charTy = mlir::dyn_cast<fir::CharacterType>(attr.getType());
               mlir::Value charLen =
                   fir::factory::CharacterExprHelper(*builder, loc)
                       .readLengthFromBox(fir::getBase(selector), charTy);
@@ -3158,11 +3151,12 @@ private:
         } else if (std::holds_alternative<Fortran::parser::DerivedTypeSpec>(
                        guard.u)) {
           // CLASS IS
-          fir::SubclassAttr attr = typeGuardAttr.dyn_cast<fir::SubclassAttr>();
+          fir::SubclassAttr attr =
+              mlir::dyn_cast<fir::SubclassAttr>(typeGuardAttr);
           mlir::Type addrTy = attr.getType();
           if (isArray) {
-            auto seqTy = fir::dyn_cast_ptrOrBoxEleTy(baseTy)
-                             .dyn_cast<fir::SequenceType>();
+            auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+                fir::dyn_cast_ptrOrBoxEleTy(baseTy));
             addrTy = fir::SequenceType::get(seqTy.getShape(), attr.getType());
           }
           if (isPointer)
@@ -3794,7 +3788,9 @@ private:
           auto needCleanup = fir::getIntIfConstant(cleanup);
           if (needCleanup && *needCleanup)
             temps.push_back(temp);
-          addSymbol(sym, temp, /*forced=*/true);
+          addSymbol(sym,
+                    hlfir::translateToExtendedValue(loc, builder, temp).first,
+                    /*forced=*/true);
           builder.create<fir::CUDADataTransferOp>(loc, addr, temp,
                                                   transferKindAttr);
           ++nbDeviceResidentObject;
@@ -3804,18 +3800,38 @@ private:
     return temps;
   }
 
+  // Check if the insertion point is currently in a device context. HostDevice
+  // subprogram are not considered fully device context so it will return false
+  // for it.
+  static bool isDeviceContext(fir::FirOpBuilder &builder) {
+    if (builder.getRegion().getParentOfType<fir::CUDAKernelOp>())
+      return true;
+    if (auto funcOp =
+            builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
+      if (auto cudaProcAttr =
+              funcOp.getOperation()->getAttrOfType<fir::CUDAProcAttributeAttr>(
+                  fir::getCUDAAttrName())) {
+        return cudaProcAttr.getValue() != fir::CUDAProcAttribute::Host &&
+               cudaProcAttr.getValue() != fir::CUDAProcAttribute::HostDevice;
+      }
+    }
+    return false;
+  }
+
   void genDataAssignment(
       const Fortran::evaluate::Assignment &assign,
       const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
     mlir::Location loc = getCurrentLocation();
     fir::FirOpBuilder &builder = getFirOpBuilder();
 
-    bool isCUDATransfer = Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
-                          Fortran::evaluate::HasCUDAAttrs(assign.rhs);
+    bool isInDeviceContext = isDeviceContext(builder);
+    bool isCUDATransfer = (Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
+                           Fortran::evaluate::HasCUDAAttrs(assign.rhs)) &&
+                          !isInDeviceContext;
     bool hasCUDAImplicitTransfer =
         Fortran::evaluate::HasCUDAImplicitTransfer(assign.rhs);
     llvm::SmallVector<mlir::Value> implicitTemps;
-    if (hasCUDAImplicitTransfer)
+    if (hasCUDAImplicitTransfer && !isInDeviceContext)
       implicitTemps = genCUDAImplicitDataTransfer(builder, loc, assign);
 
     // Gather some information about the assignment that will impact how it is
@@ -3880,7 +3896,7 @@ private:
         builder.create<hlfir::AssignOp>(loc, rhs, lhs,
                                         isWholeAllocatableAssignment,
                                         keepLhsLengthInAllocatableAssignment);
-      if (hasCUDAImplicitTransfer) {
+      if (hasCUDAImplicitTransfer && !isInDeviceContext) {
         localSymbols.popScope();
         for (mlir::Value temp : implicitTemps)
           builder.create<fir::FreeMemOp>(loc, temp);
@@ -4135,7 +4151,7 @@ private:
               } else if (isDerivedCategory(lhsType->category())) {
                 // Handle parent component.
                 if (Fortran::lower::isParentComponent(assign.lhs)) {
-                  if (!fir::getBase(lhs).getType().isa<fir::BaseBoxType>())
+                  if (!mlir::isa<fir::BaseBoxType>(fir::getBase(lhs).getType()))
                     lhs = fir::getBase(builder->createBox(loc, lhs));
                   lhs = Fortran::lower::updateBoxForParentComponent(*this, lhs,
                                                                     assign.lhs);
@@ -5486,7 +5502,7 @@ Fortran::lower::LoweringBridge::LoweringBridge(
     default:
       break;
     }
-    if (!diag.getLocation().isa<mlir::UnknownLoc>())
+    if (!mlir::isa<mlir::UnknownLoc>(diag.getLocation()))
       os << diag.getLocation() << ": ";
     os << diag << '\n';
     os.flush();
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 5ad244600328..c1f54ad39287 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1182,7 +1182,7 @@ private:
       Property prop = Property::BaseAddress;
       if (isValueAttr) {
         bool isBuiltinCptrType = fir::isa_builtin_cptr_type(type);
-        if (isBindC || (!type.isa<fir::SequenceType>() &&
+        if (isBindC || (!mlir::isa<fir::SequenceType>(type) &&
                         !obj.attrs.test(Attrs::Optional) &&
                         (dynamicType.category() !=
                              Fortran::common::TypeCategory::Derived ||
@@ -1190,7 +1190,7 @@ private:
           passBy = PassEntityBy::Value;
           prop = Property::Value;
           if (isBuiltinCptrType) {
-            auto recTy = type.dyn_cast<fir::RecordType>();
+            auto recTy = mlir::dyn_cast<fir::RecordType>(type);
             mlir::Type fieldTy = recTy.getTypeList()[0].second;
             passType = fir::ReferenceType::get(fieldTy);
           } else {
@@ -1714,7 +1714,7 @@ mlir::Type Fortran::lower::getDummyProcedureType(
 }
 
 bool Fortran::lower::isCPtrArgByValueType(mlir::Type ty) {
-  return ty.isa<fir::ReferenceType>() &&
+  return mlir::isa<fir::ReferenceType>(ty) &&
          fir::isa_integer(fir::unwrapRefType(ty));
 }
 
diff --git a/flang/lib/Lower/ConvertArrayConstructor.cpp b/flang/lib/Lower/ConvertArrayConstructor.cpp
index 24aa9beba6bf..a5b5838fe6b6 100644
--- a/flang/lib/Lower/ConvertArrayConstructor.cpp
+++ b/flang/lib/Lower/ConvertArrayConstructor.cpp
@@ -336,7 +336,7 @@ public:
       if (!extent)
         extent = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
       if (missingLengthParameters) {
-        if (declaredType.getEleTy().isa<fir::CharacterType>())
+        if (mlir::isa<fir::CharacterType>(declaredType.getEleTy()))
           emboxLengths.push_back(builder.createIntegerConstant(
               loc, builder.getCharacterLengthType(), 0));
         else
@@ -357,7 +357,7 @@ public:
 
   bool useSimplePushRuntime(hlfir::Entity value) {
     return value.isScalar() &&
-           !arrayConstructorElementType.isa<fir::CharacterType>() &&
+           !mlir::isa<fir::CharacterType>(arrayConstructorElementType) &&
            !fir::isRecordWithAllocatableMember(arrayConstructorElementType) &&
            !fir::isRecordWithTypeParameters(arrayConstructorElementType);
   }
@@ -370,7 +370,7 @@ public:
       auto [addrExv, cleanUp] = hlfir::convertToAddress(
           loc, builder, value, arrayConstructorElementType);
       mlir::Value addr = fir::getBase(addrExv);
-      if (addr.getType().isa<fir::BaseBoxType>())
+      if (mlir::isa<fir::BaseBoxType>(addr.getType()))
         addr = builder.create<fir::BoxAddrOp>(loc, addr);
       fir::runtime::genPushArrayConstructorSimpleScalar(
           loc, builder, arrayConstructorVector, addr);
@@ -564,7 +564,7 @@ struct LengthAndTypeCollector<Character<Kind>> {
 /// lowering an ac-value and must be delayed?
 static bool missingLengthParameters(mlir::Type elementType,
                                     llvm::ArrayRef<mlir::Value> lengths) {
-  return (elementType.isa<fir::CharacterType>() ||
+  return (mlir::isa<fir::CharacterType>(elementType) ||
           fir::isRecordWithTypeParameters(elementType)) &&
          lengths.empty();
 }
@@ -702,7 +702,8 @@ static ArrayCtorLoweringStrategy selectArrayCtorLoweringStrategy(
   // Based on what was gathered and the result of the analysis, select and
   // instantiate the right lowering strategy for the array constructor.
   if (!extent || needToEvaluateOneExprToGetLengthParameters ||
-      analysis.anyArrayExpr || declaredType.getEleTy().isa<fir::RecordType>())
+      analysis.anyArrayExpr ||
+      mlir::isa<fir::RecordType>(declaredType.getEleTy()))
     return RuntimeTempStrategy(
         loc, builder, stmtCtx, symMap, declaredType,
         extent ? std::optional<mlir::Value>(extent) : std::nullopt, lengths,
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index c6f7d3410ad5..e4a0cc8d4730 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -49,15 +49,15 @@ static fir::ExtendedValue toExtendedValue(mlir::Location loc, mlir::Value base,
                                           llvm::ArrayRef<mlir::Value> extents,
                                           llvm::ArrayRef<mlir::Value> lengths) {
   mlir::Type type = base.getType();
-  if (type.isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(type))
     return fir::BoxValue(base, /*lbounds=*/{}, lengths, extents);
   type = fir::unwrapRefType(type);
-  if (type.isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(type))
     return fir::MutableBoxValue(base, lengths, /*mutableProperties*/ {});
-  if (auto seqTy = type.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type)) {
     if (seqTy.getDimension() != extents.size())
       fir::emitFatalError(loc, "incorrect number of extents for array");
-    if (seqTy.getEleTy().isa<fir::CharacterType>()) {
+    if (mlir::isa<fir::CharacterType>(seqTy.getEleTy())) {
       if (lengths.empty())
         fir::emitFatalError(loc, "missing length for character");
       assert(lengths.size() == 1);
@@ -65,7 +65,7 @@ static fir::ExtendedValue toExtendedValue(mlir::Location loc, mlir::Value base,
     }
     return fir::ArrayBoxValue(base, extents);
   }
-  if (type.isa<fir::CharacterType>()) {
+  if (mlir::isa<fir::CharacterType>(type)) {
     if (lengths.empty())
       fir::emitFatalError(loc, "missing length for character");
     assert(lengths.size() == 1);
@@ -193,7 +193,7 @@ static mlir::Value remapActualToDummyDescriptor(
   llvm::SmallVector<mlir::Value> lengths;
   mlir::Type dummyBoxType = caller.getDummyArgumentType(arg);
   mlir::Type dummyBaseType = fir::unwrapPassByRefType(dummyBoxType);
-  if (dummyBaseType.isa<fir::SequenceType>())
+  if (mlir::isa<fir::SequenceType>(dummyBaseType))
     caller.walkDummyArgumentExtents(
         arg, [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
           extents.emplace_back(lowerSpecExpr(e, isAssumedSizeExtent));
@@ -338,7 +338,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     if (!caller.callerAllocateResult())
       return {};
     mlir::Type type = caller.getResultStorageType();
-    if (type.isa<fir::SequenceType>())
+    if (mlir::isa<fir::SequenceType>(type))
       caller.walkResultExtents(
           [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
             assert(!isAssumedSizeExtent && "result cannot be assumed-size");
@@ -353,7 +353,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     // Result length parameters should not be provided to box storage
     // allocation and save_results, but they are still useful information to
     // keep in the ExtendedValue if non-deferred.
-    if (!type.isa<fir::BoxType>()) {
+    if (!mlir::isa<fir::BoxType>(type)) {
       if (fir::isa_char(fir::unwrapSequenceType(type)) && lengths.empty()) {
         // Calling an assumed length function. This is only possible if this
         // is a call to a character dummy procedure.
@@ -478,7 +478,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
   // FIR.
   if (funcPointer) {
     operands.push_back(
-        funcPointer.getType().isa<fir::BoxProcType>()
+        mlir::isa<fir::BoxProcType>(funcPointer.getType())
             ? builder.create<fir::BoxAddrOp>(loc, funcType, funcPointer)
             : builder.createConvert(loc, funcType, funcPointer));
   }
@@ -492,8 +492,8 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     // arguments of any type and vice versa.
     mlir::Value cast;
     auto *context = builder.getContext();
-    if (snd.isa<fir::BoxProcType>() &&
-        fst.getType().isa<mlir::FunctionType>()) {
+    if (mlir::isa<fir::BoxProcType>(snd) &&
+        mlir::isa<mlir::FunctionType>(fst.getType())) {
       auto funcTy =
           mlir::FunctionType::get(context, std::nullopt, std::nullopt);
       auto boxProcTy = builder.getBoxProcType(funcTy);
@@ -734,9 +734,9 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
 
   // Call a BIND(C) function that return a char.
   if (caller.characterize().IsBindC() &&
-      funcType.getResults()[0].isa<fir::CharacterType>()) {
+      mlir::isa<fir::CharacterType>(funcType.getResults()[0])) {
     fir::CharacterType charTy =
-        funcType.getResults()[0].dyn_cast<fir::CharacterType>();
+        mlir::dyn_cast<fir::CharacterType>(funcType.getResults()[0]);
     mlir::Value len = builder.createIntegerConstant(
         loc, builder.getCharacterLengthType(), charTy.getLen());
     return {fir::CharBoxValue{callResult, len}, /*resultIsFinalized=*/false};
@@ -890,7 +890,7 @@ extendedValueToHlfirEntity(mlir::Location loc, fir::FirOpBuilder &builder,
   mlir::Type firBaseTy = firBase.getType();
   if (fir::isa_trivial(firBaseTy))
     return hlfir::EntityWithAttributes{firBase};
-  if (auto charTy = firBase.getType().dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(firBase.getType())) {
     // CHAR() intrinsic and BIND(C) procedures returning CHARACTER(1)
     // are lowered to a fir.char<kind,1> that is not in memory.
     // This tends to cause a lot of bugs because the rest of the
@@ -1061,7 +1061,7 @@ static hlfir::Entity fixProcedureDummyMismatch(mlir::Location loc,
                                                fir::FirOpBuilder &builder,
                                                hlfir::Entity actual,
                                                mlir::Type dummyType) {
-  if (actual.getType().isa<fir::BoxProcType>() &&
+  if (mlir::isa<fir::BoxProcType>(actual.getType()) &&
       fir::isCharacterProcedureTuple(dummyType)) {
     mlir::Value length =
         builder.create<fir::UndefOp>(loc, builder.getCharacterLengthType());
@@ -1070,7 +1070,7 @@ static hlfir::Entity fixProcedureDummyMismatch(mlir::Location loc,
     return hlfir::Entity{tuple};
   }
   assert(fir::isCharacterProcedureTuple(actual.getType()) &&
-         dummyType.isa<fir::BoxProcType>() &&
+         mlir::isa<fir::BoxProcType>(dummyType) &&
          "unsupported dummy procedure mismatch with the actual argument");
   mlir::Value boxProc = fir::factory::extractCharacterProcedureTuple(
                             builder, loc, actual, /*openBoxProc=*/false)
@@ -1143,7 +1143,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     assert(actual.isProcedure());
     // Do nothing if this is a procedure argument. It is already a
     // fir.boxproc/fir.tuple<fir.boxproc, len> as it should.
-    if (!actual.getType().isa<fir::BoxProcType>() &&
+    if (!mlir::isa<fir::BoxProcType>(actual.getType()) &&
         actual.getType() != dummyType)
       // The actual argument may be a procedure that returns character (a
       // fir.tuple<fir.boxproc, len>) while the dummy is not. Extract the tuple
@@ -1164,7 +1164,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // dynamic type matters to determine the contiguity.
   const bool mustSetDynamicTypeToDummyType =
       passingPolymorphicToNonPolymorphic &&
-      (actual.isArray() || dummyType.isa<fir::BaseBoxType>());
+      (actual.isArray() || mlir::isa<fir::BaseBoxType>(dummyType));
 
   // The simple contiguity of the actual is "lost" when passing a polymorphic
   // to a non polymorphic entity because the dummy dynamic type matters for
@@ -1236,7 +1236,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
       preparedDummy.pushExprAssociateCleanUp(associate);
     } else if (mustDoCopyInOut) {
       // Copy-in non contiguous variables.
-      assert(entity.getType().isa<fir::BaseBoxType>() &&
+      assert(mlir::isa<fir::BaseBoxType>(entity.getType()) &&
              "expect non simply contiguous variables to be boxes");
       if (actualIsAssumedRank)
         TODO(loc, "copy-in and copy-out of assumed-rank arguments");
@@ -1294,13 +1294,14 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // Step 3: now that the dummy argument storage has been prepared, package
   // it according to the interface.
   mlir::Value addr;
-  if (dummyTypeWithActualRank.isa<fir::BoxCharType>()) {
+  if (mlir::isa<fir::BoxCharType>(dummyTypeWithActualRank)) {
     addr = hlfir::genVariableBoxChar(loc, builder, entity);
-  } else if (dummyTypeWithActualRank.isa<fir::BaseBoxType>()) {
+  } else if (mlir::isa<fir::BaseBoxType>(dummyTypeWithActualRank)) {
     entity = hlfir::genVariableBox(loc, builder, entity);
     // Ensures the box has the right attributes and that it holds an
     // addendum if needed.
-    fir::BaseBoxType actualBoxType = entity.getType().cast<fir::BaseBoxType>();
+    fir::BaseBoxType actualBoxType =
+        mlir::cast<fir::BaseBoxType>(entity.getType());
     mlir::Type boxEleType = actualBoxType.getEleTy();
     // For now, assume it is not OK to pass the allocatable/pointer
     // descriptor to a non pointer/allocatable dummy. That is a strict
@@ -1567,7 +1568,7 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
         // callee side, and it is illegal to use NULL without a MOLD if any
         // dummy length parameters are assumed.
         mlir::Type boxTy = fir::dyn_cast_ptrEleTy(argTy);
-        assert(boxTy && boxTy.isa<fir::BaseBoxType>() &&
+        assert(boxTy && mlir::isa<fir::BaseBoxType>(boxTy) &&
                "must be a fir.box type");
         mlir::Value boxStorage =
             fir::factory::genNullBoxStorage(builder, loc, boxTy);
@@ -1635,7 +1636,8 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
       caller, callSiteType, callContext.resultType,
       callContext.isElementalProcWithArrayArgs());
   // For procedure pointer function result, just return the call.
-  if (callContext.resultType && callContext.resultType->isa<fir::BoxProcType>())
+  if (callContext.resultType &&
+      mlir::isa<fir::BoxProcType>(*callContext.resultType))
     return hlfir::EntityWithAttributes(fir::getBase(result));
 
   /// Clean-up associations and copy-in.
@@ -2115,9 +2117,9 @@ public:
         hlfir::getFortranElementType(*callContext.resultType);
     // Get result length parameters.
     llvm::SmallVector<mlir::Value> typeParams;
-    if (elementType.isa<fir::CharacterType>() ||
+    if (mlir::isa<fir::CharacterType>(elementType) ||
         fir::isRecordWithTypeParameters(elementType)) {
-      auto charType = elementType.dyn_cast<fir::CharacterType>();
+      auto charType = mlir::dyn_cast<fir::CharacterType>(elementType);
       if (charType && charType.hasConstantLen())
         typeParams.push_back(builder.createIntegerConstant(
             loc, builder.getIndexType(), charType.getLen()));
@@ -2523,7 +2525,7 @@ genIntrinsicRef(const Fortran::evaluate::SpecificIntrinsic *intrinsic,
   }
   std::optional<hlfir::EntityWithAttributes> result = genHLFIRIntrinsicRefCore(
       loweredActuals, intrinsic, argLowering, callContext);
-  if (result && result->getType().isa<hlfir::ExprType>()) {
+  if (result && mlir::isa<hlfir::ExprType>(result->getType())) {
     fir::FirOpBuilder *bldr = &callContext.getBuilder();
     callContext.stmtCtx.attachCleanup(
         [=]() { bldr->create<hlfir::DestroyOp>(loc, *result); });
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
index ed389bbe4ae5..653e874a969c 100644
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -184,8 +184,8 @@ private:
     if (!attributeElementType || attributes.empty())
       return {};
 
-    assert(symTy.isa<fir::SequenceType>() && "expecting an array global");
-    auto arrTy = symTy.cast<fir::SequenceType>();
+    assert(mlir::isa<fir::SequenceType>(symTy) && "expecting an array global");
+    auto arrTy = mlir::cast<fir::SequenceType>(symTy);
     llvm::SmallVector<int64_t> tensorShape(arrTy.getShape());
     std::reverse(tensorShape.begin(), tensorShape.end());
     auto tensorTy =
@@ -423,14 +423,14 @@ static mlir::Value genStructureComponentInit(
     // address field, which ought to be an intptr_t on the target.
     mlir::Value addr = fir::getBase(
         Fortran::lower::genExtAddrInInitializer(converter, loc, expr));
-    if (addr.getType().isa<fir::BoxProcType>())
+    if (mlir::isa<fir::BoxProcType>(addr.getType()))
       addr = builder.create<fir::BoxAddrOp>(loc, addr);
     assert((fir::isa_ref_type(addr.getType()) ||
-            addr.getType().isa<mlir::FunctionType>()) &&
+            mlir::isa<mlir::FunctionType>(addr.getType())) &&
            "expect reference type for address field");
     assert(fir::isa_derived(componentTy) &&
            "expect C_PTR, C_FUNPTR to be a record");
-    auto cPtrRecTy = componentTy.cast<fir::RecordType>();
+    auto cPtrRecTy = mlir::cast<fir::RecordType>(componentTy);
     llvm::StringRef addrFieldName = Fortran::lower::builtin::cptrFieldName;
     mlir::Type addrFieldTy = cPtrRecTy.getType(addrFieldName);
     auto addrField = builder.create<fir::FieldIndexOp>(
@@ -460,7 +460,7 @@ static mlir::Value genInlinedStructureCtorLitImpl(
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
     const Fortran::evaluate::StructureConstructor &ctor, mlir::Type type) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  auto recTy = type.cast<fir::RecordType>();
+  auto recTy = mlir::cast<fir::RecordType>(type);
 
   if (!converter.getLoweringOptions().getLowerToHighLevelFIR()) {
     mlir::Value res = builder.create<fir::UndefOp>(loc, recTy);
@@ -587,7 +587,7 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
     } while (con.IncrementSubscripts(subscripts));
   } else if constexpr (T::category == Fortran::common::TypeCategory::Derived) {
     do {
-      mlir::Type eleTy = arrayTy.cast<fir::SequenceType>().getEleTy();
+      mlir::Type eleTy = mlir::cast<fir::SequenceType>(arrayTy).getEleTy();
       mlir::Value elementVal =
           genScalarLit(converter, loc, con.At(subscripts), eleTy,
                        /*outlineInReadOnlyMemory=*/false);
@@ -597,7 +597,7 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
   } else {
     llvm::SmallVector<mlir::Attribute> rangeStartIdx;
     uint64_t rangeSize = 0;
-    mlir::Type eleTy = arrayTy.cast<fir::SequenceType>().getEleTy();
+    mlir::Type eleTy = mlir::cast<fir::SequenceType>(arrayTy).getEleTy();
     do {
       auto getElementVal = [&]() {
         return builder.createConvert(loc, eleTy,
@@ -620,12 +620,11 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
         llvm::SmallVector<int64_t> rangeBounds;
         llvm::SmallVector<mlir::Attribute> idx = createIdx();
         for (size_t i = 0; i < idx.size(); ++i) {
-          rangeBounds.push_back(rangeStartIdx[i]
-                                    .cast<mlir::IntegerAttr>()
+          rangeBounds.push_back(mlir::cast<mlir::IntegerAttr>(rangeStartIdx[i])
                                     .getValue()
                                     .getSExtValue());
           rangeBounds.push_back(
-              idx[i].cast<mlir::IntegerAttr>().getValue().getSExtValue());
+              mlir::cast<mlir::IntegerAttr>(idx[i]).getValue().getSExtValue());
         }
         array = builder.create<fir::InsertOnRangeOp>(
             loc, arrayTy, array, getElementVal(),
@@ -647,7 +646,7 @@ genOutlineArrayLit(Fortran::lower::AbstractConverter &converter,
                    mlir::Location loc, mlir::Type arrayTy,
                    const Fortran::evaluate::Constant<T> &constant) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  mlir::Type eleTy = arrayTy.cast<fir::SequenceType>().getEleTy();
+  mlir::Type eleTy = mlir::cast<fir::SequenceType>(arrayTy).getEleTy();
   llvm::StringRef globalName = converter.getUniqueLitName(
       loc, std::make_unique<Fortran::lower::SomeExpr>(toEvExpr(constant)),
       eleTy);
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index fb7807718ff8..9567685aa3d2 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -267,7 +267,7 @@ arrayLoadExtValue(fir::FirOpBuilder &builder, mlir::Location loc,
     mlir::Type ty = fir::applyPathToType(arrTy, path);
     if (!ty)
       fir::emitFatalError(loc, "path does not apply to type");
-    if (!ty.isa<fir::SequenceType>()) {
+    if (!mlir::isa<fir::SequenceType>(ty)) {
       if (fir::isa_char(ty)) {
         mlir::Value len = newLen;
         if (!len)
@@ -282,7 +282,7 @@ arrayLoadExtValue(fir::FirOpBuilder &builder, mlir::Location loc,
       }
       return newBase;
     }
-    arrTy = ty.cast<fir::SequenceType>();
+    arrTy = mlir::cast<fir::SequenceType>(ty);
   }
 
   auto arrayToExtendedValue =
@@ -412,15 +412,15 @@ static fir::ExtendedValue genLoad(fir::FirOpBuilder &builder,
   return addr.match(
       [](const fir::CharBoxValue &box) -> fir::ExtendedValue { return box; },
       [&](const fir::PolymorphicValue &p) -> fir::ExtendedValue {
-        if (fir::unwrapRefType(fir::getBase(p).getType())
-                .isa<fir::RecordType>())
+        if (mlir::isa<fir::RecordType>(
+                fir::unwrapRefType(fir::getBase(p).getType())))
           return p;
         mlir::Value load = builder.create<fir::LoadOp>(loc, fir::getBase(p));
         return fir::PolymorphicValue(load, p.getSourceBox());
       },
       [&](const fir::UnboxedValue &v) -> fir::ExtendedValue {
-        if (fir::unwrapRefType(fir::getBase(v).getType())
-                .isa<fir::RecordType>())
+        if (mlir::isa<fir::RecordType>(
+                fir::unwrapRefType(fir::getBase(v).getType())))
           return v;
         return builder.create<fir::LoadOp>(loc, fir::getBase(v));
       },
@@ -536,8 +536,8 @@ static mlir::Value
 createBoxProcCharTuple(Fortran::lower::AbstractConverter &converter,
                        mlir::Type argTy, mlir::Value funcAddr,
                        mlir::Value charLen) {
-  auto boxTy =
-      argTy.cast<mlir::TupleType>().getType(0).cast<fir::BoxProcType>();
+  auto boxTy = mlir::cast<fir::BoxProcType>(
+      mlir::cast<mlir::TupleType>(argTy).getType(0));
   mlir::Location loc = converter.getCurrentLocation();
   auto &builder = converter.getFirOpBuilder();
 
@@ -549,7 +549,7 @@ createBoxProcCharTuple(Fortran::lower::AbstractConverter &converter,
   mlir::Type toTy = boxTy.getEleTy();
   if (fir::isa_ref_type(fromTy))
     funcAddr = builder.createConvert(loc, toTy, funcAddr);
-  else if (fromTy.isa<fir::BoxProcType>())
+  else if (mlir::isa<fir::BoxProcType>(fromTy))
     funcAddr = builder.create<fir::BoxAddrOp>(loc, toTy, funcAddr);
 
   auto boxProc = [&]() -> mlir::Value {
@@ -575,7 +575,7 @@ absentBoxToUnallocatedBox(fir::FirOpBuilder &builder, mlir::Location loc,
                           mlir::Value isPresent) {
   mlir::Value box = fir::getBase(exv);
   mlir::Type boxType = box.getType();
-  assert(boxType.isa<fir::BoxType>() && "argument must be a fir.box");
+  assert(mlir::isa<fir::BoxType>(boxType) && "argument must be a fir.box");
   mlir::Value emptyBox =
       fir::factory::createUnallocatedBox(builder, loc, boxType, std::nullopt);
   auto safeToReadBox =
@@ -915,7 +915,7 @@ public:
     if (inInitializer)
       return Fortran::lower::genInlinedStructureCtorLit(converter, loc, ctor);
     mlir::Type ty = translateSomeExprToFIRType(converter, toEvExpr(ctor));
-    auto recTy = ty.cast<fir::RecordType>();
+    auto recTy = mlir::cast<fir::RecordType>(ty);
     auto fieldTy = fir::FieldType::get(ty.getContext());
     mlir::Value res = builder.createTemporary(loc, recTy);
     mlir::Value box = builder.createBox(loc, fir::ExtendedValue{res});
@@ -1172,8 +1172,8 @@ public:
     if (!charBox)
       fir::emitFatalError(loc, "expected scalar character");
     mlir::Value charAddr = charBox->getAddr();
-    auto charType =
-        fir::unwrapPassByRefType(charAddr.getType()).cast<fir::CharacterType>();
+    auto charType = mlir::cast<fir::CharacterType>(
+        fir::unwrapPassByRefType(charAddr.getType()));
     if (charType.hasConstantLen()) {
       // Erase previous constant length from the base type.
       fir::CharacterType::LenType newLen = fir::CharacterType::unknownLen();
@@ -1441,7 +1441,7 @@ public:
     auto fldTy = fir::FieldType::get(&converter.getMLIRContext());
     // FIXME: need to thread the LEN type parameters here.
     for (const Fortran::evaluate::Component *field : list) {
-      auto recTy = ty.cast<fir::RecordType>();
+      auto recTy = mlir::cast<fir::RecordType>(ty);
       const Fortran::semantics::Symbol &sym = getLastSym(*field);
       std::string name = converter.getRecordTypeFieldName(sym);
       coorArgs.push_back(builder.create<fir::FieldIndexOp>(
@@ -1478,7 +1478,7 @@ public:
   mlir::Type genSubType(mlir::Type arrTy, unsigned dims) {
     mlir::Type unwrapTy = fir::dyn_cast_ptrOrBoxEleTy(arrTy);
     assert(unwrapTy && "must be a pointer or box type");
-    auto seqTy = unwrapTy.cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(unwrapTy);
     llvm::ArrayRef<int64_t> shape = seqTy.getShape();
     assert(shape.size() > 0 && "removing columns for sequence sans shape");
     assert(dims <= shape.size() && "removing more columns than exist");
@@ -1550,9 +1550,9 @@ public:
     }
 
     mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(base.getType());
-    if (auto classTy = eleTy.dyn_cast<fir::ClassType>())
+    if (auto classTy = mlir::dyn_cast<fir::ClassType>(eleTy))
       eleTy = classTy.getEleTy();
-    auto seqTy = eleTy.cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(eleTy);
     assert(args.size() == seqTy.getDimension());
     mlir::Type ty = builder.getRefType(seqTy.getEleTy());
     auto addr = builder.create<fir::CoordinateOp>(loc, ty, base, args);
@@ -1571,7 +1571,7 @@ public:
     mlir::Location loc = getLoc();
     mlir::Value addr = fir::getBase(array);
     mlir::Type arrTy = fir::dyn_cast_ptrEleTy(addr.getType());
-    auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+    auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
     mlir::Type seqTy = builder.getRefType(builder.getVarLenSeqTy(eleTy));
     mlir::Type refTy = builder.getRefType(eleTy);
     mlir::Value base = builder.createConvert(loc, seqTy, addr);
@@ -1656,7 +1656,7 @@ public:
     mlir::Location loc = getLoc();
     mlir::Value addr = fir::getBase(exv);
     mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(addr.getType());
-    mlir::Type eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+    mlir::Type eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
     mlir::Type refTy = builder.getRefType(eleTy);
     mlir::IndexType idxTy = builder.getIndexType();
     llvm::SmallVector<mlir::Value> arrayCoorArgs;
@@ -1766,8 +1766,9 @@ public:
     mlir::Location loc = getLoc();
     ExtValue exv = genBoxArg(expr);
     auto exvTy = fir::getBase(exv).getType();
-    if (exvTy.isa<mlir::FunctionType>()) {
-      auto boxProcTy = builder.getBoxProcType(exvTy.cast<mlir::FunctionType>());
+    if (mlir::isa<mlir::FunctionType>(exvTy)) {
+      auto boxProcTy =
+          builder.getBoxProcType(mlir::cast<mlir::FunctionType>(exvTy));
       return builder.create<fir::EmboxProcOp>(loc, boxProcTy,
                                               fir::getBase(exv));
     }
@@ -1861,7 +1862,7 @@ public:
           // IS_CONTIGUOUS may require an assumed size TYPE(*) to be passed to
           // the intrinsic library utility as a fir.box.
           if (argRules.lowerAs == fir::LowerIntrinsicArgAs::Box &&
-              !fir::getBase(exv).getType().isa<fir::BaseBoxType>()) {
+              !mlir::isa<fir::BaseBoxType>(fir::getBase(exv).getType())) {
             operands.emplace_back(
                 fir::factory::createBoxValue(builder, loc, exv));
             continue;
@@ -2005,7 +2006,7 @@ public:
         fir::getTypeParams(mold);
     mlir::Value charLen;
     mlir::Type elementType = fir::unwrapSequenceType(type);
-    if (auto charType = elementType.dyn_cast<fir::CharacterType>()) {
+    if (auto charType = mlir::dyn_cast<fir::CharacterType>(elementType)) {
       charLen = allocMemTypeParams.empty()
                     ? fir::factory::readCharLen(builder, loc, mold)
                     : allocMemTypeParams[0];
@@ -2017,7 +2018,7 @@ public:
 
     mlir::Value temp = builder.create<fir::AllocMemOp>(
         loc, type, tempName, allocMemTypeParams, extents);
-    if (fir::unwrapSequenceType(type).isa<fir::CharacterType>())
+    if (mlir::isa<fir::CharacterType>(fir::unwrapSequenceType(type)))
       return fir::CharArrayBoxValue{temp, charLen, extents};
     return fir::ArrayBoxValue{temp, extents};
   }
@@ -2166,7 +2167,7 @@ public:
         // We have to initialize the temp if it may have components
         // that need initialization. If there are no components
         // requiring initialization, then the call is a no-op.
-        if (getElementTypeOf(temp).isa<fir::RecordType>()) {
+        if (mlir::isa<fir::RecordType>(getElementTypeOf(temp))) {
           mlir::Value tempBox = fir::getBase(builder.createBox(loc, temp));
           fir::runtime::genDerivedTypeInitialize(builder, loc, tempBox);
         }
@@ -2312,7 +2313,7 @@ public:
     if (!copyOutPair.restrictCopyAndFreeAtRuntime) {
       doCopyOut();
 
-      if (fir::getElementTypeOf(copyOutPair.temp).isa<fir::RecordType>()) {
+      if (mlir::isa<fir::RecordType>(fir::getElementTypeOf(copyOutPair.temp))) {
         // Destroy components of the temporary (if any).
         // If there are no components requiring destruction, then the call
         // is a no-op.
@@ -2330,7 +2331,8 @@ public:
     builder.genIfThen(loc, *copyOutPair.restrictCopyAndFreeAtRuntime)
         .genThen([&]() {
           doCopyOut();
-          if (fir::getElementTypeOf(copyOutPair.temp).isa<fir::RecordType>()) {
+          if (mlir::isa<fir::RecordType>(
+                  fir::getElementTypeOf(copyOutPair.temp))) {
             // Destroy components of the temporary (if any).
             // If there are no components requiring destruction, then the call
             // is a no-op.
@@ -2381,7 +2383,7 @@ public:
     mlir::Value actualArgBase = fir::getBase(actualArg);
     mlir::Value isPresent = builder.create<fir::IsPresentOp>(
         loc, builder.getI1Type(), actualArgBase);
-    if (!actualArgBase.getType().isa<fir::BoxType>())
+    if (!mlir::isa<fir::BoxType>(actualArgBase.getType()))
       return {actualArg, isPresent};
     ExtValue safeToReadBox =
         absentBoxToUnallocatedBox(builder, loc, actualArg, isPresent);
@@ -2408,7 +2410,7 @@ public:
                                       fir::getAdaptToByRefAttr(builder)});
       return fir::CharBoxValue{temp, len};
     }
-    assert((fir::isa_trivial(type) || type.isa<fir::RecordType>()) &&
+    assert((fir::isa_trivial(type) || mlir::isa<fir::RecordType>(type)) &&
            "must be simple scalar");
     return builder.createTemporary(loc, type,
                                    llvm::ArrayRef<mlir::NamedAttribute>{
@@ -2585,7 +2587,7 @@ public:
           // callee side, and it is illegal to use NULL without a MOLD if any
           // dummy length parameters are assumed.
           mlir::Type boxTy = fir::dyn_cast_ptrEleTy(argTy);
-          assert(boxTy && boxTy.isa<fir::BaseBoxType>() &&
+          assert(boxTy && mlir::isa<fir::BaseBoxType>(boxTy) &&
                  "must be a fir.box type");
           mlir::Value boxStorage = builder.createTemporary(loc, boxTy);
           mlir::Value nullBox = fir::factory::createUnallocatedBox(
@@ -2643,10 +2645,11 @@ public:
                 // If a character procedure was passed instead, handle the
                 // mismatch.
                 auto funcTy =
-                    x.getAddr().getType().dyn_cast<mlir::FunctionType>();
+                    mlir::dyn_cast<mlir::FunctionType>(x.getAddr().getType());
                 if (funcTy && funcTy.getNumResults() == 1 &&
-                    funcTy.getResult(0).isa<fir::BoxCharType>()) {
-                  auto boxTy = funcTy.getResult(0).cast<fir::BoxCharType>();
+                    mlir::isa<fir::BoxCharType>(funcTy.getResult(0))) {
+                  auto boxTy =
+                      mlir::cast<fir::BoxCharType>(funcTy.getResult(0));
                   mlir::Value ref = builder.createConvert(
                       loc, builder.getRefType(boxTy.getEleTy()), x.getAddr());
                   auto len = builder.create<fir::UndefOp>(
@@ -2667,7 +2670,7 @@ public:
                 // free-casting the base address to be a !fir.char reference and
                 // setting the LEN argument to undefined. What could go wrong?
                 auto dataPtr = fir::getBase(x);
-                assert(!dataPtr.getType().template isa<fir::BoxType>());
+                assert(!mlir::isa<fir::BoxType>(dataPtr.getType()));
                 return builder.convertWithSemantics(
                     loc, argTy, dataPtr,
                     /*allowCharacterConversion=*/true);
@@ -2742,7 +2745,7 @@ public:
                 loc,
                 fir::ClassType::get(mlir::NoneType::get(builder.getContext())),
                 box);
-          } else if (box.getType().isa<fir::BoxType>() &&
+          } else if (mlir::isa<fir::BoxType>(box.getType()) &&
                      fir::isPolymorphicType(argTy)) {
             box = builder.create<fir::ReboxOp>(loc, argTy, box, mlir::Value{},
                                                /*slice=*/mlir::Value{});
@@ -2791,7 +2794,7 @@ public:
                       : builder.createBox(getLoc(), genTempExtAddr(*expr),
                                           fir::isPolymorphicType(argTy),
                                           fir::isAssumedType(argTy));
-            if (box.getType().isa<fir::BoxType>() &&
+            if (mlir::isa<fir::BoxType>(box.getType()) &&
                 fir::isPolymorphicType(argTy) && !fir::isAssumedType(argTy)) {
               mlir::Type actualTy = argTy;
               if (Fortran::lower::isParentComponent(*expr))
@@ -3030,10 +3033,11 @@ private:
       Fortran::common::ScopedSet(semant, PushVal);
 
 static bool isAdjustedArrayElementType(mlir::Type t) {
-  return fir::isa_char(t) || fir::isa_derived(t) || t.isa<fir::SequenceType>();
+  return fir::isa_char(t) || fir::isa_derived(t) ||
+         mlir::isa<fir::SequenceType>(t);
 }
 static bool elementTypeWasAdjusted(mlir::Type t) {
-  if (auto ty = t.dyn_cast<fir::ReferenceType>())
+  if (auto ty = mlir::dyn_cast<fir::ReferenceType>(t))
     return isAdjustedArrayElementType(ty.getEleTy());
   return false;
 }
@@ -3050,15 +3054,15 @@ static void genScalarUserDefinedAssignmentCall(fir::FirOpBuilder &builder,
   auto prepareUserDefinedArg =
       [](fir::FirOpBuilder &builder, mlir::Location loc,
          const fir::ExtendedValue &value, mlir::Type argType) -> mlir::Value {
-    if (argType.isa<fir::BoxCharType>()) {
+    if (mlir::isa<fir::BoxCharType>(argType)) {
       const fir::CharBoxValue *charBox = value.getCharBox();
       assert(charBox && "argument type mismatch in elemental user assignment");
       return fir::factory::CharacterExprHelper{builder, loc}.createEmbox(
           *charBox);
     }
-    if (argType.isa<fir::BaseBoxType>()) {
+    if (mlir::isa<fir::BaseBoxType>(argType)) {
       mlir::Value box =
-          builder.createBox(loc, value, argType.isa<fir::ClassType>());
+          builder.createBox(loc, value, mlir::isa<fir::ClassType>(argType));
       return builder.createConvert(loc, argType, box);
     }
     // Simple pass by address.
@@ -3170,7 +3174,7 @@ convertToArrayBoxValue(mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::Value val, mlir::Value len) {
   mlir::Type ty = fir::unwrapRefType(val.getType());
   mlir::IndexType idxTy = builder.getIndexType();
-  auto seqTy = ty.cast<fir::SequenceType>();
+  auto seqTy = mlir::cast<fir::SequenceType>(ty);
   auto undef = builder.create<fir::UndefOp>(loc, idxTy);
   llvm::SmallVector<mlir::Value> extents(seqTy.getDimension(), undef);
   if (fir::isa_char(seqTy.getEleTy()))
@@ -3462,7 +3466,7 @@ public:
         [&](const auto &e) {
           auto f = genarr(e);
           ExtValue exv = f(IterationSpace{});
-          if (fir::getBase(exv).getType().template isa<fir::BaseBoxType>())
+          if (mlir::isa<fir::BaseBoxType>(fir::getBase(exv).getType()))
             return exv;
           fir::emitFatalError(getLoc(), "array must be emboxed");
         },
@@ -3487,10 +3491,9 @@ public:
                                            tempRes, dest.getSlice(),
                                            dest.getTypeparams());
 
-    auto arrTy =
-        fir::dyn_cast_ptrEleTy(tempRes.getType()).cast<fir::SequenceType>();
-    if (auto charTy =
-            arrTy.getEleTy().template dyn_cast<fir::CharacterType>()) {
+    auto arrTy = mlir::cast<fir::SequenceType>(
+        fir::dyn_cast_ptrEleTy(tempRes.getType()));
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(arrTy.getEleTy())) {
       if (fir::characterWithDynamicLen(charTy))
         TODO(loc, "CHARACTER does not have constant LEN");
       mlir::Value len = builder.createIntegerConstant(
@@ -3912,17 +3915,18 @@ private:
   mlir::Value convertElementForUpdate(mlir::Location loc, mlir::Type eleTy,
                                       mlir::Value origVal) {
     if (auto origEleTy = fir::dyn_cast_ptrEleTy(origVal.getType()))
-      if (origEleTy.isa<fir::BaseBoxType>()) {
+      if (mlir::isa<fir::BaseBoxType>(origEleTy)) {
         // If origVal is a box variable, load it so it is in the value domain.
         origVal = builder.create<fir::LoadOp>(loc, origVal);
       }
-    if (origVal.getType().isa<fir::BoxType>() && !eleTy.isa<fir::BoxType>()) {
+    if (mlir::isa<fir::BoxType>(origVal.getType()) &&
+        !mlir::isa<fir::BoxType>(eleTy)) {
       if (isPointerAssignment())
         TODO(loc, "lhs of pointer assignment returned unexpected value");
       TODO(loc, "invalid box conversion in elemental computation");
     }
-    if (isPointerAssignment() && eleTy.isa<fir::BoxType>() &&
-        !origVal.getType().isa<fir::BoxType>()) {
+    if (isPointerAssignment() && mlir::isa<fir::BoxType>(eleTy) &&
+        !mlir::isa<fir::BoxType>(origVal.getType())) {
       // This is a pointer assignment and the rhs is a raw reference to a TARGET
       // in memory. Embox the reference so it can be stored to the boxed
       // POINTER variable.
@@ -3930,7 +3934,7 @@ private:
       if (auto eleTy = fir::dyn_cast_ptrEleTy(origVal.getType());
           fir::hasDynamicSize(eleTy))
         TODO(loc, "TARGET of pointer assignment with runtime size/shape");
-      auto memrefTy = fir::boxMemRefType(eleTy.cast<fir::BoxType>());
+      auto memrefTy = fir::boxMemRefType(mlir::cast<fir::BoxType>(eleTy));
       auto castTo = builder.createConvert(loc, memrefTy, origVal);
       origVal = builder.create<fir::EmboxOp>(loc, eleTy, castTo);
     }
@@ -3982,7 +3986,7 @@ private:
         auto arrayOp = builder.create<fir::ArrayAccessOp>(
             loc, resRefTy, innerArg, iterSpace.iterVec(),
             fir::factory::getTypeParams(loc, builder, destination));
-        if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+        if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
           populateBounds(substringBounds, substring);
           mlir::Value dstLen = fir::factory::genLenOfCharacter(
@@ -3996,7 +4000,7 @@ private:
               loc, destination, builder, arrayOp, exv, eleTy, innerArg);
           return abstractArrayExtValue(amend /*FIXME: typeparams?*/);
         }
-        assert(eleTy.isa<fir::SequenceType>() && "must be an array");
+        assert(mlir::isa<fir::SequenceType>(eleTy) && "must be an array");
         TODO(loc, "array (as element) assignment");
       }
       // By value semantics. The element is being assigned by value.
@@ -4060,7 +4064,7 @@ private:
   llvm::SmallVector<mlir::Value> getShape(ArrayOperand array) {
     if (array.slice)
       return computeSliceShape(array.slice);
-    if (array.memref.getType().isa<fir::BaseBoxType>())
+    if (mlir::isa<fir::BaseBoxType>(array.memref.getType()))
       return fir::factory::readExtents(builder, getLoc(),
                                        fir::BoxValue{array.memref});
     return fir::factory::getExtents(array.shape);
@@ -4133,7 +4137,7 @@ private:
     mlir::Location loc = getLoc();
     return [=, builder = &converter.getFirOpBuilder()](IterSpace iters) {
       mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(tmp.getType());
-      auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+      auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
       mlir::Type eleRefTy = builder->getRefType(eleTy);
       mlir::IntegerType i1Ty = builder->getI1Type();
       // Adjust indices for any shift of the origin of the array.
@@ -4442,15 +4446,15 @@ private:
       TODO(loc, "polymorphic array temporary");
     if (ccLoadDest)
       return (*ccLoadDest)(shape);
-    auto seqTy = type.dyn_cast<fir::SequenceType>();
+    auto seqTy = mlir::dyn_cast<fir::SequenceType>(type);
     assert(seqTy && "must be an array");
     // TODO: Need to thread the LEN parameters here. For character, they may
     // differ from the operands length (e.g concatenation). So the array loads
     // type parameters are not enough.
-    if (auto charTy = seqTy.getEleTy().dyn_cast<fir::CharacterType>())
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(seqTy.getEleTy()))
       if (charTy.hasDynamicLen())
         TODO(loc, "character array expression temp with dynamic length");
-    if (auto recTy = seqTy.getEleTy().dyn_cast<fir::RecordType>())
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(seqTy.getEleTy()))
       if (recTy.getNumLenParams() > 0)
         TODO(loc, "derived type array expression temp with LEN parameters");
     if (mlir::Type eleTy = fir::unwrapSequenceType(type);
@@ -4827,7 +4831,7 @@ private:
             });
           } else {
             ExtValue exv = asScalarRef(*expr);
-            if (fir::getBase(exv).getType().isa<fir::BaseBoxType>()) {
+            if (mlir::isa<fir::BaseBoxType>(fir::getBase(exv).getType())) {
               operands.emplace_back(
                   [=](IterSpace iters) -> ExtValue { return exv; });
             } else {
@@ -5565,7 +5569,7 @@ private:
   }
 
   static mlir::Type unwrapBoxEleTy(mlir::Type ty) {
-    if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>())
+    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
       return fir::unwrapRefType(boxTy.getEleTy());
     return ty;
   }
@@ -5575,7 +5579,7 @@ private:
     ty = unwrapBoxEleTy(ty);
     mlir::Location loc = getLoc();
     mlir::IndexType idxTy = builder.getIndexType();
-    for (auto extent : ty.cast<fir::SequenceType>().getShape()) {
+    for (auto extent : mlir::cast<fir::SequenceType>(ty).getShape()) {
       auto v = extent == fir::SequenceType::getUnknownExtent()
                    ? builder.create<fir::UndefOp>(loc, idxTy).getResult()
                    : builder.createIntegerConstant(loc, idxTy, extent);
@@ -5638,7 +5642,8 @@ private:
     mlir::Location loc = getLoc();
     mlir::Value memref = fir::getBase(extMemref);
     mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(memref.getType());
-    assert(arrTy.isa<fir::SequenceType>() && "memory ref must be an array");
+    assert(mlir::isa<fir::SequenceType>(arrTy) &&
+           "memory ref must be an array");
     mlir::Value shape = builder.createShape(loc, extMemref);
     mlir::Value slice;
     if (components.isSlice()) {
@@ -5688,12 +5693,12 @@ private:
                                     components.suffixComponents);
       }
       if (components.hasComponents()) {
-        auto seqTy = arrTy.cast<fir::SequenceType>();
+        auto seqTy = mlir::cast<fir::SequenceType>(arrTy);
         mlir::Type eleTy =
             fir::applyPathToType(seqTy.getEleTy(), components.suffixComponents);
         if (!eleTy)
           fir::emitFatalError(loc, "slicing path is ill-formed");
-        if (auto realTy = eleTy.dyn_cast<fir::RealType>())
+        if (auto realTy = mlir::dyn_cast<fir::RealType>(eleTy))
           eleTy = Fortran::lower::convertReal(realTy.getContext(),
                                               realTy.getFKind());
 
@@ -5713,13 +5718,14 @@ private:
       // value. The value of the box is forwarded in the continuation.
       mlir::Type reduceTy = reduceRank(arrTy, slice);
       mlir::Type boxTy = fir::BoxType::get(reduceTy);
-      if (memref.getType().isa<fir::ClassType>() && !components.hasComponents())
+      if (mlir::isa<fir::ClassType>(memref.getType()) &&
+          !components.hasComponents())
         boxTy = fir::ClassType::get(reduceTy);
       if (components.substring) {
         // Adjust char length to substring size.
         fir::CharacterType charTy =
             fir::factory::CharacterExprHelper::getCharType(reduceTy);
-        auto seqTy = reduceTy.cast<fir::SequenceType>();
+        auto seqTy = mlir::cast<fir::SequenceType>(reduceTy);
         // TODO: Use a constant for fir.char LEN if we can compute it.
         boxTy = fir::BoxType::get(
             fir::SequenceType::get(fir::CharacterType::getUnknownLen(
@@ -5734,7 +5740,7 @@ private:
         nonDeferredLenParams = fir::factory::getNonDeferredLenParams(extMemref);
       }
       mlir::Value embox =
-          memref.getType().isa<fir::BaseBoxType>()
+          mlir::isa<fir::BaseBoxType>(memref.getType())
               ? builder.create<fir::ReboxOp>(loc, boxTy, memref, shape, slice)
                     .getResult()
               : builder
@@ -5745,7 +5751,7 @@ private:
         return fir::BoxValue(embox, lbounds, nonDeferredLenParams);
       };
     }
-    auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+    auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
     if (isReferentiallyOpaque()) {
       // Semantics are an opaque reference to an array.
       // This case forwards a continuation that will generate the address
@@ -5760,12 +5766,12 @@ private:
         mlir::Value coor = builder.create<fir::ArrayCoorOp>(
             loc, refEleTy, memref, shape, slice, indices,
             fir::getTypeParams(extMemref));
-        if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+        if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
           populateBounds(substringBounds, components.substring);
           if (!substringBounds.empty()) {
             mlir::Value dstLen = fir::factory::genLenOfCharacter(
-                builder, loc, arrTy.cast<fir::SequenceType>(), memref,
+                builder, loc, mlir::cast<fir::SequenceType>(arrTy), memref,
                 fir::getTypeParams(extMemref), iters.iterVec(),
                 substringBounds);
             fir::CharBoxValue dstChar(coor, dstLen);
@@ -5863,7 +5869,7 @@ private:
         mlir::Type eleRefTy = builder.getRefType(eleTy);
         mlir::Value arrayOp = builder.create<fir::ArrayAccessOp>(
             loc, eleRefTy, arrLd, iters.iterVec(), arrLdTypeParams);
-        if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+        if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
           populateBounds(substringBounds, components.substring);
           if (!substringBounds.empty()) {
@@ -5896,7 +5902,7 @@ private:
     const bool hasOptionalAttr =
         fir::valueHasFirAttribute(base, fir::getOptionalAttrName());
     mlir::Type baseType = fir::unwrapRefType(base.getType());
-    const bool isBox = baseType.isa<fir::BoxType>();
+    const bool isBox = mlir::isa<fir::BoxType>(baseType);
     const bool isAllocOrPtr =
         Fortran::evaluate::IsAllocatableOrPointerObject(expr);
     mlir::Type arrType = fir::unwrapPassByRefType(baseType);
@@ -5989,7 +5995,7 @@ private:
     if (slice) {
       auto slOp = mlir::dyn_cast<fir::SliceOp>(slice.getDefiningOp());
       assert(slOp && "expected slice op");
-      auto seqTy = arrTy.dyn_cast<fir::SequenceType>();
+      auto seqTy = mlir::dyn_cast<fir::SequenceType>(arrTy);
       assert(seqTy && "expected array type");
       mlir::Operation::operand_range triples = slOp.getTriples();
       fir::SequenceType::Shape shape;
@@ -6053,7 +6059,7 @@ private:
     mlir::IndexType idxTy = builder.getIndexType();
     mlir::Value multiplier = builder.createIntegerConstant(loc, idxTy, 1);
     if (fir::hasDynamicSize(eleTy)) {
-      if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+      if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
         // Array of char with dynamic LEN parameter. Downcast to an array
         // of singleton char, and scale by the len type parameter from
         // `exv`.
@@ -6074,7 +6080,7 @@ private:
             });
         fir::CharacterType newEleTy = fir::CharacterType::getSingleton(
             eleTy.getContext(), charTy.getFKind());
-        if (auto seqTy = resTy.dyn_cast<fir::SequenceType>()) {
+        if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(resTy)) {
           assert(eleTy == seqTy.getEleTy());
           resTy = fir::SequenceType::get(seqTy.getShape(), newEleTy);
         }
@@ -6161,7 +6167,7 @@ private:
     if (!eleSz) {
       // Compute the element size at runtime.
       assert(fir::hasDynamicSize(eleTy));
-      if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+      if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
         auto charBytes =
             builder.getKindMap().getCharacterBitsize(charTy.getFKind()) / 8;
         mlir::Value bytes =
@@ -6181,7 +6187,7 @@ private:
     auto computeCoordinate = [&](mlir::Value buff, mlir::Value off) {
       mlir::Type refTy = eleRefTy;
       if (fir::hasDynamicSize(eleTy)) {
-        if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+        if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           // Scale a simple pointer using dynamic length and offset values.
           auto chTy = fir::CharacterType::getSingleton(charTy.getContext(),
                                                        charTy.getFKind());
@@ -6308,7 +6314,7 @@ private:
         builder.createConvert(loc, idxTy, fir::getBase(asScalar(x.upper())));
     mlir::Value step =
         builder.createConvert(loc, idxTy, fir::getBase(asScalar(x.stride())));
-    auto seqTy = resTy.template cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(resTy);
     mlir::Type eleTy = fir::unwrapSequenceType(seqTy);
     auto loop =
         builder.create<fir::DoLoopOp>(loc, lo, up, step, /*unordered=*/false,
@@ -6375,7 +6381,7 @@ private:
     auto evExpr = toEvExpr(x);
     mlir::Type resTy = translateSomeExprToFIRType(converter, evExpr);
     mlir::IndexType idxTy = builder.getIndexType();
-    auto seqTy = resTy.template cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(resTy);
     mlir::Type eleTy = fir::unwrapSequenceType(resTy);
     mlir::Value buffSize = builder.createTemporary(loc, idxTy, ".buff.size");
     mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
@@ -6719,7 +6725,7 @@ private:
                 auto fieldTy = fir::FieldType::get(builder.getContext());
                 std::string name =
                     converter.getRecordTypeFieldName(getLastSym(*x));
-                if (auto recTy = ty.dyn_cast<fir::RecordType>()) {
+                if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty)) {
                   ty = recTy.getType(name);
                   auto fld = builder.create<fir::FieldIndexOp>(
                       loc, fieldTy, name, recTy, fir::getTypeParams(arrayExv));
@@ -6728,7 +6734,7 @@ private:
                     // Need an intermediate  dereference if the boxed value
                     // appears in the middle of the component path or if it is
                     // on the right and this is not a pointer assignment.
-                    if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
+                    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
                       auto currentFunc = components.getExtendCoorRef();
                       auto loc = getLoc();
                       auto *bldr = &converter.getFirOpBuilder();
@@ -6739,9 +6745,9 @@ private:
                       deref = true;
                     }
                   }
-                } else if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
+                } else if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
                   ty = fir::unwrapRefType(boxTy.getEleTy());
-                  auto recTy = ty.cast<fir::RecordType>();
+                  auto recTy = mlir::cast<fir::RecordType>(ty);
                   ty = recTy.getType(name);
                   auto fld = builder.create<fir::FieldIndexOp>(
                       loc, fieldTy, name, recTy, fir::getTypeParams(arrayExv));
@@ -6790,7 +6796,7 @@ private:
           auto arrayOp = builder.create<fir::ArrayAccessOp>(
               loc, eleRefTy, innerArg, iters.iterVec(),
               fir::factory::getTypeParams(loc, builder, load));
-          if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+          if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
             mlir::Value dstLen = fir::factory::genLenOfCharacter(
                 builder, loc, load, iters.iterVec(), substringBounds);
             fir::ArrayAmendOp amend = createCharArrayAmend(
@@ -6806,13 +6812,13 @@ private:
             return arrayLoadExtValue(builder, loc, load, iters.iterVec(),
                                      amend);
           }
-          assert(eleTy.isa<fir::SequenceType>());
+          assert(mlir::isa<fir::SequenceType>(eleTy));
           TODO(loc, "array (as element) assignment");
         }
         if (components.hasExtendCoorRef()) {
           auto eleBoxTy =
               fir::applyPathToType(innerArg.getType(), iters.iterVec());
-          if (!eleBoxTy || !eleBoxTy.isa<fir::BoxType>())
+          if (!eleBoxTy || !mlir::isa<fir::BoxType>(eleBoxTy))
             TODO(loc, "assignment in a FORALL involving a designator with a "
                       "POINTER or ALLOCATABLE component part-ref");
           auto arrayOp = builder.create<fir::ArrayAccessOp>(
@@ -6824,7 +6830,7 @@ private:
           // assignment, then insert the dereference of the box before any
           // conversion and store.
           if (!isPointerAssignment()) {
-            if (auto boxTy = eleTy.dyn_cast<fir::BaseBoxType>()) {
+            if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(eleTy)) {
               eleTy = fir::boxMemRefType(boxTy);
               addr = builder.create<fir::BoxAddrOp>(loc, eleTy, addr);
               eleTy = fir::unwrapRefType(eleTy);
@@ -6885,7 +6891,7 @@ private:
       }
       if (components.hasExtendCoorRef()) {
         auto eleBoxTy = fir::applyPathToType(load.getType(), iters.iterVec());
-        if (!eleBoxTy || !eleBoxTy.isa<fir::BoxType>())
+        if (!eleBoxTy || !mlir::isa<fir::BoxType>(eleBoxTy))
           TODO(loc, "assignment in a FORALL involving a designator with a "
                     "POINTER or ALLOCATABLE component part-ref");
         auto access = builder.create<fir::ArrayAccessOp>(
@@ -6897,7 +6903,7 @@ private:
       }
       if (isPointerAssignment()) {
         auto eleTy = fir::applyPathToType(load.getType(), iters.iterVec());
-        if (!eleTy.isa<fir::BoxType>()) {
+        if (!mlir::isa<fir::BoxType>(eleTy)) {
           // Rhs is a regular expression that will need to be boxed before
           // assigning to the boxed variable.
           auto typeParams = fir::factory::getTypeParams(loc, builder, load);
@@ -7615,7 +7621,7 @@ mlir::Value Fortran::lower::addCrayPointerInst(mlir::Location loc,
   auto box = builder.create<fir::EmboxOp>(loc, boxTy, ptrVal, empty, empty,
                                           emptyRange);
   mlir::Value addrof =
-      (ptrTy.isa<fir::ReferenceType>())
+      (mlir::isa<fir::ReferenceType>(ptrTy))
           ? builder.create<fir::BoxAddrOp>(loc, ptrTy, box)
           : builder.create<fir::BoxAddrOp>(loc, builder.getRefType(ptrTy), box);
 
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 6e57b31d022b..93bdf650f9ff 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -138,8 +138,8 @@ public:
     mlir::Location loc = getLoc();
     mlir::Type idxTy = builder.getIndexType();
     llvm::SmallVector<mlir::Value> extents;
-    auto seqTy = hlfir::getFortranElementOrSequenceType(fieldType)
-                     .cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(
+        hlfir::getFortranElementOrSequenceType(fieldType));
     for (auto extent : seqTy.getShape()) {
       if (extent == fir::SequenceType::getUnknownExtent()) {
         // We have already generated invalid hlfir.declare
@@ -199,7 +199,7 @@ private:
                                    const T &designatorNode) {
     // Get base's shape if its a sequence type with no previously computed
     // result shape
-    if (partInfo.base && resultValueType.isa<fir::SequenceType>() &&
+    if (partInfo.base && mlir::isa<fir::SequenceType>(resultValueType) &&
         !partInfo.resultShape)
       partInfo.resultShape =
           hlfir::genShape(getLoc(), getBuilder(), *partInfo.base);
@@ -209,7 +209,7 @@ private:
       return fir::ClassType::get(resultValueType);
     // Character scalar with dynamic length needs a fir.boxchar to hold the
     // designator length.
-    auto charType = resultValueType.dyn_cast<fir::CharacterType>();
+    auto charType = mlir::dyn_cast<fir::CharacterType>(resultValueType);
     if (charType && charType.hasDynamicLen())
       return fir::BoxCharType::get(charType.getContext(), charType.getFKind());
     // Arrays with non default lower bounds or dynamic length or dynamic extent
@@ -218,7 +218,7 @@ private:
         hasNonDefaultLowerBounds(partInfo))
       return fir::BoxType::get(resultValueType);
     // Non simply contiguous ref require a fir.box to carry the byte stride.
-    if (resultValueType.isa<fir::SequenceType>() &&
+    if (mlir::isa<fir::SequenceType>(resultValueType) &&
         !Fortran::evaluate::IsSimplyContiguous(
             designatorNode, getConverter().getFoldingContext()))
       return fir::BoxType::get(resultValueType);
@@ -398,8 +398,8 @@ private:
       partInfo.typeParams[0] =
           fir::factory::genMaxWithZero(builder, loc, rawLen);
     }
-    auto kind = hlfir::getFortranElementType(baseStringType)
-                    .cast<fir::CharacterType>()
+    auto kind = mlir::cast<fir::CharacterType>(
+                    hlfir::getFortranElementType(baseStringType))
                     .getFKind();
     auto newCharTy = fir::CharacterType::get(
         baseStringType.getContext(), kind,
@@ -579,7 +579,7 @@ private:
       return createVectorSubscriptElementAddrOp(partInfo, baseType,
                                                 resultExtents);
 
-    mlir::Type resultType = baseType.cast<fir::SequenceType>().getEleTy();
+    mlir::Type resultType = mlir::cast<fir::SequenceType>(baseType).getEleTy();
     if (!resultTypeShape.empty()) {
       // Ranked array section. The result shape comes from the array section
       // subscripts.
@@ -612,8 +612,8 @@ private:
   }
   static bool hasNonDefaultLowerBounds(const PartInfo &partInfo) {
     return partInfo.resultShape &&
-           (partInfo.resultShape.getType().isa<fir::ShiftType>() ||
-            partInfo.resultShape.getType().isa<fir::ShapeShiftType>());
+           mlir::isa<fir::ShiftType, fir::ShapeShiftType>(
+               partInfo.resultShape.getType());
   }
 
   mlir::Type visit(const Fortran::evaluate::Component &component,
@@ -705,7 +705,7 @@ private:
     const Fortran::semantics::Symbol &componentSym = component.GetLastSymbol();
     partInfo.componentName = converter.getRecordTypeFieldName(componentSym);
     auto recordType =
-        hlfir::getFortranElementType(baseType).cast<fir::RecordType>();
+        mlir::cast<fir::RecordType>(hlfir::getFortranElementType(baseType));
     if (recordType.isDependentType())
       TODO(getLoc(), "Designate derived type with length parameters in HLFIR");
     mlir::Type fieldType = recordType.getType(partInfo.componentName);
@@ -718,7 +718,7 @@ private:
     if (fir::isRecordWithTypeParameters(fieldEleType))
       TODO(loc,
            "lower a component that is a parameterized derived type to HLFIR");
-    if (auto charTy = fieldEleType.dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(fieldEleType)) {
       mlir::Location loc = getLoc();
       mlir::Type idxTy = builder.getIndexType();
       if (charTy.hasConstantLen())
@@ -811,7 +811,7 @@ private:
       }
     }
     builder.setInsertionPoint(elementalAddrOp);
-    return baseType.cast<fir::SequenceType>().getEleTy();
+    return mlir::cast<fir::SequenceType>(baseType).getEleTy();
   }
 
   /// Yield the designator for the final part-ref inside the
@@ -1665,7 +1665,7 @@ private:
     mlir::Location loc = getLoc();
     fir::FirOpBuilder &builder = getBuilder();
     mlir::Type ty = translateSomeExprToFIRType(converter, toEvExpr(ctor));
-    auto recTy = ty.cast<fir::RecordType>();
+    auto recTy = mlir::cast<fir::RecordType>(ty);
 
     if (recTy.isDependentType())
       TODO(loc, "structure constructor for derived type with length parameters "
diff --git a/flang/lib/Lower/ConvertProcedureDesignator.cpp b/flang/lib/Lower/ConvertProcedureDesignator.cpp
index 2446be3a1908..aa0d7ce54788 100644
--- a/flang/lib/Lower/ConvertProcedureDesignator.cpp
+++ b/flang/lib/Lower/ConvertProcedureDesignator.cpp
@@ -107,11 +107,11 @@ static hlfir::EntityWithAttributes designateProcedurePointerComponent(
                                                 procComponentSym);
   /// Passed argument may be a descriptor. This is a scalar reference, so the
   /// base address can be directly addressed.
-  if (base.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(base.getType()))
     base = builder.create<fir::BoxAddrOp>(loc, base);
   std::string fieldName = converter.getRecordTypeFieldName(procComponentSym);
   auto recordType =
-      hlfir::getFortranElementType(base.getType()).cast<fir::RecordType>();
+      mlir::cast<fir::RecordType>(hlfir::getFortranElementType(base.getType()));
   mlir::Type fieldType = recordType.getType(fieldName);
   // Note: semantics turns x%p() into x%t%p() when the procedure pointer
   // component is part of parent component t.
@@ -164,7 +164,7 @@ hlfir::EntityWithAttributes Fortran::lower::convertProcedureDesignatorToHLFIR(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
   mlir::Value funcAddr = fir::getBase(procExv);
-  if (!funcAddr.getType().isa<fir::BoxProcType>()) {
+  if (!mlir::isa<fir::BoxProcType>(funcAddr.getType())) {
     mlir::Type boxTy =
         Fortran::lower::getUntypedBoxProcType(&converter.getMLIRContext());
     if (auto host = Fortran::lower::argumentHostAssocs(converter, funcAddr))
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 21db0cac11bf..413563fe95ca 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -389,13 +389,13 @@ static mlir::Value genDefaultInitializerValue(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Type scalarType = symTy;
   fir::SequenceType sequenceType;
-  if (auto ty = symTy.dyn_cast<fir::SequenceType>()) {
+  if (auto ty = mlir::dyn_cast<fir::SequenceType>(symTy)) {
     sequenceType = ty;
     scalarType = ty.getEleTy();
   }
   // Build a scalar default value of the symbol type, looping through the
   // components to build each component initial value.
-  auto recTy = scalarType.cast<fir::RecordType>();
+  auto recTy = mlir::cast<fir::RecordType>(scalarType);
   mlir::Value initialValue = builder.create<fir::UndefOp>(loc, scalarType);
   const Fortran::semantics::DeclTypeSpec *declTy = sym.GetType();
   assert(declTy && "var with default initialization must have a type");
@@ -493,11 +493,11 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
   // with a tensor mlir type. This optimization currently only supports
   // Fortran arrays of integer, real, complex, or logical. The tensor
   // type does not support nested structures.
-  if (symTy.isa<fir::SequenceType>() &&
+  if (mlir::isa<fir::SequenceType>(symTy) &&
       !Fortran::semantics::IsAllocatableOrPointer(sym)) {
-    mlir::Type eleTy = symTy.cast<fir::SequenceType>().getEleTy();
-    if (eleTy.isa<mlir::IntegerType, mlir::FloatType, fir::ComplexType,
-                  fir::LogicalType>()) {
+    mlir::Type eleTy = mlir::cast<fir::SequenceType>(symTy).getEleTy();
+    if (mlir::isa<mlir::IntegerType, mlir::FloatType, fir::ComplexType,
+                  fir::LogicalType>(eleTy)) {
       const auto *details =
           sym.detailsIf<Fortran::semantics::ObjectEntityDetails>();
       if (details->init()) {
@@ -1292,7 +1292,7 @@ static void finalizeCommonBlockDefinition(
     fir::GlobalOp global,
     const Fortran::semantics::MutableSymbolVector &cmnBlkMems) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  mlir::TupleType commonTy = global.getType().cast<mlir::TupleType>();
+  mlir::TupleType commonTy = mlir::cast<mlir::TupleType>(global.getType());
   auto initFunc = [&](fir::FirOpBuilder &builder) {
     mlir::IndexType idxTy = builder.getIndexType();
     mlir::Value cb = builder.create<fir::ZeroOp>(loc, commonTy);
@@ -1407,7 +1407,7 @@ static bool lowerToBoxValue(const Fortran::semantics::Symbol &sym,
                             mlir::Value dummyArg,
                             Fortran::lower::AbstractConverter &converter) {
   // Only dummy arguments coming as fir.box can be tracked in an BoxValue.
-  if (!dummyArg || !dummyArg.getType().isa<fir::BaseBoxType>())
+  if (!dummyArg || !mlir::isa<fir::BaseBoxType>(dummyArg.getType()))
     return false;
   // Non contiguous arrays must be tracked in an BoxValue.
   if (sym.Rank() > 0 && !Fortran::evaluate::IsSimplyContiguous(
@@ -1905,7 +1905,7 @@ void Fortran::lower::mapSymbolAttributes(
           // Do not keep scalar characters as fir.box (even when optional).
           // Lowering and FIR is not meant to deal with scalar characters as
           // fir.box outside of calls.
-          auto boxTy = dummyArg.getType().dyn_cast<fir::BaseBoxType>();
+          auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(dummyArg.getType());
           mlir::Type refTy = builder.getRefType(boxTy.getEleTy());
           mlir::Type lenType = builder.getCharacterLengthType();
           mlir::Value addr, len;
@@ -1984,8 +1984,8 @@ void Fortran::lower::mapSymbolAttributes(
       // a non pointer/allocatable symbol to be mapped to a MutableBox.
       mlir::Type ty = converter.genType(var);
       bool isPolymorphic = false;
-      if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
-        isPolymorphic = ty.isa<fir::ClassType>();
+      if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
+        isPolymorphic = mlir::isa<fir::ClassType>(ty);
         ty = boxTy.getEleTy();
       }
       Fortran::lower::genDeclareSymbol(
@@ -2092,7 +2092,7 @@ void Fortran::lower::mapSymbolAttributes(
   mlir::Value addr = preAlloc;
 
   if (arg)
-    if (auto boxTy = arg.getType().dyn_cast<fir::BaseBoxType>()) {
+    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(arg.getType())) {
       // Contiguous assumed shape that can be tracked without a fir.box.
       mlir::Type refTy = builder.getRefType(boxTy.getEleTy());
       addr = builder.create<fir::BoxAddrOp>(loc, refTy, arg);
@@ -2134,7 +2134,7 @@ void Fortran::lower::mapSymbolAttributes(
       } else if (!len) {
         // Assumed length fir.box (possible for contiguous assumed shapes).
         // Read length from box.
-        assert(arg && arg.getType().isa<fir::BoxType>() &&
+        assert(arg && mlir::isa<fir::BoxType>(arg.getType()) &&
                "must be character dummy fir.box");
         len = charHelp.readLengthFromBox(arg);
       }
diff --git a/flang/lib/Lower/CustomIntrinsicCall.cpp b/flang/lib/Lower/CustomIntrinsicCall.cpp
index 439fc3d915b4..30c6ce7f53b3 100644
--- a/flang/lib/Lower/CustomIntrinsicCall.cpp
+++ b/flang/lib/Lower/CustomIntrinsicCall.cpp
@@ -227,22 +227,23 @@ lowerIshftc(fir::FirOpBuilder &builder, mlir::Location loc,
   args.push_back(getOperand(1, loadOperand));
   auto iPC = isPresentCheck(2);
   assert(iPC.has_value());
-  args.push_back(builder
-                     .genIfOp(loc, {resultType}, *iPC,
-                              /*withElseRegion=*/true)
-                     .genThen([&]() {
-                       fir::ExtendedValue sizeExv = getOperand(2, loadOperand);
-                       mlir::Value size = builder.createConvert(
-                           loc, resultType, fir::getBase(sizeExv));
-                       builder.create<fir::ResultOp>(loc, size);
-                     })
-                     .genElse([&]() {
-                       mlir::Value bitSize = builder.createIntegerConstant(
-                           loc, resultType,
-                           resultType.cast<mlir::IntegerType>().getWidth());
-                       builder.create<fir::ResultOp>(loc, bitSize);
-                     })
-                     .getResults()[0]);
+  args.push_back(
+      builder
+          .genIfOp(loc, {resultType}, *iPC,
+                   /*withElseRegion=*/true)
+          .genThen([&]() {
+            fir::ExtendedValue sizeExv = getOperand(2, loadOperand);
+            mlir::Value size =
+                builder.createConvert(loc, resultType, fir::getBase(sizeExv));
+            builder.create<fir::ResultOp>(loc, size);
+          })
+          .genElse([&]() {
+            mlir::Value bitSize = builder.createIntegerConstant(
+                loc, resultType,
+                mlir::cast<mlir::IntegerType>(resultType).getWidth());
+            builder.create<fir::ResultOp>(loc, bitSize);
+          })
+          .getResults()[0]);
   return genIntrinsicCall(builder, loc, name, resultType, args, stmtCtx);
 }
 
@@ -282,7 +283,7 @@ lowerAssociated(fir::FirOpBuilder &builder, mlir::Location loc,
       builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), targetBase);
   mlir::Type targetType = fir::unwrapRefType(targetBase.getType());
   mlir::Type targetValueType = fir::unwrapPassByRefType(targetType);
-  mlir::Type boxType = targetType.isa<fir::BaseBoxType>()
+  mlir::Type boxType = mlir::isa<fir::BaseBoxType>(targetType)
                            ? targetType
                            : fir::BoxType::get(targetValueType);
   fir::BoxValue targetBox =
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 3ebf3fd965da..42bd3868196b 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -642,14 +642,14 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter,
     isPresent =
         builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), rawInput);
 
-  if (auto boxTy =
-          fir::unwrapRefType(symAddr.getType()).dyn_cast<fir::BaseBoxType>()) {
-    if (boxTy.getEleTy().isa<fir::RecordType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(
+          fir::unwrapRefType(symAddr.getType()))) {
+    if (mlir::isa<fir::RecordType>(boxTy.getEleTy()))
       TODO(loc, "derived type");
 
     // Load the box when baseAddr is a `fir.ref<fir.box<T>>` or a
     // `fir.ref<fir.class<T>>` type.
-    if (symAddr.getType().isa<fir::ReferenceType>()) {
+    if (mlir::isa<fir::ReferenceType>(symAddr.getType())) {
       if (Fortran::semantics::IsOptional(sym)) {
         mlir::Value addr =
             builder.genIfOp(loc, {boxTy}, isPresent, /*withElseRegion=*/true)
@@ -722,7 +722,7 @@ genBoundsOpsFromBox(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Type idxTy = builder.getIndexType();
   mlir::Type boundTy = builder.getType<BoundsType>();
 
-  assert(info.addr.getType().isa<fir::BaseBoxType>() &&
+  assert(mlir::isa<fir::BaseBoxType>(info.addr.getType()) &&
          "expect fir.box or fir.class");
 
   if (info.isPresent) {
@@ -909,7 +909,8 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
       mlir::Value stride = one;
       bool strideInBytes = false;
 
-      if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+      if (mlir::isa<fir::BaseBoxType>(
+              fir::unwrapRefType(info.addr.getType()))) {
         if (info.isPresent) {
           stride =
               builder
@@ -1020,8 +1021,8 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
           }
         }
 
-        if (info.isPresent &&
-            fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+        if (info.isPresent && mlir::isa<fir::BaseBoxType>(
+                                  fir::unwrapRefType(info.addr.getType()))) {
           extent =
               builder
                   .genIfOp(loc, idxTy, info.isPresent, /*withElseRegion=*/true)
@@ -1157,7 +1158,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
         converter.genExprAddr(operandLocation, designator, stmtCtx);
     info.addr = fir::getBase(compExv);
     info.rawInput = info.addr;
-    if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>())
+    if (mlir::isa<fir::SequenceType>(fir::unwrapRefType(info.addr.getType())))
       bounds = genBaseBoundsOps<BoundsOp, BoundsType>(builder, operandLocation,
                                                       converter, compExv,
                                                       /*isAssumedSize=*/false);
@@ -1199,13 +1200,14 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
       fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(*symRef);
       info =
           getDataOperandBaseAddr(converter, builder, *symRef, operandLocation);
-      if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+      if (mlir::isa<fir::BaseBoxType>(
+              fir::unwrapRefType(info.addr.getType()))) {
         bounds = genBoundsOpsFromBox<BoundsOp, BoundsType>(
             builder, operandLocation, converter, dataExv, info);
       }
       bool dataExvIsAssumedSize =
           Fortran::semantics::IsAssumedSizeArray(symRef->get().GetUltimate());
-      if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>())
+      if (mlir::isa<fir::SequenceType>(fir::unwrapRefType(info.addr.getType())))
         bounds = genBaseBoundsOps<BoundsOp, BoundsType>(
             builder, operandLocation, converter, dataExv, dataExvIsAssumedSize);
       asFortran << symRef->get().name().ToString();
diff --git a/flang/lib/Lower/HlfirIntrinsics.cpp b/flang/lib/Lower/HlfirIntrinsics.cpp
index bda04fa9689b..310b62697f71 100644
--- a/flang/lib/Lower/HlfirIntrinsics.cpp
+++ b/flang/lib/Lower/HlfirIntrinsics.cpp
@@ -265,7 +265,7 @@ HlfirTransformationalIntrinsic::computeResultType(mlir::Value argArray,
                                                   mlir::Type stmtResultType) {
   mlir::Type normalisedResult =
       hlfir::getFortranElementOrSequenceType(stmtResultType);
-  if (auto array = normalisedResult.dyn_cast<fir::SequenceType>()) {
+  if (auto array = mlir::dyn_cast<fir::SequenceType>(normalisedResult)) {
     hlfir::ExprType::Shape resultShape =
         hlfir::ExprType::Shape{array.getShape()};
     mlir::Type elementType = array.getEleTy();
@@ -341,7 +341,7 @@ mlir::Value HlfirTransposeLowering::lowerImpl(
   hlfir::ExprType::Shape resultShape;
   mlir::Type normalisedResult =
       hlfir::getFortranElementOrSequenceType(stmtResultType);
-  auto array = normalisedResult.cast<fir::SequenceType>();
+  auto array = mlir::cast<fir::SequenceType>(normalisedResult);
   llvm::ArrayRef<int64_t> arrayShape = array.getShape();
   assert(arrayShape.size() == 2 && "arguments to transpose have a rank of 2");
   mlir::Type elementType = array.getEleTy();
diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index 2e2656356719..75a5bed56655 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -219,7 +219,7 @@ public:
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
     fir::KindTy kind =
-        converter.genType(sym).cast<fir::CharacterType>().getFKind();
+        mlir::cast<fir::CharacterType>(converter.genType(sym)).getFKind();
     return fir::BoxCharType::get(&converter.getMLIRContext(), kind);
   }
 
@@ -293,7 +293,7 @@ public:
     mlir::Location loc = args.loc;
     mlir::Value box = args.valueInTuple;
     if (Fortran::semantics::IsOptional(sym)) {
-      auto boxTy = box.getType().cast<fir::BaseBoxType>();
+      auto boxTy = mlir::cast<fir::BaseBoxType>(box.getType());
       auto eleTy = boxTy.getEleTy();
       if (!fir::isa_ref_type(eleTy))
         eleTy = builder.getRefType(eleTy);
@@ -381,8 +381,8 @@ public:
                             const Fortran::semantics::Symbol &sym) {
     mlir::Type type = converter.genType(sym);
     bool isPolymorphic = Fortran::semantics::IsPolymorphic(sym);
-    assert((type.isa<fir::SequenceType>() ||
-            (isPolymorphic && type.isa<fir::ClassType>())) &&
+    assert((mlir::isa<fir::SequenceType>(type) ||
+            (isPolymorphic && mlir::isa<fir::ClassType>(type))) &&
            "must be a sequence type");
     if (isPolymorphic)
       return type;
@@ -459,7 +459,7 @@ public:
       // (absent boxes are null descriptor addresses, not descriptors containing
       // a null base address).
       if (Fortran::semantics::IsOptional(sym)) {
-        auto boxTy = box.getType().cast<fir::BaseBoxType>();
+        auto boxTy = mlir::cast<fir::BaseBoxType>(box.getType());
         auto eleTy = boxTy.getEleTy();
         if (!fir::isa_ref_type(eleTy))
           eleTy = builder.getRefType(eleTy);
@@ -527,7 +527,7 @@ walkCaptureCategories(T visitor, Fortran::lower::AbstractConverter &converter,
 // `t` should be the result of getArgumentType, which has a type of
 // `!fir.ref<tuple<...>>`.
 static mlir::TupleType unwrapTupleTy(mlir::Type t) {
-  return fir::dyn_cast_ptrEleTy(t).cast<mlir::TupleType>();
+  return mlir::cast<mlir::TupleType>(fir::dyn_cast_ptrEleTy(t));
 }
 
 static mlir::Value genTupleCoor(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -535,7 +535,7 @@ static mlir::Value genTupleCoor(fir::FirOpBuilder &builder, mlir::Location loc,
                                 mlir::Value offset) {
   // fir.ref<fir.ref> and fir.ptr<fir.ref> are forbidden. Use
   // fir.llvm_ptr if needed.
-  auto ty = varTy.isa<fir::ReferenceType>()
+  auto ty = mlir::isa<fir::ReferenceType>(varTy)
                 ? mlir::Type(fir::LLVMPointerType::get(varTy))
                 : mlir::Type(builder.getRefType(varTy));
   return builder.create<fir::CoordinateOp>(loc, ty, tupleArg, offset);
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index ac82276bcddb..ed0afad9197d 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -168,7 +168,7 @@ static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
 }
 
 inline int64_t getLength(mlir::Type argTy) {
-  return argTy.cast<fir::SequenceType>().getShape()[0];
+  return mlir::cast<fir::SequenceType>(argTy).getShape()[0];
 }
 
 /// Get (or generate) the MLIR FuncOp for a given IO runtime function.
@@ -656,11 +656,11 @@ static void genNamelistIO(Fortran::lower::AbstractConverter &converter,
 static mlir::func::FuncOp getOutputFunc(mlir::Location loc,
                                         fir::FirOpBuilder &builder,
                                         mlir::Type type, bool isFormatted) {
-  if (fir::unwrapPassByRefType(type).isa<fir::RecordType>())
+  if (mlir::isa<fir::RecordType>(fir::unwrapPassByRefType(type)))
     return getIORuntimeFunc<mkIOKey(OutputDerivedType)>(loc, builder);
   if (!isFormatted)
     return getIORuntimeFunc<mkIOKey(OutputDescriptor)>(loc, builder);
-  if (auto ty = type.dyn_cast<mlir::IntegerType>()) {
+  if (auto ty = mlir::dyn_cast<mlir::IntegerType>(type)) {
     switch (ty.getWidth()) {
     case 1:
       return getIORuntimeFunc<mkIOKey(OutputLogical)>(loc, builder);
@@ -677,14 +677,14 @@ static mlir::func::FuncOp getOutputFunc(mlir::Location loc,
     }
     llvm_unreachable("unknown OutputInteger kind");
   }
-  if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+  if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
     if (auto width = ty.getWidth(); width == 32)
       return getIORuntimeFunc<mkIOKey(OutputReal32)>(loc, builder);
     else if (width == 64)
       return getIORuntimeFunc<mkIOKey(OutputReal64)>(loc, builder);
   }
   auto kindMap = fir::getKindMapping(builder.getModule());
-  if (auto ty = type.dyn_cast<fir::ComplexType>()) {
+  if (auto ty = mlir::dyn_cast<fir::ComplexType>(type)) {
     // COMPLEX(KIND=k) corresponds to a pair of REAL(KIND=k).
     auto width = kindMap.getRealBitsize(ty.getFKind());
     if (width == 32)
@@ -692,7 +692,7 @@ static mlir::func::FuncOp getOutputFunc(mlir::Location loc,
     else if (width == 64)
       return getIORuntimeFunc<mkIOKey(OutputComplex64)>(loc, builder);
   }
-  if (type.isa<fir::LogicalType>())
+  if (mlir::isa<fir::LogicalType>(type))
     return getIORuntimeFunc<mkIOKey(OutputLogical)>(loc, builder);
   if (fir::factory::CharacterExprHelper::isCharacterScalar(type)) {
     // TODO: What would it mean if the default CHARACTER KIND is set to a wide
@@ -731,14 +731,14 @@ static void genOutputItemList(
     mlir::func::FuncOp outputFunc =
         getOutputFunc(loc, builder, itemTy, isFormatted);
     mlir::Type argType = outputFunc.getFunctionType().getInput(1);
-    assert((isFormatted || argType.isa<fir::BoxType>()) &&
+    assert((isFormatted || mlir::isa<fir::BoxType>(argType)) &&
            "expect descriptor for unformatted IO runtime");
     llvm::SmallVector<mlir::Value> outputFuncArgs = {cookie};
     fir::factory::CharacterExprHelper helper{builder, loc};
-    if (argType.isa<fir::BoxType>()) {
+    if (mlir::isa<fir::BoxType>(argType)) {
       mlir::Value box = fir::getBase(converter.genExprBox(loc, *expr, stmtCtx));
       outputFuncArgs.push_back(builder.createConvert(loc, argType, box));
-      if (fir::unwrapPassByRefType(itemTy).isa<fir::RecordType>())
+      if (mlir::isa<fir::RecordType>(fir::unwrapPassByRefType(itemTy)))
         outputFuncArgs.push_back(getNonTbpDefinedIoTableAddr(converter));
     } else if (helper.isCharacterScalar(itemTy)) {
       fir::ExtendedValue exv = converter.genExprAddr(loc, expr, stmtCtx);
@@ -773,29 +773,29 @@ static void genOutputItemList(
 static mlir::func::FuncOp getInputFunc(mlir::Location loc,
                                        fir::FirOpBuilder &builder,
                                        mlir::Type type, bool isFormatted) {
-  if (fir::unwrapPassByRefType(type).isa<fir::RecordType>())
+  if (mlir::isa<fir::RecordType>(fir::unwrapPassByRefType(type)))
     return getIORuntimeFunc<mkIOKey(InputDerivedType)>(loc, builder);
   if (!isFormatted)
     return getIORuntimeFunc<mkIOKey(InputDescriptor)>(loc, builder);
-  if (auto ty = type.dyn_cast<mlir::IntegerType>())
+  if (auto ty = mlir::dyn_cast<mlir::IntegerType>(type))
     return ty.getWidth() == 1
                ? getIORuntimeFunc<mkIOKey(InputLogical)>(loc, builder)
                : getIORuntimeFunc<mkIOKey(InputInteger)>(loc, builder);
-  if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+  if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
     if (auto width = ty.getWidth(); width == 32)
       return getIORuntimeFunc<mkIOKey(InputReal32)>(loc, builder);
     else if (width == 64)
       return getIORuntimeFunc<mkIOKey(InputReal64)>(loc, builder);
   }
   auto kindMap = fir::getKindMapping(builder.getModule());
-  if (auto ty = type.dyn_cast<fir::ComplexType>()) {
+  if (auto ty = mlir::dyn_cast<fir::ComplexType>(type)) {
     auto width = kindMap.getRealBitsize(ty.getFKind());
     if (width == 32)
       return getIORuntimeFunc<mkIOKey(InputComplex32)>(loc, builder);
     else if (width == 64)
       return getIORuntimeFunc<mkIOKey(InputComplex64)>(loc, builder);
   }
-  if (type.isa<fir::LogicalType>())
+  if (mlir::isa<fir::LogicalType>(type))
     return getIORuntimeFunc<mkIOKey(InputLogical)>(loc, builder);
   if (fir::factory::CharacterExprHelper::isCharacterScalar(type)) {
     auto asciiKind = kindMap.defaultCharacterKind();
@@ -830,12 +830,12 @@ createIoRuntimeCallForItem(Fortran::lower::AbstractConverter &converter,
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Type argType = inputFunc.getFunctionType().getInput(1);
   llvm::SmallVector<mlir::Value> inputFuncArgs = {cookie};
-  if (argType.isa<fir::BaseBoxType>()) {
+  if (mlir::isa<fir::BaseBoxType>(argType)) {
     mlir::Value box = fir::getBase(item);
-    auto boxTy = box.getType().dyn_cast<fir::BaseBoxType>();
+    auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getType());
     assert(boxTy && "must be previously emboxed");
     inputFuncArgs.push_back(builder.createConvert(loc, argType, box));
-    if (fir::unwrapPassByRefType(boxTy).isa<fir::RecordType>())
+    if (mlir::isa<fir::RecordType>(fir::unwrapPassByRefType(boxTy)))
       inputFuncArgs.push_back(getNonTbpDefinedIoTableAddr(converter));
   } else {
     mlir::Value itemAddr = fir::getBase(item);
@@ -846,16 +846,16 @@ createIoRuntimeCallForItem(Fortran::lower::AbstractConverter &converter,
       mlir::Value len = fir::getLen(item);
       inputFuncArgs.push_back(builder.createConvert(
           loc, inputFunc.getFunctionType().getInput(2), len));
-    } else if (itemTy.isa<mlir::IntegerType>()) {
+    } else if (mlir::isa<mlir::IntegerType>(itemTy)) {
       inputFuncArgs.push_back(builder.create<mlir::arith::ConstantOp>(
           loc, builder.getI32IntegerAttr(
-                   itemTy.cast<mlir::IntegerType>().getWidth() / 8)));
+                   mlir::cast<mlir::IntegerType>(itemTy).getWidth() / 8)));
     }
   }
   auto call = builder.create<fir::CallOp>(loc, inputFunc, inputFuncArgs);
   auto itemAddr = fir::getBase(item);
   auto itemTy = fir::unwrapRefType(itemAddr.getType());
-  if (itemTy.isa<fir::LogicalType>())
+  if (mlir::isa<fir::LogicalType>(itemTy))
     boolRefToLogical(loc, builder, itemAddr);
   return call.getResult(0);
 }
@@ -886,7 +886,7 @@ static void genInputItemList(Fortran::lower::AbstractConverter &converter,
       mlir::func::FuncOp inputFunc = getInputFunc(
           loc, builder, vectorSubscriptBox.getElementType(), isFormatted);
       const bool mustBox =
-          inputFunc.getFunctionType().getInput(1).isa<fir::BoxType>();
+          mlir::isa<fir::BoxType>(inputFunc.getFunctionType().getInput(1));
       if (!checkResult) {
         auto elementalGenerator = [&](const fir::ExtendedValue &element) {
           createIoRuntimeCallForItem(converter, loc, inputFunc, cookie,
@@ -911,9 +911,10 @@ static void genInputItemList(Fortran::lower::AbstractConverter &converter,
     mlir::Type itemTy = converter.genType(*expr);
     mlir::func::FuncOp inputFunc =
         getInputFunc(loc, builder, itemTy, isFormatted);
-    auto itemExv = inputFunc.getFunctionType().getInput(1).isa<fir::BoxType>()
-                       ? converter.genExprBox(loc, *expr, stmtCtx)
-                       : converter.genExprAddr(loc, expr, stmtCtx);
+    auto itemExv =
+        mlir::isa<fir::BoxType>(inputFunc.getFunctionType().getInput(1))
+            ? converter.genExprBox(loc, *expr, stmtCtx)
+            : converter.genExprAddr(loc, expr, stmtCtx);
     ok = createIoRuntimeCallForItem(converter, loc, inputFunc, cookie, itemExv);
   }
 }
@@ -1772,8 +1773,8 @@ static mlir::Value genIOUnitNumber(Fortran::lower::AbstractConverter &converter,
   auto &builder = converter.getFirOpBuilder();
   auto rawUnit = fir::getBase(converter.genExprValue(loc, iounit, stmtCtx));
   unsigned rawUnitWidth =
-      rawUnit.getType().cast<mlir::IntegerType>().getWidth();
-  unsigned runtimeArgWidth = ty.cast<mlir::IntegerType>().getWidth();
+      mlir::cast<mlir::IntegerType>(rawUnit.getType()).getWidth();
+  unsigned runtimeArgWidth = mlir::cast<mlir::IntegerType>(ty).getWidth();
   // The IO runtime supports `int` unit numbers, if the unit number may
   // overflow when passed to the IO runtime, check that the unit number is
   // in range before calling the BeginXXX.
@@ -2331,7 +2332,7 @@ mlir::Value genInquireSpec<Fortran::parser::InquireSpec::IntVar>(
   if (!eleTy)
     fir::emitFatalError(loc,
                         "internal error: expected a memory reference type");
-  auto width = eleTy.cast<mlir::IntegerType>().getWidth();
+  auto width = mlir::cast<mlir::IntegerType>(eleTy).getWidth();
   mlir::IndexType idxTy = builder.getIndexType();
   mlir::Value kind = builder.createIntegerConstant(loc, idxTy, width / 8);
   llvm::SmallVector<mlir::Value> args = {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index b56bdedc07bf..eae2afc760e6 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -65,7 +65,7 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
                             mlir::acc::DataClause dataClause, mlir::Type retTy,
                             mlir::Value isPresent = {}) {
   mlir::Value varPtrPtr;
-  if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
     if (isPresent) {
       mlir::Type ifRetTy = boxTy.getEleTy();
       if (!fir::isa_ref_type(ifRetTy))
@@ -2658,7 +2658,7 @@ genACCHostDataOp(Fortran::lower::AbstractConverter &converter,
   if (ifCond) {
     if (auto cst =
             mlir::dyn_cast<mlir::arith::ConstantOp>(ifCond.getDefiningOp()))
-      if (auto boolAttr = cst.getValue().dyn_cast<mlir::BoolAttr>()) {
+      if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(cst.getValue())) {
         if (boolAttr.getValue()) {
           // get rid of the if condition if it is always true.
           ifCond = mlir::Value();
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 4c51b61f6bf0..79525d6dfe7a 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -23,10 +23,10 @@ namespace omp {
 
 /// Check for unsupported map operand types.
 static void checkMapType(mlir::Location location, mlir::Type type) {
-  if (auto refType = type.dyn_cast<fir::ReferenceType>())
+  if (auto refType = mlir::dyn_cast<fir::ReferenceType>(type))
     type = refType.getElementType();
-  if (auto boxType = type.dyn_cast_or_null<fir::BoxType>())
-    if (!boxType.getElementType().isa<fir::PointerType>())
+  if (auto boxType = mlir::dyn_cast_or_null<fir::BoxType>(type))
+    if (!mlir::isa<fir::PointerType>(boxType.getElementType()))
       TODO(location, "OMPD_target_data MapOperand BoxType");
 }
 
@@ -814,7 +814,7 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 llvm::ArrayRef<mlir::Value> members, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
                 bool isVal) {
-  if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
     baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
     retTy = baseAddr.getType();
   }
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index b419686e8ce4..f63a774fa44b 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -17,7 +17,6 @@
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Semantics/tools.h"
-#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 
 namespace Fortran {
 namespace lower {
@@ -52,10 +51,37 @@ void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) {
 }
 
 void DataSharingProcessor::insertDeallocs() {
-  // TODO Extend delayed privatization to include a `dealloc` region.
   for (const Fortran::semantics::Symbol *sym : privatizedSymbols)
     if (Fortran::semantics::IsAllocatable(sym->GetUltimate())) {
+      if (!useDelayedPrivatization) {
+        converter.createHostAssociateVarCloneDealloc(*sym);
+        return;
+      }
+
+      Fortran::lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
+      assert(hsb && "Host symbol box not found");
+      mlir::Type symType = hsb.getAddr().getType();
+      mlir::Location symLoc = hsb.getAddr().getLoc();
+      fir::ExtendedValue symExV = converter.getSymbolExtendedValue(*sym);
+      mlir::omp::PrivateClauseOp privatizer = symToPrivatizer.at(sym);
+
+      symTable->pushScope();
+
+      mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
+
+      mlir::Region &deallocRegion = privatizer.getDeallocRegion();
+      fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+      mlir::Block *deallocEntryBlock = firOpBuilder.createBlock(
+          &deallocRegion, /*insertPt=*/{}, symType, symLoc);
+
+      firOpBuilder.setInsertionPointToEnd(deallocEntryBlock);
+      symTable->addSymbol(*sym,
+                          fir::substBase(symExV, deallocRegion.getArgument(0)));
+
       converter.createHostAssociateVarCloneDealloc(*sym);
+      firOpBuilder.create<mlir::omp::YieldOp>(hsb.getAddr().getLoc());
+
+      symTable->popScope();
     }
 }
 
@@ -339,6 +365,7 @@ void DataSharingProcessor::defaultPrivatize(
     if (!Fortran::semantics::IsProcedure(*sym) &&
         !sym->GetUltimate().has<Fortran::semantics::DerivedTypeDetails>() &&
         !sym->GetUltimate().has<Fortran::semantics::NamelistDetails>() &&
+        !Fortran::semantics::IsImpliedDoIndex(sym->GetUltimate()) &&
         !symbolsInNestedRegions.contains(sym) &&
         !symbolsInParentRegions.contains(sym) &&
         !privatizedSymbols.contains(sym))
@@ -439,6 +466,8 @@ void DataSharingProcessor::doPrivatize(
 
   if (privateSyms)
     privateSyms->push_back(sym);
+
+  symToPrivatizer[sym] = privatizerOp;
 }
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index ef7b14327278..f709a64211a8 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -18,6 +18,7 @@
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/symbol.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 
 namespace mlir {
 namespace omp {
@@ -40,6 +41,8 @@ private:
   llvm::SetVector<const Fortran::semantics::Symbol *> defaultSymbols;
   llvm::SetVector<const Fortran::semantics::Symbol *> symbolsInNestedRegions;
   llvm::SetVector<const Fortran::semantics::Symbol *> symbolsInParentRegions;
+  llvm::DenseMap<const Fortran::semantics::Symbol *, mlir::omp::PrivateClauseOp>
+      symToPrivatizer;
   Fortran::lower::AbstractConverter &converter;
   fir::FirOpBuilder &firOpBuilder;
   omp::List<omp::Clause> clauses;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index f454f5a45a51..c54f100b73da 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -84,7 +84,7 @@ static fir::GlobalOp globalInitialization(
 
   // Create default initialization for non-character scalar.
   if (Fortran::semantics::IsAllocatableOrObjectPointer(&sym)) {
-    mlir::Type baseAddrType = ty.dyn_cast<fir::BoxType>().getEleTy();
+    mlir::Type baseAddrType = mlir::dyn_cast<fir::BoxType>(ty).getEleTy();
     Fortran::lower::createGlobalInitialization(
         firOpBuilder, global, [&](fir::FirOpBuilder &b) {
           mlir::Value nullAddr =
@@ -548,11 +548,12 @@ struct OpWithBodyGenInfo {
           mlir::Operation *)>;
 
   OpWithBodyGenInfo(Fortran::lower::AbstractConverter &converter,
+                    Fortran::lower::SymMap &symTable,
                     Fortran::semantics::SemanticsContext &semaCtx,
                     mlir::Location loc, Fortran::lower::pft::Evaluation &eval,
                     llvm::omp::Directive dir)
-      : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval), dir(dir) {
-  }
+      : converter(converter), symTable(symTable), semaCtx(semaCtx), loc(loc),
+        eval(eval), dir(dir) {}
 
   OpWithBodyGenInfo &setGenNested(bool value) {
     genNested = value;
@@ -589,6 +590,8 @@ struct OpWithBodyGenInfo {
 
   /// [inout] converter to use for the clauses.
   Fortran::lower::AbstractConverter &converter;
+  /// [in] Symbol table
+  Fortran::lower::SymMap &symTable;
   /// [in] Semantics context
   Fortran::semantics::SemanticsContext &semaCtx;
   /// [in] location in source code.
@@ -764,6 +767,7 @@ static void createBodyOfOp(mlir::Operation &op, OpWithBodyGenInfo &info) {
 
 static void genBodyOfTargetDataOp(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::SymMap &symTable,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
     mlir::omp::TargetDataOp &dataOp, llvm::ArrayRef<mlir::Type> useDeviceTypes,
@@ -778,7 +782,7 @@ static void genBodyOfTargetDataOp(
   for (auto [argIndex, argSymbol] : llvm::enumerate(useDeviceSymbols)) {
     const mlir::BlockArgument &arg = region.front().getArgument(argIndex);
     fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*argSymbol);
-    if (auto refType = arg.getType().dyn_cast<fir::ReferenceType>()) {
+    if (auto refType = mlir::dyn_cast<fir::ReferenceType>(arg.getType())) {
       if (fir::isa_builtin_cptr_type(refType.getElementType())) {
         converter.bindSymbol(*argSymbol, arg);
       } else {
@@ -830,6 +834,7 @@ static void genBodyOfTargetDataOp(
 // all the symbols present in mapSymbols as block arguments to this block.
 static void
 genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
+                  Fortran::lower::SymMap &symTable,
                   Fortran::semantics::SemanticsContext &semaCtx,
                   Fortran::lower::pft::Evaluation &eval, bool genNested,
                   mlir::omp::TargetOp &targetOp,
@@ -1267,6 +1272,7 @@ static void genWsloopClauses(
 
 static mlir::omp::BarrierOp
 genBarrierOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, mlir::Location loc) {
   return converter.getFirOpBuilder().create<mlir::omp::BarrierOp>(loc);
@@ -1274,6 +1280,7 @@ genBarrierOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::CriticalOp
 genCriticalOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, bool genNested,
               mlir::Location loc, const List<Clause> &clauses,
@@ -1298,7 +1305,7 @@ genCriticalOp(Fortran::lower::AbstractConverter &converter,
   }
 
   return genOpWithBody<mlir::omp::CriticalOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_critical)
           .setGenNested(genNested),
       nameAttr);
@@ -1306,6 +1313,7 @@ genCriticalOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::DistributeOp
 genDistributeOp(Fortran::lower::AbstractConverter &converter,
+                Fortran::lower::SymMap &symTable,
                 Fortran::semantics::SemanticsContext &semaCtx,
                 Fortran::lower::pft::Evaluation &eval, bool genNested,
                 mlir::Location loc, const List<Clause> &clauses) {
@@ -1315,6 +1323,7 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::FlushOp
 genFlushOp(Fortran::lower::AbstractConverter &converter,
+           Fortran::lower::SymMap &symTable,
            Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
            const ObjectList &objects, const List<Clause> &clauses) {
@@ -1327,17 +1336,19 @@ genFlushOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::MasterOp
 genMasterOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location loc) {
   return genOpWithBody<mlir::omp::MasterOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_master)
           .setGenNested(genNested));
 }
 
 static mlir::omp::OrderedOp
 genOrderedOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
              const List<Clause> &clauses) {
@@ -1347,6 +1358,7 @@ genOrderedOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::OrderedRegionOp
 genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
+                   Fortran::lower::SymMap &symTable,
                    Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval, bool genNested,
                    mlir::Location loc, const List<Clause> &clauses) {
@@ -1354,7 +1366,7 @@ genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
   genOrderedRegionClauses(converter, semaCtx, clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_ordered)
           .setGenNested(genNested),
       clauseOps);
@@ -1383,7 +1395,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   };
 
   OpWithBodyGenInfo genInfo =
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_parallel)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
@@ -1440,13 +1452,14 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::SectionOp
 genSectionOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, bool genNested,
              mlir::Location loc, const List<Clause> &clauses) {
   // Currently only private/firstprivate clause is handled, and
   // all privatization is done within `omp.section` operations.
   return genOpWithBody<mlir::omp::SectionOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_section)
           .setGenNested(genNested)
           .setClauses(&clauses));
@@ -1454,11 +1467,12 @@ genSectionOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::SectionsOp
 genSectionsOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
               const mlir::omp::SectionsClauseOps &clauseOps) {
   return genOpWithBody<mlir::omp::SectionsOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_sections)
           .setGenNested(false),
       clauseOps);
@@ -1466,6 +1480,7 @@ genSectionsOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::SimdOp
 genSimdOp(Fortran::lower::AbstractConverter &converter,
+          Fortran::lower::SymMap &symTable,
           Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
           const List<Clause> &clauses) {
@@ -1499,8 +1514,8 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
   };
 
   createBodyOfOp(*loopOp,
-                 OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval,
-                                   llvm::omp::Directive::OMPD_simd)
+                 OpWithBodyGenInfo(converter, symTable, semaCtx, loc,
+                                   *nestedEval, llvm::omp::Directive::OMPD_simd)
                      .setClauses(&clauses)
                      .setDataSharingProcessor(&dsp)
                      .setGenRegionEntryCb(ivCallback));
@@ -1510,6 +1525,7 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::SingleOp
 genSingleOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location loc, const List<Clause> &clauses) {
@@ -1517,7 +1533,7 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
   genSingleClauses(converter, semaCtx, clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::SingleOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_single)
           .setGenNested(genNested)
           .setClauses(&clauses),
@@ -1526,6 +1542,7 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TargetOp
 genTargetOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, bool genNested,
             mlir::Location loc, const List<Clause> &clauses,
@@ -1570,13 +1587,15 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
 
         Fortran::lower::AddrAndBoundsInfo info = getDataOperandBaseAddr(
             converter, firOpBuilder, sym, converter.getCurrentLocation());
-        if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>())
+        if (mlir::isa<fir::BaseBoxType>(
+                fir::unwrapRefType(info.addr.getType())))
           bounds =
               Fortran::lower::genBoundsOpsFromBox<mlir::omp::MapBoundsOp,
                                                   mlir::omp::MapBoundsType>(
                   firOpBuilder, converter.getCurrentLocation(), converter,
                   dataExv, info);
-        if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>()) {
+        if (mlir::isa<fir::SequenceType>(
+                fir::unwrapRefType(info.addr.getType()))) {
           bool dataExvIsAssumedSize =
               Fortran::semantics::IsAssumedSizeArray(sym.GetUltimate());
           bounds = Fortran::lower::genBaseBoundsOps<mlir::omp::MapBoundsOp,
@@ -1591,7 +1610,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
             mlir::omp::VariableCaptureKind::ByRef;
 
         mlir::Type eleType = baseOp.getType();
-        if (auto refType = baseOp.getType().dyn_cast<fir::ReferenceType>())
+        if (auto refType = mlir::dyn_cast<fir::ReferenceType>(baseOp.getType()))
           eleType = refType.getElementType();
 
         // If a variable is specified in declare target link and if device
@@ -1633,13 +1652,14 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap);
 
   auto targetOp = firOpBuilder.create<mlir::omp::TargetOp>(loc, clauseOps);
-  genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSyms,
-                    mapLocs, mapTypes, loc);
+  genBodyOfTargetOp(converter, symTable, semaCtx, eval, genNested, targetOp,
+                    mapSyms, mapLocs, mapTypes, loc);
   return targetOp;
 }
 
 static mlir::omp::TargetDataOp
 genTargetDataOp(Fortran::lower::AbstractConverter &converter,
+                Fortran::lower::SymMap &symTable,
                 Fortran::semantics::SemanticsContext &semaCtx,
                 Fortran::lower::pft::Evaluation &eval, bool genNested,
                 mlir::Location loc, const List<Clause> &clauses) {
@@ -1654,14 +1674,16 @@ genTargetDataOp(Fortran::lower::AbstractConverter &converter,
   auto targetDataOp =
       converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(loc,
                                                                   clauseOps);
-  genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, targetDataOp,
-                        useDeviceTypes, useDeviceLocs, useDeviceSyms, loc);
+  genBodyOfTargetDataOp(converter, symTable, semaCtx, eval, genNested,
+                        targetDataOp, useDeviceTypes, useDeviceLocs,
+                        useDeviceSyms, loc);
   return targetDataOp;
 }
 
 template <typename OpTy>
 static OpTy
 genTargetEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
+                               Fortran::lower::SymMap &symTable,
                                Fortran::semantics::SemanticsContext &semaCtx,
                                mlir::Location loc,
                                const List<Clause> &clauses) {
@@ -1689,6 +1711,7 @@ genTargetEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskOp
 genTaskOp(Fortran::lower::AbstractConverter &converter,
+          Fortran::lower::SymMap &symTable,
           Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, bool genNested,
           mlir::Location loc, const List<Clause> &clauses) {
@@ -1697,7 +1720,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
   genTaskClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_task)
           .setGenNested(genNested)
           .setClauses(&clauses),
@@ -1706,6 +1729,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskgroupOp
 genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
+               Fortran::lower::SymMap &symTable,
                Fortran::semantics::SemanticsContext &semaCtx,
                Fortran::lower::pft::Evaluation &eval, bool genNested,
                mlir::Location loc, const List<Clause> &clauses) {
@@ -1713,7 +1737,7 @@ genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
   genTaskgroupClauses(converter, semaCtx, clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskgroupOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_taskgroup)
           .setGenNested(genNested)
           .setClauses(&clauses),
@@ -1722,6 +1746,7 @@ genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskloopOp
 genTaskloopOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
               const List<Clause> &clauses) {
@@ -1730,6 +1755,7 @@ genTaskloopOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskwaitOp
 genTaskwaitOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
               const List<Clause> &clauses) {
@@ -1741,6 +1767,7 @@ genTaskwaitOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TaskyieldOp
 genTaskyieldOp(Fortran::lower::AbstractConverter &converter,
+               Fortran::lower::SymMap &symTable,
                Fortran::semantics::SemanticsContext &semaCtx,
                Fortran::lower::pft::Evaluation &eval, mlir::Location loc) {
   return converter.getFirOpBuilder().create<mlir::omp::TaskyieldOp>(loc);
@@ -1748,6 +1775,7 @@ genTaskyieldOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::TeamsOp
 genTeamsOp(Fortran::lower::AbstractConverter &converter,
+           Fortran::lower::SymMap &symTable,
            Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, bool genNested,
            mlir::Location loc, const List<Clause> &clauses,
@@ -1757,7 +1785,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
   genTeamsClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_teams)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
@@ -1767,6 +1795,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::WsloopOp
 genWsloopOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
             const List<Clause> &clauses) {
@@ -1805,8 +1834,8 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
   };
 
   createBodyOfOp(*loopOp,
-                 OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval,
-                                   llvm::omp::Directive::OMPD_do)
+                 OpWithBodyGenInfo(converter, symTable, semaCtx, loc,
+                                   *nestedEval, llvm::omp::Directive::OMPD_do)
                      .setClauses(&clauses)
                      .setDataSharingProcessor(&dsp)
                      .setReductions(&reductionSyms, &reductionTypes)
@@ -1820,6 +1849,7 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
 
 static void
 genCompositeDistributeParallelDo(Fortran::lower::AbstractConverter &converter,
+                                 Fortran::lower::SymMap &symTable,
                                  Fortran::semantics::SemanticsContext &semaCtx,
                                  Fortran::lower::pft::Evaluation &eval,
                                  const List<Clause> &clauses,
@@ -1829,6 +1859,7 @@ genCompositeDistributeParallelDo(Fortran::lower::AbstractConverter &converter,
 
 static void genCompositeDistributeParallelDoSimd(
     Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::SymMap &symTable,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, const List<Clause> &clauses,
     mlir::Location loc) {
@@ -1837,6 +1868,7 @@ static void genCompositeDistributeParallelDoSimd(
 
 static void
 genCompositeDistributeSimd(Fortran::lower::AbstractConverter &converter,
+                           Fortran::lower::SymMap &symTable,
                            Fortran::semantics::SemanticsContext &semaCtx,
                            Fortran::lower::pft::Evaluation &eval,
                            const List<Clause> &clauses, mlir::Location loc) {
@@ -1844,6 +1876,7 @@ genCompositeDistributeSimd(Fortran::lower::AbstractConverter &converter,
 }
 
 static void genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
+                               Fortran::lower::SymMap &symTable,
                                Fortran::semantics::SemanticsContext &semaCtx,
                                Fortran::lower::pft::Evaluation &eval,
                                const List<Clause> &clauses,
@@ -1860,11 +1893,12 @@ static void genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
   // When support for vectorization is enabled, then we need to add handling of
   // if clause. Currently if clause can be skipped because we always assume
   // SIMD length = 1.
-  genWsloopOp(converter, semaCtx, eval, loc, clauses);
+  genWsloopOp(converter, symTable, semaCtx, eval, loc, clauses);
 }
 
 static void
 genCompositeTaskloopSimd(Fortran::lower::AbstractConverter &converter,
+                         Fortran::lower::SymMap &symTable,
                          Fortran::semantics::SemanticsContext &semaCtx,
                          Fortran::lower::pft::Evaluation &eval,
                          const List<Clause> &clauses, mlir::Location loc) {
@@ -1985,32 +2019,32 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
   default:
     break;
   case llvm::omp::Directive::OMPD_barrier:
-    genBarrierOp(converter, semaCtx, eval, currentLocation);
+    genBarrierOp(converter, symTable, semaCtx, eval, currentLocation);
     break;
   case llvm::omp::Directive::OMPD_taskwait:
-    genTaskwaitOp(converter, semaCtx, eval, currentLocation, clauses);
+    genTaskwaitOp(converter, symTable, semaCtx, eval, currentLocation, clauses);
     break;
   case llvm::omp::Directive::OMPD_taskyield:
-    genTaskyieldOp(converter, semaCtx, eval, currentLocation);
+    genTaskyieldOp(converter, symTable, semaCtx, eval, currentLocation);
     break;
   case llvm::omp::Directive::OMPD_target_data:
-    genTargetDataOp(converter, semaCtx, eval, /*genNested=*/true,
+    genTargetDataOp(converter, symTable, semaCtx, eval, /*genNested=*/true,
                     currentLocation, clauses);
     break;
   case llvm::omp::Directive::OMPD_target_enter_data:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetEnterDataOp>(
-        converter, semaCtx, currentLocation, clauses);
+        converter, symTable, semaCtx, currentLocation, clauses);
     break;
   case llvm::omp::Directive::OMPD_target_exit_data:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetExitDataOp>(
-        converter, semaCtx, currentLocation, clauses);
+        converter, symTable, semaCtx, currentLocation, clauses);
     break;
   case llvm::omp::Directive::OMPD_target_update:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetUpdateOp>(
-        converter, semaCtx, currentLocation, clauses);
+        converter, symTable, semaCtx, currentLocation, clauses);
     break;
   case llvm::omp::Directive::OMPD_ordered:
-    genOrderedOp(converter, semaCtx, eval, currentLocation, clauses);
+    genOrderedOp(converter, symTable, semaCtx, eval, currentLocation, clauses);
     break;
   }
 }
@@ -2034,7 +2068,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                             [&](auto &&s) { return makeClause(s.v, semaCtx); })
                  : List<Clause>{};
   mlir::Location currentLocation = converter.genLocation(verbatim.source);
-  genFlushOp(converter, semaCtx, eval, currentLocation, objects, clauses);
+  genFlushOp(converter, symTable, semaCtx, eval, currentLocation, objects,
+             clauses);
 }
 
 static void
@@ -2187,12 +2222,13 @@ genOMP(Fortran::lower::AbstractConverter &converter,
     switch (leafDir) {
     case llvm::omp::Directive::OMPD_master:
       // 2.16 MASTER construct.
-      genMasterOp(converter, semaCtx, eval, genNested, currentLocation);
+      genMasterOp(converter, symTable, semaCtx, eval, genNested,
+                  currentLocation);
       break;
     case llvm::omp::Directive::OMPD_ordered:
       // 2.17.9 ORDERED construct.
-      genOrderedRegionOp(converter, semaCtx, eval, genNested, currentLocation,
-                         clauses);
+      genOrderedRegionOp(converter, symTable, semaCtx, eval, genNested,
+                         currentLocation, clauses);
       break;
     case llvm::omp::Directive::OMPD_parallel:
       // 2.6 PARALLEL construct.
@@ -2201,41 +2237,43 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       break;
     case llvm::omp::Directive::OMPD_single:
       // 2.8.2 SINGLE construct.
-      genSingleOp(converter, semaCtx, eval, genNested, currentLocation,
-                  clauses);
+      genSingleOp(converter, symTable, semaCtx, eval, genNested,
+                  currentLocation, clauses);
       break;
     case llvm::omp::Directive::OMPD_target:
       // 2.12.5 TARGET construct.
-      genTargetOp(converter, semaCtx, eval, genNested, currentLocation, clauses,
-                  outerCombined);
+      genTargetOp(converter, symTable, semaCtx, eval, genNested,
+                  currentLocation, clauses, outerCombined);
       break;
     case llvm::omp::Directive::OMPD_target_data:
       // 2.12.2 TARGET DATA construct.
-      genTargetDataOp(converter, semaCtx, eval, genNested, currentLocation,
-                      clauses);
+      genTargetDataOp(converter, symTable, semaCtx, eval, genNested,
+                      currentLocation, clauses);
       break;
     case llvm::omp::Directive::OMPD_task:
       // 2.10.1 TASK construct.
-      genTaskOp(converter, semaCtx, eval, genNested, currentLocation, clauses);
+      genTaskOp(converter, symTable, semaCtx, eval, genNested, currentLocation,
+                clauses);
       break;
     case llvm::omp::Directive::OMPD_taskgroup:
       // 2.17.6 TASKGROUP construct.
-      genTaskgroupOp(converter, semaCtx, eval, genNested, currentLocation,
-                     clauses);
+      genTaskgroupOp(converter, symTable, semaCtx, eval, genNested,
+                     currentLocation, clauses);
       break;
     case llvm::omp::Directive::OMPD_teams:
       // 2.7 TEAMS construct.
       // FIXME Pass the outerCombined argument or rename it to better describe
       // what it represents if it must always be `false` in this context.
-      genTeamsOp(converter, semaCtx, eval, genNested, currentLocation, clauses);
+      genTeamsOp(converter, symTable, semaCtx, eval, genNested, currentLocation,
+                 clauses);
       break;
     case llvm::omp::Directive::OMPD_workshare:
       // 2.8.3 WORKSHARE construct.
       // FIXME: Workshare is not a commonly used OpenMP construct, an
       // implementation for this feature will come later. For the codes
       // that use this construct, add a single construct for now.
-      genSingleOp(converter, semaCtx, eval, genNested, currentLocation,
-                  clauses);
+      genSingleOp(converter, symTable, semaCtx, eval, genNested,
+                  currentLocation, clauses);
       break;
     default:
       llvm_unreachable("Unexpected block construct");
@@ -2257,8 +2295,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       makeClauses(std::get<Fortran::parser::OmpClauseList>(cd.t), semaCtx);
   const auto &name = std::get<std::optional<Fortran::parser::Name>>(cd.t);
   mlir::Location currentLocation = converter.getCurrentLocation();
-  genCriticalOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-                clauses, name);
+  genCriticalOp(converter, symTable, semaCtx, eval, /*genNested=*/true,
+                currentLocation, clauses, name);
 }
 
 static void
@@ -2304,26 +2342,27 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
       switch (leafDir) {
       case llvm::omp::Directive::OMPD_distribute_parallel_do:
         // 2.9.4.3 DISTRIBUTE PARALLEL Worksharing-Loop construct.
-        genCompositeDistributeParallelDo(converter, semaCtx, eval, clauses,
-                                         currentLocation);
+        genCompositeDistributeParallelDo(converter, symTable, semaCtx, eval,
+                                         clauses, currentLocation);
         break;
       case llvm::omp::Directive::OMPD_distribute_parallel_do_simd:
         // 2.9.4.4 DISTRIBUTE PARALLEL Worksharing-Loop SIMD construct.
-        genCompositeDistributeParallelDoSimd(converter, semaCtx, eval, clauses,
-                                             currentLocation);
+        genCompositeDistributeParallelDoSimd(converter, symTable, semaCtx, eval,
+                                             clauses, currentLocation);
         break;
       case llvm::omp::Directive::OMPD_distribute_simd:
         // 2.9.4.2 DISTRIBUTE SIMD construct.
-        genCompositeDistributeSimd(converter, semaCtx, eval, clauses,
+        genCompositeDistributeSimd(converter, symTable, semaCtx, eval, clauses,
                                    currentLocation);
         break;
       case llvm::omp::Directive::OMPD_do_simd:
         // 2.9.3.2 Worksharing-Loop SIMD construct.
-        genCompositeDoSimd(converter, semaCtx, eval, clauses, currentLocation);
+        genCompositeDoSimd(converter, symTable, semaCtx, eval, clauses,
+                           currentLocation);
         break;
       case llvm::omp::Directive::OMPD_taskloop_simd:
         // 2.10.3 TASKLOOP SIMD construct.
-        genCompositeTaskloopSimd(converter, semaCtx, eval, clauses,
+        genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, clauses,
                                  currentLocation);
         break;
       default:
@@ -2334,12 +2373,13 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
       switch (leafDir) {
       case llvm::omp::Directive::OMPD_distribute:
         // 2.9.4.1 DISTRIBUTE construct.
-        genDistributeOp(converter, semaCtx, eval, genNested, currentLocation,
-                        clauses);
+        genDistributeOp(converter, symTable, semaCtx, eval, genNested,
+                        currentLocation, clauses);
         break;
       case llvm::omp::Directive::OMPD_do:
         // 2.9.2 Worksharing-Loop construct.
-        genWsloopOp(converter, semaCtx, eval, currentLocation, clauses);
+        genWsloopOp(converter, symTable, semaCtx, eval, currentLocation,
+                    clauses);
         break;
       case llvm::omp::Directive::OMPD_parallel:
         // 2.6 PARALLEL construct.
@@ -2353,16 +2393,17 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
         break;
       case llvm::omp::Directive::OMPD_simd:
         // 2.9.3.1 SIMD construct.
-        genSimdOp(converter, semaCtx, eval, currentLocation, clauses);
+        genSimdOp(converter, symTable, semaCtx, eval, currentLocation, clauses);
         break;
       case llvm::omp::Directive::OMPD_target:
         // 2.12.5 TARGET construct.
-        genTargetOp(converter, semaCtx, eval, genNested, currentLocation,
-                    clauses, /*outerCombined=*/true);
+        genTargetOp(converter, symTable, semaCtx, eval, genNested,
+                    currentLocation, clauses, /*outerCombined=*/true);
         break;
       case llvm::omp::Directive::OMPD_taskloop:
         // 2.10.2 TASKLOOP construct.
-        genTaskloopOp(converter, semaCtx, eval, currentLocation, clauses);
+        genTaskloopOp(converter, symTable, semaCtx, eval, currentLocation,
+                      clauses);
         break;
       case llvm::omp::Directive::OMPD_teams:
         // 2.7 TEAMS construct.
@@ -2370,8 +2411,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
         // combined construct in this constext (e.g. target teams distribute).
         // Maybe rename the argument if it represents something else or
         // initialize it properly.
-        genTeamsOp(converter, semaCtx, eval, genNested, currentLocation,
-                   clauses, /*outerCombined=*/true);
+        genTeamsOp(converter, symTable, semaCtx, eval, genNested,
+                   currentLocation, clauses, /*outerCombined=*/true);
         break;
       case llvm::omp::Directive::OMPD_loop:
       case llvm::omp::Directive::OMPD_masked:
@@ -2433,7 +2474,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   }
 
   // SECTIONS construct.
-  genSectionsOp(converter, semaCtx, eval, currentLocation, clauseOps);
+  genSectionsOp(converter, symTable, semaCtx, eval, currentLocation, clauseOps);
 
   // Generate nested SECTION operations recursively.
   const auto &sectionBlocks =
@@ -2443,8 +2484,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   for (const auto &[nblock, neval] :
        llvm::zip(sectionBlocks.v, eval.getNestedEvaluations())) {
     symTable.pushScope();
-    genSectionOp(converter, semaCtx, neval, /*genNested=*/true, currentLocation,
-                 clauses);
+    genSectionOp(converter, symTable, semaCtx, neval, /*genNested=*/true,
+                 currentLocation, clauses);
     symTable.popScope();
     firOpBuilder.restoreInsertionPoint(ip);
   }
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 895340549f7c..b3f08eb81c79 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -138,7 +138,7 @@ ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
     TODO(loc, "Reduction of some types is not supported");
   switch (redId) {
   case ReductionIdentifier::MAX: {
-    if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       return builder.createRealConstant(
           loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/true));
@@ -148,7 +148,7 @@ ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
     return builder.createIntegerConstant(loc, type, minInt);
   }
   case ReductionIdentifier::MIN: {
-    if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       return builder.createRealConstant(
           loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/false));
@@ -188,12 +188,12 @@ ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
       return fir::factory::Complex{builder, loc}.createComplex(type, initRe,
                                                                initIm);
     }
-    if (type.isa<mlir::FloatType>())
+    if (mlir::isa<mlir::FloatType>(type))
       return builder.create<mlir::arith::ConstantOp>(
           loc, type,
           builder.getFloatAttr(type, (double)getOperationIdentity(redId, loc)));
 
-    if (type.isa<fir::LogicalType>()) {
+    if (mlir::isa<fir::LogicalType>(type)) {
       mlir::Value intConst = builder.create<mlir::arith::ConstantOp>(
           loc, builder.getI1Type(),
           builder.getIntegerAttr(builder.getI1Type(),
@@ -474,11 +474,11 @@ createReductionCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
 
 // like fir::unwrapSeqOrBoxedSeqType except it also works for non-sequence boxes
 static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) {
-  if (auto seqTy = ty.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
     return seqTy.getEleTy();
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
     auto eleTy = fir::unwrapRefType(boxTy.getEleTy());
-    if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
       return seqTy.getEleTy();
     return eleTy;
   }
@@ -790,7 +790,7 @@ void ReductionProcessor::addDeclareReduction(
     for (mlir::Value symVal : reductionVars) {
       auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
       const auto &kindMap = firOpBuilder.getKindMap();
-      if (redType.getEleTy().isa<fir::LogicalType>())
+      if (mlir::isa<fir::LogicalType>(redType.getEleTy()))
         decl = createDeclareReduction(firOpBuilder,
                                       getReductionName(intrinsicOp, kindMap,
                                                        firOpBuilder.getI1Type(),
@@ -811,14 +811,11 @@ void ReductionProcessor::addDeclareReduction(
             *reductionIntrinsic)) {
       ReductionProcessor::ReductionIdentifier redId =
           ReductionProcessor::getReductionType(*reductionIntrinsic);
-      for (const Object &object : objectList) {
-        const Fortran::semantics::Symbol *symbol = object.id();
-        mlir::Value symVal = converter.getSymbolAddress(*symbol);
-        if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-          symVal = declOp.getBase();
-        auto redType = symVal.getType().cast<fir::ReferenceType>();
+      for (mlir::Value symVal : reductionVars) {
+        auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
         if (!redType.getEleTy().isIntOrIndexOrFloat())
-          TODO(currentLocation, "User Defined Reduction on non-trivial type");
+          TODO(currentLocation,
+               "Reduction of some types is not supported for intrinsics");
         decl = createDeclareReduction(
             firOpBuilder,
             getReductionName(getRealName(*reductionIntrinsic).ToString(),
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index da3f2be73e50..c38e0c18cac8 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -11,10 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "Utils.h"
-#include "Clauses.h"
 
+#include "Clauses.h"
 #include <flang/Lower/AbstractConverter.h>
 #include <flang/Lower/ConvertType.h>
+#include <flang/Lower/PFTBuilder.h>
 #include <flang/Optimizer/Builder/FIRBuilder.h>
 #include <flang/Parser/parse-tree.h>
 #include <flang/Parser/tools.h>
@@ -47,6 +48,12 @@ int64_t getCollapseValue(const List<Clause> &clauses) {
   return 1;
 }
 
+uint32_t getOpenMPVersion(mlir::ModuleOp mod) {
+  if (mlir::Attribute verAttr = mod->getAttr("omp.version"))
+    return llvm::cast<mlir::omp::VersionAttr>(verAttr).getVersion();
+  llvm_unreachable("Expecting OpenMP version attribute in module");
+}
+
 void genObjectList(const ObjectList &objects,
                    Fortran::lower::AbstractConverter &converter,
                    llvm::SmallVectorImpl<mlir::Value> &operands) {
@@ -81,6 +88,27 @@ mlir::Type getLoopVarType(Fortran::lower::AbstractConverter &converter,
   return converter.getFirOpBuilder().getIntegerType(loopVarTypeSize);
 }
 
+Fortran::semantics::Symbol *
+getIterationVariableSymbol(const Fortran::lower::pft::Evaluation &eval) {
+  return eval.visit(Fortran::common::visitors{
+      [&](const Fortran::parser::DoConstruct &doLoop) {
+        if (const auto &maybeCtrl = doLoop.GetLoopControl()) {
+          using LoopControl = Fortran::parser::LoopControl;
+          if (auto *bounds = std::get_if<LoopControl::Bounds>(&maybeCtrl->u)) {
+            static_assert(
+                std::is_same_v<decltype(bounds->name),
+                               Fortran::parser::Scalar<Fortran::parser::Name>>);
+            return bounds->name.thing.symbol;
+          }
+        }
+        return static_cast<Fortran::semantics::Symbol *>(nullptr);
+      },
+      [](auto &&) {
+        return static_cast<Fortran::semantics::Symbol *>(nullptr);
+      },
+  });
+}
+
 void gatherFuncAndVarSyms(
     const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index b3a9f7f30c98..5e0ebba23bf3 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -34,6 +34,9 @@ struct OmpObjectList;
 } // namespace parser
 
 namespace lower {
+namespace pft {
+struct Evaluation;
+}
 
 class AbstractConverter;
 
@@ -54,11 +57,15 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Type getLoopVarType(Fortran::lower::AbstractConverter &converter,
                           std::size_t loopVarTypeSize);
 
+Fortran::semantics::Symbol *
+getIterationVariableSymbol(const Fortran::lower::pft::Evaluation &eval);
+
 void gatherFuncAndVarSyms(
     const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause);
 
 int64_t getCollapseValue(const List<Clause> &clauses);
+uint32_t getOpenMPVersion(mlir::ModuleOp mod);
 
 Fortran::semantics::Symbol *
 getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject);
diff --git a/flang/lib/Lower/VectorSubscripts.cpp b/flang/lib/Lower/VectorSubscripts.cpp
index 7439b9f7df8f..d7a311d32d59 100644
--- a/flang/lib/Lower/VectorSubscripts.cpp
+++ b/flang/lib/Lower/VectorSubscripts.cpp
@@ -105,7 +105,7 @@ private:
   }
 
   mlir::Type gen(const Fortran::evaluate::Component &component) {
-    auto recTy = gen(component.base()).cast<fir::RecordType>();
+    auto recTy = mlir::cast<fir::RecordType>(gen(component.base()));
     const Fortran::semantics::Symbol &componentSymbol =
         component.GetLastSymbol();
     // Parent components will not be found here, they are not part
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index c403b9effbfa..f723e8f66e3e 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -68,7 +68,7 @@ bool AliasAnalysis::Source::isPointerReference(mlir::Type ty) {
   if (!eleTy)
     return false;
 
-  return fir::isPointerType(eleTy) || eleTy.isa<fir::PointerType>();
+  return fir::isPointerType(eleTy) || mlir::isa<fir::PointerType>(eleTy);
 }
 
 bool AliasAnalysis::Source::isTargetOrPointer() const {
@@ -81,7 +81,7 @@ bool AliasAnalysis::Source::isRecordWithPointerComponent() const {
   if (!eleTy)
     return false;
   // TO DO: Look for pointer components
-  return eleTy.isa<fir::RecordType>();
+  return mlir::isa<fir::RecordType>(eleTy);
 }
 
 AliasResult AliasAnalysis::alias(Value lhs, Value rhs) {
diff --git a/flang/lib/Optimizer/Builder/BoxValue.cpp b/flang/lib/Optimizer/Builder/BoxValue.cpp
index 361fa59e2040..a90ce5570de7 100644
--- a/flang/lib/Optimizer/Builder/BoxValue.cpp
+++ b/flang/lib/Optimizer/Builder/BoxValue.cpp
@@ -191,7 +191,7 @@ bool fir::MutableBoxValue::verify() const {
   mlir::Type type = fir::dyn_cast_ptrEleTy(getAddr().getType());
   if (!type)
     return false;
-  auto box = type.dyn_cast<fir::BaseBoxType>();
+  auto box = mlir::dyn_cast<fir::BaseBoxType>(type);
   if (!box)
     return false;
   // A boxed value always takes a memory reference,
@@ -210,7 +210,7 @@ bool fir::MutableBoxValue::verify() const {
 /// Debug verifier for BoxValue ctor. There is no guarantee this will
 /// always be called.
 bool fir::BoxValue::verify() const {
-  if (!addr.getType().isa<fir::BaseBoxType>())
+  if (!mlir::isa<fir::BaseBoxType>(addr.getType()))
     return false;
   if (!lbounds.empty() && lbounds.size() != rank())
     return false;
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 06339b116cd8..6d0aeb429d35 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -23,6 +23,7 @@ add_flang_library(FIRBuilder
   Runtime/Execute.cpp
   Runtime/Inquiry.cpp
   Runtime/Intrinsics.cpp
+  Runtime/Main.cpp
   Runtime/Numeric.cpp
   Runtime/Pointer.cpp
   Runtime/Ragged.cpp
diff --git a/flang/lib/Optimizer/Builder/Character.cpp b/flang/lib/Optimizer/Builder/Character.cpp
index af0786809cc2..b7a7453efdb3 100644
--- a/flang/lib/Optimizer/Builder/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Character.cpp
@@ -26,11 +26,11 @@
 
 /// Unwrap all the ref and box types and return the inner element type.
 static mlir::Type unwrapBoxAndRef(mlir::Type type) {
-  if (auto boxType = type.dyn_cast<fir::BoxCharType>())
+  if (auto boxType = mlir::dyn_cast<fir::BoxCharType>(type))
     return boxType.getEleTy();
   while (true) {
     type = fir::unwrapRefType(type);
-    if (auto boxTy = type.dyn_cast<fir::BoxType>())
+    if (auto boxTy = mlir::dyn_cast<fir::BoxType>(type))
       type = boxTy.getEleTy();
     else
       break;
@@ -41,19 +41,19 @@ static mlir::Type unwrapBoxAndRef(mlir::Type type) {
 /// Unwrap base fir.char<kind,len> type.
 static fir::CharacterType recoverCharacterType(mlir::Type type) {
   type = fir::unwrapSequenceType(unwrapBoxAndRef(type));
-  if (auto charTy = type.dyn_cast<fir::CharacterType>())
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(type))
     return charTy;
   llvm::report_fatal_error("expected a character type");
 }
 
 bool fir::factory::CharacterExprHelper::isCharacterScalar(mlir::Type type) {
   type = unwrapBoxAndRef(type);
-  return !type.isa<fir::SequenceType>() && fir::isa_char(type);
+  return !mlir::isa<fir::SequenceType>(type) && fir::isa_char(type);
 }
 
 bool fir::factory::CharacterExprHelper::isArray(mlir::Type type) {
   type = unwrapBoxAndRef(type);
-  if (auto seqTy = type.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type))
     return fir::isa_char(seqTy.getEleTy());
   return false;
 }
@@ -92,7 +92,8 @@ getCompileTimeLength(const fir::CharBoxValue &box) {
 /// Detect the precondition that the value `str` does not reside in memory. Such
 /// values will have a type `!fir.array<...x!fir.char<N>>` or `!fir.char<N>`.
 LLVM_ATTRIBUTE_UNUSED static bool needToMaterialize(mlir::Value str) {
-  return str.getType().isa<fir::SequenceType>() || fir::isa_char(str.getType());
+  return mlir::isa<fir::SequenceType>(str.getType()) ||
+         fir::isa_char(str.getType());
 }
 
 /// This is called only if `str` does not reside in memory. Such a bare string
@@ -103,7 +104,7 @@ fir::factory::CharacterExprHelper::materializeValue(mlir::Value str) {
   assert(needToMaterialize(str));
   auto ty = str.getType();
   assert(isCharacterScalar(ty) && "expected scalar character");
-  auto charTy = ty.dyn_cast<fir::CharacterType>();
+  auto charTy = mlir::dyn_cast<fir::CharacterType>(ty);
   if (!charTy || charTy.getLen() == fir::CharacterType::unknownLen()) {
     LLVM_DEBUG(llvm::dbgs() << "cannot materialize: " << str << '\n');
     llvm_unreachable("must be a !fir.char<N> type");
@@ -129,7 +130,7 @@ fir::factory::CharacterExprHelper::toExtendedValue(mlir::Value character,
   if (auto eleType = fir::dyn_cast_ptrEleTy(type))
     type = eleType;
 
-  if (auto arrayType = type.dyn_cast<fir::SequenceType>()) {
+  if (auto arrayType = mlir::dyn_cast<fir::SequenceType>(type)) {
     type = arrayType.getEleTy();
     auto indexType = builder.getIndexType();
     for (auto extent : arrayType.getShape()) {
@@ -145,10 +146,10 @@ fir::factory::CharacterExprHelper::toExtendedValue(mlir::Value character,
       mlir::emitError(loc, "cannot retrieve array extents from type");
   }
 
-  if (auto charTy = type.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(type)) {
     if (!resultLen && charTy.getLen() != fir::CharacterType::unknownLen())
       resultLen = builder.createIntegerConstant(loc, lenType, charTy.getLen());
-  } else if (auto boxCharType = type.dyn_cast<fir::BoxCharType>()) {
+  } else if (auto boxCharType = mlir::dyn_cast<fir::BoxCharType>(type)) {
     auto refType = builder.getRefType(boxCharType.getEleTy());
     // If the embox is accessible, use its operand to avoid filling
     // the generated fir with embox/unbox.
@@ -168,7 +169,7 @@ fir::factory::CharacterExprHelper::toExtendedValue(mlir::Value character,
     if (!resultLen) {
       resultLen = boxCharLen;
     }
-  } else if (type.isa<fir::BoxType>()) {
+  } else if (mlir::isa<fir::BoxType>(type)) {
     mlir::emitError(loc, "descriptor or derived type not yet handled");
   } else {
     llvm_unreachable("Cannot translate mlir::Value to character ExtendedValue");
@@ -221,7 +222,7 @@ fir::factory::CharacterExprHelper::createEmbox(const fir::CharBoxValue &box) {
 
 fir::CharBoxValue fir::factory::CharacterExprHelper::toScalarCharacter(
     const fir::CharArrayBoxValue &box) {
-  if (box.getBuffer().getType().isa<fir::PointerType>())
+  if (mlir::isa<fir::PointerType>(box.getBuffer().getType()))
     TODO(loc, "concatenating non contiguous character array into a scalar");
 
   // TODO: add a fast path multiplying new length at compile time if the info is
@@ -655,7 +656,7 @@ fir::factory::CharacterExprHelper::createUnboxChar(mlir::Value boxChar) {
 }
 
 bool fir::factory::CharacterExprHelper::isCharacterLiteral(mlir::Type type) {
-  if (auto seqType = type.dyn_cast<fir::SequenceType>())
+  if (auto seqType = mlir::dyn_cast<fir::SequenceType>(type))
     return (seqType.getShape().size() == 1) &&
            fir::isa_char(seqType.getEleTy());
   return false;
@@ -728,9 +729,9 @@ mlir::Value fir::factory::CharacterExprHelper::getLength(mlir::Value memref) {
   if (charType.hasConstantLen())
     return builder.createIntegerConstant(loc, builder.getCharacterLengthType(),
                                          charType.getLen());
-  if (memrefType.isa<fir::BoxType>())
+  if (mlir::isa<fir::BoxType>(memrefType))
     return readLengthFromBox(memref);
-  if (memrefType.isa<fir::BoxCharType>())
+  if (mlir::isa<fir::BoxCharType>(memrefType))
     return createUnboxChar(memref).second;
 
   // Length cannot be deduced from memref.
@@ -742,14 +743,14 @@ fir::factory::extractCharacterProcedureTuple(fir::FirOpBuilder &builder,
                                              mlir::Location loc,
                                              mlir::Value tuple,
                                              bool openBoxProc) {
-  mlir::TupleType tupleType = tuple.getType().cast<mlir::TupleType>();
+  mlir::TupleType tupleType = mlir::cast<mlir::TupleType>(tuple.getType());
   mlir::Value addr = builder.create<fir::ExtractValueOp>(
       loc, tupleType.getType(0), tuple,
       builder.getArrayAttr(
           {builder.getIntegerAttr(builder.getIndexType(), 0)}));
   mlir::Value proc = [&]() -> mlir::Value {
     if (openBoxProc)
-      if (auto addrTy = addr.getType().dyn_cast<fir::BoxProcType>())
+      if (auto addrTy = mlir::dyn_cast<fir::BoxProcType>(addr.getType()))
         return builder.create<fir::BoxAddrOp>(loc, addrTy.getEleTy(), addr);
     return addr;
   }();
@@ -763,7 +764,7 @@ fir::factory::extractCharacterProcedureTuple(fir::FirOpBuilder &builder,
 mlir::Value fir::factory::createCharacterProcedureTuple(
     fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type argTy,
     mlir::Value addr, mlir::Value len) {
-  mlir::TupleType tupleType = argTy.cast<mlir::TupleType>();
+  mlir::TupleType tupleType = mlir::cast<mlir::TupleType>(argTy);
   addr = builder.createConvert(loc, tupleType.getType(0), addr);
   if (len)
     len = builder.createConvert(loc, tupleType.getType(1), len);
@@ -866,7 +867,7 @@ fir::factory::convertCharacterKind(fir::FirOpBuilder &builder,
   auto kindMap = builder.getKindMap();
   mlir::Value boxCharAddr = srcBoxChar.getAddr();
   auto fromTy = boxCharAddr.getType();
-  if (auto charTy = fromTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(fromTy)) {
     // boxchar is a value, not a variable. Turn it into a temporary.
     // As a value, it ought to have a constant LEN value.
     assert(charTy.hasConstantLen() && "must have constant length");
@@ -875,7 +876,7 @@ fir::factory::convertCharacterKind(fir::FirOpBuilder &builder,
     boxCharAddr = tmp;
   }
   auto fromBits = kindMap.getCharacterBitsize(
-      fir::unwrapRefType(fromTy).cast<fir::CharacterType>().getFKind());
+      mlir::cast<fir::CharacterType>(fir::unwrapRefType(fromTy)).getFKind());
   auto toBits = kindMap.getCharacterBitsize(toKind);
   if (toBits < fromBits) {
     // Scale by relative ratio to give a buffer of the same length.
diff --git a/flang/lib/Optimizer/Builder/Complex.cpp b/flang/lib/Optimizer/Builder/Complex.cpp
index e97cb3067808..cbcd4f850014 100644
--- a/flang/lib/Optimizer/Builder/Complex.cpp
+++ b/flang/lib/Optimizer/Builder/Complex.cpp
@@ -14,7 +14,8 @@
 
 mlir::Type
 fir::factory::Complex::getComplexPartType(mlir::Type complexType) const {
-  return builder.getRealType(complexType.cast<fir::ComplexType>().getFKind());
+  return builder.getRealType(
+      mlir::cast<fir::ComplexType>(complexType).getFKind());
 }
 
 mlir::Type fir::factory::Complex::getComplexPartType(mlir::Value cplx) const {
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index a0fbae5b614c..a6da38763726 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -90,7 +90,7 @@ fir::FirOpBuilder::getNamedGlobal(mlir::ModuleOp modOp,
 }
 
 mlir::Type fir::FirOpBuilder::getRefType(mlir::Type eleTy) {
-  assert(!eleTy.isa<fir::ReferenceType>() && "cannot be a reference type");
+  assert(!mlir::isa<fir::ReferenceType>(eleTy) && "cannot be a reference type");
   return fir::ReferenceType::get(eleTy);
 }
 
@@ -147,7 +147,7 @@ mlir::Value
 fir::FirOpBuilder::createRealConstant(mlir::Location loc, mlir::Type fltTy,
                                       llvm::APFloat::integerPart val) {
   auto apf = [&]() -> llvm::APFloat {
-    if (auto ty = fltTy.dyn_cast<fir::RealType>())
+    if (auto ty = mlir::dyn_cast<fir::RealType>(fltTy))
       return llvm::APFloat(kindMap.getFloatSemantics(ty.getFKind()), val);
     if (fltTy.isF16())
       return llvm::APFloat(llvm::APFloat::IEEEhalf(), val);
@@ -169,7 +169,7 @@ fir::FirOpBuilder::createRealConstant(mlir::Location loc, mlir::Type fltTy,
 mlir::Value fir::FirOpBuilder::createRealConstant(mlir::Location loc,
                                                   mlir::Type fltTy,
                                                   const llvm::APFloat &value) {
-  if (fltTy.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(fltTy)) {
     auto attr = getFloatAttr(fltTy, value);
     return create<mlir::arith::ConstantOp>(loc, fltTy, attr);
   }
@@ -178,7 +178,7 @@ mlir::Value fir::FirOpBuilder::createRealConstant(mlir::Location loc,
 
 static llvm::SmallVector<mlir::Value>
 elideExtentsAlreadyInType(mlir::Type type, mlir::ValueRange shape) {
-  auto arrTy = type.dyn_cast<fir::SequenceType>();
+  auto arrTy = mlir::dyn_cast<fir::SequenceType>(type);
   if (shape.empty() || !arrTy)
     return {};
   // elide the constant dimensions before construction
@@ -195,7 +195,7 @@ static llvm::SmallVector<mlir::Value>
 elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams) {
   if (lenParams.empty())
     return {};
-  if (auto arrTy = type.dyn_cast<fir::SequenceType>())
+  if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(type))
     type = arrTy.getEleTy();
   if (fir::hasDynamicSize(type))
     return lenParams;
@@ -264,7 +264,7 @@ mlir::Value fir::FirOpBuilder::createTemporaryAlloc(
     mlir::Location loc, mlir::Type type, llvm::StringRef name,
     mlir::ValueRange lenParams, mlir::ValueRange shape,
     llvm::ArrayRef<mlir::NamedAttribute> attrs) {
-  assert(!type.isa<fir::ReferenceType>() && "cannot be a reference");
+  assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
   // If the alloca is inside an OpenMP Op which will be outlined then pin
   // the alloca here.
   const bool pinned =
@@ -310,7 +310,7 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary(
   llvm::SmallVector<mlir::Value> dynamicLength =
       elideLengthsAlreadyInType(type, lenParams);
 
-  assert(!type.isa<fir::ReferenceType>() && "cannot be a reference");
+  assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
   return create<fir::AllocMemOp>(loc, type, /*unique_name=*/llvm::StringRef{},
                                  name, dynamicLength, dynamicShape, attrs);
 }
@@ -376,8 +376,9 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(
     // imaginary part is zero
     auto eleTy = helper.getComplexPartType(toTy);
     auto cast = createConvert(loc, eleTy, val);
-    llvm::APFloat zero{
-        kindMap.getFloatSemantics(toTy.cast<fir::ComplexType>().getFKind()), 0};
+    llvm::APFloat zero{kindMap.getFloatSemantics(
+                           mlir::cast<fir::ComplexType>(toTy).getFKind()),
+                       0};
     auto imag = createRealConstant(loc, eleTy, zero);
     return helper.createComplex(toTy, cast, imag);
   }
@@ -388,14 +389,14 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(
     return createConvert(loc, toTy, rp);
   }
   if (allowCharacterConversion) {
-    if (fromTy.isa<fir::BoxCharType>()) {
+    if (mlir::isa<fir::BoxCharType>(fromTy)) {
       // Extract the address of the character string and pass it
       fir::factory::CharacterExprHelper charHelper{*this, loc};
       std::pair<mlir::Value, mlir::Value> unboxchar =
           charHelper.createUnboxChar(val);
       return createConvert(loc, toTy, unboxchar.first);
     }
-    if (auto boxType = toTy.dyn_cast<fir::BoxCharType>()) {
+    if (auto boxType = mlir::dyn_cast<fir::BoxCharType>(toTy)) {
       // Extract the address of the actual argument and create a boxed
       // character value with an undefined length
       // TODO: We should really calculate the total size of the actual
@@ -415,10 +416,10 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(
             "element types expected to match"));
     return create<fir::BoxAddrOp>(loc, toTy, val);
   }
-  if (fir::isa_ref_type(fromTy) && toTy.isa<fir::BoxProcType>()) {
+  if (fir::isa_ref_type(fromTy) && mlir::isa<fir::BoxProcType>(toTy)) {
     // Call is expecting a boxed procedure, not a reference to other data type.
     // Convert the reference to a procedure and embox it.
-    mlir::Type procTy = toTy.cast<fir::BoxProcType>().getEleTy();
+    mlir::Type procTy = mlir::cast<fir::BoxProcType>(toTy).getEleTy();
     mlir::Value proc = createConvert(loc, procTy, val);
     return create<fir::EmboxProcOp>(loc, toTy, proc);
   }
@@ -428,7 +429,7 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(
     if (((fir::isPolymorphicType(fromTy) &&
           (fir::isAllocatableType(fromTy) || fir::isPointerType(fromTy)) &&
           fir::isPolymorphicType(toTy)) ||
-         (fir::isPolymorphicType(fromTy) && toTy.isa<fir::BoxType>())) &&
+         (fir::isPolymorphicType(fromTy) && mlir::isa<fir::BoxType>(toTy))) &&
         !(fir::isUnlimitedPolymorphicType(fromTy) && fir::isAssumedType(toTy)))
       return create<fir::ReboxOp>(loc, toTy, val, mlir::Value{},
                                   /*slice=*/mlir::Value{});
@@ -581,7 +582,7 @@ mlir::Value fir::FirOpBuilder::createBox(mlir::Location loc,
                                          bool isPolymorphic,
                                          bool isAssumedType) {
   mlir::Value itemAddr = fir::getBase(exv);
-  if (itemAddr.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(itemAddr.getType()))
     return itemAddr;
   auto elementType = fir::dyn_cast_ptrEleTy(itemAddr.getType());
   if (!elementType) {
@@ -592,7 +593,7 @@ mlir::Value fir::FirOpBuilder::createBox(mlir::Location loc,
   mlir::Type boxTy;
   mlir::Value tdesc;
   // Avoid to wrap a box/class with box/class.
-  if (elementType.isa<fir::BaseBoxType>()) {
+  if (mlir::isa<fir::BaseBoxType>(elementType)) {
     boxTy = elementType;
   } else {
     boxTy = fir::BoxType::get(elementType);
@@ -709,7 +710,7 @@ mlir::Value fir::FirOpBuilder::genAbsentOp(mlir::Location loc,
     return create<fir::AbsentOp>(loc, argTy);
 
   auto boxProc =
-      create<fir::AbsentOp>(loc, argTy.cast<mlir::TupleType>().getType(0));
+      create<fir::AbsentOp>(loc, mlir::cast<mlir::TupleType>(argTy).getType(0));
   mlir::Value charLen = create<fir::UndefOp>(loc, getCharacterLengthType());
   return fir::factory::createCharacterProcedureTuple(*this, loc, argTy, boxProc,
                                                      charLen);
@@ -958,14 +959,14 @@ static llvm::SmallVector<mlir::Value> getFromBox(mlir::Location loc,
                                                  fir::FirOpBuilder &builder,
                                                  mlir::Type valTy,
                                                  mlir::Value boxVal) {
-  if (auto boxTy = valTy.dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(valTy)) {
     auto eleTy = fir::unwrapAllRefAndSeqType(boxTy.getEleTy());
-    if (auto recTy = eleTy.dyn_cast<fir::RecordType>()) {
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy)) {
       if (recTy.getNumLenParams() > 0) {
         // Walk each type parameter in the record and get the value.
         TODO(loc, "generate code to get LEN type parameters");
       }
-    } else if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+    } else if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       if (charTy.hasDynamicLen()) {
         auto idxTy = builder.getIndexType();
         auto eleSz = builder.create<fir::BoxEleSizeOp>(loc, idxTy, boxVal);
@@ -1012,7 +1013,7 @@ llvm::SmallVector<mlir::Value>
 fir::factory::getTypeParams(mlir::Location loc, fir::FirOpBuilder &builder,
                             fir::ArrayLoadOp load) {
   mlir::Type memTy = load.getMemref().getType();
-  if (auto boxTy = memTy.dyn_cast<fir::BaseBoxType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(memTy))
     return getFromBox(loc, builder, boxTy, load.getMemref());
   return load.getTypeparams();
 }
@@ -1039,7 +1040,7 @@ std::string fir::factory::uniqueCGIdent(llvm::StringRef prefix,
 
 mlir::Value fir::factory::locationToFilename(fir::FirOpBuilder &builder,
                                              mlir::Location loc) {
-  if (auto flc = loc.dyn_cast<mlir::FileLineColLoc>()) {
+  if (auto flc = mlir::dyn_cast<mlir::FileLineColLoc>(loc)) {
     // must be encoded as asciiz, C string
     auto fn = flc.getFilename().str() + '\0';
     return fir::getBase(createStringLiteral(builder, loc, fn));
@@ -1050,7 +1051,7 @@ mlir::Value fir::factory::locationToFilename(fir::FirOpBuilder &builder,
 mlir::Value fir::factory::locationToLineNo(fir::FirOpBuilder &builder,
                                            mlir::Location loc,
                                            mlir::Type type) {
-  if (auto flc = loc.dyn_cast<mlir::FileLineColLoc>())
+  if (auto flc = mlir::dyn_cast<mlir::FileLineColLoc>(loc))
     return builder.createIntegerConstant(loc, type, flc.getLine());
   return builder.createIntegerConstant(loc, type, 0);
 }
@@ -1108,10 +1109,10 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
   auto fieldTy = component.getType();
   if (auto ty = fir::dyn_cast_ptrEleTy(fieldTy))
     fieldTy = ty;
-  if (fieldTy.isa<fir::BaseBoxType>()) {
+  if (mlir::isa<fir::BaseBoxType>(fieldTy)) {
     llvm::SmallVector<mlir::Value> nonDeferredTypeParams;
     auto eleTy = fir::unwrapSequenceType(fir::dyn_cast_ptrOrBoxEleTy(fieldTy));
-    if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       auto lenTy = builder.getCharacterLengthType();
       if (charTy.hasConstantLen())
         nonDeferredTypeParams.emplace_back(
@@ -1120,7 +1121,7 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
       // on a PDT length parameter. There is no way to make a difference with
       // deferred length here yet.
     }
-    if (auto recTy = eleTy.dyn_cast<fir::RecordType>())
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy))
       if (recTy.getNumLenParams() > 0)
         TODO(loc, "allocatable and pointer components non deferred length "
                   "parameters");
@@ -1129,7 +1130,7 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
                                 /*mutableProperties=*/{});
   }
   llvm::SmallVector<mlir::Value> extents;
-  if (auto seqTy = fieldTy.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(fieldTy)) {
     fieldTy = seqTy.getEleTy();
     auto idxTy = builder.getIndexType();
     for (auto extent : seqTy.getShape()) {
@@ -1138,7 +1139,7 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
       extents.emplace_back(builder.createIntegerConstant(loc, idxTy, extent));
     }
   }
-  if (auto charTy = fieldTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(fieldTy)) {
     auto cstLen = charTy.getLen();
     if (cstLen == fir::CharacterType::unknownLen())
       TODO(loc, "get character component length from length type parameters");
@@ -1148,7 +1149,7 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
       return fir::CharArrayBoxValue{component, len, extents};
     return fir::CharBoxValue{component, len};
   }
-  if (auto recordTy = fieldTy.dyn_cast<fir::RecordType>())
+  if (auto recordTy = mlir::dyn_cast<fir::RecordType>(fieldTy))
     if (recordTy.getNumLenParams() != 0)
       TODO(loc,
            "lower component ref that is a derived type with length parameter");
@@ -1211,14 +1212,14 @@ void fir::factory::genScalarAssignment(fir::FirOpBuilder &builder,
   assert(lhs.rank() == 0 && rhs.rank() == 0 && "must be scalars");
   auto type = fir::unwrapSequenceType(
       fir::unwrapPassByRefType(fir::getBase(lhs).getType()));
-  if (type.isa<fir::CharacterType>()) {
+  if (mlir::isa<fir::CharacterType>(type)) {
     const fir::CharBoxValue *toChar = lhs.getCharBox();
     const fir::CharBoxValue *fromChar = rhs.getCharBox();
     assert(toChar && fromChar);
     fir::factory::CharacterExprHelper helper{builder, loc};
     helper.createAssign(fir::ExtendedValue{*toChar},
                         fir::ExtendedValue{*fromChar});
-  } else if (type.isa<fir::RecordType>()) {
+  } else if (mlir::isa<fir::RecordType>(type)) {
     fir::factory::genRecordAssignment(builder, loc, lhs, rhs, needFinalization,
                                       isTemporaryLHS);
   } else {
@@ -1239,10 +1240,10 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
                                               const fir::ExtendedValue &rhs,
                                               bool isTemporaryLHS) {
   auto lbaseType = fir::unwrapPassByRefType(fir::getBase(lhs).getType());
-  auto lhsType = lbaseType.dyn_cast<fir::RecordType>();
+  auto lhsType = mlir::dyn_cast<fir::RecordType>(lbaseType);
   assert(lhsType && "lhs must be a scalar record type");
   auto rbaseType = fir::unwrapPassByRefType(fir::getBase(rhs).getType());
-  auto rhsType = rbaseType.dyn_cast<fir::RecordType>();
+  auto rhsType = mlir::dyn_cast<fir::RecordType>(rbaseType);
   assert(rhsType && "rhs must be a scalar record type");
   auto fieldIndexType = fir::FieldType::get(lhsType.getContext());
   for (auto [lhsPair, rhsPair] :
@@ -1261,7 +1262,7 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
     mlir::Value toCoor = builder.create<fir::CoordinateOp>(
         loc, fieldRefType, fir::getBase(lhs), field);
     std::optional<fir::DoLoopOp> outerLoop;
-    if (auto sequenceType = lFieldTy.dyn_cast<fir::SequenceType>()) {
+    if (auto sequenceType = mlir::dyn_cast<fir::SequenceType>(lFieldTy)) {
       // Create loops to assign array components elements by elements.
       // Note that, since these are components, they either do not overlap,
       // or are the same and exactly overlap. They also have compile time
@@ -1288,10 +1289,9 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
                                                    fromCoor, indices);
     }
     if (auto fieldEleTy = fir::unwrapSequenceType(lFieldTy);
-        fieldEleTy.isa<fir::BaseBoxType>()) {
-      assert(fieldEleTy.cast<fir::BaseBoxType>()
-                 .getEleTy()
-                 .isa<fir::PointerType>() &&
+        mlir::isa<fir::BaseBoxType>(fieldEleTy)) {
+      assert(mlir::isa<fir::PointerType>(
+                 mlir::cast<fir::BaseBoxType>(fieldEleTy).getEleTy()) &&
              "allocatable members require deep copy");
       auto fromPointerValue = builder.create<fir::LoadOp>(loc, fromCoor);
       auto castTo = builder.createConvert(loc, fieldEleTy, fromPointerValue);
@@ -1320,11 +1320,11 @@ static bool recordTypeCanBeMemCopied(fir::RecordType recordType) {
   for (auto [_, fieldType] : recordType.getTypeList()) {
     // Derived type component may have user assignment (so far, we cannot tell
     // in FIR, so assume it is always the case, TODO: get the actual info).
-    if (fir::unwrapSequenceType(fieldType).isa<fir::RecordType>())
+    if (mlir::isa<fir::RecordType>(fir::unwrapSequenceType(fieldType)))
       return false;
     // Allocatable components need deep copy.
-    if (auto boxType = fieldType.dyn_cast<fir::BaseBoxType>())
-      if (boxType.getEleTy().isa<fir::HeapType>())
+    if (auto boxType = mlir::dyn_cast<fir::BaseBoxType>(fieldType))
+      if (mlir::isa<fir::HeapType>(boxType.getEleTy()))
         return false;
   }
   // Constant size components without user defined assignment and pointers can
@@ -1353,9 +1353,10 @@ void fir::factory::genRecordAssignment(fir::FirOpBuilder &builder,
   // Box operands may be polymorphic, it is not entirely clear from 10.2.1.3
   // if the assignment is performed on the dynamic of declared type. Use the
   // runtime assuming it is performed on the dynamic type.
-  bool hasBoxOperands = fir::getBase(lhs).getType().isa<fir::BaseBoxType>() ||
-                        fir::getBase(rhs).getType().isa<fir::BaseBoxType>();
-  auto recTy = baseTy.dyn_cast<fir::RecordType>();
+  bool hasBoxOperands =
+      mlir::isa<fir::BaseBoxType>(fir::getBase(lhs).getType()) ||
+      mlir::isa<fir::BaseBoxType>(fir::getBase(rhs).getType());
+  auto recTy = mlir::dyn_cast<fir::RecordType>(baseTy);
   assert(recTy && "must be a record type");
   if ((needFinalization && mayHaveFinalizer(recTy, builder)) ||
       hasBoxOperands || !recordTypeCanBeMemCopied(recTy)) {
@@ -1401,7 +1402,7 @@ mlir::Value fir::factory::genLenOfCharacter(
     llvm::ArrayRef<mlir::Value> path, llvm::ArrayRef<mlir::Value> substring) {
   llvm::SmallVector<mlir::Value> typeParams(arrLoad.getTypeparams());
   return genLenOfCharacter(builder, loc,
-                           arrLoad.getType().cast<fir::SequenceType>(),
+                           mlir::cast<fir::SequenceType>(arrLoad.getType()),
                            arrLoad.getMemref(), typeParams, path, substring);
 }
 
@@ -1429,7 +1430,7 @@ mlir::Value fir::factory::genLenOfCharacter(
     lower = builder.createConvert(loc, idxTy, substring.front());
   auto eleTy = fir::applyPathToType(seqTy, path);
   if (!fir::hasDynamicSize(eleTy)) {
-    if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       // Use LEN from the type.
       return builder.createIntegerConstant(loc, idxTy, charTy.getLen());
     }
@@ -1438,9 +1439,9 @@ mlir::Value fir::factory::genLenOfCharacter(
                         "application of path did not result in a !fir.char");
   }
   if (fir::isa_box_type(memref.getType())) {
-    if (memref.getType().isa<fir::BoxCharType>())
+    if (mlir::isa<fir::BoxCharType>(memref.getType()))
       return builder.create<fir::BoxCharLenOp>(loc, idxTy, memref);
-    if (memref.getType().isa<fir::BoxType>())
+    if (mlir::isa<fir::BoxType>(memref.getType()))
       return CharacterExprHelper(builder, loc).readLengthFromBox(memref);
     fir::emitFatalError(loc, "memref has wrong type");
   }
@@ -1457,7 +1458,7 @@ mlir::Value fir::factory::genLenOfCharacter(
 mlir::Value fir::factory::createZeroValue(fir::FirOpBuilder &builder,
                                           mlir::Location loc, mlir::Type type) {
   mlir::Type i1 = builder.getIntegerType(1);
-  if (type.isa<fir::LogicalType>() || type == i1)
+  if (mlir::isa<fir::LogicalType>(type) || type == i1)
     return builder.createConvert(loc, type, builder.createBool(loc, false));
   if (fir::isa_integer(type))
     return builder.createIntegerConstant(loc, type, 0);
@@ -1507,7 +1508,7 @@ mlir::Value fir::factory::genMaxWithZero(fir::FirOpBuilder &builder,
   mlir::Value zero = builder.createIntegerConstant(loc, value.getType(), 0);
   if (mlir::Operation *definingOp = value.getDefiningOp())
     if (auto cst = mlir::dyn_cast<mlir::arith::ConstantOp>(definingOp))
-      if (auto intAttr = cst.getValue().dyn_cast<mlir::IntegerAttr>())
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue()))
         return intAttr.getInt() > 0 ? value : zero;
   mlir::Value valueIsGreater = builder.create<mlir::arith::CmpIOp>(
       loc, mlir::arith::CmpIPredicate::sgt, value, zero);
@@ -1519,8 +1520,8 @@ mlir::Value fir::factory::genCPtrOrCFunptrAddr(fir::FirOpBuilder &builder,
                                                mlir::Location loc,
                                                mlir::Value cPtr,
                                                mlir::Type ty) {
-  assert(ty.isa<fir::RecordType>());
-  auto recTy = ty.dyn_cast<fir::RecordType>();
+  assert(mlir::isa<fir::RecordType>(ty));
+  auto recTy = mlir::dyn_cast<fir::RecordType>(ty);
   assert(recTy.getTypeList().size() == 1);
   auto fieldName = recTy.getTypeList()[0].first;
   mlir::Type fieldTy = recTy.getTypeList()[0].second;
@@ -1582,7 +1583,7 @@ mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
 mlir::Value fir::factory::createNullBoxProc(fir::FirOpBuilder &builder,
                                             mlir::Location loc,
                                             mlir::Type boxType) {
-  auto boxTy{boxType.dyn_cast<fir::BoxProcType>()};
+  auto boxTy{mlir::dyn_cast<fir::BoxProcType>(boxType)};
   if (!boxTy)
     fir::emitFatalError(loc, "Procedure pointer must be of BoxProcType");
   auto boxEleTy{fir::unwrapRefType(boxTy.getEleTy())};
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index db638ceb4070..44779427ab55 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -38,10 +38,10 @@ hlfir::getExplicitExtentsFromShape(mlir::Value shape,
   } else if (mlir::dyn_cast_or_null<fir::ShiftOp>(shapeOp)) {
     return {};
   } else if (auto s = mlir::dyn_cast_or_null<hlfir::ShapeOfOp>(shapeOp)) {
-    hlfir::ExprType expr = s.getExpr().getType().cast<hlfir::ExprType>();
+    hlfir::ExprType expr = mlir::cast<hlfir::ExprType>(s.getExpr().getType());
     llvm::ArrayRef<int64_t> exprShape = expr.getShape();
     mlir::Type indexTy = builder.getIndexType();
-    fir::ShapeType shapeTy = shape.getType().cast<fir::ShapeType>();
+    fir::ShapeType shapeTy = mlir::cast<fir::ShapeType>(shape.getType());
     result.reserve(shapeTy.getRank());
     for (unsigned i = 0; i < shapeTy.getRank(); ++i) {
       int64_t extent = exprShape[i];
@@ -99,7 +99,7 @@ genLboundsAndExtentsFromBox(mlir::Location loc, fir::FirOpBuilder &builder,
                             hlfir::Entity boxEntity,
                             llvm::SmallVectorImpl<mlir::Value> &lbounds,
                             llvm::SmallVectorImpl<mlir::Value> *extents) {
-  assert(boxEntity.getType().isa<fir::BaseBoxType>() && "must be a box");
+  assert(mlir::isa<fir::BaseBoxType>(boxEntity.getType()) && "must be a box");
   mlir::Type idxTy = builder.getIndexType();
   const int rank = boxEntity.getRank();
   for (int i = 0; i < rank; ++i) {
@@ -154,7 +154,7 @@ static mlir::Value genCharacterVariableLength(mlir::Location loc,
                                               hlfir::Entity var) {
   if (mlir::Value len = tryGettingNonDeferredCharLen(var))
     return len;
-  auto charType = var.getFortranElementType().cast<fir::CharacterType>();
+  auto charType = mlir::cast<fir::CharacterType>(var.getFortranElementType());
   if (charType.hasConstantLen())
     return builder.createIntegerConstant(loc, builder.getIndexType(),
                                          charType.getLen());
@@ -172,7 +172,7 @@ static fir::CharBoxValue genUnboxChar(mlir::Location loc,
   if (auto emboxChar = boxChar.getDefiningOp<fir::EmboxCharOp>())
     return {emboxChar.getMemref(), emboxChar.getLen()};
   mlir::Type refType = fir::ReferenceType::get(
-      boxChar.getType().cast<fir::BoxCharType>().getEleTy());
+      mlir::cast<fir::BoxCharType>(boxChar.getType()).getEleTy());
   auto unboxed = builder.create<fir::UnboxCharOp>(
       loc, refType, builder.getIndexType(), boxChar);
   mlir::Value addr = unboxed.getResult(0);
@@ -252,8 +252,8 @@ hlfir::genAssociateExpr(mlir::Location loc, fir::FirOpBuilder &builder,
   // and the other static).
   mlir::Type varEleTy = getFortranElementType(variableType);
   mlir::Type valueEleTy = getFortranElementType(value.getType());
-  if (varEleTy != valueEleTy && !(valueEleTy.isa<fir::CharacterType>() &&
-                                  varEleTy.isa<fir::CharacterType>())) {
+  if (varEleTy != valueEleTy && !(mlir::isa<fir::CharacterType>(valueEleTy) &&
+                                  mlir::isa<fir::CharacterType>(varEleTy))) {
     assert(value.isScalar() && fir::isa_trivial(value.getType()));
     source = builder.createConvert(loc, fir::unwrapPassByRefType(variableType),
                                    value);
@@ -278,9 +278,9 @@ mlir::Value hlfir::genVariableRawAddress(mlir::Location loc,
   if (var.isMutableBox())
     baseAddr = builder.create<fir::LoadOp>(loc, baseAddr);
   // Get raw address.
-  if (var.getType().isa<fir::BoxCharType>())
+  if (mlir::isa<fir::BoxCharType>(var.getType()))
     baseAddr = genUnboxChar(loc, builder, var.getBase()).getAddr();
-  if (baseAddr.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(baseAddr.getType()))
     baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
   return baseAddr;
 }
@@ -289,13 +289,13 @@ mlir::Value hlfir::genVariableBoxChar(mlir::Location loc,
                                       fir::FirOpBuilder &builder,
                                       hlfir::Entity var) {
   assert(var.isVariable() && "only address of variables can be taken");
-  if (var.getType().isa<fir::BoxCharType>())
+  if (mlir::isa<fir::BoxCharType>(var.getType()))
     return var;
   mlir::Value addr = genVariableRawAddress(loc, builder, var);
   llvm::SmallVector<mlir::Value> lengths;
   genLengthParameters(loc, builder, var, lengths);
   assert(lengths.size() == 1);
-  auto charType = var.getFortranElementType().cast<fir::CharacterType>();
+  auto charType = mlir::cast<fir::CharacterType>(var.getFortranElementType());
   auto boxCharType =
       fir::BoxCharType::get(builder.getContext(), charType.getFKind());
   auto scalarAddr =
@@ -309,7 +309,7 @@ hlfir::Entity hlfir::genVariableBox(mlir::Location loc,
                                     hlfir::Entity var) {
   assert(var.isVariable() && "must be a variable");
   var = hlfir::derefPointersAndAllocatables(loc, builder, var);
-  if (var.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(var.getType()))
     return var;
   // Note: if the var is not a fir.box/fir.class at that point, it has default
   // lower bounds and is not polymorphic.
@@ -317,11 +317,11 @@ hlfir::Entity hlfir::genVariableBox(mlir::Location loc,
       var.isArray() ? hlfir::genShape(loc, builder, var) : mlir::Value{};
   llvm::SmallVector<mlir::Value> typeParams;
   auto maybeCharType =
-      var.getFortranElementType().dyn_cast<fir::CharacterType>();
+      mlir::dyn_cast<fir::CharacterType>(var.getFortranElementType());
   if (!maybeCharType || maybeCharType.hasDynamicLen())
     hlfir::genLengthParameters(loc, builder, var, typeParams);
   mlir::Value addr = var.getBase();
-  if (var.getType().isa<fir::BoxCharType>())
+  if (mlir::isa<fir::BoxCharType>(var.getType()))
     addr = genVariableRawAddress(loc, builder, var);
   mlir::Type boxType = fir::BoxType::get(var.getElementOrSequenceType());
   auto embox =
@@ -348,7 +348,7 @@ hlfir::Entity hlfir::getElementAt(mlir::Location loc,
     return entity;
   llvm::SmallVector<mlir::Value> lenParams;
   genLengthParameters(loc, builder, entity, lenParams);
-  if (entity.getType().isa<hlfir::ExprType>())
+  if (mlir::isa<hlfir::ExprType>(entity.getType()))
     return hlfir::Entity{builder.create<hlfir::ApplyOp>(
         loc, entity, oneBasedIndices, lenParams)};
   // Build hlfir.designate. The lower bounds may need to be added to
@@ -394,7 +394,7 @@ static mlir::Value genUBound(mlir::Location loc, fir::FirOpBuilder &builder,
 llvm::SmallVector<std::pair<mlir::Value, mlir::Value>>
 hlfir::genBounds(mlir::Location loc, fir::FirOpBuilder &builder,
                  Entity entity) {
-  if (entity.getType().isa<hlfir::ExprType>())
+  if (mlir::isa<hlfir::ExprType>(entity.getType()))
     TODO(loc, "bounds of expressions in hlfir");
   auto [exv, cleanup] = translateToExtendedValue(loc, builder, entity);
   assert(!cleanup && "translation of entity should not yield cleanup");
@@ -415,8 +415,8 @@ hlfir::genBounds(mlir::Location loc, fir::FirOpBuilder &builder,
 llvm::SmallVector<std::pair<mlir::Value, mlir::Value>>
 hlfir::genBounds(mlir::Location loc, fir::FirOpBuilder &builder,
                  mlir::Value shape) {
-  assert((shape.getType().isa<fir::ShapeShiftType>() ||
-          shape.getType().isa<fir::ShapeType>()) &&
+  assert((mlir::isa<fir::ShapeShiftType>(shape.getType()) ||
+          mlir::isa<fir::ShapeType>(shape.getType())) &&
          "shape must contain extents");
   auto extents = hlfir::getExplicitExtentsFromShape(shape, builder);
   auto lowers = getExplicitLboundsFromShape(shape);
@@ -474,7 +474,7 @@ static mlir::Value computeVariableExtent(mlir::Location loc,
     if (typeExtent != fir::SequenceType::getUnknownExtent())
       return builder.createIntegerConstant(loc, idxTy, typeExtent);
   }
-  assert(variable.getType().isa<fir::BaseBoxType>() &&
+  assert(mlir::isa<fir::BaseBoxType>(variable.getType()) &&
          "array variable with dynamic extent must be boxed");
   mlir::Value dimVal = builder.createIntegerConstant(loc, idxTy, dim);
   auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
@@ -496,9 +496,8 @@ llvm::SmallVector<mlir::Value> getVariableExtents(mlir::Location loc,
     variable = hlfir::derefPointersAndAllocatables(loc, builder, variable);
   // Use the type shape information, and/or the fir.box/fir.class shape
   // information if any extents are not static.
-  fir::SequenceType seqTy =
-      hlfir::getFortranElementOrSequenceType(variable.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType seqTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(variable.getType()));
   unsigned rank = seqTy.getShape().size();
   for (unsigned dim = 0; dim < rank; ++dim)
     extents.push_back(
@@ -507,7 +506,7 @@ llvm::SmallVector<mlir::Value> getVariableExtents(mlir::Location loc,
 }
 
 static mlir::Value tryRetrievingShapeOrShift(hlfir::Entity entity) {
-  if (entity.getType().isa<hlfir::ExprType>()) {
+  if (mlir::isa<hlfir::ExprType>(entity.getType())) {
     if (auto elemental = entity.getDefiningOp<hlfir::ElementalOp>())
       return elemental.getShape();
     return mlir::Value{};
@@ -523,13 +522,13 @@ mlir::Value hlfir::genShape(mlir::Location loc, fir::FirOpBuilder &builder,
   entity = followShapeInducingSource(entity);
   assert(entity && "what?");
   if (auto shape = tryRetrievingShapeOrShift(entity)) {
-    if (shape.getType().isa<fir::ShapeType>())
+    if (mlir::isa<fir::ShapeType>(shape.getType()))
       return shape;
-    if (shape.getType().isa<fir::ShapeShiftType>())
+    if (mlir::isa<fir::ShapeShiftType>(shape.getType()))
       if (auto s = shape.getDefiningOp<fir::ShapeShiftOp>())
         return builder.create<fir::ShapeOp>(loc, s.getExtents());
   }
-  if (entity.getType().isa<hlfir::ExprType>())
+  if (mlir::isa<hlfir::ExprType>(entity.getType()))
     return builder.create<hlfir::ShapeOfOp>(loc, entity.getBase());
   // There is no shape lying around for this entity. Retrieve the extents and
   // build a new fir.shape.
@@ -563,9 +562,8 @@ mlir::Value hlfir::genExtent(mlir::Location loc, fir::FirOpBuilder &builder,
       entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
     // Use the type shape information, and/or the fir.box/fir.class shape
     // information if any extents are not static.
-    fir::SequenceType seqTy =
-        hlfir::getFortranElementOrSequenceType(entity.getType())
-            .cast<fir::SequenceType>();
+    fir::SequenceType seqTy = mlir::cast<fir::SequenceType>(
+        hlfir::getFortranElementOrSequenceType(entity.getType()));
     return computeVariableExtent(loc, builder, entity, seqTy, dim);
   }
   TODO(loc, "get extent from HLFIR expr without producer holding the shape");
@@ -584,7 +582,7 @@ mlir::Value hlfir::genLBound(mlir::Location loc, fir::FirOpBuilder &builder,
   }
   if (entity.isMutableBox())
     entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
-  assert(entity.getType().isa<fir::BaseBoxType>() && "must be a box");
+  assert(mlir::isa<fir::BaseBoxType>(entity.getType()) && "must be a box");
   mlir::Type idxTy = builder.getIndexType();
   mlir::Value dimVal = builder.createIntegerConstant(loc, idxTy, dim);
   auto dimInfo =
@@ -597,7 +595,7 @@ void hlfir::genLengthParameters(mlir::Location loc, fir::FirOpBuilder &builder,
                                 llvm::SmallVectorImpl<mlir::Value> &result) {
   if (!entity.hasLengthParameters())
     return;
-  if (entity.getType().isa<hlfir::ExprType>()) {
+  if (mlir::isa<hlfir::ExprType>(entity.getType())) {
     mlir::Value expr = entity;
     if (auto reassoc = expr.getDefiningOp<hlfir::NoReassocOp>())
       expr = reassoc.getVal();
@@ -654,8 +652,8 @@ static mlir::Value asEmboxShape(mlir::Location loc, fir::FirOpBuilder &builder,
   // fir.shape_shift) since this information is already in the input fir.box,
   // it only accepts fir.shift because local lower bounds may not be reflected
   // in the fir.box.
-  if (fir::getBase(exv).getType().isa<fir::BaseBoxType>() &&
-      !shape.getType().isa<fir::ShiftType>())
+  if (mlir::isa<fir::BaseBoxType>(fir::getBase(exv).getType()) &&
+      !mlir::isa<fir::ShiftType>(shape.getType()))
     return builder.createShape(loc, exv);
   return shape;
 }
@@ -686,7 +684,7 @@ hlfir::Entity hlfir::derefPointersAndAllocatables(mlir::Location loc,
       if (!entity.isPolymorphic() && !entity.hasLengthParameters())
         return hlfir::Entity{builder.create<fir::BoxAddrOp>(loc, boxLoad)};
       mlir::Type elementType = boxLoad.getFortranElementType();
-      if (auto charType = elementType.dyn_cast<fir::CharacterType>()) {
+      if (auto charType = mlir::dyn_cast<fir::CharacterType>(elementType)) {
         mlir::Value base = builder.create<fir::BoxAddrOp>(loc, boxLoad);
         if (charType.hasConstantLen())
           return hlfir::Entity{base};
@@ -716,7 +714,7 @@ mlir::Type hlfir::getVariableElementType(hlfir::Entity variable) {
   mlir::Type eleTy = variable.getFortranElementType();
   if (variable.isPolymorphic())
     return fir::ClassType::get(eleTy);
-  if (auto charType = eleTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charType = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
     if (charType.hasDynamicLen())
       return fir::BoxCharType::get(charType.getContext(), charType.getFKind());
   } else if (fir::isRecordWithTypeParameters(eleTy)) {
@@ -737,7 +735,7 @@ mlir::Type hlfir::getEntityElementType(hlfir::Entity entity) {
 
 static hlfir::ExprType getArrayExprType(mlir::Type elementType,
                                         mlir::Value shape, bool isPolymorphic) {
-  unsigned rank = shape.getType().cast<fir::ShapeType>().getRank();
+  unsigned rank = mlir::cast<fir::ShapeType>(shape.getType()).getRank();
   hlfir::ExprType::Shape typeShape(rank, hlfir::ExprType::getUnknownExtent());
   if (auto shapeOp = shape.getDefiningOp<fir::ShapeOp>())
     for (auto extent : llvm::enumerate(shapeOp.getExtents()))
@@ -859,7 +857,7 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
     return fir::MutableBoxValue(base, getExplicitTypeParams(variable),
                                 fir::MutableProperties{});
 
-  if (base.getType().isa<fir::BaseBoxType>()) {
+  if (mlir::isa<fir::BaseBoxType>(base.getType())) {
     if (!variable.isSimplyContiguous() || variable.isPolymorphic() ||
         variable.isDerivedWithLengthParameters() || variable.isOptional()) {
       llvm::SmallVector<mlir::Value> nonDefaultLbounds =
@@ -874,7 +872,7 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
 
   if (variable.isScalar()) {
     if (variable.isCharacter()) {
-      if (base.getType().isa<fir::BoxCharType>())
+      if (mlir::isa<fir::BoxCharType>(base.getType()))
         return genUnboxChar(loc, builder, base);
       mlir::Value len = genCharacterVariableLength(loc, builder, variable);
       return fir::CharBoxValue{base, len};
@@ -883,7 +881,7 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
   }
   llvm::SmallVector<mlir::Value> extents;
   llvm::SmallVector<mlir::Value> nonDefaultLbounds;
-  if (variable.getType().isa<fir::BaseBoxType>() &&
+  if (mlir::isa<fir::BaseBoxType>(variable.getType()) &&
       !variable.getIfVariableInterface()) {
     // This special case avoids generating two sets of identical
     // fir.box_dim to get both the lower bounds and extents.
@@ -923,7 +921,7 @@ hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
     return {static_cast<mlir::Value>(entity), std::nullopt};
   }
 
-  if (entity.getType().isa<hlfir::ExprType>()) {
+  if (mlir::isa<hlfir::ExprType>(entity.getType())) {
     mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
     hlfir::AssociateOp associate = hlfir::genAssociateExpr(
         loc, builder, entity, entity.getType(), "", byRefAttr);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index e28d14cd318d..9d72e76e2369 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -786,7 +786,7 @@ mlir::Value genLibSplitComplexArgsCall(fir::FirOpBuilder &builder,
 
   auto getSplitComplexArgsType = [&builder, &args]() -> mlir::FunctionType {
     mlir::Type ctype = args[0].getType();
-    auto fKind = ctype.cast<fir::ComplexType>().getFKind();
+    auto fKind = mlir::cast<fir::ComplexType>(ctype).getFKind();
     mlir::Type ftype;
 
     if (fKind == 2)
@@ -894,8 +894,8 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
   LLVM_DEBUG(llvm::dbgs() << "Generating '" << mathLibFuncName
                           << "' operation with type ";
              mathLibFuncType.dump(); llvm::dbgs() << "\n");
-  auto type = mathLibFuncType.getInput(0).cast<fir::ComplexType>();
-  auto kind = type.getElementType().cast<fir::RealType>().getFKind();
+  auto type = mlir::cast<fir::ComplexType>(mathLibFuncType.getInput(0));
+  auto kind = mlir::cast<fir::RealType>(type.getElementType()).getFKind();
   auto realTy = builder.getRealType(kind);
   auto mComplexTy = mlir::ComplexType::get(realTy);
 
@@ -1394,14 +1394,14 @@ private:
 
   // Floating point can be mlir::FloatType or fir::real
   static unsigned getFloatingPointWidth(mlir::Type t) {
-    if (auto f{t.dyn_cast<mlir::FloatType>()})
+    if (auto f{mlir::dyn_cast<mlir::FloatType>(t)})
       return f.getWidth();
     // FIXME: Get width another way for fir.real/complex
     // - use fir/KindMapping.h and llvm::Type
     // - or use evaluate/type.h
-    if (auto r{t.dyn_cast<fir::RealType>()})
+    if (auto r{mlir::dyn_cast<fir::RealType>(t)})
       return r.getFKind() * 4;
-    if (auto cplx{t.dyn_cast<fir::ComplexType>()})
+    if (auto cplx{mlir::dyn_cast<fir::ComplexType>(t)})
       return cplx.getFKind() * 4;
     llvm_unreachable("not a floating-point type");
   }
@@ -1410,8 +1410,8 @@ private:
     if (from == to)
       return Conversion::None;
 
-    if (auto fromIntTy{from.dyn_cast<mlir::IntegerType>()}) {
-      if (auto toIntTy{to.dyn_cast<mlir::IntegerType>()}) {
+    if (auto fromIntTy{mlir::dyn_cast<mlir::IntegerType>(from)}) {
+      if (auto toIntTy{mlir::dyn_cast<mlir::IntegerType>(to)}) {
         return fromIntTy.getWidth() > toIntTy.getWidth() ? Conversion::Narrow
                                                          : Conversion::Extend;
       }
@@ -1423,8 +1423,8 @@ private:
                  : Conversion::Extend;
     }
 
-    if (auto fromCplxTy{from.dyn_cast<fir::ComplexType>()}) {
-      if (auto toCplxTy{to.dyn_cast<fir::ComplexType>()}) {
+    if (auto fromCplxTy{mlir::dyn_cast<fir::ComplexType>(from)}) {
+      if (auto toCplxTy{mlir::dyn_cast<fir::ComplexType>(to)}) {
         return getFloatingPointWidth(fromCplxTy) >
                        getFloatingPointWidth(toCplxTy)
                    ? Conversion::Narrow
@@ -1550,10 +1550,10 @@ fir::ExtendedValue toExtendedValue(mlir::Value val, fir::FirOpBuilder &builder,
   if (charHelper.isCharacterScalar(type))
     return charHelper.toExtendedValue(val);
 
-  if (auto refType = type.dyn_cast<fir::ReferenceType>())
+  if (auto refType = mlir::dyn_cast<fir::ReferenceType>(type))
     type = refType.getEleTy();
 
-  if (auto arrayType = type.dyn_cast<fir::SequenceType>()) {
+  if (auto arrayType = mlir::dyn_cast<fir::SequenceType>(type)) {
     type = arrayType.getEleTy();
     for (fir::SequenceType::Extent extent : arrayType.getShape()) {
       if (extent == fir::SequenceType::getUnknownExtent())
@@ -1566,7 +1566,8 @@ fir::ExtendedValue toExtendedValue(mlir::Value val, fir::FirOpBuilder &builder,
     // have been used in the interface).
     if (extents.size() + 1 < arrayType.getShape().size())
       mlir::emitError(loc, "cannot retrieve array extents from type");
-  } else if (type.isa<fir::BoxType>() || type.isa<fir::RecordType>()) {
+  } else if (mlir::isa<fir::BoxType>(type) ||
+             mlir::isa<fir::RecordType>(type)) {
     fir::emitFatalError(loc, "not yet implemented: descriptor or derived type");
   }
 
@@ -1580,10 +1581,10 @@ mlir::Value toValue(const fir::ExtendedValue &val, fir::FirOpBuilder &builder,
   if (const fir::CharBoxValue *charBox = val.getCharBox()) {
     mlir::Value buffer = charBox->getBuffer();
     auto buffTy = buffer.getType();
-    if (buffTy.isa<mlir::FunctionType>())
+    if (mlir::isa<mlir::FunctionType>(buffTy))
       fir::emitFatalError(
           loc, "A character's buffer type cannot be a function type.");
-    if (buffTy.isa<fir::BoxCharType>())
+    if (mlir::isa<fir::BoxCharType>(buffTy))
       return buffer;
     return fir::factory::CharacterExprHelper{builder, loc}.createEmboxChar(
         buffer, charBox->getLen());
@@ -1827,27 +1828,27 @@ IntrinsicLibrary::invokeGenerator(SubroutineGenerator generator,
 /// Note: mlir has Type::dump(ostream) methods but it may add "!" that is not
 /// suitable for function names.
 static std::string typeToString(mlir::Type t) {
-  if (auto refT{t.dyn_cast<fir::ReferenceType>()})
+  if (auto refT{mlir::dyn_cast<fir::ReferenceType>(t)})
     return "ref_" + typeToString(refT.getEleTy());
-  if (auto i{t.dyn_cast<mlir::IntegerType>()}) {
+  if (auto i{mlir::dyn_cast<mlir::IntegerType>(t)}) {
     return "i" + std::to_string(i.getWidth());
   }
-  if (auto cplx{t.dyn_cast<fir::ComplexType>()}) {
+  if (auto cplx{mlir::dyn_cast<fir::ComplexType>(t)}) {
     return "z" + std::to_string(cplx.getFKind());
   }
-  if (auto real{t.dyn_cast<fir::RealType>()}) {
+  if (auto real{mlir::dyn_cast<fir::RealType>(t)}) {
     return "r" + std::to_string(real.getFKind());
   }
-  if (auto f{t.dyn_cast<mlir::FloatType>()}) {
+  if (auto f{mlir::dyn_cast<mlir::FloatType>(t)}) {
     return "f" + std::to_string(f.getWidth());
   }
-  if (auto logical{t.dyn_cast<fir::LogicalType>()}) {
+  if (auto logical{mlir::dyn_cast<fir::LogicalType>(t)}) {
     return "l" + std::to_string(logical.getFKind());
   }
-  if (auto character{t.dyn_cast<fir::CharacterType>()}) {
+  if (auto character{mlir::dyn_cast<fir::CharacterType>(t)}) {
     return "c" + std::to_string(character.getFKind());
   }
-  if (auto boxCharacter{t.dyn_cast<fir::BoxCharType>()}) {
+  if (auto boxCharacter{mlir::dyn_cast<fir::BoxCharType>(t)}) {
     return "bc" + std::to_string(boxCharacter.getEleTy().getFKind());
   }
   llvm_unreachable("no mangling for type");
@@ -1907,7 +1908,7 @@ mlir::func::FuncOp IntrinsicLibrary::getWrapper(GeneratorType generator,
     mlir::Location localLoc = localBuilder->getUnknownLoc();
     llvm::SmallVector<mlir::Value> localArguments;
     for (mlir::BlockArgument bArg : function.front().getArguments()) {
-      auto refType = bArg.getType().dyn_cast<fir::ReferenceType>();
+      auto refType = mlir::dyn_cast<fir::ReferenceType>(bArg.getType());
       if (loadRefArguments && refType) {
         auto loaded = localBuilder->create<fir::LoadOp>(localLoc, bArg);
         localArguments.push_back(loaded);
@@ -2060,7 +2061,7 @@ mlir::SymbolRefAttr IntrinsicLibrary::getUnrestrictedIntrinsicSymbolRefAttr(
   if (!funcOp) {
     llvm::SmallVector<mlir::Type> argTypes;
     for (mlir::Type type : signature.getInputs()) {
-      if (auto refType = type.dyn_cast<fir::ReferenceType>())
+      if (auto refType = mlir::dyn_cast<fir::ReferenceType>(type))
         argTypes.push_back(refType.getEleTy());
       else
         argTypes.push_back(type);
@@ -2145,7 +2146,7 @@ mlir::Value IntrinsicLibrary::genAbs(mlir::Type resultType,
     // math::AbsFOp but it does not support all fir floating point types.
     return genRuntimeCall("abs", resultType, args);
   }
-  if (auto intType = type.dyn_cast<mlir::IntegerType>()) {
+  if (auto intType = mlir::dyn_cast<mlir::IntegerType>(type)) {
     // At the time of this implementation there is no abs op in mlir.
     // So, implement abs here without branching.
     mlir::Value shift =
@@ -2379,8 +2380,8 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
                                 llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);
   mlir::Type ptrTy = fir::getBase(args[0]).getType();
-  if (ptrTy &&
-      (fir::isBoxProcAddressType(ptrTy) || ptrTy.isa<fir::BoxProcType>())) {
+  if (ptrTy && (fir::isBoxProcAddressType(ptrTy) ||
+                mlir::isa<fir::BoxProcType>(ptrTy))) {
     mlir::Value pointerBoxProc =
         fir::isBoxProcAddressType(ptrTy)
             ? builder.create<fir::LoadOp>(loc, fir::getBase(args[0]))
@@ -2392,7 +2393,7 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
     mlir::Value target = fir::getBase(args[1]);
     if (fir::isBoxProcAddressType(target.getType()))
       target = builder.create<fir::LoadOp>(loc, target);
-    if (target.getType().isa<fir::BoxProcType>())
+    if (mlir::isa<fir::BoxProcType>(target.getType()))
       target = builder.create<fir::BoxAddrOp>(loc, target);
     mlir::Type intPtrTy = builder.getIntPtrType();
     mlir::Value pointerInt =
@@ -2649,7 +2650,7 @@ static mlir::Value getAddrFromBox(fir::FirOpBuilder &builder,
   mlir::Value argValue = fir::getBase(arg);
   mlir::Value addr{nullptr};
   if (isFunc) {
-    auto funcTy = argValue.getType().cast<fir::BoxProcType>().getEleTy();
+    auto funcTy = mlir::cast<fir::BoxProcType>(argValue.getType()).getEleTy();
     addr = builder.create<fir::BoxAddrOp>(loc, funcTy, argValue);
   } else {
     const auto *box = arg.getBoxOf<fir::BoxValue>();
@@ -3029,7 +3030,7 @@ void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) {
 mlir::Value IntrinsicLibrary::genDim(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
-  if (resultType.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(resultType)) {
     mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
     auto diff = builder.create<mlir::arith::SubIOp>(loc, args[0], args[1]);
     auto cmp = builder.create<mlir::arith::CmpIOp>(
@@ -3574,7 +3575,7 @@ IntrinsicLibrary::genReduction(FN func, FD funcDim, llvm::StringRef errMsg,
   if (absentDim || rank == 1) {
     mlir::Type ty = array.getType();
     mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-    auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+    auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
     if (fir::isa_complex(eleTy)) {
       mlir::Value result = builder.createTemporary(loc, eleTy);
       func(builder, loc, array, mask, result);
@@ -3646,7 +3647,7 @@ mlir::Value IntrinsicLibrary::genIbits(mlir::Type resultType,
   mlir::Value pos = builder.createConvert(loc, resultType, args[1]);
   mlir::Value len = builder.createConvert(loc, resultType, args[2]);
   mlir::Value bitSize = builder.createIntegerConstant(
-      loc, resultType, resultType.cast<mlir::IntegerType>().getWidth());
+      loc, resultType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   auto shiftCount = builder.create<mlir::arith::SubIOp>(loc, bitSize, len);
   mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
   mlir::Value ones = builder.createAllOnesInteger(loc, resultType);
@@ -3686,7 +3687,7 @@ IntrinsicLibrary::genIchar(mlir::Type resultType,
   mlir::Value buffer = charBox->getBuffer();
   mlir::Type bufferTy = buffer.getType();
   mlir::Value charVal;
-  if (auto charTy = bufferTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(bufferTy)) {
     assert(charTy.singleton());
     charVal = buffer;
   } else {
@@ -3759,7 +3760,7 @@ void IntrinsicLibrary::genRaiseExcept(int except, mlir::Value cond) {
 static std::pair<mlir::Value, mlir::Type>
 getFieldRef(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value rec) {
   auto recType =
-      fir::unwrapPassByRefType(rec.getType()).dyn_cast<fir::RecordType>();
+      mlir::dyn_cast<fir::RecordType>(fir::unwrapPassByRefType(rec.getType()));
   assert(recType.getTypeList().size() == 1 && "expected exactly one component");
   auto [fieldName, fieldTy] = recType.getTypeList().front();
   mlir::Value field = builder.create<fir::FieldIndexOp>(
@@ -3808,7 +3809,7 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
 
   assert(args.size() == 1);
   mlir::Value realVal = args[0];
-  mlir::FloatType realType = realVal.getType().dyn_cast<mlir::FloatType>();
+  mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(realVal.getType());
   const unsigned intWidth = realType.getWidth();
   mlir::Type intType = builder.getIntegerType(intWidth);
   mlir::Value intVal =
@@ -4056,8 +4057,10 @@ IntrinsicLibrary::genIeeeCopySign(mlir::Type resultType,
   assert(args.size() == 2);
   mlir::Value xRealVal = args[0];
   mlir::Value yRealVal = args[1];
-  mlir::FloatType xRealType = xRealVal.getType().dyn_cast<mlir::FloatType>();
-  mlir::FloatType yRealType = yRealVal.getType().dyn_cast<mlir::FloatType>();
+  mlir::FloatType xRealType =
+      mlir::dyn_cast<mlir::FloatType>(xRealVal.getType());
+  mlir::FloatType yRealType =
+      mlir::dyn_cast<mlir::FloatType>(yRealVal.getType());
 
   if (yRealType == mlir::FloatType::getBF16(builder.getContext())) {
     // Workaround: CopySignOp and BitcastOp don't work for kind 3 arg Y.
@@ -4106,7 +4109,7 @@ void IntrinsicLibrary::genIeeeGetFlag(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value flag = fir::getBase(args[0]);
   mlir::Value flagValue = fir::getBase(args[1]);
   mlir::Type resultTy =
-      flagValue.getType().dyn_cast<fir::ReferenceType>().getEleTy();
+      mlir::dyn_cast<fir::ReferenceType>(flagValue.getType()).getEleTy();
   mlir::Type i32Ty = builder.getIntegerType(32);
   mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, flag);
@@ -4130,7 +4133,7 @@ void IntrinsicLibrary::genIeeeGetHaltingMode(
   mlir::Value flag = fir::getBase(args[0]);
   mlir::Value halting = fir::getBase(args[1]);
   mlir::Type resultTy =
-      halting.getType().dyn_cast<fir::ReferenceType>().getEleTy();
+      mlir::dyn_cast<fir::ReferenceType>(halting.getType()).getEleTy();
   mlir::Type i32Ty = builder.getIntegerType(32);
   mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, flag);
@@ -4248,7 +4251,7 @@ mlir::Value IntrinsicLibrary::genIeeeLogb(mlir::Type resultType,
   //                 : ieee_copy_sign(X, 1.0) // +infinity or NaN
   assert(args.size() == 1);
   mlir::Value realVal = args[0];
-  mlir::FloatType realType = realVal.getType().dyn_cast<mlir::FloatType>();
+  mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(realVal.getType());
   int bitWidth = realType.getWidth();
   mlir::Type intType = builder.getIntegerType(realType.getWidth());
   mlir::Value intVal =
@@ -4545,7 +4548,7 @@ mlir::Value IntrinsicLibrary::genIeeeSignbit(mlir::Type resultType,
   // Check if the sign bit of arg X is set.
   assert(args.size() == 1);
   mlir::Value realVal = args[0];
-  mlir::FloatType realType = realVal.getType().dyn_cast<mlir::FloatType>();
+  mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(realVal.getType());
   int bitWidth = realType.getWidth();
   if (realType == mlir::FloatType::getBF16(builder.getContext())) {
     // Workaround: can't bitcast or convert real(3) to integer(2) or real(2).
@@ -4642,7 +4645,7 @@ mlir::Value IntrinsicLibrary::genIeeeValue(mlir::Type resultType,
   // A compiler generated call has one argument:
   //  - arg[0] is an index constant
   assert(args.size() == 1 || args.size() == 2);
-  mlir::FloatType realType = resultType.dyn_cast<mlir::FloatType>();
+  mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(resultType);
   int bitWidth = realType.getWidth();
   mlir::Type intType = builder.getIntegerType(bitWidth);
   mlir::Type valueTy = bitWidth <= 64 ? intType : builder.getIntegerType(64);
@@ -4884,7 +4887,7 @@ mlir::Value IntrinsicLibrary::genIshft(mlir::Type resultType,
   //                    : I << abs(SHIFT)
   assert(args.size() == 2);
   mlir::Value bitSize = builder.createIntegerConstant(
-      loc, resultType, resultType.cast<mlir::IntegerType>().getWidth());
+      loc, resultType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
   mlir::Value shift = builder.createConvert(loc, resultType, args[1]);
   mlir::Value absShift = genAbs(resultType, {shift});
@@ -4920,7 +4923,7 @@ mlir::Value IntrinsicLibrary::genIshftc(mlir::Type resultType,
   // Return:  SHIFT == 0 || SIZE == abs(SHIFT) ? I : (unchanged | left | right)
   assert(args.size() == 3);
   mlir::Value bitSize = builder.createIntegerConstant(
-      loc, resultType, resultType.cast<mlir::IntegerType>().getWidth());
+      loc, resultType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   mlir::Value I = args[0];
   mlir::Value shift = builder.createConvert(loc, resultType, args[1]);
   mlir::Value size =
@@ -5027,7 +5030,7 @@ IntrinsicLibrary::genLoc(mlir::Type resultType,
   mlir::Value box = fir::getBase(args[0]);
   assert(fir::isa_box_type(box.getType()) &&
          "argument must have been lowered to box type");
-  bool isFunc = box.getType().isa<fir::BoxProcType>();
+  bool isFunc = mlir::isa<fir::BoxProcType>(box.getType());
   if (!isOptional(box)) {
     mlir::Value argAddr = getAddrFromBox(builder, loc, args[0], isFunc);
     return builder.createConvert(loc, resultType, argAddr);
@@ -5156,7 +5159,7 @@ IntrinsicLibrary::genMerge(mlir::Type,
   auto convertToStaticType = [&](mlir::Value polymorphic,
                                  mlir::Value other) -> mlir::Value {
     mlir::Type otherType = other.getType();
-    if (otherType.isa<fir::BaseBoxType>())
+    if (mlir::isa<fir::BaseBoxType>(otherType))
       return builder.create<fir::ReboxOp>(loc, otherType, polymorphic,
                                           /*shape*/ mlir::Value{},
                                           /*slice=*/mlir::Value{});
@@ -5209,7 +5212,7 @@ mlir::Value IntrinsicLibrary::genMergeBits(mlir::Type resultType,
 mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
-  if (resultType.isa<mlir::IntegerType>())
+  if (mlir::isa<mlir::IntegerType>(resultType))
     return builder.create<mlir::arith::RemSIOp>(loc, args[0], args[1]);
 
   // Use runtime.
@@ -5231,7 +5234,7 @@ mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
   //  - Otherwise, when A/P < 0 and MOD(A,P) !=0, then MODULO(A, P) =
   //    A-FLOOR(A/P)*P = A-(INT(A/P)-1)*P = A-INT(A/P)*P+P = MOD(A,P)+P
   // Note that A/P < 0 if and only if A and P signs are different.
-  if (resultType.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(resultType)) {
     auto remainder =
         builder.create<mlir::arith::RemSIOp>(loc, args[0], args[1]);
     auto argXor = builder.create<mlir::arith::XOrIOp>(loc, args[0], args[1]);
@@ -5344,7 +5347,7 @@ void IntrinsicLibrary::genMvbits(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
   mlir::Value ones = builder.createAllOnesInteger(loc, resultType);
   mlir::Value bitSize = builder.createIntegerConstant(
-      loc, resultType, resultType.cast<mlir::IntegerType>().getWidth());
+      loc, resultType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   auto shiftCount = builder.create<mlir::arith::SubIOp>(loc, bitSize, len);
   auto mask = builder.create<mlir::arith::ShRUIOp>(loc, ones, shiftCount);
   auto unchangedTmp1 = builder.create<mlir::arith::ShLIOp>(loc, mask, topos);
@@ -5628,7 +5631,7 @@ IntrinsicLibrary::genReshape(mlir::Type resultType,
   assert(fir::BoxValue(shape).rank() == 1);
   mlir::Type shapeTy = shape.getType();
   mlir::Type shapeArrTy = fir::dyn_cast_ptrOrBoxEleTy(shapeTy);
-  auto resultRank = shapeArrTy.cast<fir::SequenceType>().getShape()[0];
+  auto resultRank = mlir::cast<fir::SequenceType>(shapeArrTy).getShape()[0];
 
   if (resultRank == fir::SequenceType::getUnknownExtent())
     TODO(loc, "intrinsic: reshape requires computing rank of result");
@@ -5921,7 +5924,7 @@ void IntrinsicLibrary::genSignalSubroutine(
 mlir::Value IntrinsicLibrary::genSign(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
-  if (resultType.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(resultType)) {
     mlir::Value abs = genAbs(resultType, {args[0]});
     mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
     auto neg = builder.create<mlir::arith::SubIOp>(loc, zero, abs);
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index d4012e9c3d9d..76b920dba869 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -28,7 +28,7 @@ createNewFirBox(fir::FirOpBuilder &builder, mlir::Location loc,
                 const fir::MutableBoxValue &box, mlir::Value addr,
                 mlir::ValueRange lbounds, mlir::ValueRange extents,
                 mlir::ValueRange lengths, mlir::Value tdesc = {}) {
-  if (addr.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(addr.getType()))
     // The entity is already boxed.
     return builder.createConvert(loc, box.getBoxTy(), addr);
 
@@ -53,20 +53,21 @@ createNewFirBox(fir::FirOpBuilder &builder, mlir::Location loc,
   // error in the embox).
   llvm::SmallVector<mlir::Value> cleanedLengths;
   auto cleanedAddr = addr;
-  if (auto charTy = box.getEleTy().dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(box.getEleTy())) {
     // Cast address to box type so that both input and output type have
     // unknown or constant lengths.
     auto bt = box.getBaseTy();
     auto addrTy = addr.getType();
-    auto type = addrTy.isa<fir::HeapType>()      ? fir::HeapType::get(bt)
-                : addrTy.isa<fir::PointerType>() ? fir::PointerType::get(bt)
-                                                 : builder.getRefType(bt);
+    auto type = mlir::isa<fir::HeapType>(addrTy) ? fir::HeapType::get(bt)
+                : mlir::isa<fir::PointerType>(addrTy)
+                    ? fir::PointerType::get(bt)
+                    : builder.getRefType(bt);
     cleanedAddr = builder.createConvert(loc, type, addr);
     if (charTy.getLen() == fir::CharacterType::unknownLen())
       cleanedLengths.append(lengths.begin(), lengths.end());
   } else if (fir::isUnlimitedPolymorphicType(box.getBoxTy())) {
-    if (auto charTy = fir::dyn_cast_ptrEleTy(addr.getType())
-                          .dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(
+            fir::dyn_cast_ptrEleTy(addr.getType()))) {
       if (charTy.getLen() == fir::CharacterType::unknownLen())
         cleanedLengths.append(lengths.begin(), lengths.end());
     }
@@ -328,18 +329,18 @@ private:
 mlir::Value fir::factory::createUnallocatedBox(
     fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type boxType,
     mlir::ValueRange nonDeferredParams, mlir::Value typeSourceBox) {
-  auto baseAddrType = boxType.dyn_cast<fir::BaseBoxType>().getEleTy();
+  auto baseAddrType = mlir::dyn_cast<fir::BaseBoxType>(boxType).getEleTy();
   if (!fir::isa_ref_type(baseAddrType))
     baseAddrType = builder.getRefType(baseAddrType);
   auto type = fir::unwrapRefType(baseAddrType);
   auto eleTy = fir::unwrapSequenceType(type);
-  if (auto recTy = eleTy.dyn_cast<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy))
     if (recTy.getNumLenParams() > 0)
       TODO(loc, "creating unallocated fir.box of derived type with length "
                 "parameters");
   auto nullAddr = builder.createNullConstant(loc, baseAddrType);
   mlir::Value shape;
-  if (auto seqTy = type.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type)) {
     auto zero = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
     llvm::SmallVector<mlir::Value> extents(seqTy.getDimension(), zero);
     shape = builder.createShape(
@@ -348,7 +349,7 @@ mlir::Value fir::factory::createUnallocatedBox(
   // Provide dummy length parameters if they are dynamic. If a length parameter
   // is deferred. It is set to zero here and will be set on allocation.
   llvm::SmallVector<mlir::Value> lenParams;
-  if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
     if (charTy.getLen() == fir::CharacterType::unknownLen()) {
       if (!nonDeferredParams.empty()) {
         lenParams.push_back(nonDeferredParams[0]);
@@ -592,7 +593,7 @@ void fir::factory::associateMutableBoxWithRemap(
   auto cast = [&](mlir::Value addr) -> mlir::Value {
     // Cast base addr to new sequence type.
     auto ty = fir::dyn_cast_ptrEleTy(addr.getType());
-    if (auto seqTy = ty.dyn_cast<fir::SequenceType>()) {
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) {
       fir::SequenceType::Shape shape(newRank,
                                      fir::SequenceType::getUnknownExtent());
       ty = fir::SequenceType::get(shape, seqTy.getEleTy());
@@ -673,10 +674,10 @@ void fir::factory::disassociateMutableBox(fir::FirOpBuilder &builder,
   if (box.isPolymorphic() && polymorphicSetType) {
     // 7.3.2.3 point 7. The dynamic type of a disassociated pointer is the
     // same as its declared type.
-    auto boxTy = box.getBoxTy().dyn_cast<fir::BaseBoxType>();
+    auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getBoxTy());
     auto eleTy = fir::unwrapPassByRefType(boxTy.getEleTy());
     mlir::Type derivedType = fir::getDerivedType(eleTy);
-    if (auto recTy = derivedType.dyn_cast<fir::RecordType>()) {
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(derivedType)) {
       fir::runtime::genNullifyDerivedType(builder, loc, box.getAddr(), recTy,
                                           box.rank());
       return;
@@ -690,7 +691,7 @@ getNewLengths(fir::FirOpBuilder &builder, mlir::Location loc,
               const fir::MutableBoxValue &box, mlir::ValueRange lenParams) {
   llvm::SmallVector<mlir::Value> lengths;
   auto idxTy = builder.getIndexType();
-  if (auto charTy = box.getEleTy().dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(box.getEleTy())) {
     if (charTy.getLen() == fir::CharacterType::unknownLen()) {
       if (box.hasNonDeferredLenParams()) {
         lengths.emplace_back(
@@ -717,7 +718,7 @@ static mlir::Value allocateAndInitNewStorage(fir::FirOpBuilder &builder,
   auto lengths = getNewLengths(builder, loc, box, lenParams);
   auto newStorage = builder.create<fir::AllocMemOp>(
       loc, box.getBaseTy(), allocName, lengths, extents);
-  if (box.getEleTy().isa<fir::RecordType>()) {
+  if (mlir::isa<fir::RecordType>(box.getEleTy())) {
     // TODO: skip runtime initialization if this is not required. Currently,
     // there is no way to know here if a derived type needs it or not. But the
     // information is available at compile time and could be reflected here
@@ -742,7 +743,7 @@ void fir::factory::genInlinedAllocation(
                                               lengths, safeExtents);
   MutablePropertyWriter{builder, loc, box}.updateMutableBox(
       heap, lbounds, safeExtents, lengths);
-  if (box.getEleTy().isa<fir::RecordType>()) {
+  if (mlir::isa<fir::RecordType>(box.getEleTy())) {
     // TODO: skip runtime initialization if this is not required. Currently,
     // there is no way to know here if a derived type needs it or not. But the
     // information is available at compile time and could be reflected here
diff --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
index 160118e2c050..7f09e8822844 100644
--- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
@@ -1119,7 +1119,7 @@ PPCIntrinsicLibrary::genVecAbs(mlir::Type resultType,
     funcOp = builder.createFunction(loc, fname, ftype);
     auto callOp{builder.create<fir::CallOp>(loc, funcOp, argBases[0])};
     return callOp.getResult(0);
-  } else if (auto eleTy = vTypeInfo.eleTy.dyn_cast<mlir::IntegerType>()) {
+  } else if (auto eleTy = mlir::dyn_cast<mlir::IntegerType>(vTypeInfo.eleTy)) {
     // vec_abs(arg1) = max(0 - arg1, arg1)
 
     auto newVecTy{mlir::VectorType::get(vTypeInfo.len, eleTy)};
@@ -1173,12 +1173,13 @@ fir::ExtendedValue PPCIntrinsicLibrary::genVecAddAndMulSubXor(
   assert(args.size() == 2);
   auto argBases{getBasesForArgs(args)};
   auto argsTy{getTypesForArgs(argBases)};
-  assert(argsTy[0].isa<fir::VectorType>() && argsTy[1].isa<fir::VectorType>());
+  assert(mlir::isa<fir::VectorType>(argsTy[0]) &&
+         mlir::isa<fir::VectorType>(argsTy[1]));
 
   auto vecTyInfo{getVecTypeFromFir(argBases[0])};
 
-  const auto isInteger{vecTyInfo.eleTy.isa<mlir::IntegerType>()};
-  const auto isFloat{vecTyInfo.eleTy.isa<mlir::FloatType>()};
+  const auto isInteger{mlir::isa<mlir::IntegerType>(vecTyInfo.eleTy)};
+  const auto isFloat{mlir::isa<mlir::FloatType>(vecTyInfo.eleTy)};
   assert((isInteger || isFloat) && "unknown vector type");
 
   auto vargs{convertVecArgs(builder, loc, vecTyInfo, argBases)};
@@ -1212,7 +1213,7 @@ fir::ExtendedValue PPCIntrinsicLibrary::genVecAddAndMulSubXor(
       arg2 = vargs[1];
     } else if (isFloat) {
       // bitcast the arguments to integer
-      auto wd{vecTyInfo.eleTy.dyn_cast<mlir::FloatType>().getWidth()};
+      auto wd{mlir::dyn_cast<mlir::FloatType>(vecTyInfo.eleTy).getWidth()};
       auto ftype{builder.getIntegerType(wd)};
       auto bcVecTy{mlir::VectorType::get(vecTyInfo.len, ftype)};
       arg1 = builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, vargs[0]);
@@ -1450,7 +1451,7 @@ PPCIntrinsicLibrary::genVecCmp(mlir::Type resultType,
 
   mlir::Value res{nullptr};
 
-  if (auto eTy = vecTyInfo.eleTy.dyn_cast<mlir::IntegerType>()) {
+  if (auto eTy = mlir::dyn_cast<mlir::IntegerType>(vecTyInfo.eleTy)) {
     constexpr int firstArg{0};
     constexpr int secondArg{1};
     std::map<VecOp, std::array<int, 2>> argOrder{
@@ -1559,7 +1560,7 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
   case VecOp::Ctf: {
     assert(args.size() == 2);
     auto convArg{builder.createConvert(loc, i32Ty, argBases[1])};
-    auto eTy{vecTyInfo.eleTy.dyn_cast<mlir::IntegerType>()};
+    auto eTy{mlir::dyn_cast<mlir::IntegerType>(vecTyInfo.eleTy)};
     assert(eTy && "Unsupported vector type");
     const auto isUnsigned{eTy.isUnsignedInteger()};
     const auto width{eTy.getWidth()};
@@ -1587,10 +1588,9 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
                       : builder.create<mlir::LLVM::SIToFPOp>(loc, ty, vArg1)};
 
       // construct vector<1./(1<<arg1), 1.0/(1<<arg1)>
-      auto constInt{
+      auto constInt{mlir::dyn_cast_or_null<mlir::IntegerAttr>(
           mlir::dyn_cast<mlir::arith::ConstantOp>(argBases[1].getDefiningOp())
-              .getValue()
-              .dyn_cast_or_null<mlir::IntegerAttr>()};
+              .getValue())};
       assert(constInt && "expected integer constant argument");
       double f{1.0 / (1 << constInt.getInt())};
       llvm::SmallVector<double> vals{f, f};
@@ -1815,7 +1815,7 @@ static mlir::Value addOffsetToAddress(fir::FirOpBuilder &builder,
 static mlir::Value reverseVectorElements(fir::FirOpBuilder &builder,
                                          mlir::Location loc, mlir::Value v,
                                          int64_t len) {
-  assert(v.getType().isa<mlir::VectorType>());
+  assert(mlir::isa<mlir::VectorType>(v.getType()));
   assert(len > 0);
   llvm::SmallVector<int64_t, 16> mask;
   for (int64_t i = 0; i < len; ++i) {
@@ -2144,10 +2144,9 @@ PPCIntrinsicLibrary::genVecPerm(mlir::Type resultType,
   }
   case VecOp::Permi: {
     // arg3 is a constant
-    auto constIntOp{
+    auto constIntOp{mlir::dyn_cast_or_null<mlir::IntegerAttr>(
         mlir::dyn_cast<mlir::arith::ConstantOp>(argBases[2].getDefiningOp())
-            .getValue()
-            .dyn_cast_or_null<mlir::IntegerAttr>()};
+            .getValue())};
     assert(constIntOp && "expected integer constant argument");
     auto constInt{constIntOp.getInt()};
     // arg1, arg2, and result type share same VecTypeInfo
@@ -2321,10 +2320,9 @@ PPCIntrinsicLibrary::genVecShift(mlir::Type resultType,
     }
   } else if (vop == VecOp::Sld || vop == VecOp::Sldw) {
     assert(args.size() == 3);
-    auto constIntOp =
+    auto constIntOp = mlir::dyn_cast_or_null<mlir::IntegerAttr>(
         mlir::dyn_cast<mlir::arith::ConstantOp>(argBases[2].getDefiningOp())
-            .getValue()
-            .dyn_cast_or_null<mlir::IntegerAttr>();
+            .getValue());
     assert(constIntOp && "expected integer constant argument");
 
     // Bitcast to vector<16xi8>
@@ -2797,16 +2795,16 @@ void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef<fir::ExtendedValue> args) {
     auto vType{v.getType()};
     mlir::Type targetType{intrFuncType.getInput(j)};
     if (vType != targetType) {
-      if (targetType.isa<mlir::VectorType>()) {
+      if (mlir::isa<mlir::VectorType>(targetType)) {
         // Perform vector type conversion for arguments passed by value.
-        auto eleTy{vType.dyn_cast<fir::VectorType>().getEleTy()};
-        auto len{vType.dyn_cast<fir::VectorType>().getLen()};
+        auto eleTy{mlir::dyn_cast<fir::VectorType>(vType).getEleTy()};
+        auto len{mlir::dyn_cast<fir::VectorType>(vType).getLen()};
         mlir::VectorType mlirType = mlir::VectorType::get(len, eleTy);
         auto v0{builder.createConvert(loc, mlirType, v)};
         auto v1{builder.create<mlir::vector::BitCastOp>(loc, targetType, v0)};
         intrArgs.push_back(v1);
-      } else if (targetType.isa<mlir::IntegerType>() &&
-                 vType.isa<mlir::IntegerType>()) {
+      } else if (mlir::isa<mlir::IntegerType>(targetType) &&
+                 mlir::isa<mlir::IntegerType>(vType)) {
         auto v0{builder.createConvert(loc, targetType, v)};
         intrArgs.push_back(v0);
       } else {
@@ -2861,7 +2859,7 @@ void PPCIntrinsicLibrary::genVecStore(llvm::ArrayRef<fir::ExtendedValue> args) {
     if (arg1TyInfo.isFloat32()) {
       stTy = mlir::VectorType::get(len, i32ty);
       fname = "llvm.ppc.altivec.stvewx";
-    } else if (arg1TyInfo.eleTy.isa<mlir::IntegerType>()) {
+    } else if (mlir::isa<mlir::IntegerType>(arg1TyInfo.eleTy)) {
       stTy = mlir::VectorType::get(len, mlir::IntegerType::get(context, width));
 
       switch (width) {
diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
index abff0e150ab4..70a88ff18cb1 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@@ -27,7 +27,7 @@ mlir::Value fir::runtime::genMoveAlloc(fir::FirOpBuilder &builder,
   if (fir::isPolymorphicType(from.getType()) &&
       !fir::isUnlimitedPolymorphicType(from.getType())) {
     fir::ClassType clTy =
-        fir::dyn_cast_ptrEleTy(from.getType()).dyn_cast<fir::ClassType>();
+        mlir::dyn_cast<fir::ClassType>(fir::dyn_cast_ptrEleTy(from.getType()));
     mlir::Type derivedType = fir::unwrapInnerType(clTy.getEleTy());
     declaredTypeDesc =
         builder.create<fir::TypeDescOp>(loc, mlir::TypeAttr::get(derivedType));
diff --git a/flang/lib/Optimizer/Builder/Runtime/Character.cpp b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
index f3663439fdd5..b16819915d5a 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
@@ -39,15 +39,15 @@ static void genCharacterSearch(FN func, fir::FirOpBuilder &builder,
 
 /// Helper function to recover the KIND from the FIR type.
 static int discoverKind(mlir::Type ty) {
-  if (auto charTy = ty.dyn_cast<fir::CharacterType>())
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(ty))
     return charTy.getFKind();
   if (auto eleTy = fir::dyn_cast_ptrEleTy(ty))
     return discoverKind(eleTy);
-  if (auto arrTy = ty.dyn_cast<fir::SequenceType>())
+  if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(ty))
     return discoverKind(arrTy.getEleTy());
-  if (auto boxTy = ty.dyn_cast<fir::BoxCharType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BoxCharType>(ty))
     return discoverKind(boxTy.getEleTy());
-  if (auto boxTy = ty.dyn_cast<fir::BoxType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BoxType>(ty))
     return discoverKind(boxTy.getEleTy());
   llvm_unreachable("unexpected character type");
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp b/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
index a11b9339681e..bf5fd6af0eaf 100755
--- a/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
@@ -13,7 +13,7 @@
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "llvm/ADT/ArrayRef.h"
 
-void fir::runtime::genEnvironmentDefaults(
+mlir::Value fir::runtime::genEnvironmentDefaults(
     fir::FirOpBuilder &builder, mlir::Location loc,
     const std::vector<Fortran::lower::EnvironmentDefault> &envDefaults) {
   std::string envDefaultListPtrName =
@@ -34,14 +34,8 @@ void fir::runtime::genEnvironmentDefaults(
 
   // If no defaults were specified, initialize with a null pointer.
   if (envDefaults.empty()) {
-    builder.createGlobalConstant(
-        loc, envDefaultListRefTy, envDefaultListPtrName,
-        [&](fir::FirOpBuilder &builder) {
-          mlir::Value nullVal =
-              builder.createNullConstant(loc, envDefaultListRefTy);
-          builder.create<fir::HasValueOp>(loc, nullVal);
-        });
-    return;
+    mlir::Value nullVal = builder.createNullConstant(loc, envDefaultListRefTy);
+    return nullVal;
   }
 
   // Create the Item list.
@@ -99,11 +93,7 @@ void fir::runtime::genEnvironmentDefaults(
       envDefaultListBuilder, linkOnce);
 
   // Define the pointer to the list used by the runtime.
-  builder.createGlobalConstant(
-      loc, envDefaultListRefTy, envDefaultListPtrName,
-      [&](fir::FirOpBuilder &builder) {
-        mlir::Value addr = builder.create<fir::AddrOfOp>(
-            loc, envDefaultList.resultType(), envDefaultList.getSymbol());
-        builder.create<fir::HasValueOp>(loc, addr);
-      });
+  mlir::Value addr = builder.create<fir::AddrOfOp>(
+      loc, envDefaultList.resultType(), envDefaultList.getSymbol());
+  return addr;
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index 57c47da0f3f8..8b78a1688c73 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -228,7 +228,8 @@ void fir::runtime::genSystemClock(fir::FirOpBuilder &builder,
     fir::IfOp ifOp{};
     const bool isOptionalArg =
         fir::valueHasFirAttribute(arg, fir::getOptionalAttrName());
-    if (type.dyn_cast<fir::PointerType>() || type.dyn_cast<fir::HeapType>()) {
+    if (mlir::dyn_cast<fir::PointerType>(type) ||
+        mlir::dyn_cast<fir::HeapType>(type)) {
       // Check for a disassociated pointer or an unallocated allocatable.
       assert(!isOptionalArg && "invalid optional argument");
       ifOp = builder.create<fir::IfOp>(loc, builder.genIsNotNullAddr(loc, arg),
@@ -242,7 +243,8 @@ void fir::runtime::genSystemClock(fir::FirOpBuilder &builder,
       builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     mlir::Type kindTy = func.getFunctionType().getInput(0);
     int integerKind = 8;
-    if (auto intType = fir::unwrapRefType(type).dyn_cast<mlir::IntegerType>())
+    if (auto intType =
+            mlir::dyn_cast<mlir::IntegerType>(fir::unwrapRefType(type)))
       integerKind = intType.getWidth() / 8;
     mlir::Value kind = builder.createIntegerConstant(loc, kindTy, integerKind);
     mlir::Value res =
diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
new file mode 100644
index 000000000000..ab3c4ca81314
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
@@ -0,0 +1,69 @@
+//===-- Main.cpp - generate main runtime API calls --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/Runtime/Main.h"
+#include "flang/Lower/EnvironmentDefault.h"
+#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Runtime/main.h"
+#include "flang/Runtime/stop.h"
+
+using namespace Fortran::runtime;
+
+/// Create a `int main(...)` that calls the Fortran entry point
+void fir::runtime::genMain(
+    fir::FirOpBuilder &builder, mlir::Location loc,
+    const std::vector<Fortran::lower::EnvironmentDefault> &defs) {
+  auto *context = builder.getContext();
+  auto argcTy = builder.getDefaultIntegerType();
+  auto ptrTy = mlir::LLVM::LLVMPointerType::get(context);
+
+  // void ProgramStart(int argc, char** argv, char** envp,
+  //                   _QQEnvironmentDefaults* env)
+  auto startFn = builder.createFunction(
+      loc, RTNAME_STRING(ProgramStart),
+      mlir::FunctionType::get(context, {argcTy, ptrTy, ptrTy, ptrTy}, {}));
+  // void ProgramStop()
+  auto stopFn =
+      builder.createFunction(loc, RTNAME_STRING(ProgramEndStatement),
+                             mlir::FunctionType::get(context, {}, {}));
+
+  // int main(int argc, char** argv, char** envp)
+  auto mainFn = builder.createFunction(
+      loc, "main",
+      mlir::FunctionType::get(context, {argcTy, ptrTy, ptrTy}, argcTy));
+  // void _QQmain()
+  auto qqMainFn = builder.createFunction(
+      loc, "_QQmain", mlir::FunctionType::get(context, {}, {}));
+
+  mainFn.setPublic();
+
+  auto *block = mainFn.addEntryBlock();
+  mlir::OpBuilder::InsertionGuard insertGuard(builder);
+  builder.setInsertionPointToStart(block);
+
+  // Create the list of any environment defaults for the runtime to set. The
+  // runtime default list is only created if there is a main program to ensure
+  // it only happens once and to provide consistent results if multiple files
+  // are compiled separately.
+  auto env = fir::runtime::genEnvironmentDefaults(builder, loc, defs);
+
+  llvm::SmallVector<mlir::Value, 4> args(block->getArguments());
+  args.push_back(env);
+
+  builder.create<fir::CallOp>(loc, startFn, args);
+  builder.create<fir::CallOp>(loc, qqMainFn);
+  builder.create<fir::CallOp>(loc, stopFn);
+
+  mlir::Value ret = builder.createIntegerConstant(loc, argcTy, 0);
+  builder.create<mlir::func::ReturnOp>(loc, ret);
+}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp b/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
index 4d33282a35d9..e5d0fb0fb27a 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
@@ -32,7 +32,8 @@ void fir::runtime::genRaggedArrayAllocate(mlir::Location loc,
   // Position of the bufferPointer in the header struct.
   auto one = builder.createIntegerConstant(loc, i32Ty, 1);
   auto eleTy = fir::unwrapSequenceType(fir::unwrapRefType(header.getType()));
-  auto ptrTy = builder.getRefType(eleTy.cast<mlir::TupleType>().getType(1));
+  auto ptrTy =
+      builder.getRefType(mlir::cast<mlir::TupleType>(eleTy).getType(1));
   auto ptr = builder.create<fir::CoordinateOp>(loc, ptrTy, header, one);
   auto heap = builder.create<fir::LoadOp>(loc, ptr);
   auto cmp = builder.genIsNullAddr(loc, heap);
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index 66fbaddcbda1..d4076067bf10 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -666,7 +666,7 @@ void fir::runtime::genMaxloc(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   fir::factory::CharacterExprHelper charHelper{builder, loc};
   if (eleTy.isF32())
     func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocReal4)>(loc, builder);
@@ -713,7 +713,7 @@ mlir::Value fir::runtime::genMaxval(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -781,7 +781,7 @@ void fir::runtime::genMinloc(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   fir::factory::CharacterExprHelper charHelper{builder, loc};
   if (eleTy.isF32())
     func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocReal4)>(loc, builder);
@@ -853,7 +853,7 @@ mlir::Value fir::runtime::genMinval(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -895,7 +895,7 @@ void fir::runtime::genNorm2Dim(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   if (eleTy.isF128())
     func = fir::runtime::getRuntimeFunc<ForcedNorm2DimReal16>(loc, builder);
   else
@@ -917,7 +917,7 @@ mlir::Value fir::runtime::genNorm2(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -968,7 +968,7 @@ mlir::Value fir::runtime::genProduct(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -1069,7 +1069,7 @@ mlir::Value fir::runtime::genDotProduct(fir::FirOpBuilder &builder,
   else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
     func =
         fir::runtime::getRuntimeFunc<ForcedDotProductInteger16>(loc, builder);
-  else if (eleTy.isa<fir::LogicalType>())
+  else if (mlir::isa<fir::LogicalType>(eleTy))
     func =
         fir::runtime::getRuntimeFunc<mkRTKey(DotProductLogical)>(loc, builder);
   else
@@ -1111,7 +1111,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -1173,7 +1173,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc,
     mlir::func::FuncOp func;                                                   \
     auto ty = arrayBox.getType();                                              \
     auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);                              \
-    auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();                   \
+    auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();              \
     auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);  \
                                                                                \
     if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))            \
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 48173033ecbe..5229d40f2250 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -51,9 +51,9 @@ public:
   /// not at all depending on the implementation target's characteristics and
   /// preference.
   bool needsConversion(mlir::Type ty) {
-    if (ty.isa<BoxProcType>())
+    if (mlir::isa<BoxProcType>(ty))
       return true;
-    if (auto funcTy = ty.dyn_cast<mlir::FunctionType>()) {
+    if (auto funcTy = mlir::dyn_cast<mlir::FunctionType>(ty)) {
       for (auto t : funcTy.getInputs())
         if (needsConversion(t))
           return true;
@@ -62,13 +62,13 @@ public:
           return true;
       return false;
     }
-    if (auto tupleTy = ty.dyn_cast<mlir::TupleType>()) {
+    if (auto tupleTy = mlir::dyn_cast<mlir::TupleType>(ty)) {
       for (auto t : tupleTy.getTypes())
         if (needsConversion(t))
           return true;
       return false;
     }
-    if (auto recTy = ty.dyn_cast<RecordType>()) {
+    if (auto recTy = mlir::dyn_cast<RecordType>(ty)) {
       auto visited = visitedTypes.find(ty);
       if (visited != visitedTypes.end())
         return visited->second;
@@ -97,11 +97,11 @@ public:
         visitedTypes.find(ty)->second = result;
       return result;
     }
-    if (auto boxTy = ty.dyn_cast<BaseBoxType>())
+    if (auto boxTy = mlir::dyn_cast<BaseBoxType>(ty))
       return needsConversion(boxTy.getEleTy());
     if (isa_ref_type(ty))
       return needsConversion(unwrapRefType(ty));
-    if (auto t = ty.dyn_cast<SequenceType>())
+    if (auto t = mlir::dyn_cast<SequenceType>(ty))
       return needsConversion(unwrapSequenceType(ty));
     return false;
   }
@@ -246,7 +246,7 @@ public:
           if (typeConverter.needsConversion(ty)) {
             rewriter.startOpModification(func);
             auto toTy =
-                typeConverter.convertType(ty).cast<mlir::FunctionType>();
+                mlir::cast<mlir::FunctionType>(typeConverter.convertType(ty));
             if (!func.empty())
               for (auto e : llvm::enumerate(toTy.getInputs())) {
                 unsigned i = e.index();
@@ -263,7 +263,7 @@ public:
           // Rewrite all `fir.emboxproc` ops to either `fir.convert` or a thunk
           // as required.
           mlir::Type toTy = typeConverter.convertType(
-              embox.getType().cast<BoxProcType>().getEleTy());
+              mlir::cast<BoxProcType>(embox.getType()).getEleTy());
           rewriter.setInsertionPoint(embox);
           if (embox.getHost()) {
             // Create the thunk.
diff --git a/flang/lib/Optimizer/CodeGen/CGOps.cpp b/flang/lib/Optimizer/CodeGen/CGOps.cpp
index c3bcdeaf86db..44d07d26dd2b 100644
--- a/flang/lib/Optimizer/CodeGen/CGOps.cpp
+++ b/flang/lib/Optimizer/CodeGen/CGOps.cpp
@@ -41,24 +41,24 @@ unsigned fir::cg::XEmboxOp::getOutRank() {
 }
 
 unsigned fir::cg::XReboxOp::getOutRank() {
-  if (auto seqTy =
-          fir::dyn_cast_ptrOrBoxEleTy(getType()).dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+          fir::dyn_cast_ptrOrBoxEleTy(getType())))
     return seqTy.getDimension();
   return 0;
 }
 
 unsigned fir::cg::XReboxOp::getRank() {
-  if (auto seqTy = fir::dyn_cast_ptrOrBoxEleTy(getBox().getType())
-                       .dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+          fir::dyn_cast_ptrOrBoxEleTy(getBox().getType())))
     return seqTy.getDimension();
   return 0;
 }
 
 unsigned fir::cg::XArrayCoorOp::getRank() {
   auto memrefTy = getMemref().getType();
-  if (memrefTy.isa<fir::BaseBoxType>())
-    if (auto seqty =
-            fir::dyn_cast_ptrOrBoxEleTy(memrefTy).dyn_cast<fir::SequenceType>())
+  if (mlir::isa<fir::BaseBoxType>(memrefTy))
+    if (auto seqty = mlir::dyn_cast<fir::SequenceType>(
+            fir::dyn_cast_ptrOrBoxEleTy(memrefTy)))
       return seqty.getDimension();
   return getShape().size();
 }
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 921eac2f8f4b..b4705aa47992 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -101,7 +101,7 @@ static int64_t getConstantIntValue(mlir::Value val) {
 }
 
 static unsigned getTypeDescFieldId(mlir::Type ty) {
-  auto isArray = fir::dyn_cast_ptrOrBoxEleTy(ty).isa<fir::SequenceType>();
+  auto isArray = mlir::isa<fir::SequenceType>(fir::dyn_cast_ptrOrBoxEleTy(ty));
   return isArray ? kOptTypePtrPosInBox : kDimsPosInBox;
 }
 static unsigned getLenParamFieldId(mlir::Type ty) {
@@ -147,7 +147,7 @@ genAllocationScaleSize(OP op, mlir::Type ity,
                        mlir::ConversionPatternRewriter &rewriter) {
   mlir::Location loc = op.getLoc();
   mlir::Type dataTy = op.getInType();
-  auto seqTy = dataTy.dyn_cast<fir::SequenceType>();
+  auto seqTy = mlir::dyn_cast<fir::SequenceType>(dataTy);
   fir::SequenceType::Extent constSize = 1;
   if (seqTy) {
     int constRows = seqTy.getConstantRows();
@@ -191,13 +191,13 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
       for (; i < end; ++i)
         lenParams.push_back(operands[i]);
       mlir::Type scalarType = fir::unwrapSequenceType(alloc.getInType());
-      if (auto chrTy = scalarType.dyn_cast<fir::CharacterType>()) {
+      if (auto chrTy = mlir::dyn_cast<fir::CharacterType>(scalarType)) {
         fir::CharacterType rawCharTy = fir::CharacterType::getUnknownLen(
             chrTy.getContext(), chrTy.getFKind());
         llvmObjectType = convertType(rawCharTy);
         assert(end == 1);
         size = integerCast(loc, rewriter, ity, lenParams[0]);
-      } else if (auto recTy = scalarType.dyn_cast<fir::RecordType>()) {
+      } else if (auto recTy = mlir::dyn_cast<fir::RecordType>(scalarType)) {
         mlir::LLVM::LLVMFuncOp memSizeFn =
             getDependentTypeMemSizeFn(recTy, alloc, rewriter);
         if (!memSizeFn)
@@ -265,7 +265,8 @@ struct BoxAddrOpConversion : public fir::FIROpConversion<fir::BoxAddrOp> {
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Value a = adaptor.getOperands()[0];
     auto loc = boxaddr.getLoc();
-    if (auto argty = boxaddr.getVal().getType().dyn_cast<fir::BaseBoxType>()) {
+    if (auto argty =
+            mlir::dyn_cast<fir::BaseBoxType>(boxaddr.getVal().getType())) {
       TypePair boxTyPair = getBoxTypePair(argty);
       rewriter.replaceOp(boxaddr,
                          getBaseAddrFromBox(loc, boxTyPair, a, rewriter));
@@ -476,24 +477,25 @@ struct StringLitOpConversion : public fir::FIROpConversion<fir::StringLitOp> {
                   mlir::ConversionPatternRewriter &rewriter) const override {
     auto ty = convertType(constop.getType());
     auto attr = constop.getValue();
-    if (attr.isa<mlir::StringAttr>()) {
+    if (mlir::isa<mlir::StringAttr>(attr)) {
       rewriter.replaceOpWithNewOp<mlir::LLVM::ConstantOp>(constop, ty, attr);
       return mlir::success();
     }
 
-    auto charTy = constop.getType().cast<fir::CharacterType>();
+    auto charTy = mlir::cast<fir::CharacterType>(constop.getType());
     unsigned bits = lowerTy().characterBitsize(charTy);
     mlir::Type intTy = rewriter.getIntegerType(bits);
     mlir::Location loc = constop.getLoc();
     mlir::Value cst = rewriter.create<mlir::LLVM::UndefOp>(loc, ty);
-    if (auto arr = attr.dyn_cast<mlir::DenseElementsAttr>()) {
+    if (auto arr = mlir::dyn_cast<mlir::DenseElementsAttr>(attr)) {
       cst = rewriter.create<mlir::LLVM::ConstantOp>(loc, ty, arr);
-    } else if (auto arr = attr.dyn_cast<mlir::ArrayAttr>()) {
+    } else if (auto arr = mlir::dyn_cast<mlir::ArrayAttr>(attr)) {
       for (auto a : llvm::enumerate(arr.getValue())) {
         // convert each character to a precise bitsize
         auto elemAttr = mlir::IntegerAttr::get(
             intTy,
-            a.value().cast<mlir::IntegerAttr>().getValue().zextOrTrunc(bits));
+            mlir::cast<mlir::IntegerAttr>(a.value()).getValue().zextOrTrunc(
+                bits));
         auto elemCst =
             rewriter.create<mlir::LLVM::ConstantOp>(loc, intTy, elemAttr);
         cst = rewriter.create<mlir::LLVM::InsertValueOp>(loc, cst, elemCst,
@@ -528,9 +530,9 @@ struct CallOpConversion : public fir::FIROpConversion<fir::CallOp> {
 } // namespace
 
 static mlir::Type getComplexEleTy(mlir::Type complex) {
-  if (auto cc = complex.dyn_cast<mlir::ComplexType>())
+  if (auto cc = mlir::dyn_cast<mlir::ComplexType>(complex))
     return cc.getElementType();
-  return complex.cast<fir::ComplexType>().getElementType();
+  return mlir::cast<fir::ComplexType>(complex).getElementType();
 }
 
 namespace {
@@ -599,7 +601,7 @@ struct ConstcOpConversion : public fir::FIROpConversion<fir::ConstcOp> {
   }
 
   inline llvm::APFloat getValue(mlir::Attribute attr) const {
-    return attr.cast<fir::RealAttr>().getValue();
+    return mlir::cast<fir::RealAttr>(attr).getValue();
   }
 };
 
@@ -608,7 +610,7 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
   using FIROpConversion::FIROpConversion;
 
   static bool isFloatingPointTy(mlir::Type ty) {
-    return ty.isa<mlir::FloatType>();
+    return mlir::isa<mlir::FloatType>(ty);
   }
 
   mlir::LogicalResult
@@ -628,7 +630,8 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
     auto loc = convert.getLoc();
     auto i1Type = mlir::IntegerType::get(convert.getContext(), 1);
 
-    if (fromFirTy.isa<fir::LogicalType>() || toFirTy.isa<fir::LogicalType>()) {
+    if (mlir::isa<fir::LogicalType>(fromFirTy) ||
+        mlir::isa<fir::LogicalType>(toFirTy)) {
       // By specification fir::LogicalType value may be any number,
       // where non-zero value represents .true. and zero value represents
       // .false.
@@ -641,7 +644,8 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
       // Conversion from narrow logical to wide logical may be implemented
       // as a zero or sign extension of the input, but it may use value
       // normalization as well.
-      if (!fromTy.isa<mlir::IntegerType>() || !toTy.isa<mlir::IntegerType>())
+      if (!mlir::isa<mlir::IntegerType>(fromTy) ||
+          !mlir::isa<mlir::IntegerType>(toTy))
         return mlir::emitError(loc)
                << "unsupported types for logical conversion: " << fromTy
                << " -> " << toTy;
@@ -722,13 +726,13 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
         rewriter.replaceOp(convert, v);
         return mlir::success();
       }
-      if (toTy.isa<mlir::IntegerType>()) {
+      if (mlir::isa<mlir::IntegerType>(toTy)) {
         rewriter.replaceOpWithNewOp<mlir::LLVM::FPToSIOp>(convert, toTy, op0);
         return mlir::success();
       }
-    } else if (fromTy.isa<mlir::IntegerType>()) {
+    } else if (mlir::isa<mlir::IntegerType>(fromTy)) {
       // Integer to integer conversion.
-      if (toTy.isa<mlir::IntegerType>()) {
+      if (mlir::isa<mlir::IntegerType>(toTy)) {
         auto fromBits = mlir::LLVM::getPrimitiveTypeSizeInBits(fromTy);
         auto toBits = mlir::LLVM::getPrimitiveTypeSizeInBits(toTy);
         assert(fromBits != toBits);
@@ -749,18 +753,18 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
         return mlir::success();
       }
       // Integer to pointer conversion.
-      if (toTy.isa<mlir::LLVM::LLVMPointerType>()) {
+      if (mlir::isa<mlir::LLVM::LLVMPointerType>(toTy)) {
         rewriter.replaceOpWithNewOp<mlir::LLVM::IntToPtrOp>(convert, toTy, op0);
         return mlir::success();
       }
-    } else if (fromTy.isa<mlir::LLVM::LLVMPointerType>()) {
+    } else if (mlir::isa<mlir::LLVM::LLVMPointerType>(fromTy)) {
       // Pointer to integer conversion.
-      if (toTy.isa<mlir::IntegerType>()) {
+      if (mlir::isa<mlir::IntegerType>(toTy)) {
         rewriter.replaceOpWithNewOp<mlir::LLVM::PtrToIntOp>(convert, toTy, op0);
         return mlir::success();
       }
       // Pointer to pointer conversion.
-      if (toTy.isa<mlir::LLVM::LLVMPointerType>()) {
+      if (mlir::isa<mlir::LLVM::LLVMPointerType>(toTy)) {
         rewriter.replaceOpWithNewOp<mlir::LLVM::BitcastOp>(convert, toTy, op0);
         return mlir::success();
       }
@@ -842,11 +846,11 @@ struct EmboxCharOpConversion : public fir::FIROpConversion<fir::EmboxCharOp> {
     auto llvmStruct = rewriter.create<mlir::LLVM::UndefOp>(loc, llvmStructTy);
 
     mlir::Type lenTy =
-        llvmStructTy.cast<mlir::LLVM::LLVMStructType>().getBody()[1];
+        mlir::cast<mlir::LLVM::LLVMStructType>(llvmStructTy).getBody()[1];
     mlir::Value lenAfterCast = integerCast(loc, rewriter, lenTy, charBufferLen);
 
     mlir::Type addrTy =
-        llvmStructTy.cast<mlir::LLVM::LLVMStructType>().getBody()[0];
+        mlir::cast<mlir::LLVM::LLVMStructType>(llvmStructTy).getBody()[0];
     if (addrTy != charBuffer.getType())
       charBuffer =
           rewriter.create<mlir::LLVM::BitcastOp>(loc, addrTy, charBuffer);
@@ -979,9 +983,10 @@ static mlir::SymbolRefAttr getFree(fir::FreeMemOp op,
 
 static unsigned getDimension(mlir::LLVM::LLVMArrayType ty) {
   unsigned result = 1;
-  for (auto eleTy = ty.getElementType().dyn_cast<mlir::LLVM::LLVMArrayType>();
-       eleTy;
-       eleTy = eleTy.getElementType().dyn_cast<mlir::LLVM::LLVMArrayType>())
+  for (auto eleTy =
+           mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(ty.getElementType());
+       eleTy; eleTy = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(
+                  eleTy.getElementType()))
     ++result;
   return result;
 }
@@ -1052,9 +1057,9 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
 
   static int getCFIAttr(fir::BaseBoxType boxTy) {
     auto eleTy = boxTy.getEleTy();
-    if (eleTy.isa<fir::PointerType>())
+    if (mlir::isa<fir::PointerType>(eleTy))
       return CFI_attribute_pointer;
-    if (eleTy.isa<fir::HeapType>())
+    if (mlir::isa<fir::HeapType>(eleTy))
       return CFI_attribute_allocatable;
     return CFI_attribute_other;
   }
@@ -1082,27 +1087,29 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
     auto i64Ty = mlir::IntegerType::get(rewriter.getContext(), 64);
     if (auto eleTy = fir::dyn_cast_ptrEleTy(boxEleTy))
       boxEleTy = eleTy;
-    if (auto seqTy = boxEleTy.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxEleTy))
       return getSizeAndTypeCode(loc, rewriter, seqTy.getEleTy(), lenParams);
-    if (boxEleTy.isa<mlir::NoneType>()) // unlimited polymorphic or assumed type
+    if (mlir::isa<mlir::NoneType>(
+            boxEleTy)) // unlimited polymorphic or assumed type
       return {rewriter.create<mlir::LLVM::ConstantOp>(loc, i64Ty, 0),
               this->genConstantOffset(loc, rewriter, CFI_type_other)};
     mlir::Value typeCodeVal = this->genConstantOffset(
         loc, rewriter,
         fir::getTypeCode(boxEleTy, this->lowerTy().getKindMap()));
-    if (fir::isa_integer(boxEleTy) || boxEleTy.dyn_cast<fir::LogicalType>() ||
-        fir::isa_real(boxEleTy) || fir::isa_complex(boxEleTy))
+    if (fir::isa_integer(boxEleTy) ||
+        mlir::dyn_cast<fir::LogicalType>(boxEleTy) || fir::isa_real(boxEleTy) ||
+        fir::isa_complex(boxEleTy))
       return {genTypeStrideInBytes(loc, i64Ty, rewriter,
                                    this->convertType(boxEleTy)),
               typeCodeVal};
-    if (auto charTy = boxEleTy.dyn_cast<fir::CharacterType>())
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(boxEleTy))
       return {getCharacterByteSize(loc, rewriter, charTy, lenParams),
               typeCodeVal};
     if (fir::isa_ref_type(boxEleTy)) {
       auto ptrTy = ::getLlvmPtrType(rewriter.getContext());
       return {genTypeStrideInBytes(loc, i64Ty, rewriter, ptrTy), typeCodeVal};
     }
-    if (boxEleTy.isa<fir::RecordType>())
+    if (mlir::isa<fir::RecordType>(boxEleTy))
       return {genTypeStrideInBytes(loc, i64Ty, rewriter,
                                    this->convertType(boxEleTy)),
               typeCodeVal};
@@ -1211,8 +1218,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
       if (!typeDesc) {
         if (useInputType) {
           mlir::Type innerType = fir::unwrapInnerType(inputType);
-          if (innerType && innerType.template isa<fir::RecordType>()) {
-            auto recTy = innerType.template dyn_cast<fir::RecordType>();
+          if (innerType && mlir::isa<fir::RecordType>(innerType)) {
+            auto recTy = mlir::dyn_cast<fir::RecordType>(innerType);
             typeDesc = getTypeDescriptor(mod, rewriter, loc, recTy);
           } else {
             // Unlimited polymorphic type descriptor with no record type. Set
@@ -1250,7 +1257,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
                        mlir::ValueRange lenParams, mlir::Value sourceBox = {},
                        mlir::Type sourceBoxType = {}) const {
     auto loc = box.getLoc();
-    auto boxTy = box.getType().template dyn_cast<fir::BaseBoxType>();
+    auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getType());
     bool useInputType = fir::isPolymorphicType(boxTy) &&
                         !fir::isUnlimitedPolymorphicType(inputType);
     llvm::SmallVector<mlir::Value> typeparams = lenParams;
@@ -1293,8 +1300,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
                        mlir::ValueRange lenParams,
                        mlir::Value typeDesc = {}) const {
     auto loc = box.getLoc();
-    auto boxTy = box.getType().dyn_cast<fir::BaseBoxType>();
-    auto inputBoxTy = box.getBox().getType().dyn_cast<fir::BaseBoxType>();
+    auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getType());
+    auto inputBoxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getBox().getType());
     auto inputBoxTyPair = this->getBoxTypePair(inputBoxTy);
     llvm::SmallVector<mlir::Value> typeparams = lenParams;
     if (!box.getSubstr().empty() && fir::hasDynamicSize(boxTy.getEleTy()))
@@ -1343,7 +1350,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
     mlir::Type resultTy = llvmBaseObjectType;
     // Fortran is column major, llvm GEP is row major: reverse the indices here.
     for (mlir::Value interiorIndex : llvm::reverse(cstInteriorIndices)) {
-      auto arrayTy = resultTy.dyn_cast<mlir::LLVM::LLVMArrayType>();
+      auto arrayTy = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(resultTy);
       if (!arrayTy)
         fir::emitFatalError(
             loc,
@@ -1355,7 +1362,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
         convertSubcomponentIndices(loc, resultTy, componentIndices, &resultTy);
     gepArgs.append(gepIndices.begin(), gepIndices.end());
     if (substringOffset) {
-      if (auto arrayTy = resultTy.dyn_cast<mlir::LLVM::LLVMArrayType>()) {
+      if (auto arrayTy = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(resultTy)) {
         gepArgs.push_back(*substringOffset);
         resultTy = arrayTy.getElementType();
       } else {
@@ -1504,18 +1511,18 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     unsigned constRows = 0;
     mlir::Value ptrOffset = zero;
     mlir::Type memEleTy = fir::dyn_cast_ptrEleTy(xbox.getMemref().getType());
-    assert(memEleTy.isa<fir::SequenceType>());
-    auto seqTy = memEleTy.cast<fir::SequenceType>();
+    assert(mlir::isa<fir::SequenceType>(memEleTy));
+    auto seqTy = mlir::cast<fir::SequenceType>(memEleTy);
     mlir::Type seqEleTy = seqTy.getEleTy();
     // Adjust the element scaling factor if the element is a dependent type.
     if (fir::hasDynamicSize(seqEleTy)) {
-      if (auto charTy = seqEleTy.dyn_cast<fir::CharacterType>()) {
+      if (auto charTy = mlir::dyn_cast<fir::CharacterType>(seqEleTy)) {
         // The GEP pointer type decays to llvm.ptr<i[width]>.
         // The scaling factor is the runtime value of the length.
         assert(!adaptor.getLenParams().empty());
         prevPtrOff = FIROpConversion::integerCast(
             loc, rewriter, i64Ty, adaptor.getLenParams().back());
-      } else if (seqEleTy.isa<fir::RecordType>()) {
+      } else if (mlir::isa<fir::RecordType>(seqEleTy)) {
         // prevPtrOff = ;
         TODO(loc, "generate call to calculate size of PDT");
       } else {
@@ -1540,7 +1547,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     } else if (hasSubstr) {
       // We have a substring. The step value needs to be the number of bytes
       // per CHARACTER element.
-      auto charTy = seqEleTy.cast<fir::CharacterType>();
+      auto charTy = mlir::cast<fir::CharacterType>(seqEleTy);
       if (fir::hasDynamicSize(charTy)) {
         prevDimByteStride =
             getCharacterByteSize(loc, rewriter, charTy, adaptor.getLenParams());
@@ -1589,7 +1596,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
         // Lower bound is normalized to 0 for BIND(C) interoperability.
         mlir::Value lb = zero;
         const bool isaPointerOrAllocatable =
-            eleTy.isa<fir::PointerType>() || eleTy.isa<fir::HeapType>();
+            mlir::isa<fir::PointerType, fir::HeapType>(eleTy);
         // Lower bound is defaults to 1 for POINTER, ALLOCATABLE, and
         // denormalized descriptors.
         if (isaPointerOrAllocatable || !normalizedLowerBound(xbox))
@@ -1695,7 +1702,7 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
     // Create new descriptor and fill its non-shape related data.
     llvm::SmallVector<mlir::Value, 2> lenParams;
     mlir::Type inputEleTy = getInputEleTy(rebox);
-    if (auto charTy = inputEleTy.dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(inputEleTy)) {
       if (charTy.hasConstantLen()) {
         mlir::Value len =
             genConstantIndex(loc, idxTy, rewriter, charTy.getLen());
@@ -1712,15 +1719,15 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
         }
         lenParams.emplace_back(len);
       }
-    } else if (auto recTy = inputEleTy.dyn_cast<fir::RecordType>()) {
+    } else if (auto recTy = mlir::dyn_cast<fir::RecordType>(inputEleTy)) {
       if (recTy.getNumLenParams() != 0)
         TODO(loc, "reboxing descriptor of derived type with length parameters");
     }
 
     // Rebox on polymorphic entities needs to carry over the dynamic type.
     mlir::Value typeDescAddr;
-    if (inputBoxTyPair.fir.isa<fir::ClassType>() &&
-        rebox.getType().isa<fir::ClassType>())
+    if (mlir::isa<fir::ClassType>(inputBoxTyPair.fir) &&
+        mlir::isa<fir::ClassType>(rebox.getType()))
       typeDescAddr =
           loadTypeDescAddress(loc, inputBoxTyPair, loweredBox, rewriter);
 
@@ -1908,7 +1915,7 @@ private:
   /// Return scalar element type of the input box.
   static mlir::Type getInputEleTy(fir::cg::XReboxOp rebox) {
     auto ty = fir::dyn_cast_ptrOrBoxEleTy(rebox.getBox().getType());
-    if (auto seqTy = ty.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
       return seqTy.getEleTy();
     return ty;
   }
@@ -1936,7 +1943,7 @@ struct ValueOpCommon {
     assert(ty && "type is null");
     const auto end = indices.size();
     for (std::remove_const_t<decltype(end)> i = 0; i < end; ++i) {
-      if (auto seq = ty.dyn_cast<mlir::LLVM::LLVMArrayType>()) {
+      if (auto seq = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(ty)) {
         const auto dim = getDimension(seq);
         if (dim > 1) {
           auto ub = std::min(i + dim, end);
@@ -1944,7 +1951,7 @@ struct ValueOpCommon {
           i += dim - 1;
         }
         ty = getArrayElementType(seq);
-      } else if (auto st = ty.dyn_cast<mlir::LLVM::LLVMStructType>()) {
+      } else if (auto st = mlir::dyn_cast<mlir::LLVM::LLVMStructType>(ty)) {
         ty = st.getBody()[indices[i]];
       } else {
         llvm_unreachable("index into invalid type");
@@ -1957,13 +1964,13 @@ struct ValueOpCommon {
                  mlir::ArrayAttr arrAttr) {
     llvm::SmallVector<int64_t> indices;
     for (auto i = arrAttr.begin(), e = arrAttr.end(); i != e; ++i) {
-      if (auto intAttr = i->dyn_cast<mlir::IntegerAttr>()) {
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(*i)) {
         indices.push_back(intAttr.getInt());
       } else {
-        auto fieldName = i->cast<mlir::StringAttr>().getValue();
+        auto fieldName = mlir::cast<mlir::StringAttr>(*i).getValue();
         ++i;
-        auto ty = i->cast<mlir::TypeAttr>().getValue();
-        auto index = ty.cast<fir::RecordType>().getFieldIndex(fieldName);
+        auto ty = mlir::cast<mlir::TypeAttr>(*i).getValue();
+        auto index = mlir::cast<fir::RecordType>(ty).getFieldIndex(fieldName);
         indices.push_back(index);
       }
     }
@@ -1973,7 +1980,7 @@ struct ValueOpCommon {
 private:
   static mlir::Type getArrayElementType(mlir::LLVM::LLVMArrayType ty) {
     auto eleTy = ty.getElementType();
-    while (auto arrTy = eleTy.dyn_cast<mlir::LLVM::LLVMArrayType>())
+    while (auto arrTy = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(eleTy))
       eleTy = arrTy.getElementType();
     return eleTy;
   }
@@ -2041,7 +2048,7 @@ struct InsertOnRangeOpConversion
     auto type = adaptor.getOperands()[0].getType();
 
     // Iteratively extract the array dimensions from the type.
-    while (auto t = type.dyn_cast<mlir::LLVM::LLVMArrayType>()) {
+    while (auto t = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(type)) {
       dims.push_back(t.getNumElements());
       type = t.getElementType();
     }
@@ -2107,7 +2114,8 @@ struct XArrayCoorOpConversion
     mlir::Value offset = genConstantIndex(loc, idxTy, rewriter, 0);
     const bool isShifted = !coor.getShift().empty();
     const bool isSliced = !coor.getSlice().empty();
-    const bool baseIsBoxed = coor.getMemref().getType().isa<fir::BaseBoxType>();
+    const bool baseIsBoxed =
+        mlir::isa<fir::BaseBoxType>(coor.getMemref().getType());
     TypePair baseBoxTyPair =
         baseIsBoxed ? getBoxTypePair(coor.getMemref().getType()) : TypePair{};
     mlir::LLVM::IntegerOverflowFlags nsw =
@@ -2185,7 +2193,8 @@ struct XArrayCoorOpConversion
       // components.
       mlir::Type elementType =
           getLlvmObjectTypeFromBoxType(coor.getMemref().getType());
-      while (auto arrayTy = elementType.dyn_cast<mlir::LLVM::LLVMArrayType>())
+      while (auto arrayTy =
+                 mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(elementType))
         elementType = arrayTy.getElementType();
       args.clear();
       args.push_back(0);
@@ -2275,11 +2284,12 @@ struct CoordinateOpConversion
     }
 
     // Boxed type - get the base pointer from the box
-    if (baseObjectTy.dyn_cast<fir::BaseBoxType>())
+    if (mlir::dyn_cast<fir::BaseBoxType>(baseObjectTy))
       return doRewriteBox(coor, operands, loc, rewriter);
 
     // Reference, pointer or a heap type
-    if (baseObjectTy.isa<fir::ReferenceType, fir::PointerType, fir::HeapType>())
+    if (mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType>(
+            baseObjectTy))
       return doRewriteRefOrPtr(coor, llvmObjectTy, operands, loc, rewriter);
 
     return rewriter.notifyMatchFailure(
@@ -2295,7 +2305,7 @@ struct CoordinateOpConversion
   }
 
   static bool hasSubDimensions(mlir::Type type) {
-    return type.isa<fir::SequenceType, fir::RecordType, mlir::TupleType>();
+    return mlir::isa<fir::SequenceType, fir::RecordType, mlir::TupleType>(type);
   }
 
   /// Check whether this form of `!fir.coordinate_of` is supported. These
@@ -2310,14 +2320,14 @@ struct CoordinateOpConversion
     bool ptrEle = false;
     for (; i < numOfCoors; ++i) {
       mlir::Value nxtOpnd = coors[i];
-      if (auto arrTy = type.dyn_cast<fir::SequenceType>()) {
+      if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(type)) {
         subEle = true;
         i += arrTy.getDimension() - 1;
         type = arrTy.getEleTy();
-      } else if (auto recTy = type.dyn_cast<fir::RecordType>()) {
+      } else if (auto recTy = mlir::dyn_cast<fir::RecordType>(type)) {
         subEle = true;
         type = recTy.getType(getFieldNumber(recTy, nxtOpnd));
-      } else if (auto tupTy = type.dyn_cast<mlir::TupleType>()) {
+      } else if (auto tupTy = mlir::dyn_cast<mlir::TupleType>(type)) {
         subEle = true;
         type = tupTy.getType(getConstantIntValue(nxtOpnd));
       } else {
@@ -2335,14 +2345,14 @@ struct CoordinateOpConversion
   static bool arraysHaveKnownShape(mlir::Type type, mlir::ValueRange coors) {
     for (std::size_t i = 0, sz = coors.size(); i < sz; ++i) {
       mlir::Value nxtOpnd = coors[i];
-      if (auto arrTy = type.dyn_cast<fir::SequenceType>()) {
+      if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(type)) {
         if (fir::sequenceWithNonConstantShape(arrTy))
           return false;
         i += arrTy.getDimension() - 1;
         type = arrTy.getEleTy();
-      } else if (auto strTy = type.dyn_cast<fir::RecordType>()) {
+      } else if (auto strTy = mlir::dyn_cast<fir::RecordType>(type)) {
         type = strTy.getType(getFieldNumber(strTy, nxtOpnd));
-      } else if (auto strTy = type.dyn_cast<mlir::TupleType>()) {
+      } else if (auto strTy = mlir::dyn_cast<mlir::TupleType>(type)) {
         type = strTy.getType(getConstantIntValue(nxtOpnd));
       } else {
         return true;
@@ -2357,7 +2367,8 @@ private:
                mlir::Location loc,
                mlir::ConversionPatternRewriter &rewriter) const {
     mlir::Type boxObjTy = coor.getBaseType();
-    assert(boxObjTy.dyn_cast<fir::BaseBoxType>() && "This is not a `fir.box`");
+    assert(mlir::dyn_cast<fir::BaseBoxType>(boxObjTy) &&
+           "This is not a `fir.box`");
     TypePair boxTyPair = getBoxTypePair(boxObjTy);
 
     mlir::Value boxBaseAddr = operands[0];
@@ -2399,7 +2410,7 @@ private:
         mlir::LLVM::IntegerOverflowFlags::nsw;
 
     for (unsigned i = 1, last = operands.size(); i < last; ++i) {
-      if (auto arrTy = cpnTy.dyn_cast<fir::SequenceType>()) {
+      if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(cpnTy)) {
         if (i != 1)
           TODO(loc, "fir.array nested inside other array and/or derived type");
         // Applies byte strides from the box. Ignore lower bound from box
@@ -2421,7 +2432,7 @@ private:
             llvm::ArrayRef<mlir::LLVM::GEPArg>{off});
         i += arrTy.getDimension() - 1;
         cpnTy = arrTy.getEleTy();
-      } else if (auto recTy = cpnTy.dyn_cast<fir::RecordType>()) {
+      } else if (auto recTy = mlir::dyn_cast<fir::RecordType>(cpnTy)) {
         mlir::Value nxtOpnd = operands[i];
         cpnTy = recTy.getType(getFieldNumber(recTy, nxtOpnd));
         auto llvmRecTy = lowerTy().convertType(recTy);
@@ -2456,7 +2467,7 @@ private:
 
     // If only the column is `?`, then we can simply place the column value in
     // the 0-th GEP position.
-    if (auto arrTy = cpnTy.dyn_cast<fir::SequenceType>()) {
+    if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(cpnTy)) {
       if (!hasKnownShape) {
         const unsigned sz = arrTy.getDimension();
         if (arraysHaveKnownShape(arrTy.getEleTy(),
@@ -2500,29 +2511,29 @@ private:
             dims = dimsLeft - 1;
             continue;
           }
-          cpnTy = cpnTy.cast<fir::SequenceType>().getEleTy();
+          cpnTy = mlir::cast<fir::SequenceType>(cpnTy).getEleTy();
           // append array range in reverse (FIR arrays are column-major)
           offs.append(arrIdx.rbegin(), arrIdx.rend());
           arrIdx.clear();
           dims.reset();
           continue;
         }
-        if (auto arrTy = cpnTy.dyn_cast<fir::SequenceType>()) {
+        if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(cpnTy)) {
           int d = arrTy.getDimension() - 1;
           if (d > 0) {
             dims = d;
             arrIdx.push_back(nxtOpnd);
             continue;
           }
-          cpnTy = cpnTy.cast<fir::SequenceType>().getEleTy();
+          cpnTy = mlir::cast<fir::SequenceType>(cpnTy).getEleTy();
           offs.push_back(nxtOpnd);
           continue;
         }
 
         // check if the i-th coordinate relates to a field
-        if (auto recTy = cpnTy.dyn_cast<fir::RecordType>())
+        if (auto recTy = mlir::dyn_cast<fir::RecordType>(cpnTy))
           cpnTy = recTy.getType(getFieldNumber(recTy, nxtOpnd));
-        else if (auto tupTy = cpnTy.dyn_cast<mlir::TupleType>())
+        else if (auto tupTy = mlir::dyn_cast<mlir::TupleType>(cpnTy))
           cpnTy = tupTy.getType(getConstantIntValue(nxtOpnd));
         else
           cpnTy = nullptr;
@@ -2551,7 +2562,7 @@ struct FieldIndexOpConversion : public fir::FIROpConversion<fir::FieldIndexOp> {
   mlir::LogicalResult
   matchAndRewrite(fir::FieldIndexOp field, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
-    auto recTy = field.getOnType().cast<fir::RecordType>();
+    auto recTy = mlir::cast<fir::RecordType>(field.getOnType());
     unsigned index = recTy.getFieldIndex(field.getFieldId());
 
     if (!fir::hasDynamicSize(recTy)) {
@@ -2604,8 +2615,8 @@ struct TypeDescOpConversion : public fir::FIROpConversion<fir::TypeDescOp> {
   matchAndRewrite(fir::TypeDescOp typeDescOp, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Type inTy = typeDescOp.getInType();
-    assert(inTy.isa<fir::RecordType>() && "expecting fir.type");
-    auto recordType = inTy.dyn_cast<fir::RecordType>();
+    assert(mlir::isa<fir::RecordType>(inTy) && "expecting fir.type");
+    auto recordType = mlir::dyn_cast<fir::RecordType>(inTy);
     auto module = typeDescOp.getOperation()->getParentOfType<mlir::ModuleOp>();
     std::string typeDescName =
         fir::NameUniquer::getTypeDescriptorName(recordType.getName());
@@ -2732,7 +2743,7 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
           mlir::Type vecType = mlir::VectorType::get(
               insertOp.getType().getShape(), constant.getType());
           auto denseAttr = mlir::DenseElementsAttr::get(
-              vecType.cast<mlir::ShapedType>(), constant.getValue());
+              mlir::cast<mlir::ShapedType>(vecType), constant.getValue());
           rewriter.setInsertionPointAfter(insertOp);
           rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(
               insertOp, seqTyAttr, denseAttr);
@@ -2808,7 +2819,7 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
   matchAndRewrite(fir::LoadOp load, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Type llvmLoadTy = convertObjectType(load.getType());
-    if (auto boxTy = load.getType().dyn_cast<fir::BaseBoxType>()) {
+    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(load.getType())) {
       // fir.box is a special case because it is considered as an ssa values in
       // fir, but it is lowered as a pointer to a descriptor. So
       // fir.ref<fir.box> and fir.box end up being the same llvm types and
@@ -2921,7 +2932,7 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
     llvm::ArrayRef<mlir::Attribute> cases = caseOp.getCases().getValue();
     // Type can be CHARACTER, INTEGER, or LOGICAL (C1145)
     auto ty = caseOp.getSelector().getType();
-    if (ty.isa<fir::CharacterType>()) {
+    if (mlir::isa<fir::CharacterType>(ty)) {
       TODO(caseOp.getLoc(), "fir.select_case codegen with character type");
       return mlir::failure();
     }
@@ -2935,25 +2946,25 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
           *caseOp.getCompareOperands(adaptor.getOperands(), t);
       mlir::Value caseArg = *(cmpOps.value().begin());
       mlir::Attribute attr = cases[t];
-      if (attr.isa<fir::PointIntervalAttr>()) {
+      if (mlir::isa<fir::PointIntervalAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
             loc, mlir::LLVM::ICmpPredicate::eq, selector, caseArg);
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
-      if (attr.isa<fir::LowerBoundAttr>()) {
+      if (mlir::isa<fir::LowerBoundAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
             loc, mlir::LLVM::ICmpPredicate::sle, caseArg, selector);
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
-      if (attr.isa<fir::UpperBoundAttr>()) {
+      if (mlir::isa<fir::UpperBoundAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
             loc, mlir::LLVM::ICmpPredicate::sle, selector, caseArg);
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
-      if (attr.isa<fir::ClosedIntervalAttr>()) {
+      if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
             loc, mlir::LLVM::ICmpPredicate::sle, caseArg, selector);
         auto *thisBlock = rewriter.getInsertionBlock();
@@ -2969,7 +2980,7 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
         rewriter.setInsertionPointToEnd(newBlock2);
         continue;
       }
-      assert(attr.isa<mlir::UnitAttr>());
+      assert(mlir::isa<mlir::UnitAttr>(attr));
       assert((t + 1 == conds) && "unit must be last");
       genBrOp(caseOp, dest, destOps, rewriter);
     }
@@ -2997,13 +3008,13 @@ static void selectMatchAndRewrite(const fir::LLVMTypeConverter &lowering,
     mlir::Block *dest = select.getSuccessor(t);
     auto destOps = select.getSuccessorOperands(adaptor.getOperands(), t);
     const mlir::Attribute &attr = cases[t];
-    if (auto intAttr = attr.template dyn_cast<mlir::IntegerAttr>()) {
+    if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(attr)) {
       destinations.push_back(dest);
       destinationsOperands.push_back(destOps ? *destOps : mlir::ValueRange{});
       caseValues.push_back(intAttr.getInt());
       continue;
     }
-    assert(attr.template dyn_cast_or_null<mlir::UnitAttr>());
+    assert(mlir::dyn_cast_or_null<mlir::UnitAttr>(attr));
     assert((t + 1 == conds) && "unit must be last");
     defaultDestination = dest;
     defaultOperands = destOps ? *destOps : mlir::ValueRange{};
@@ -3071,7 +3082,7 @@ struct StoreOpConversion : public fir::FIROpConversion<fir::StoreOp> {
     mlir::Location loc = store.getLoc();
     mlir::Type storeTy = store.getValue().getType();
     mlir::LLVM::StoreOp newStoreOp;
-    if (auto boxTy = storeTy.dyn_cast<fir::BaseBoxType>()) {
+    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(storeTy)) {
       // fir.box value is actually in memory, load it first before storing it.
       mlir::Type llvmBoxTy = lowerTy().convertBoxTypeAsStruct(boxTy);
       auto val = rewriter.create<mlir::LLVM::LoadOp>(loc, llvmBoxTy,
@@ -3186,9 +3197,9 @@ struct IsPresentOpConversion : public fir::FIROpConversion<fir::IsPresentOp> {
     mlir::Location loc = isPresent.getLoc();
     auto ptr = adaptor.getOperands()[0];
 
-    if (isPresent.getVal().getType().isa<fir::BoxCharType>()) {
+    if (mlir::isa<fir::BoxCharType>(isPresent.getVal().getType())) {
       [[maybe_unused]] auto structTy =
-          ptr.getType().cast<mlir::LLVM::LLVMStructType>();
+          mlir::cast<mlir::LLVM::LLVMStructType>(ptr.getType());
       assert(!structTy.isOpaque() && !structTy.getBody().empty());
 
       ptr = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, ptr, 0);
@@ -3214,8 +3225,8 @@ struct AbsentOpConversion : public fir::FIROpConversion<fir::AbsentOp> {
     mlir::Type ty = convertType(absent.getType());
     mlir::Location loc = absent.getLoc();
 
-    if (absent.getType().isa<fir::BoxCharType>()) {
-      auto structTy = ty.cast<mlir::LLVM::LLVMStructType>();
+    if (mlir::isa<fir::BoxCharType>(absent.getType())) {
+      auto structTy = mlir::cast<mlir::LLVM::LLVMStructType>(ty);
       assert(!structTy.isOpaque() && !structTy.getBody().empty());
       auto undefStruct = rewriter.create<mlir::LLVM::UndefOp>(loc, ty);
       auto nullField =
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 26871d888815..69e78167b073 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -20,7 +20,7 @@ static inline mlir::Type getLlvmPtrType(mlir::MLIRContext *context,
 }
 
 static unsigned getTypeDescFieldId(mlir::Type ty) {
-  auto isArray = fir::dyn_cast_ptrOrBoxEleTy(ty).isa<fir::SequenceType>();
+  auto isArray = mlir::isa<fir::SequenceType>(fir::dyn_cast_ptrOrBoxEleTy(ty));
   return isArray ? kOptTypePtrPosInBox : kDimsPosInBox;
 }
 
@@ -37,7 +37,7 @@ ConvertFIRToLLVMPattern::ConvertFIRToLLVMPattern(
 // reference.
 mlir::Type
 ConvertFIRToLLVMPattern::convertObjectType(mlir::Type firType) const {
-  if (auto boxTy = firType.dyn_cast<fir::BaseBoxType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(firType))
     return lowerTy().convertBoxTypeAsStruct(boxTy);
   return lowerTy().convertType(firType);
 }
@@ -69,7 +69,7 @@ ConvertFIRToLLVMPattern::integerCast(mlir::Location loc,
   auto valTy = val.getType();
   // If the value was not yet lowered, lower its type so that it can
   // be used in getPrimitiveTypeSizeInBits.
-  if (!valTy.isa<mlir::IntegerType>())
+  if (!mlir::isa<mlir::IntegerType>(valTy))
     valTy = convertType(valTy);
   auto toSize = mlir::LLVM::getPrimitiveTypeSizeInBits(ty);
   auto fromSize = mlir::LLVM::getPrimitiveTypeSizeInBits(valTy);
@@ -91,7 +91,7 @@ ConvertFIRToLLVMPattern::getBoxTypePair(mlir::Type firBoxTy) const {
 mlir::Value ConvertFIRToLLVMPattern::getValueFromBox(
     mlir::Location loc, TypePair boxTy, mlir::Value box, mlir::Type resultTy,
     mlir::ConversionPatternRewriter &rewriter, int boxValue) const {
-  if (box.getType().isa<mlir::LLVM::LLVMPointerType>()) {
+  if (mlir::isa<mlir::LLVM::LLVMPointerType>(box.getType())) {
     auto pty = getLlvmPtrType(resultTy.getContext());
     auto p = rewriter.create<mlir::LLVM::GEPOp>(
         loc, pty, boxTy.llvm, box,
@@ -133,7 +133,7 @@ llvm::SmallVector<mlir::Value, 3> ConvertFIRToLLVMPattern::getDimsFromBox(
 mlir::Value ConvertFIRToLLVMPattern::loadDimFieldFromBox(
     mlir::Location loc, TypePair boxTy, mlir::Value box, mlir::Value dim,
     int off, mlir::Type ty, mlir::ConversionPatternRewriter &rewriter) const {
-  assert(box.getType().isa<mlir::LLVM::LLVMPointerType>() &&
+  assert(mlir::isa<mlir::LLVM::LLVMPointerType>(box.getType()) &&
          "descriptor inquiry with runtime dim can only be done on descriptor "
          "in memory");
   mlir::LLVM::GEPOp p = genGEP(loc, boxTy.llvm, rewriter, box, 0,
@@ -146,7 +146,7 @@ mlir::Value ConvertFIRToLLVMPattern::loadDimFieldFromBox(
 mlir::Value ConvertFIRToLLVMPattern::getDimFieldFromBox(
     mlir::Location loc, TypePair boxTy, mlir::Value box, int dim, int off,
     mlir::Type ty, mlir::ConversionPatternRewriter &rewriter) const {
-  if (box.getType().isa<mlir::LLVM::LLVMPointerType>()) {
+  if (mlir::isa<mlir::LLVM::LLVMPointerType>(box.getType())) {
     mlir::LLVM::GEPOp p = genGEP(loc, boxTy.llvm, rewriter, box, 0,
                                  static_cast<int>(kDimsPosInBox), dim, off);
     auto loadOp = rewriter.create<mlir::LLVM::LoadOp>(loc, ty, p);
@@ -184,12 +184,12 @@ mlir::Value ConvertFIRToLLVMPattern::getElementSizeFromBox(
 mlir::Type ConvertFIRToLLVMPattern::getBoxEleTy(
     mlir::Type type, llvm::ArrayRef<std::int64_t> indexes) const {
   for (unsigned i : indexes) {
-    if (auto t = type.dyn_cast<mlir::LLVM::LLVMStructType>()) {
+    if (auto t = mlir::dyn_cast<mlir::LLVM::LLVMStructType>(type)) {
       assert(!t.isOpaque() && i < t.getBody().size());
       type = t.getBody()[i];
-    } else if (auto t = type.dyn_cast<mlir::LLVM::LLVMArrayType>()) {
+    } else if (auto t = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(type)) {
       type = t.getElementType();
-    } else if (auto t = type.dyn_cast<mlir::VectorType>()) {
+    } else if (auto t = mlir::dyn_cast<mlir::VectorType>(type)) {
       type = t.getElementType();
     } else {
       fir::emitFatalError(mlir::UnknownLoc::get(type.getContext()),
@@ -243,6 +243,7 @@ ConvertFIRToLLVMPattern::getBlockForAllocaInsert(mlir::Operation *op) const {
     return iface.getAllocaBlock();
   if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
     return &llvmFuncOp.front();
+
   return getBlockForAllocaInsert(op->getParentOp());
 }
 
@@ -257,9 +258,10 @@ mlir::Value ConvertFIRToLLVMPattern::genAllocaAndAddrCastWithType(
     mlir::ConversionPatternRewriter &rewriter) const {
   auto thisPt = rewriter.saveInsertionPoint();
   mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
-  if (mlir::isa<mlir::omp::DeclareReductionOp>(parentOp)) {
-    // DeclareReductionOp has multiple child regions. We want to get the first
-    // block of whichever of those regions we are currently in
+  if (mlir::isa<mlir::omp::DeclareReductionOp>(parentOp) ||
+      mlir::isa<mlir::omp::PrivateClauseOp>(parentOp)) {
+    // DeclareReductionOp & PrivateClauseOp have multiple child regions. We want
+    // to get the first block of whichever of those regions we are currently in
     mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
     rewriter.setInsertionPointToStart(&parentRegion->front());
   } else {
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index 665bf09b8fc3..5bd3ec8d1845 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -86,10 +86,10 @@ public:
     // If the embox does not include a shape, then do not convert it
     if (auto shapeVal = embox.getShape())
       return rewriteDynamicShape(embox, rewriter, shapeVal);
-    if (embox.getType().isa<fir::ClassType>())
+    if (mlir::isa<fir::ClassType>(embox.getType()))
       TODO(embox.getLoc(), "embox conversion for fir.class type");
-    if (auto boxTy = embox.getType().dyn_cast<fir::BoxType>())
-      if (auto seqTy = boxTy.getEleTy().dyn_cast<fir::SequenceType>())
+    if (auto boxTy = mlir::dyn_cast<fir::BoxType>(embox.getType()))
+      if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy()))
         if (!seqTy.hasDynamicExtents())
           return rewriteStaticShape(embox, rewriter, seqTy);
     return mlir::failure();
@@ -281,6 +281,20 @@ public:
   }
 };
 
+class DummyScopeOpConversion
+    : public mlir::OpRewritePattern<fir::DummyScopeOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(fir::DummyScopeOp dummyScopeOp,
+                  mlir::PatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<fir::UndefOp>(dummyScopeOp,
+                                              dummyScopeOp.getType());
+    return mlir::success();
+  }
+};
+
 class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 public:
   void runOnOperation() override final {
@@ -293,11 +307,11 @@ public:
     target.addIllegalOp<fir::ArrayCoorOp>();
     target.addIllegalOp<fir::ReboxOp>();
     target.addIllegalOp<fir::DeclareOp>();
+    target.addIllegalOp<fir::DummyScopeOp>();
     target.addDynamicallyLegalOp<fir::EmboxOp>([](fir::EmboxOp embox) {
-      return !(embox.getShape() || embox.getType()
-                                       .cast<fir::BaseBoxType>()
-                                       .getEleTy()
-                                       .isa<fir::SequenceType>());
+      return !(embox.getShape() ||
+               mlir::isa<fir::SequenceType>(
+                   mlir::cast<fir::BaseBoxType>(embox.getType()).getEleTy()));
     });
     mlir::RewritePatternSet patterns(&context);
     fir::populatePreCGRewritePatterns(patterns);
@@ -322,5 +336,6 @@ std::unique_ptr<mlir::Pass> fir::createFirCodeGenRewritePass() {
 
 void fir::populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns) {
   patterns.insert<EmboxConversion, ArrayCoorConversion, ReboxConversion,
-                  DeclareOpConversion>(patterns.getContext());
+                  DeclareOpConversion, DummyScopeOpConversion>(
+      patterns.getContext());
 }
diff --git a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
index b1b0e9b766a6..a21384e8d594 100644
--- a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
+++ b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
@@ -120,7 +120,7 @@ void TBAABuilder::attachTBAATag(AliasAnalysisOpInterface op, Type baseFIRType,
     // with both data and descriptor accesses.
     // Conservatively set any-access tag if there is any descriptor member.
     tbaaTagSym = getAnyAccessTag(func);
-  } else if (baseFIRType.isa<fir::BaseBoxType>()) {
+  } else if (mlir::isa<fir::BaseBoxType>(baseFIRType)) {
     tbaaTagSym = getBoxAccessTag(baseFIRType, accessFIRType, gep, func);
   } else {
     tbaaTagSym = getDataAccessTag(baseFIRType, accessFIRType, gep, func);
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index cea7a1f97f41..652e2bddc1b8 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -41,9 +41,9 @@ llvm::StringRef Attributes::getIntExtensionAttrName() const {
 static const llvm::fltSemantics &floatToSemantics(const KindMapping &kindMap,
                                                   mlir::Type type) {
   assert(isa_real(type));
-  if (auto ty = type.dyn_cast<fir::RealType>())
+  if (auto ty = mlir::dyn_cast<fir::RealType>(type))
     return kindMap.getFloatSemantics(ty.getFKind());
-  return type.cast<mlir::FloatType>().getFloatSemantics();
+  return mlir::cast<mlir::FloatType>(type).getFloatSemantics();
 }
 
 static void typeTodo(const llvm::fltSemantics *sem, mlir::Location loc,
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index 7bf31ec38695..616de78d0026 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -137,7 +137,7 @@ public:
         if (!hasPortableSignature(dispatch.getFunctionType(), op))
           convertCallOp(dispatch);
       } else if (auto addr = mlir::dyn_cast<fir::AddrOfOp>(op)) {
-        if (addr.getType().isa<mlir::FunctionType>() &&
+        if (mlir::isa<mlir::FunctionType>(addr.getType()) &&
             !hasPortableSignature(addr.getType(), op))
           convertAddrOp(addr);
       }
@@ -601,7 +601,7 @@ public:
   /// Taking the address of a function. Modify the signature as needed.
   void convertAddrOp(fir::AddrOfOp addrOp) {
     rewriter->setInsertionPoint(addrOp);
-    auto addrTy = addrOp.getType().cast<mlir::FunctionType>();
+    auto addrTy = mlir::cast<mlir::FunctionType>(addrOp.getType());
     fir::CodeGenSpecifics::Marshalling newInTyAndAttrs;
     llvm::SmallVector<mlir::Type> newResTys;
     auto loc = addrOp.getLoc();
@@ -705,22 +705,23 @@ public:
   /// return `true`. Otherwise, the signature is not portable and `false` is
   /// returned.
   bool hasPortableSignature(mlir::Type signature, mlir::Operation *op) {
-    assert(signature.isa<mlir::FunctionType>());
-    auto func = signature.dyn_cast<mlir::FunctionType>();
+    assert(mlir::isa<mlir::FunctionType>(signature));
+    auto func = mlir::dyn_cast<mlir::FunctionType>(signature);
     bool hasCCallingConv = isFuncWithCCallingConvention(op);
     for (auto ty : func.getResults())
-      if ((ty.isa<fir::BoxCharType>() && !noCharacterConversion) ||
+      if ((mlir::isa<fir::BoxCharType>(ty) && !noCharacterConversion) ||
           (fir::isa_complex(ty) && !noComplexConversion) ||
-          (ty.isa<mlir::IntegerType>() && hasCCallingConv)) {
+          (mlir::isa<mlir::IntegerType>(ty) && hasCCallingConv)) {
         LLVM_DEBUG(llvm::dbgs() << "rewrite " << signature << " for target\n");
         return false;
       }
     for (auto ty : func.getInputs())
-      if (((ty.isa<fir::BoxCharType>() || fir::isCharacterProcedureTuple(ty)) &&
+      if (((mlir::isa<fir::BoxCharType>(ty) ||
+            fir::isCharacterProcedureTuple(ty)) &&
            !noCharacterConversion) ||
           (fir::isa_complex(ty) && !noComplexConversion) ||
-          (ty.isa<mlir::IntegerType>() && hasCCallingConv) ||
-          (ty.isa<fir::RecordType>() && !noStructConversion)) {
+          (mlir::isa<mlir::IntegerType>(ty) && hasCCallingConv) ||
+          (mlir::isa<fir::RecordType>(ty) && !noStructConversion)) {
         LLVM_DEBUG(llvm::dbgs() << "rewrite " << signature << " for target\n");
         return false;
       }
@@ -740,7 +741,7 @@ public:
   /// Rewrite the signatures and body of the `FuncOp`s in the module for
   /// the immediately subsequent target code gen.
   void convertSignature(mlir::func::FuncOp func) {
-    auto funcTy = func.getFunctionType().cast<mlir::FunctionType>();
+    auto funcTy = mlir::cast<mlir::FunctionType>(func.getFunctionType());
     if (hasPortableSignature(funcTy, func) && !hasHostAssociations(func))
       return;
     llvm::SmallVector<mlir::Type> newResTys;
diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
index 8fa423f35806..729ece6fc177 100644
--- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
+++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
@@ -103,10 +103,10 @@ LLVMTypeConverter::LLVMTypeConverter(mlir::ModuleOp module, bool applyTBAA,
     for (auto mem : tuple.getTypes()) {
       // Prevent fir.box from degenerating to a pointer to a descriptor in the
       // context of a tuple type.
-      if (auto box = mem.dyn_cast<fir::BaseBoxType>())
+      if (auto box = mlir::dyn_cast<fir::BaseBoxType>(mem))
         members.push_back(convertBoxTypeAsStruct(box));
       else
-        members.push_back(convertType(mem).cast<mlir::Type>());
+        members.push_back(mlir::cast<mlir::Type>(convertType(mem)));
     }
     return mlir::LLVM::LLVMStructType::getLiteral(&getContext(), members,
                                                   /*isPacked=*/false);
@@ -115,6 +115,11 @@ LLVMTypeConverter::LLVMTypeConverter(mlir::ModuleOp module, bool applyTBAA,
     return mlir::LLVM::LLVMStructType::getLiteral(
         none.getContext(), std::nullopt, /*isPacked=*/false);
   });
+  addConversion([&](fir::DummyScopeType dscope) {
+    // DummyScopeType values must not have any uses after PreCGRewrite.
+    // Convert it here to i1 just in case it survives.
+    return mlir::IntegerType::get(&getContext(), 1);
+  });
   // FIXME: https://reviews.llvm.org/D82831 introduced an automatic
   // materialization of conversion around function calls that is not working
   // well with fir lowering to llvm (incorrect llvm.mlir.cast are inserted).
@@ -181,10 +186,10 @@ std::optional<mlir::LogicalResult> LLVMTypeConverter::convertRecordType(
   for (auto mem : derived.getTypeList()) {
     // Prevent fir.box from degenerating to a pointer to a descriptor in the
     // context of a record type.
-    if (auto box = mem.second.dyn_cast<fir::BaseBoxType>())
+    if (auto box = mlir::dyn_cast<fir::BaseBoxType>(mem.second))
       members.push_back(convertBoxTypeAsStruct(box));
     else
-      members.push_back(convertType(mem.second).cast<mlir::Type>());
+      members.push_back(mlir::cast<mlir::Type>(convertType(mem.second)));
   }
   if (mlir::failed(st.setBody(members, /*isPacked=*/false)))
     return mlir::failure();
@@ -196,7 +201,7 @@ std::optional<mlir::LogicalResult> LLVMTypeConverter::convertRecordType(
 // Extended descriptors are required for derived types.
 bool LLVMTypeConverter::requiresExtendedDesc(mlir::Type boxElementType) const {
   auto eleTy = fir::unwrapSequenceType(boxElementType);
-  return eleTy.isa<fir::RecordType>();
+  return mlir::isa<fir::RecordType>(eleTy);
 }
 
 // This corresponds to the descriptor as defined in ISO_Fortran_binding.h and
@@ -211,7 +216,8 @@ mlir::Type LLVMTypeConverter::convertBoxTypeAsStruct(BaseBoxType box,
     ele = removeIndirection;
   auto eleTy = convertType(ele);
   // base_addr*
-  if (ele.isa<SequenceType>() && eleTy.isa<mlir::LLVM::LLVMPointerType>())
+  if (mlir::isa<SequenceType>(ele) &&
+      mlir::isa<mlir::LLVM::LLVMPointerType>(eleTy))
     dataDescFields.push_back(eleTy);
   else
     dataDescFields.push_back(
@@ -236,7 +242,7 @@ mlir::Type LLVMTypeConverter::convertBoxTypeAsStruct(BaseBoxType box,
       getDescFieldTypeModel<kF18AddendumPosInBox>()(&getContext()));
   // [dims]
   if (rank == unknownRank()) {
-    if (auto seqTy = ele.dyn_cast<SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<SequenceType>(ele))
       rank = seqTy.getDimension();
     else
       rank = 0;
@@ -252,7 +258,8 @@ mlir::Type LLVMTypeConverter::convertBoxTypeAsStruct(BaseBoxType box,
     auto rowTy =
         getExtendedDescFieldTypeModel<kOptRowTypePosInBox>()(&getContext());
     dataDescFields.push_back(mlir::LLVM::LLVMArrayType::get(rowTy, 1));
-    if (auto recTy = fir::unwrapSequenceType(ele).dyn_cast<fir::RecordType>())
+    if (auto recTy =
+            mlir::dyn_cast<fir::RecordType>(fir::unwrapSequenceType(ele)))
       if (recTy.getNumLenParams() > 0) {
         // The descriptor design needs to be clarified regarding the number of
         // length parameters in the addendum. Since it can change for
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index e43710f5627e..9ea3a0568f69 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -264,23 +264,23 @@ void fir::FortranVariableFlagsAttr::print(mlir::AsmPrinter &printer) const {
 void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
                             mlir::DialectAsmPrinter &p) {
   auto &os = p.getStream();
-  if (auto exact = attr.dyn_cast<fir::ExactTypeAttr>()) {
+  if (auto exact = mlir::dyn_cast<fir::ExactTypeAttr>(attr)) {
     os << fir::ExactTypeAttr::getAttrName() << '<';
     p.printType(exact.getType());
     os << '>';
-  } else if (auto sub = attr.dyn_cast<fir::SubclassAttr>()) {
+  } else if (auto sub = mlir::dyn_cast<fir::SubclassAttr>(attr)) {
     os << fir::SubclassAttr::getAttrName() << '<';
     p.printType(sub.getType());
     os << '>';
-  } else if (attr.dyn_cast_or_null<fir::PointIntervalAttr>()) {
+  } else if (mlir::dyn_cast_or_null<fir::PointIntervalAttr>(attr)) {
     os << fir::PointIntervalAttr::getAttrName();
-  } else if (attr.dyn_cast_or_null<fir::ClosedIntervalAttr>()) {
+  } else if (mlir::dyn_cast_or_null<fir::ClosedIntervalAttr>(attr)) {
     os << fir::ClosedIntervalAttr::getAttrName();
-  } else if (attr.dyn_cast_or_null<fir::LowerBoundAttr>()) {
+  } else if (mlir::dyn_cast_or_null<fir::LowerBoundAttr>(attr)) {
     os << fir::LowerBoundAttr::getAttrName();
-  } else if (attr.dyn_cast_or_null<fir::UpperBoundAttr>()) {
+  } else if (mlir::dyn_cast_or_null<fir::UpperBoundAttr>(attr)) {
     os << fir::UpperBoundAttr::getAttrName();
-  } else if (auto a = attr.dyn_cast_or_null<fir::RealAttr>()) {
+  } else if (auto a = mlir::dyn_cast_or_null<fir::RealAttr>(attr)) {
     os << fir::RealAttr::getAttrName() << '<' << a.getFKind() << ", i x";
     llvm::SmallString<40> ss;
     a.getValue().bitcastToAPInt().toStringUnsigned(ss, 16);
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 24af94f9b90a..6773d0adced0 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -57,7 +57,7 @@ static void propagateAttributes(mlir::Operation *fromOp,
 static bool verifyInType(mlir::Type inType,
                          llvm::SmallVectorImpl<llvm::StringRef> &visited,
                          unsigned dynamicExtents = 0) {
-  if (auto st = inType.dyn_cast<fir::SequenceType>()) {
+  if (auto st = mlir::dyn_cast<fir::SequenceType>(inType)) {
     auto shape = st.getShape();
     if (shape.size() == 0)
       return true;
@@ -67,7 +67,7 @@ static bool verifyInType(mlir::Type inType,
       if (dynamicExtents-- == 0)
         return true;
     }
-  } else if (auto rt = inType.dyn_cast<fir::RecordType>()) {
+  } else if (auto rt = mlir::dyn_cast<fir::RecordType>(inType)) {
     // don't recurse if we're already visiting this one
     if (llvm::is_contained(visited, rt.getName()))
       return false;
@@ -84,13 +84,13 @@ static bool verifyInType(mlir::Type inType,
 static bool verifyTypeParamCount(mlir::Type inType, unsigned numParams) {
   auto ty = fir::unwrapSequenceType(inType);
   if (numParams > 0) {
-    if (auto recTy = ty.dyn_cast<fir::RecordType>())
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty))
       return numParams != recTy.getNumLenParams();
-    if (auto chrTy = ty.dyn_cast<fir::CharacterType>())
+    if (auto chrTy = mlir::dyn_cast<fir::CharacterType>(ty))
       return !(numParams == 1 && chrTy.hasDynamicLen());
     return true;
   }
-  if (auto chrTy = ty.dyn_cast<fir::CharacterType>())
+  if (auto chrTy = mlir::dyn_cast<fir::CharacterType>(ty))
     return !chrTy.hasConstantLen();
   return false;
 }
@@ -171,13 +171,13 @@ static void printAllocatableOp(mlir::OpAsmPrinter &p, OP &op) {
 /// Create a legal memory reference as return type
 static mlir::Type wrapAllocaResultType(mlir::Type intype) {
   // FIR semantics: memory references to memory references are disallowed
-  if (intype.isa<fir::ReferenceType>())
+  if (mlir::isa<fir::ReferenceType>(intype))
     return {};
   return fir::ReferenceType::get(intype);
 }
 
 mlir::Type fir::AllocaOp::getAllocatedType() {
-  return getType().cast<fir::ReferenceType>().getEleTy();
+  return mlir::cast<fir::ReferenceType>(getType()).getEleTy();
 }
 
 mlir::Type fir::AllocaOp::getRefTy(mlir::Type ty) {
@@ -270,7 +270,7 @@ mlir::LogicalResult fir::AllocaOp::verify() {
   if (verifyTypeParamCount(getInType(), numLenParams()))
     return emitOpError("LEN params do not correspond to type");
   mlir::Type outType = getType();
-  if (!outType.isa<fir::ReferenceType>())
+  if (!mlir::isa<fir::ReferenceType>(outType))
     return emitOpError("must be a !fir.ref type");
   if (fir::isa_unknown_size_box(fir::dyn_cast_ptrEleTy(outType)))
     return emitOpError("cannot allocate !fir.box of unknown rank or type");
@@ -286,14 +286,14 @@ static mlir::Type wrapAllocMemResultType(mlir::Type intype) {
   // Fortran semantics: C852 an entity cannot be both ALLOCATABLE and POINTER
   // 8.5.3 note 1 prohibits ALLOCATABLE procedures as well
   // FIR semantics: one may not allocate a memory reference value
-  if (intype.isa<fir::ReferenceType, fir::HeapType, fir::PointerType,
-                 mlir::FunctionType>())
+  if (mlir::isa<fir::ReferenceType, fir::HeapType, fir::PointerType,
+                mlir::FunctionType>(intype))
     return {};
   return fir::HeapType::get(intype);
 }
 
 mlir::Type fir::AllocMemOp::getAllocatedType() {
-  return getType().cast<fir::HeapType>().getEleTy();
+  return mlir::cast<fir::HeapType>(getType()).getEleTy();
 }
 
 mlir::Type fir::AllocMemOp::getRefTy(mlir::Type ty) {
@@ -348,7 +348,7 @@ mlir::LogicalResult fir::AllocMemOp::verify() {
   if (verifyTypeParamCount(getInType(), numLenParams()))
     return emitOpError("LEN params do not correspond to type");
   mlir::Type outType = getType();
-  if (!outType.dyn_cast<fir::HeapType>())
+  if (!mlir::dyn_cast<fir::HeapType>(outType))
     return emitOpError("must be a !fir.heap type");
   if (fir::isa_unknown_size_box(fir::dyn_cast_ptrEleTy(outType)))
     return emitOpError("cannot allocate !fir.box of unknown rank or type");
@@ -364,13 +364,13 @@ mlir::LogicalResult fir::AllocMemOp::verify() {
 static bool validTypeParams(mlir::Type dynTy, mlir::ValueRange typeParams) {
   dynTy = fir::unwrapAllRefAndSeqType(dynTy);
   // A box value will contain type parameter values itself.
-  if (dynTy.isa<fir::BoxType>())
+  if (mlir::isa<fir::BoxType>(dynTy))
     return typeParams.size() == 0;
   // Derived type must have all type parameters satisfied.
-  if (auto recTy = dynTy.dyn_cast<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(dynTy))
     return typeParams.size() == recTy.getNumLenParams();
   // Characters with non-constant LEN must have a type parameter value.
-  if (auto charTy = dynTy.dyn_cast<fir::CharacterType>())
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(dynTy))
     if (charTy.hasDynamicLen())
       return typeParams.size() == 1;
   // Otherwise, any type parameters are invalid.
@@ -379,7 +379,7 @@ static bool validTypeParams(mlir::Type dynTy, mlir::ValueRange typeParams) {
 
 mlir::LogicalResult fir::ArrayCoorOp::verify() {
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(getMemref().getType());
-  auto arrTy = eleTy.dyn_cast<fir::SequenceType>();
+  auto arrTy = mlir::dyn_cast<fir::SequenceType>(eleTy);
   if (!arrTy)
     return emitOpError("must be a reference to an array");
   auto arrDim = arrTy.getDimension();
@@ -387,14 +387,14 @@ mlir::LogicalResult fir::ArrayCoorOp::verify() {
   if (auto shapeOp = getShape()) {
     auto shapeTy = shapeOp.getType();
     unsigned shapeTyRank = 0;
-    if (auto s = shapeTy.dyn_cast<fir::ShapeType>()) {
+    if (auto s = mlir::dyn_cast<fir::ShapeType>(shapeTy)) {
       shapeTyRank = s.getRank();
-    } else if (auto ss = shapeTy.dyn_cast<fir::ShapeShiftType>()) {
+    } else if (auto ss = mlir::dyn_cast<fir::ShapeShiftType>(shapeTy)) {
       shapeTyRank = ss.getRank();
     } else {
-      auto s = shapeTy.cast<fir::ShiftType>();
+      auto s = mlir::cast<fir::ShiftType>(shapeTy);
       shapeTyRank = s.getRank();
-      if (!getMemref().getType().isa<fir::BaseBoxType>())
+      if (!mlir::isa<fir::BaseBoxType>(getMemref().getType()))
         return emitOpError("shift can only be provided with fir.box memref");
     }
     if (arrDim && arrDim != shapeTyRank)
@@ -407,7 +407,7 @@ mlir::LogicalResult fir::ArrayCoorOp::verify() {
     if (auto sl = mlir::dyn_cast_or_null<fir::SliceOp>(sliceOp.getDefiningOp()))
       if (!sl.getSubstr().empty())
         return emitOpError("array_coor cannot take a slice with substring");
-    if (auto sliceTy = sliceOp.getType().dyn_cast<fir::SliceType>())
+    if (auto sliceTy = mlir::dyn_cast<fir::SliceType>(sliceOp.getType()))
       if (sliceTy.getRank() != arrDim)
         return emitOpError("rank of dimension in slice mismatched");
   }
@@ -422,13 +422,13 @@ mlir::LogicalResult fir::ArrayCoorOp::verify() {
 //===----------------------------------------------------------------------===//
 
 static mlir::Type adjustedElementType(mlir::Type t) {
-  if (auto ty = t.dyn_cast<fir::ReferenceType>()) {
+  if (auto ty = mlir::dyn_cast<fir::ReferenceType>(t)) {
     auto eleTy = ty.getEleTy();
     if (fir::isa_char(eleTy))
       return eleTy;
     if (fir::isa_derived(eleTy))
       return eleTy;
-    if (eleTy.isa<fir::SequenceType>())
+    if (mlir::isa<fir::SequenceType>(eleTy))
       return eleTy;
   }
   return t;
@@ -448,7 +448,7 @@ std::vector<mlir::Value> fir::ArrayLoadOp::getExtents() {
 
 mlir::LogicalResult fir::ArrayLoadOp::verify() {
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(getMemref().getType());
-  auto arrTy = eleTy.dyn_cast<fir::SequenceType>();
+  auto arrTy = mlir::dyn_cast<fir::SequenceType>(eleTy);
   if (!arrTy)
     return emitOpError("must be a reference to an array");
   auto arrDim = arrTy.getDimension();
@@ -456,14 +456,14 @@ mlir::LogicalResult fir::ArrayLoadOp::verify() {
   if (auto shapeOp = getShape()) {
     auto shapeTy = shapeOp.getType();
     unsigned shapeTyRank = 0u;
-    if (auto s = shapeTy.dyn_cast<fir::ShapeType>()) {
+    if (auto s = mlir::dyn_cast<fir::ShapeType>(shapeTy)) {
       shapeTyRank = s.getRank();
-    } else if (auto ss = shapeTy.dyn_cast<fir::ShapeShiftType>()) {
+    } else if (auto ss = mlir::dyn_cast<fir::ShapeShiftType>(shapeTy)) {
       shapeTyRank = ss.getRank();
     } else {
-      auto s = shapeTy.cast<fir::ShiftType>();
+      auto s = mlir::cast<fir::ShiftType>(shapeTy);
       shapeTyRank = s.getRank();
-      if (!getMemref().getType().isa<fir::BaseBoxType>())
+      if (!mlir::isa<fir::BaseBoxType>(getMemref().getType()))
         return emitOpError("shift can only be provided with fir.box memref");
     }
     if (arrDim && arrDim != shapeTyRank)
@@ -474,7 +474,7 @@ mlir::LogicalResult fir::ArrayLoadOp::verify() {
     if (auto sl = mlir::dyn_cast_or_null<fir::SliceOp>(sliceOp.getDefiningOp()))
       if (!sl.getSubstr().empty())
         return emitOpError("array_load cannot take a slice with substring");
-    if (auto sliceTy = sliceOp.getType().dyn_cast<fir::SliceType>())
+    if (auto sliceTy = mlir::dyn_cast<fir::SliceType>(sliceOp.getType()))
       if (sliceTy.getRank() != arrDim)
         return emitOpError("rank of dimension in slice mismatched");
   }
@@ -502,7 +502,7 @@ mlir::LogicalResult fir::ArrayMergeStoreOp::verify() {
         // This is an intra-object merge, where the slice is projecting the
         // subfields that are to be overwritten by the merge operation.
         auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(getMemref().getType());
-        if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>()) {
+        if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy)) {
           auto projTy =
               fir::applyPathToType(seqTy.getEleTy(), sliceOp.getFields());
           if (fir::unwrapSequenceType(getOriginal().getType()) != projTy)
@@ -540,7 +540,7 @@ mlir::Type validArraySubobject(A op) {
 }
 
 mlir::LogicalResult fir::ArrayFetchOp::verify() {
-  auto arrTy = getSequence().getType().cast<fir::SequenceType>();
+  auto arrTy = mlir::cast<fir::SequenceType>(getSequence().getType());
   auto indSize = getIndices().size();
   if (indSize < arrTy.getDimension())
     return emitOpError("number of indices != dimension of array");
@@ -562,7 +562,7 @@ mlir::LogicalResult fir::ArrayFetchOp::verify() {
 //===----------------------------------------------------------------------===//
 
 mlir::LogicalResult fir::ArrayAccessOp::verify() {
-  auto arrTy = getSequence().getType().cast<fir::SequenceType>();
+  auto arrTy = mlir::cast<fir::SequenceType>(getSequence().getType());
   std::size_t indSize = getIndices().size();
   if (indSize < arrTy.getDimension())
     return emitOpError("number of indices != dimension of array");
@@ -584,7 +584,7 @@ mlir::LogicalResult fir::ArrayAccessOp::verify() {
 mlir::LogicalResult fir::ArrayUpdateOp::verify() {
   if (fir::isa_ref_type(getMerge().getType()))
     return emitOpError("does not support reference type for merge");
-  auto arrTy = getSequence().getType().cast<fir::SequenceType>();
+  auto arrTy = mlir::cast<fir::SequenceType>(getSequence().getType());
   auto indSize = getIndices().size();
   if (indSize < arrTy.getDimension())
     return emitOpError("number of indices != dimension of array");
@@ -604,7 +604,7 @@ mlir::LogicalResult fir::ArrayUpdateOp::verify() {
 //===----------------------------------------------------------------------===//
 
 mlir::LogicalResult fir::ArrayModifyOp::verify() {
-  auto arrTy = getSequence().getType().cast<fir::SequenceType>();
+  auto arrTy = mlir::cast<fir::SequenceType>(getSequence().getType());
   auto indSize = getIndices().size();
   if (indSize < arrTy.getDimension())
     return emitOpError("number of indices must match array dimension");
@@ -740,7 +740,7 @@ mlir::ParseResult fir::CallOp::parse(mlir::OpAsmParser &parser,
       parser.parseType(type))
     return mlir::failure();
 
-  auto funcType = type.dyn_cast<mlir::FunctionType>();
+  auto funcType = mlir::dyn_cast<mlir::FunctionType>(type);
   if (!funcType)
     return parser.emitError(parser.getNameLoc(), "expected function type");
   if (isDirect) {
@@ -785,7 +785,7 @@ void fir::CallOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
 mlir::LogicalResult fir::CharConvertOp::verify() {
   auto unwrap = [&](mlir::Type t) {
     t = fir::unwrapSequenceType(fir::dyn_cast_ptrEleTy(t));
-    return t.dyn_cast<fir::CharacterType>();
+    return mlir::dyn_cast<fir::CharacterType>(t);
   };
   auto inTy = unwrap(getFrom().getType());
   auto outTy = unwrap(getTo().getType());
@@ -832,13 +832,13 @@ static mlir::ParseResult parseCmpOp(mlir::OpAsmParser &parser,
       parser.resolveOperands(ops, type, result.operands))
     return mlir::failure();
 
-  if (!predicateNameAttr.isa<mlir::StringAttr>())
+  if (!mlir::isa<mlir::StringAttr>(predicateNameAttr))
     return parser.emitError(parser.getNameLoc(),
                             "expected string comparison predicate attribute");
 
   // Rewrite string attribute to an enum value.
   llvm::StringRef predicateName =
-      predicateNameAttr.cast<mlir::StringAttr>().getValue();
+      mlir::cast<mlir::StringAttr>(predicateNameAttr).getValue();
   auto predicate = fir::CmpcOp::getPredicateByName(predicateName);
   auto builder = parser.getBuilder();
   mlir::Type i1Type = builder.getI1Type();
@@ -906,7 +906,7 @@ void fir::ConstcOp::print(mlir::OpAsmPrinter &p) {
 }
 
 mlir::LogicalResult fir::ConstcOp::verify() {
-  if (!getType().isa<fir::ComplexType>())
+  if (!mlir::isa<fir::ComplexType>(getType()))
     return emitOpError("must be a !fir.complex type");
   return mlir::success();
 }
@@ -929,15 +929,16 @@ mlir::OpFoldResult fir::ConvertOp::fold(FoldAdaptor adaptor) {
   if (matchPattern(getValue(), mlir::m_Op<fir::ConvertOp>())) {
     auto inner = mlir::cast<fir::ConvertOp>(getValue().getDefiningOp());
     // (convert (convert 'a : logical -> i1) : i1 -> logical) ==> forward 'a
-    if (auto toTy = getType().dyn_cast<fir::LogicalType>())
-      if (auto fromTy = inner.getValue().getType().dyn_cast<fir::LogicalType>())
-        if (inner.getType().isa<mlir::IntegerType>() && (toTy == fromTy))
+    if (auto toTy = mlir::dyn_cast<fir::LogicalType>(getType()))
+      if (auto fromTy =
+              mlir::dyn_cast<fir::LogicalType>(inner.getValue().getType()))
+        if (mlir::isa<mlir::IntegerType>(inner.getType()) && (toTy == fromTy))
           return inner.getValue();
     // (convert (convert 'a : i1 -> logical) : logical -> i1) ==> forward 'a
-    if (auto toTy = getType().dyn_cast<mlir::IntegerType>())
+    if (auto toTy = mlir::dyn_cast<mlir::IntegerType>(getType()))
       if (auto fromTy =
-              inner.getValue().getType().dyn_cast<mlir::IntegerType>())
-        if (inner.getType().isa<fir::LogicalType>() && (toTy == fromTy) &&
+              mlir::dyn_cast<mlir::IntegerType>(inner.getValue().getType()))
+        if (mlir::isa<fir::LogicalType>(inner.getType()) && (toTy == fromTy) &&
             (fromTy.getWidth() == 1))
           return inner.getValue();
   }
@@ -945,7 +946,7 @@ mlir::OpFoldResult fir::ConvertOp::fold(FoldAdaptor adaptor) {
 }
 
 bool fir::ConvertOp::isInteger(mlir::Type ty) {
-  return ty.isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType>();
+  return mlir::isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType>(ty);
 }
 
 bool fir::ConvertOp::isIntegerCompatible(mlir::Type ty) {
@@ -953,13 +954,13 @@ bool fir::ConvertOp::isIntegerCompatible(mlir::Type ty) {
 }
 
 bool fir::ConvertOp::isFloatCompatible(mlir::Type ty) {
-  return ty.isa<mlir::FloatType, fir::RealType>();
+  return mlir::isa<mlir::FloatType, fir::RealType>(ty);
 }
 
 bool fir::ConvertOp::isPointerCompatible(mlir::Type ty) {
-  return ty.isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
-                fir::LLVMPointerType, mlir::MemRefType, mlir::FunctionType,
-                fir::TypeDescType>();
+  return mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
+                   fir::LLVMPointerType, mlir::MemRefType, mlir::FunctionType,
+                   fir::TypeDescType>(ty);
 }
 
 static std::optional<mlir::Type> getVectorElementType(mlir::Type ty) {
@@ -1026,12 +1027,14 @@ bool fir::ConvertOp::canBeConverted(mlir::Type inType, mlir::Type outType) {
          (isFloatCompatible(inType) && isFloatCompatible(outType)) ||
          (isIntegerCompatible(inType) && isPointerCompatible(outType)) ||
          (isPointerCompatible(inType) && isIntegerCompatible(outType)) ||
-         (inType.isa<fir::BoxType>() && outType.isa<fir::BoxType>()) ||
-         (inType.isa<fir::BoxProcType>() && outType.isa<fir::BoxProcType>()) ||
+         (mlir::isa<fir::BoxType>(inType) &&
+          mlir::isa<fir::BoxType>(outType)) ||
+         (mlir::isa<fir::BoxProcType>(inType) &&
+          mlir::isa<fir::BoxProcType>(outType)) ||
          (fir::isa_complex(inType) && fir::isa_complex(outType)) ||
          (fir::isBoxedRecordType(inType) && fir::isPolymorphicType(outType)) ||
          (fir::isPolymorphicType(inType) && fir::isPolymorphicType(outType)) ||
-         (fir::isPolymorphicType(inType) && outType.isa<BoxType>()) ||
+         (fir::isPolymorphicType(inType) && mlir::isa<BoxType>(outType)) ||
          areVectorsCompatible(inType, outType);
 }
 
@@ -1079,7 +1082,7 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
   const mlir::Type refTy = getRef().getType();
   if (fir::isa_ref_type(refTy)) {
     auto eleTy = fir::dyn_cast_ptrEleTy(refTy);
-    if (auto arrTy = eleTy.dyn_cast<fir::SequenceType>()) {
+    if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(eleTy)) {
       if (arrTy.hasUnknownShape())
         return emitOpError("cannot find coordinate in unknown shape");
       if (arrTy.getConstantRows() < arrTy.getDimension() - 1)
@@ -1094,8 +1097,8 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
   const unsigned numCoors = getCoor().size();
   for (auto coorOperand : llvm::enumerate(getCoor())) {
     auto co = coorOperand.value();
-    if (dimension == 0 && eleTy.isa<fir::SequenceType>()) {
-      dimension = eleTy.cast<fir::SequenceType>().getDimension();
+    if (dimension == 0 && mlir::isa<fir::SequenceType>(eleTy)) {
+      dimension = mlir::cast<fir::SequenceType>(eleTy).getDimension();
       if (dimension == 0)
         return emitOpError("cannot apply to array of unknown rank");
     }
@@ -1104,7 +1107,7 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
         // Recovering a LEN type parameter only makes sense from a boxed
         // value. For a bare reference, the LEN type parameters must be
         // passed as additional arguments to `index`.
-        if (refTy.isa<fir::BoxType>()) {
+        if (mlir::isa<fir::BoxType>(refTy)) {
           if (coorOperand.index() != numCoors - 1)
             return emitOpError("len_param_index must be last argument");
           if (getNumOperands() != 2)
@@ -1117,7 +1120,7 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
       } else if (auto index = mlir::dyn_cast<fir::FieldIndexOp>(defOp)) {
         if (eleTy != index.getOnType())
           emitOpError("field_index type not compatible with reference type");
-        if (auto recTy = eleTy.dyn_cast<fir::RecordType>()) {
+        if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy)) {
           eleTy = recTy.getType(index.getFieldName());
           continue;
         }
@@ -1126,21 +1129,21 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
     }
     if (dimension) {
       if (--dimension == 0)
-        eleTy = eleTy.cast<fir::SequenceType>().getEleTy();
+        eleTy = mlir::cast<fir::SequenceType>(eleTy).getEleTy();
     } else {
-      if (auto t = eleTy.dyn_cast<mlir::TupleType>()) {
+      if (auto t = mlir::dyn_cast<mlir::TupleType>(eleTy)) {
         // FIXME: Generally, we don't know which field of the tuple is being
         // referred to unless the operand is a constant. Just assume everything
         // is good in the tuple case for now.
         return mlir::success();
-      } else if (auto t = eleTy.dyn_cast<fir::RecordType>()) {
+      } else if (auto t = mlir::dyn_cast<fir::RecordType>(eleTy)) {
         // FIXME: This is the same as the tuple case.
         return mlir::success();
-      } else if (auto t = eleTy.dyn_cast<fir::ComplexType>()) {
+      } else if (auto t = mlir::dyn_cast<fir::ComplexType>(eleTy)) {
         eleTy = t.getElementType();
-      } else if (auto t = eleTy.dyn_cast<mlir::ComplexType>()) {
+      } else if (auto t = mlir::dyn_cast<mlir::ComplexType>(eleTy)) {
         eleTy = t.getElementType();
-      } else if (auto t = eleTy.dyn_cast<fir::CharacterType>()) {
+      } else if (auto t = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
         if (t.getLen() == fir::CharacterType::singleton())
           return emitOpError("cannot apply to character singleton");
         eleTy = fir::CharacterType::getSingleton(t.getContext(), t.getFKind());
@@ -1216,17 +1219,17 @@ mlir::LogicalResult fir::TypeInfoOp::verify() {
 mlir::LogicalResult fir::EmboxOp::verify() {
   auto eleTy = fir::dyn_cast_ptrEleTy(getMemref().getType());
   bool isArray = false;
-  if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy)) {
     eleTy = seqTy.getEleTy();
     isArray = true;
   }
   if (hasLenParams()) {
     auto lenPs = numLenParams();
-    if (auto rt = eleTy.dyn_cast<fir::RecordType>()) {
+    if (auto rt = mlir::dyn_cast<fir::RecordType>(eleTy)) {
       if (lenPs != rt.getNumLenParams())
         return emitOpError("number of LEN params does not correspond"
                            " to the !fir.type type");
-    } else if (auto strTy = eleTy.dyn_cast<fir::CharacterType>()) {
+    } else if (auto strTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       if (strTy.getLen() != fir::CharacterType::unknownLen())
         return emitOpError("CHARACTER already has static LEN");
     } else {
@@ -1240,7 +1243,7 @@ mlir::LogicalResult fir::EmboxOp::verify() {
     return emitOpError("shape must not be provided for a scalar");
   if (getSlice() && !isArray)
     return emitOpError("slice must not be provided for a scalar");
-  if (getSourceBox() && !getResult().getType().isa<fir::ClassType>())
+  if (getSourceBox() && !mlir::isa<fir::ClassType>(getResult().getType()))
     return emitOpError("source_box must be used with fir.class result type");
   return mlir::success();
 }
@@ -1251,7 +1254,7 @@ mlir::LogicalResult fir::EmboxOp::verify() {
 
 mlir::LogicalResult fir::EmboxCharOp::verify() {
   auto eleTy = fir::dyn_cast_ptrEleTy(getMemref().getType());
-  if (!eleTy.dyn_cast_or_null<fir::CharacterType>())
+  if (!mlir::dyn_cast_or_null<fir::CharacterType>(eleTy))
     return mlir::failure();
   return mlir::success();
 }
@@ -1263,8 +1266,8 @@ mlir::LogicalResult fir::EmboxCharOp::verify() {
 mlir::LogicalResult fir::EmboxProcOp::verify() {
   // host bindings (optional) must be a reference to a tuple
   if (auto h = getHost()) {
-    if (auto r = h.getType().dyn_cast<fir::ReferenceType>())
-      if (r.getEleTy().isa<mlir::TupleType>())
+    if (auto r = mlir::dyn_cast<fir::ReferenceType>(h.getType()))
+      if (mlir::isa<mlir::TupleType>(r.getEleTy()))
         return mlir::success();
     return mlir::failure();
   }
@@ -1300,7 +1303,7 @@ void fir::TypeDescOp::print(mlir::OpAsmPrinter &p) {
 
 mlir::LogicalResult fir::TypeDescOp::verify() {
   mlir::Type resultTy = getType();
-  if (auto tdesc = resultTy.dyn_cast<fir::TypeDescType>()) {
+  if (auto tdesc = mlir::dyn_cast<fir::TypeDescType>(resultTy)) {
     if (tdesc.getOfTy() != getInType())
       return emitOpError("wrapped type mismatched");
     return mlir::success();
@@ -1527,7 +1530,7 @@ mlir::ParseResult parseFieldLikeOp(mlir::OpAsmParser &parser,
     return mlir::failure();
   result.addAttribute(fir::FieldIndexOp::getFieldAttrName(),
                       builder.getStringAttr(fieldName));
-  if (!recty.dyn_cast<fir::RecordType>())
+  if (!mlir::dyn_cast<fir::RecordType>(recty))
     return mlir::failure();
   result.addAttribute(fir::FieldIndexOp::getTypeAttrName(),
                       mlir::TypeAttr::get(recty));
@@ -1671,7 +1674,7 @@ mlir::LogicalResult fir::InsertOnRangeOp::verify() {
 //===----------------------------------------------------------------------===//
 
 static bool checkIsIntegerConstant(mlir::Attribute attr, std::int64_t conVal) {
-  if (auto iattr = attr.dyn_cast<mlir::IntegerAttr>())
+  if (auto iattr = mlir::dyn_cast<mlir::IntegerAttr>(attr))
     return iattr.getInt() == conVal;
   return false;
 }
@@ -1690,7 +1693,7 @@ struct UndoComplexPattern : public mlir::RewritePattern {
   matchAndRewrite(mlir::Operation *op,
                   mlir::PatternRewriter &rewriter) const override {
     auto insval = mlir::dyn_cast_or_null<fir::InsertValueOp>(op);
-    if (!insval || !insval.getType().isa<fir::ComplexType>())
+    if (!insval || !mlir::isa<fir::ComplexType>(insval.getType()))
       return mlir::failure();
     auto insval2 = mlir::dyn_cast_or_null<fir::InsertValueOp>(
         insval.getAdt().getDefiningOp());
@@ -1819,7 +1822,7 @@ mlir::ParseResult fir::IterWhileOp::parse(mlir::OpAsmParser &parser,
         parser.parseRParen())
       return mlir::failure();
     // Type list must be "(index, i1)".
-    if (typeList.size() != 2 || !typeList[0].isa<mlir::IndexType>() ||
+    if (typeList.size() != 2 || !mlir::isa<mlir::IndexType>(typeList[0]) ||
         !typeList[1].isSignlessInteger(1))
       return mlir::failure();
     result.addTypes(typeList);
@@ -1873,7 +1876,7 @@ mlir::LogicalResult fir::IterWhileOp::verify() {
   auto opNumResults = getNumResults();
   if (getFinalValue()) {
     // Result type must be "(index, i1, ...)".
-    if (!getResult(0).getType().isa<mlir::IndexType>())
+    if (!mlir::isa<mlir::IndexType>(getResult(0).getType()))
       return emitOpError("result #0 expected to be index");
     if (!getResult(1).getType().isSignlessInteger(1))
       return emitOpError("result #1 expected to be i1");
@@ -2316,7 +2319,7 @@ void fir::DTEntryOp::print(mlir::OpAsmPrinter &p) {
 /// Example: return f32 for !fir.box<!fir.heap<!fir.array<?x?xf32>>.
 static mlir::Type getBoxScalarEleTy(mlir::Type boxTy) {
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(boxTy);
-  if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
     return seqTy.getEleTy();
   return eleTy;
 }
@@ -2324,8 +2327,8 @@ static mlir::Type getBoxScalarEleTy(mlir::Type boxTy) {
 /// Test if \p t1 and \p t2 are compatible character types (if they can
 /// represent the same type at runtime).
 static bool areCompatibleCharacterTypes(mlir::Type t1, mlir::Type t2) {
-  auto c1 = t1.dyn_cast<fir::CharacterType>();
-  auto c2 = t2.dyn_cast<fir::CharacterType>();
+  auto c1 = mlir::dyn_cast<fir::CharacterType>(t1);
+  auto c2 = mlir::dyn_cast<fir::CharacterType>(t2);
   if (!c1 || !c2)
     return false;
   if (c1.hasDynamicLen() || c2.hasDynamicLen())
@@ -2347,10 +2350,10 @@ mlir::LogicalResult fir::ReboxOp::verify() {
 
   if (auto sliceVal = getSlice()) {
     // Slicing case
-    if (sliceVal.getType().cast<fir::SliceType>().getRank() != inputRank)
+    if (mlir::cast<fir::SliceType>(sliceVal.getType()).getRank() != inputRank)
       return emitOpError("slice operand rank must match box operand rank");
     if (auto shapeVal = getShape()) {
-      if (auto shiftTy = shapeVal.getType().dyn_cast<fir::ShiftType>()) {
+      if (auto shiftTy = mlir::dyn_cast<fir::ShiftType>(shapeVal.getType())) {
         if (shiftTy.getRank() != inputRank)
           return emitOpError("shape operand and input box ranks must match "
                              "when there is a slice");
@@ -2370,12 +2373,12 @@ mlir::LogicalResult fir::ReboxOp::verify() {
     unsigned shapeRank = inputRank;
     if (auto shapeVal = getShape()) {
       auto ty = shapeVal.getType();
-      if (auto shapeTy = ty.dyn_cast<fir::ShapeType>()) {
+      if (auto shapeTy = mlir::dyn_cast<fir::ShapeType>(ty)) {
         shapeRank = shapeTy.getRank();
-      } else if (auto shapeShiftTy = ty.dyn_cast<fir::ShapeShiftType>()) {
+      } else if (auto shapeShiftTy = mlir::dyn_cast<fir::ShapeShiftType>(ty)) {
         shapeRank = shapeShiftTy.getRank();
       } else {
-        auto shiftTy = ty.cast<fir::ShiftType>();
+        auto shiftTy = mlir::cast<fir::ShiftType>(ty);
         shapeRank = shiftTy.getRank();
         if (shapeRank != inputRank)
           return emitOpError("shape operand and input box ranks must match "
@@ -2394,11 +2397,13 @@ mlir::LogicalResult fir::ReboxOp::verify() {
     // the types is a character with dynamic length, the other type can be any
     // character type.
     const bool typeCanMismatch =
-        inputEleTy.isa<fir::RecordType>() || outEleTy.isa<mlir::NoneType>() ||
-        (inputEleTy.isa<mlir::NoneType>() && outEleTy.isa<fir::RecordType>()) ||
-        (getSlice() && inputEleTy.isa<fir::CharacterType>()) ||
+        mlir::isa<fir::RecordType>(inputEleTy) ||
+        mlir::isa<mlir::NoneType>(outEleTy) ||
+        (mlir::isa<mlir::NoneType>(inputEleTy) &&
+         mlir::isa<fir::RecordType>(outEleTy)) ||
+        (getSlice() && mlir::isa<fir::CharacterType>(inputEleTy)) ||
         (getSlice() && fir::isa_complex(inputEleTy) &&
-         outEleTy.isa<mlir::FloatType>()) ||
+         mlir::isa<mlir::FloatType>(outEleTy)) ||
         areCompatibleCharacterTypes(inputEleTy, outEleTy);
     if (!typeCanMismatch)
       return emitOpError(
@@ -2435,7 +2440,7 @@ mlir::LogicalResult fir::SaveResultOp::verify() {
   if (fir::isa_unknown_size_box(resultType))
     return emitOpError("cannot save !fir.box of unknown rank or type");
 
-  if (resultType.isa<fir::BoxType>()) {
+  if (mlir::isa<fir::BoxType>(resultType)) {
     if (getShape() || !getTypeparams().empty())
       return emitOpError(
           "must not have shape or length operands if the value is a fir.box");
@@ -2446,14 +2451,14 @@ mlir::LogicalResult fir::SaveResultOp::verify() {
   unsigned shapeTyRank = 0;
   if (auto shapeVal = getShape()) {
     auto shapeTy = shapeVal.getType();
-    if (auto s = shapeTy.dyn_cast<fir::ShapeType>())
+    if (auto s = mlir::dyn_cast<fir::ShapeType>(shapeTy))
       shapeTyRank = s.getRank();
     else
-      shapeTyRank = shapeTy.cast<fir::ShapeShiftType>().getRank();
+      shapeTyRank = mlir::cast<fir::ShapeShiftType>(shapeTy).getRank();
   }
 
   auto eleTy = resultType;
-  if (auto seqTy = resultType.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(resultType)) {
     if (seqTy.getDimension() != shapeTyRank)
       emitOpError("shape operand must be provided and have the value rank "
                   "when the value is a fir.array");
@@ -2464,11 +2469,11 @@ mlir::LogicalResult fir::SaveResultOp::verify() {
           "shape operand should only be provided if the value is a fir.array");
   }
 
-  if (auto recTy = eleTy.dyn_cast<fir::RecordType>()) {
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy)) {
     if (recTy.getNumLenParams() != getTypeparams().size())
       emitOpError("length parameters number must match with the value type "
                   "length parameters");
-  } else if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+  } else if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
     if (getTypeparams().size() > 1)
       emitOpError("no more than one length parameter must be provided for "
                   "character value");
@@ -2493,10 +2498,8 @@ static constexpr llvm::StringRef getTargetOffsetAttr() {
 
 template <typename OpT>
 static mlir::LogicalResult verifyIntegralSwitchTerminator(OpT op) {
-  if (!op.getSelector()
-           .getType()
-           .template isa<mlir::IntegerType, mlir::IndexType,
-                         fir::IntegerType>())
+  if (!mlir::isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType>(
+          op.getSelector().getType()))
     return op.emitOpError("must be an integer");
   auto cases =
       op->template getAttrOfType<mlir::ArrayAttr>(op.getCasesAttr()).getValue();
@@ -2508,7 +2511,7 @@ static mlir::LogicalResult verifyIntegralSwitchTerminator(OpT op) {
   if (op.targetOffsetSize() != count)
     return op.emitOpError("incorrect number of successor operand groups");
   for (decltype(count) i = 0; i != count; ++i) {
-    if (!cases[i].template isa<mlir::IntegerAttr, mlir::UnitAttr>())
+    if (!mlir::isa<mlir::IntegerAttr, mlir::UnitAttr>(cases[i]))
       return op.emitOpError("invalid case alternative");
   }
   return mlir::success();
@@ -2571,7 +2574,7 @@ static void printIntegralSwitchTerminator(OpT op, mlir::OpAsmPrinter &p) {
     if (i)
       p << ", ";
     auto &attr = cases[i];
-    if (auto intAttr = attr.template dyn_cast_or_null<mlir::IntegerAttr>())
+    if (auto intAttr = mlir::dyn_cast_or_null<mlir::IntegerAttr>(attr))
       p << intAttr.getValue();
     else
       p.printAttribute(attr);
@@ -2620,7 +2623,7 @@ getMutableSuccessorOperands(unsigned pos, mlir::MutableOperandRange operands,
       *owner->getAttrDictionary().getNamed(offsetAttr);
   return getSubOperands(
       pos, operands,
-      targetOffsetAttr.getValue().cast<mlir::DenseI32ArrayAttr>(),
+      mlir::cast<mlir::DenseI32ArrayAttr>(targetOffsetAttr.getValue()),
       mlir::MutableOperandRange::OperandSegment(pos, targetOffsetAttr));
 }
 
@@ -2742,9 +2745,9 @@ mlir::ParseResult fir::SelectCaseOp::parse(mlir::OpAsmParser &parser,
         parser.parseComma())
       return mlir::failure();
     attrs.push_back(attr);
-    if (attr.dyn_cast_or_null<mlir::UnitAttr>()) {
+    if (mlir::dyn_cast_or_null<mlir::UnitAttr>(attr)) {
       argOffs.push_back(0);
-    } else if (attr.dyn_cast_or_null<fir::ClosedIntervalAttr>()) {
+    } else if (mlir::dyn_cast_or_null<fir::ClosedIntervalAttr>(attr)) {
       mlir::OpAsmParser::UnresolvedOperand oper1;
       mlir::OpAsmParser::UnresolvedOperand oper2;
       if (parser.parseOperand(oper1) || parser.parseComma() ||
@@ -2806,11 +2809,11 @@ void fir::SelectCaseOp::print(mlir::OpAsmPrinter &p) {
     if (i)
       p << ", ";
     p << cases[i] << ", ";
-    if (!cases[i].isa<mlir::UnitAttr>()) {
+    if (!mlir::isa<mlir::UnitAttr>(cases[i])) {
       auto caseArgs = *getCompareOperands(i);
       p.printOperand(*caseArgs.begin());
       p << ", ";
-      if (cases[i].isa<fir::ClosedIntervalAttr>()) {
+      if (mlir::isa<fir::ClosedIntervalAttr>(cases[i])) {
         p.printOperand(*(++caseArgs.begin()));
         p << ", ";
       }
@@ -2848,10 +2851,10 @@ void fir::SelectCaseOp::build(mlir::OpBuilder &builder,
   llvm::SmallVector<int32_t> operOffs;
   int32_t operSize = 0;
   for (auto attr : compareAttrs) {
-    if (attr.isa<fir::ClosedIntervalAttr>()) {
+    if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
       operOffs.push_back(2);
       operSize += 2;
-    } else if (attr.isa<mlir::UnitAttr>()) {
+    } else if (mlir::isa<mlir::UnitAttr>(attr)) {
       operOffs.push_back(0);
     } else {
       operOffs.push_back(1);
@@ -2900,10 +2903,10 @@ void fir::SelectCaseOp::build(mlir::OpBuilder &builder,
   llvm::SmallVector<mlir::ValueRange> cmpOpers;
   auto iter = cmpOpList.begin();
   for (auto &attr : compareAttrs) {
-    if (attr.isa<fir::ClosedIntervalAttr>()) {
+    if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
       cmpOpers.push_back(mlir::ValueRange({iter, iter + 2}));
       iter += 2;
-    } else if (attr.isa<mlir::UnitAttr>()) {
+    } else if (mlir::isa<mlir::UnitAttr>(attr)) {
       cmpOpers.push_back(mlir::ValueRange{});
     } else {
       cmpOpers.push_back(mlir::ValueRange({iter, iter + 1}));
@@ -2915,10 +2918,8 @@ void fir::SelectCaseOp::build(mlir::OpBuilder &builder,
 }
 
 mlir::LogicalResult fir::SelectCaseOp::verify() {
-  if (!getSelector()
-           .getType()
-           .isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType,
-                fir::LogicalType, fir::CharacterType>())
+  if (!mlir::isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType,
+                 fir::LogicalType, fir::CharacterType>(getSelector().getType()))
     return emitOpError("must be an integer, character, or logical");
   auto cases =
       getOperation()->getAttrOfType<mlir::ArrayAttr>(getCasesAttr()).getValue();
@@ -2933,9 +2934,11 @@ mlir::LogicalResult fir::SelectCaseOp::verify() {
     return emitOpError("incorrect number of successor operand groups");
   for (decltype(count) i = 0; i != count; ++i) {
     auto &attr = cases[i];
-    if (!(attr.isa<fir::PointIntervalAttr>() ||
-          attr.isa<fir::LowerBoundAttr>() || attr.isa<fir::UpperBoundAttr>() ||
-          attr.isa<fir::ClosedIntervalAttr>() || attr.isa<mlir::UnitAttr>()))
+    if (!(mlir::isa<fir::PointIntervalAttr>(attr) ||
+          mlir::isa<fir::LowerBoundAttr>(attr) ||
+          mlir::isa<fir::UpperBoundAttr>(attr) ||
+          mlir::isa<fir::ClosedIntervalAttr>(attr) ||
+          mlir::isa<mlir::UnitAttr>(attr)))
       return emitOpError("incorrect select case attribute type");
   }
   return mlir::success();
@@ -3111,14 +3114,14 @@ void fir::SelectTypeOp::print(mlir::OpAsmPrinter &p) {
 }
 
 mlir::LogicalResult fir::SelectTypeOp::verify() {
-  if (!(getSelector().getType().isa<fir::BaseBoxType>()))
+  if (!mlir::isa<fir::BaseBoxType>(getSelector().getType()))
     return emitOpError("must be a fir.class or fir.box type");
-  if (auto boxType = getSelector().getType().dyn_cast<fir::BoxType>())
-    if (!boxType.getEleTy().isa<mlir::NoneType>())
+  if (auto boxType = mlir::dyn_cast<fir::BoxType>(getSelector().getType()))
+    if (!mlir::isa<mlir::NoneType>(boxType.getEleTy()))
       return emitOpError("selector must be polymorphic");
   auto typeGuardAttr = getCases();
   for (unsigned idx = 0; idx < typeGuardAttr.size(); ++idx)
-    if (typeGuardAttr[idx].isa<mlir::UnitAttr>() &&
+    if (mlir::isa<mlir::UnitAttr>(typeGuardAttr[idx]) &&
         idx != typeGuardAttr.size() - 1)
       return emitOpError("default must be the last attribute");
   auto count = getNumDest();
@@ -3129,9 +3132,8 @@ mlir::LogicalResult fir::SelectTypeOp::verify() {
   if (targetOffsetSize() != count)
     return emitOpError("incorrect number of successor operand groups");
   for (unsigned i = 0; i != count; ++i) {
-    if (!(typeGuardAttr[i].isa<fir::ExactTypeAttr>() ||
-          typeGuardAttr[i].isa<fir::SubclassAttr>() ||
-          typeGuardAttr[i].isa<mlir::UnitAttr>()))
+    if (!mlir::isa<fir::ExactTypeAttr, fir::SubclassAttr, mlir::UnitAttr>(
+            typeGuardAttr[i]))
       return emitOpError("invalid type-case alternative");
   }
   return mlir::success();
@@ -3175,7 +3177,7 @@ void fir::SelectTypeOp::build(mlir::OpBuilder &builder,
 
 mlir::LogicalResult fir::ShapeOp::verify() {
   auto size = getExtents().size();
-  auto shapeTy = getType().dyn_cast<fir::ShapeType>();
+  auto shapeTy = mlir::dyn_cast<fir::ShapeType>(getType());
   assert(shapeTy && "must be a shape type");
   if (shapeTy.getRank() != size)
     return emitOpError("shape type rank mismatch");
@@ -3198,7 +3200,7 @@ mlir::LogicalResult fir::ShapeShiftOp::verify() {
     return emitOpError("incorrect number of args");
   if (size % 2 != 0)
     return emitOpError("requires a multiple of 2 args");
-  auto shapeTy = getType().dyn_cast<fir::ShapeShiftType>();
+  auto shapeTy = mlir::dyn_cast<fir::ShapeShiftType>(getType());
   assert(shapeTy && "must be a shape shift type");
   if (shapeTy.getRank() * 2 != size)
     return emitOpError("shape type rank mismatch");
@@ -3211,7 +3213,7 @@ mlir::LogicalResult fir::ShapeShiftOp::verify() {
 
 mlir::LogicalResult fir::ShiftOp::verify() {
   auto size = getOrigins().size();
-  auto shiftTy = getType().dyn_cast<fir::ShiftType>();
+  auto shiftTy = mlir::dyn_cast<fir::ShiftType>(getType());
   assert(shiftTy && "must be a shift type");
   if (shiftTy.getRank() != size)
     return emitOpError("shift type rank mismatch");
@@ -3251,7 +3253,7 @@ mlir::LogicalResult fir::SliceOp::verify() {
     return emitOpError("incorrect number of args for triple");
   if (size % 3 != 0)
     return emitOpError("requires a multiple of 3 args");
-  auto sliceTy = getType().dyn_cast<fir::SliceType>();
+  auto sliceTy = mlir::dyn_cast<fir::SliceType>(getType());
   assert(sliceTy && "must be a slice type");
   if (sliceTy.getRank() * 3 != size)
     return emitOpError("slice type rank mismatch");
@@ -3309,8 +3311,8 @@ void fir::StoreOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
 //===----------------------------------------------------------------------===//
 
 inline fir::CharacterType::KindTy stringLitOpGetKind(fir::StringLitOp op) {
-  auto eleTy = op.getType().cast<fir::SequenceType>().getEleTy();
-  return eleTy.cast<fir::CharacterType>().getFKind();
+  auto eleTy = mlir::cast<fir::SequenceType>(op.getType()).getEleTy();
+  return mlir::cast<fir::CharacterType>(eleTy).getFKind();
 }
 
 bool fir::StringLitOp::isWideValue() { return stringLitOpGetKind(*this) != 1; }
@@ -3390,13 +3392,13 @@ mlir::ParseResult fir::StringLitOp::parse(mlir::OpAsmParser &parser,
   llvm::SMLoc trailingTypeLoc;
   if (parser.parseAttribute(val, "fake", attrs))
     return mlir::failure();
-  if (auto v = val.dyn_cast<mlir::StringAttr>())
+  if (auto v = mlir::dyn_cast<mlir::StringAttr>(val))
     result.attributes.push_back(
         builder.getNamedAttr(fir::StringLitOp::value(), v));
-  else if (auto v = val.dyn_cast<mlir::DenseElementsAttr>())
+  else if (auto v = mlir::dyn_cast<mlir::DenseElementsAttr>(val))
     result.attributes.push_back(
         builder.getNamedAttr(fir::StringLitOp::xlist(), v));
-  else if (auto v = val.dyn_cast<mlir::ArrayAttr>())
+  else if (auto v = mlir::dyn_cast<mlir::ArrayAttr>(val))
     result.attributes.push_back(
         builder.getNamedAttr(fir::StringLitOp::xlist(), v));
   else
@@ -3409,7 +3411,7 @@ mlir::ParseResult fir::StringLitOp::parse(mlir::OpAsmParser &parser,
       parser.parseRParen() || parser.getCurrentLocation(&trailingTypeLoc) ||
       parser.parseColonType(type))
     return mlir::failure();
-  auto charTy = type.dyn_cast<fir::CharacterType>();
+  auto charTy = mlir::dyn_cast<fir::CharacterType>(type);
   if (!charTy)
     return parser.emitError(trailingTypeLoc, "must have character type");
   type = fir::CharacterType::get(builder.getContext(), charTy.getFKind(),
@@ -3421,19 +3423,19 @@ mlir::ParseResult fir::StringLitOp::parse(mlir::OpAsmParser &parser,
 
 void fir::StringLitOp::print(mlir::OpAsmPrinter &p) {
   p << ' ' << getValue() << '(';
-  p << getSize().cast<mlir::IntegerAttr>().getValue() << ") : ";
+  p << mlir::cast<mlir::IntegerAttr>(getSize()).getValue() << ") : ";
   p.printType(getType());
 }
 
 mlir::LogicalResult fir::StringLitOp::verify() {
-  if (getSize().cast<mlir::IntegerAttr>().getValue().isNegative())
+  if (mlir::cast<mlir::IntegerAttr>(getSize()).getValue().isNegative())
     return emitOpError("size must be non-negative");
   if (auto xl = getOperation()->getAttr(fir::StringLitOp::xlist())) {
-    if (auto xList = xl.dyn_cast<mlir::ArrayAttr>()) {
+    if (auto xList = mlir::dyn_cast<mlir::ArrayAttr>(xl)) {
       for (auto a : xList)
-        if (!a.isa<mlir::IntegerAttr>())
+        if (!mlir::isa<mlir::IntegerAttr>(a))
           return emitOpError("values in initializer must be integers");
-    } else if (xl.isa<mlir::DenseElementsAttr>()) {
+    } else if (mlir::isa<mlir::DenseElementsAttr>(xl)) {
       // do nothing
     } else {
       return emitOpError("has unexpected attribute");
@@ -3448,7 +3450,7 @@ mlir::LogicalResult fir::StringLitOp::verify() {
 
 mlir::LogicalResult fir::UnboxProcOp::verify() {
   if (auto eleTy = fir::dyn_cast_ptrEleTy(getRefTuple().getType()))
-    if (eleTy.isa<mlir::TupleType>())
+    if (mlir::isa<mlir::TupleType>(eleTy))
       return mlir::success();
   return emitOpError("second output argument has bad type");
 }
@@ -3527,7 +3529,7 @@ void fir::IfOp::getEntrySuccessorRegions(
 void fir::IfOp::getRegionInvocationBounds(
     llvm::ArrayRef<mlir::Attribute> operands,
     llvm::SmallVectorImpl<mlir::InvocationBounds> &invocationBounds) {
-  if (auto cond = operands[0].dyn_cast_or_null<mlir::BoolAttr>()) {
+  if (auto cond = mlir::dyn_cast_or_null<mlir::BoolAttr>(operands[0])) {
     // If the condition is known, then one region is known to be executed once
     // and the other zero times.
     invocationBounds.emplace_back(0, cond.getValue() ? 1 : 0);
@@ -3646,8 +3648,8 @@ void fir::BoxOffsetOp::build(mlir::OpBuilder &builder,
 //===----------------------------------------------------------------------===//
 
 mlir::ParseResult fir::isValidCaseAttr(mlir::Attribute attr) {
-  if (attr.isa<mlir::UnitAttr, fir::ClosedIntervalAttr, fir::PointIntervalAttr,
-               fir::LowerBoundAttr, fir::UpperBoundAttr>())
+  if (mlir::isa<mlir::UnitAttr, fir::ClosedIntervalAttr, fir::PointIntervalAttr,
+                fir::LowerBoundAttr, fir::UpperBoundAttr>(attr))
     return mlir::success();
   return mlir::failure();
 }
@@ -3657,9 +3659,9 @@ unsigned fir::getCaseArgumentOffset(llvm::ArrayRef<mlir::Attribute> cases,
   unsigned o = 0;
   for (unsigned i = 0; i < dest; ++i) {
     auto &attr = cases[i];
-    if (!attr.dyn_cast_or_null<mlir::UnitAttr>()) {
+    if (!mlir::dyn_cast_or_null<mlir::UnitAttr>(attr)) {
       ++o;
-      if (attr.dyn_cast_or_null<fir::ClosedIntervalAttr>())
+      if (mlir::dyn_cast_or_null<fir::ClosedIntervalAttr>(attr))
         ++o;
     }
   }
@@ -3722,7 +3724,7 @@ fir::GlobalOp fir::createGlobalOp(mlir::Location loc, mlir::ModuleOp module,
 bool fir::hasHostAssociationArgument(mlir::func::FuncOp func) {
   if (auto allArgAttrs = func.getAllArgAttrs())
     for (auto attr : allArgAttrs)
-      if (auto dict = attr.template dyn_cast_or_null<mlir::DictionaryAttr>())
+      if (auto dict = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(attr))
         if (dict.get(fir::getHostAssocAttrName()))
           return true;
   return false;
@@ -3772,7 +3774,7 @@ valueCheckFirAttributes(mlir::Value value,
   };
   // If this is a fir.box that was loaded, the fir attributes will be on the
   // related fir.ref<fir.box> creation.
-  if (value.getType().isa<fir::BoxType>())
+  if (mlir::isa<fir::BoxType>(value.getType()))
     if (auto definingOp = value.getDefiningOp())
       if (auto loadOp = mlir::dyn_cast<fir::LoadOp>(definingOp))
         value = loadOp.getMemref();
@@ -3837,10 +3839,10 @@ bool fir::anyFuncArgsHaveAttr(mlir::func::FuncOp func, llvm::StringRef attr) {
 std::optional<std::int64_t> fir::getIntIfConstant(mlir::Value value) {
   if (auto *definingOp = value.getDefiningOp()) {
     if (auto cst = mlir::dyn_cast<mlir::arith::ConstantOp>(definingOp))
-      if (auto intAttr = cst.getValue().dyn_cast<mlir::IntegerAttr>())
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue()))
         return intAttr.getInt();
     if (auto llConstOp = mlir::dyn_cast<mlir::LLVM::ConstantOp>(definingOp))
-      if (auto attr = llConstOp.getValue().dyn_cast<mlir::IntegerAttr>())
+      if (auto attr = mlir::dyn_cast<mlir::IntegerAttr>(llConstOp.getValue()))
         return attr.getValue().getSExtValue();
   }
   return {};
@@ -4002,15 +4004,15 @@ mlir::LogicalResult fir::CUDAKernelOp::verify() {
 mlir::LogicalResult fir::CUDAAllocateOp::verify() {
   if (getPinned() && getStream())
     return emitOpError("pinned and stream cannot appears at the same time");
-  if (!fir::unwrapRefType(getBox().getType()).isa<fir::BaseBoxType>())
+  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
     return emitOpError(
         "expect box to be a reference to a class or box type value");
   if (getSource() &&
-      !fir::unwrapRefType(getSource().getType()).isa<fir::BaseBoxType>())
+      !mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getSource().getType())))
     return emitOpError(
         "expect source to be a reference to/or a class or box type value");
   if (getErrmsg() &&
-      !fir::unwrapRefType(getErrmsg().getType()).isa<fir::BoxType>())
+      !mlir::isa<fir::BoxType>(fir::unwrapRefType(getErrmsg().getType())))
     return emitOpError(
         "expect errmsg to be a reference to/or a box type value");
   if (getErrmsg() && !getHasStat())
@@ -4019,11 +4021,11 @@ mlir::LogicalResult fir::CUDAAllocateOp::verify() {
 }
 
 mlir::LogicalResult fir::CUDADeallocateOp::verify() {
-  if (!fir::unwrapRefType(getBox().getType()).isa<fir::BaseBoxType>())
+  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
     return emitOpError(
         "expect box to be a reference to class or box type value");
   if (getErrmsg() &&
-      !fir::unwrapRefType(getErrmsg().getType()).isa<fir::BoxType>())
+      !mlir::isa<fir::BoxType>(fir::unwrapRefType(getErrmsg().getType())))
     return emitOpError(
         "expect errmsg to be a reference to/or a box type value");
   if (getErrmsg() && !getHasStat())
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index 5c4cad6d2083..daa3ac905dad 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -61,14 +61,13 @@ TYPE parseTypeSingleton(mlir::AsmParser &parser) {
 /// Is `ty` a standard or FIR integer type?
 static bool isaIntegerType(mlir::Type ty) {
   // TODO: why aren't we using isa_integer? investigatation required.
-  return ty.isa<mlir::IntegerType>() || ty.isa<fir::IntegerType>();
+  return mlir::isa<mlir::IntegerType, fir::IntegerType>(ty);
 }
 
 bool verifyRecordMemberType(mlir::Type ty) {
-  return !(ty.isa<BoxCharType>() || ty.isa<ShapeType>() ||
-           ty.isa<ShapeShiftType>() || ty.isa<ShiftType>() ||
-           ty.isa<SliceType>() || ty.isa<FieldType>() || ty.isa<LenType>() ||
-           ty.isa<ReferenceType>() || ty.isa<TypeDescType>());
+  return !mlir::isa<BoxCharType, ShapeType, ShapeShiftType, ShiftType,
+                    SliceType, FieldType, LenType, ReferenceType, TypeDescType>(
+      ty);
 }
 
 bool verifySameLists(llvm::ArrayRef<RecordType::TypePair> a1,
@@ -194,7 +193,7 @@ bool isa_std_type(mlir::Type t) {
 }
 
 bool isa_fir_or_std_type(mlir::Type t) {
-  if (auto funcType = t.dyn_cast<mlir::FunctionType>())
+  if (auto funcType = mlir::dyn_cast<mlir::FunctionType>(t))
     return llvm::all_of(funcType.getInputs(), isa_fir_or_std_type) &&
            llvm::all_of(funcType.getResults(), isa_fir_or_std_type);
   return isa_fir_type(t) || isa_std_type(t);
@@ -203,7 +202,7 @@ bool isa_fir_or_std_type(mlir::Type t) {
 mlir::Type getDerivedType(mlir::Type ty) {
   return llvm::TypeSwitch<mlir::Type, mlir::Type>(ty)
       .Case<fir::PointerType, fir::HeapType, fir::SequenceType>([](auto p) {
-        if (auto seq = p.getEleTy().template dyn_cast<fir::SequenceType>())
+        if (auto seq = mlir::dyn_cast<fir::SequenceType>(p.getEleTy()))
           return seq.getEleTy();
         return p.getEleTy();
       })
@@ -228,12 +227,12 @@ mlir::Type dyn_cast_ptrOrBoxEleTy(mlir::Type t) {
 
 static bool hasDynamicSize(fir::RecordType recTy) {
   for (auto field : recTy.getTypeList()) {
-    if (auto arr = field.second.dyn_cast<fir::SequenceType>()) {
+    if (auto arr = mlir::dyn_cast<fir::SequenceType>(field.second)) {
       if (sequenceWithNonConstantShape(arr))
         return true;
     } else if (characterWithDynamicLen(field.second)) {
       return true;
-    } else if (auto rec = field.second.dyn_cast<fir::RecordType>()) {
+    } else if (auto rec = mlir::dyn_cast<fir::RecordType>(field.second)) {
       if (hasDynamicSize(rec))
         return true;
     }
@@ -242,14 +241,14 @@ static bool hasDynamicSize(fir::RecordType recTy) {
 }
 
 bool hasDynamicSize(mlir::Type t) {
-  if (auto arr = t.dyn_cast<fir::SequenceType>()) {
+  if (auto arr = mlir::dyn_cast<fir::SequenceType>(t)) {
     if (sequenceWithNonConstantShape(arr))
       return true;
     t = arr.getEleTy();
   }
   if (characterWithDynamicLen(t))
     return true;
-  if (auto rec = t.dyn_cast<fir::RecordType>())
+  if (auto rec = mlir::dyn_cast<fir::RecordType>(t))
     return hasDynamicSize(rec);
   return false;
 }
@@ -269,33 +268,33 @@ mlir::Type extractSequenceType(mlir::Type ty) {
 bool isPointerType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>())
-    return boxTy.getEleTy().isa<fir::PointerType>();
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
+    return mlir::isa<fir::PointerType>(boxTy.getEleTy());
   return false;
 }
 
 bool isAllocatableType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>())
-    return boxTy.getEleTy().isa<fir::HeapType>();
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
+    return mlir::isa<fir::HeapType>(boxTy.getEleTy());
   return false;
 }
 
 bool isBoxNone(mlir::Type ty) {
-  if (auto box = ty.dyn_cast<fir::BoxType>())
-    return box.getEleTy().isa<mlir::NoneType>();
+  if (auto box = mlir::dyn_cast<fir::BoxType>(ty))
+    return mlir::isa<mlir::NoneType>(box.getEleTy());
   return false;
 }
 
 bool isBoxedRecordType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
-  if (auto boxTy = ty.dyn_cast<fir::BoxType>()) {
-    if (boxTy.getEleTy().isa<fir::RecordType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BoxType>(ty)) {
+    if (mlir::isa<fir::RecordType>(boxTy.getEleTy()))
       return true;
     mlir::Type innerType = boxTy.unwrapInnerType();
-    return innerType && innerType.isa<fir::RecordType>();
+    return innerType && mlir::isa<fir::RecordType>(innerType);
   }
   return false;
 }
@@ -303,13 +302,13 @@ bool isBoxedRecordType(mlir::Type ty) {
 bool isScalarBoxedRecordType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
-    if (boxTy.getEleTy().isa<fir::RecordType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
+    if (mlir::isa<fir::RecordType>(boxTy.getEleTy()))
       return true;
-    if (auto heapTy = boxTy.getEleTy().dyn_cast<fir::HeapType>())
-      return heapTy.getEleTy().isa<fir::RecordType>();
-    if (auto ptrTy = boxTy.getEleTy().dyn_cast<fir::PointerType>())
-      return ptrTy.getEleTy().isa<fir::RecordType>();
+    if (auto heapTy = mlir::dyn_cast<fir::HeapType>(boxTy.getEleTy()))
+      return mlir::isa<fir::RecordType>(heapTy.getEleTy());
+    if (auto ptrTy = mlir::dyn_cast<fir::PointerType>(boxTy.getEleTy()))
+      return mlir::isa<fir::RecordType>(ptrTy.getEleTy());
   }
   return false;
 }
@@ -363,10 +362,10 @@ bool isPolymorphicType(mlir::Type ty) {
 bool isUnlimitedPolymorphicType(mlir::Type ty) {
   // CLASS(*)
   if (auto clTy = mlir::dyn_cast<fir::ClassType>(fir::unwrapRefType(ty))) {
-    if (clTy.getEleTy().isa<mlir::NoneType>())
+    if (mlir::isa<mlir::NoneType>(clTy.getEleTy()))
       return true;
     mlir::Type innerType = clTy.unwrapInnerType();
-    return innerType && innerType.isa<mlir::NoneType>();
+    return innerType && mlir::isa<mlir::NoneType>(innerType);
   }
   // TYPE(*)
   return isAssumedType(ty);
@@ -376,7 +375,7 @@ mlir::Type unwrapInnerType(mlir::Type ty) {
   return llvm::TypeSwitch<mlir::Type, mlir::Type>(ty)
       .Case<fir::PointerType, fir::HeapType, fir::SequenceType>([](auto t) {
         mlir::Type eleTy = t.getEleTy();
-        if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+        if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
           return seqTy.getEleTy();
         return eleTy;
       })
@@ -385,13 +384,14 @@ mlir::Type unwrapInnerType(mlir::Type ty) {
 }
 
 bool isRecordWithAllocatableMember(mlir::Type ty) {
-  if (auto recTy = ty.dyn_cast<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty))
     for (auto [field, memTy] : recTy.getTypeList()) {
       if (fir::isAllocatableType(memTy))
         return true;
       // A record type cannot recursively include itself as a direct member.
       // There must be an intervening `ptr` type, so recursion is safe here.
-      if (memTy.isa<fir::RecordType>() && isRecordWithAllocatableMember(memTy))
+      if (mlir::isa<fir::RecordType>(memTy) &&
+          isRecordWithAllocatableMember(memTy))
         return true;
     }
   return false;
@@ -399,11 +399,12 @@ bool isRecordWithAllocatableMember(mlir::Type ty) {
 
 bool isRecordWithDescriptorMember(mlir::Type ty) {
   ty = unwrapSequenceType(ty);
-  if (auto recTy = ty.dyn_cast<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty))
     for (auto [field, memTy] : recTy.getTypeList()) {
       if (mlir::isa<fir::BaseBoxType>(memTy))
         return true;
-      if (memTy.isa<fir::RecordType>() && isRecordWithDescriptorMember(memTy))
+      if (mlir::isa<fir::RecordType>(memTy) &&
+          isRecordWithDescriptorMember(memTy))
         return true;
     }
   return false;
@@ -412,7 +413,7 @@ bool isRecordWithDescriptorMember(mlir::Type ty) {
 mlir::Type unwrapAllRefAndSeqType(mlir::Type ty) {
   while (true) {
     mlir::Type nt = unwrapSequenceType(unwrapRefType(ty));
-    if (auto vecTy = nt.dyn_cast<fir::VectorType>())
+    if (auto vecTy = mlir::dyn_cast<fir::VectorType>(nt))
       nt = vecTy.getEleTy();
     if (nt == ty)
       return ty;
@@ -421,11 +422,11 @@ mlir::Type unwrapAllRefAndSeqType(mlir::Type ty) {
 }
 
 mlir::Type unwrapSeqOrBoxedSeqType(mlir::Type ty) {
-  if (auto seqTy = ty.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
     return seqTy.getEleTy();
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
     auto eleTy = unwrapRefType(boxTy.getEleTy());
-    if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
       return seqTy.getEleTy();
   }
   return ty;
@@ -433,7 +434,7 @@ mlir::Type unwrapSeqOrBoxedSeqType(mlir::Type ty) {
 
 unsigned getBoxRank(mlir::Type boxTy) {
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(boxTy);
-  if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
     return seqTy.getDimension();
   return 0;
 }
@@ -441,7 +442,7 @@ unsigned getBoxRank(mlir::Type boxTy) {
 /// Return the ISO_C_BINDING intrinsic module value of type \p ty.
 int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
   unsigned width = 0;
-  if (mlir::IntegerType intTy = ty.dyn_cast<mlir::IntegerType>()) {
+  if (mlir::IntegerType intTy = mlir::dyn_cast<mlir::IntegerType>(ty)) {
     switch (intTy.getWidth()) {
     case 8:
       return CFI_type_int8_t;
@@ -456,7 +457,7 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
     }
     llvm_unreachable("unsupported integer type");
   }
-  if (fir::LogicalType logicalTy = ty.dyn_cast<fir::LogicalType>()) {
+  if (fir::LogicalType logicalTy = mlir::dyn_cast<fir::LogicalType>(ty)) {
     switch (kindMap.getLogicalBitsize(logicalTy.getFKind())) {
     case 8:
       return CFI_type_Bool;
@@ -469,7 +470,7 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
     }
     llvm_unreachable("unsupported logical type");
   }
-  if (mlir::FloatType floatTy = ty.dyn_cast<mlir::FloatType>()) {
+  if (mlir::FloatType floatTy = mlir::dyn_cast<mlir::FloatType>(ty)) {
     switch (floatTy.getWidth()) {
     case 16:
       return floatTy.isBF16() ? CFI_type_bfloat : CFI_type_half_float;
@@ -485,13 +486,14 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
     llvm_unreachable("unsupported real type");
   }
   if (fir::isa_complex(ty)) {
-    if (mlir::ComplexType complexTy = ty.dyn_cast<mlir::ComplexType>()) {
+    if (mlir::ComplexType complexTy = mlir::dyn_cast<mlir::ComplexType>(ty)) {
       mlir::FloatType floatTy =
-          complexTy.getElementType().cast<mlir::FloatType>();
+          mlir::cast<mlir::FloatType>(complexTy.getElementType());
       if (floatTy.isBF16())
         return CFI_type_bfloat_Complex;
       width = floatTy.getWidth();
-    } else if (fir::ComplexType complexTy = ty.dyn_cast<fir::ComplexType>()) {
+    } else if (fir::ComplexType complexTy =
+                   mlir::dyn_cast<fir::ComplexType>(ty)) {
       auto FKind = complexTy.getFKind();
       if (FKind == 3)
         return CFI_type_bfloat_Complex;
@@ -511,7 +513,7 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
     }
     llvm_unreachable("unsupported complex size");
   }
-  if (fir::CharacterType charTy = ty.dyn_cast<fir::CharacterType>()) {
+  if (fir::CharacterType charTy = mlir::dyn_cast<fir::CharacterType>(ty)) {
     switch (kindMap.getCharacterBitsize(charTy.getFKind())) {
     case 8:
       return CFI_type_char;
@@ -524,7 +526,7 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
   }
   if (fir::isa_ref_type(ty))
     return CFI_type_cptr;
-  if (ty.isa<fir::RecordType>())
+  if (mlir::isa<fir::RecordType>(ty))
     return CFI_type_struct;
   llvm_unreachable("unsupported type");
 }
@@ -542,12 +544,12 @@ std::string getTypeAsString(mlir::Type ty, const fir::KindMapping &kindMap,
         name << "idx";
       } else if (ty.isIntOrIndex()) {
         name << 'i' << ty.getIntOrFloatBitWidth();
-      } else if (ty.isa<mlir::FloatType>()) {
+      } else if (mlir::isa<mlir::FloatType>(ty)) {
         name << 'f' << ty.getIntOrFloatBitWidth();
       } else if (fir::isa_complex(ty)) {
         name << 'z';
         if (auto cplxTy = mlir::dyn_cast_or_null<mlir::ComplexType>(ty)) {
-          auto floatTy = cplxTy.getElementType().cast<mlir::FloatType>();
+          auto floatTy = mlir::cast<mlir::FloatType>(cplxTy.getElementType());
           name << floatTy.getWidth();
         } else if (auto cplxTy = mlir::dyn_cast_or_null<fir::ComplexType>(ty)) {
           name << kindMap.getRealBitsize(cplxTy.getFKind());
@@ -644,7 +646,7 @@ static llvm::SmallPtrSet<detail::RecordTypeStorage const *, 4>
 } // namespace
 
 void fir::verifyIntegralType(mlir::Type type) {
-  if (isaIntegerType(type) || type.isa<mlir::IndexType>())
+  if (isaIntegerType(type) || mlir::isa<mlir::IndexType>(type))
     return;
   llvm::report_fatal_error("expected integral type");
 }
@@ -656,9 +658,9 @@ void fir::printFirType(FIROpsDialect *, mlir::Type ty,
 }
 
 bool fir::isa_unknown_size_box(mlir::Type t) {
-  if (auto boxTy = t.dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(t)) {
     auto valueType = fir::unwrapPassByRefType(boxTy);
-    if (auto seqTy = valueType.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(valueType))
       if (seqTy.hasUnknownShape())
         return true;
   }
@@ -684,18 +686,18 @@ void fir::BoxProcType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult
 BoxProcType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                     mlir::Type eleTy) {
-  if (eleTy.isa<mlir::FunctionType>())
+  if (mlir::isa<mlir::FunctionType>(eleTy))
     return mlir::success();
-  if (auto refTy = eleTy.dyn_cast<ReferenceType>())
-    if (refTy.isa<mlir::FunctionType>())
+  if (auto refTy = mlir::dyn_cast<ReferenceType>(eleTy))
+    if (mlir::isa<mlir::FunctionType>(refTy))
       return mlir::success();
   return emitError() << "invalid type for boxproc" << eleTy << '\n';
 }
 
 static bool cannotBePointerOrHeapElementType(mlir::Type eleTy) {
-  return eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  return mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                    SliceType, FieldType, LenType, HeapType, PointerType,
-                   ReferenceType, TypeDescType>();
+                   ReferenceType, TypeDescType>(eleTy);
 }
 
 //===----------------------------------------------------------------------===//
@@ -705,7 +707,7 @@ static bool cannotBePointerOrHeapElementType(mlir::Type eleTy) {
 mlir::LogicalResult
 fir::BoxType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                      mlir::Type eleTy) {
-  if (eleTy.isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(eleTy))
     return emitError() << "invalid element type\n";
   // TODO
   return mlir::success();
@@ -774,10 +776,10 @@ void fir::CharacterType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult
 fir::ClassType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                        mlir::Type eleTy) {
-  if (eleTy.isa<fir::RecordType, fir::SequenceType, fir::HeapType,
+  if (mlir::isa<fir::RecordType, fir::SequenceType, fir::HeapType,
                 fir::PointerType, mlir::NoneType, mlir::IntegerType,
                 mlir::FloatType, fir::CharacterType, fir::LogicalType,
-                fir::ComplexType, mlir::ComplexType>())
+                fir::ComplexType, mlir::ComplexType>(eleTy))
     return mlir::success();
   return emitError() << "invalid element type\n";
 }
@@ -1048,8 +1050,8 @@ void fir::ReferenceType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult fir::ReferenceType::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     mlir::Type eleTy) {
-  if (eleTy.isa<ShapeType, ShapeShiftType, SliceType, FieldType, LenType,
-                ReferenceType, TypeDescType>())
+  if (mlir::isa<ShapeType, ShapeShiftType, SliceType, FieldType, LenType,
+                ReferenceType, TypeDescType>(eleTy))
     return emitError() << "cannot build a reference to type: " << eleTy << '\n';
   return mlir::success();
 }
@@ -1124,9 +1126,9 @@ mlir::LogicalResult fir::SequenceType::verify(
     llvm::ArrayRef<int64_t> shape, mlir::Type eleTy,
     mlir::AffineMapAttr layoutMap) {
   // DIMENSION attribute can only be applied to an intrinsic or record type
-  if (eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  if (mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                 ShiftType, SliceType, FieldType, LenType, HeapType, PointerType,
-                ReferenceType, TypeDescType, SequenceType>())
+                ReferenceType, TypeDescType, SequenceType>(eleTy))
     return emitError() << "cannot build an array of this element type: "
                        << eleTy << '\n';
   return mlir::success();
@@ -1197,9 +1199,9 @@ void fir::TypeDescType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult fir::TypeDescType::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     mlir::Type eleTy) {
-  if (eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  if (mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                 ShiftType, SliceType, FieldType, LenType, ReferenceType,
-                TypeDescType>())
+                TypeDescType>(eleTy))
     return emitError() << "cannot build a type descriptor of type: " << eleTy
                        << '\n';
   return mlir::success();
@@ -1236,10 +1238,10 @@ bool fir::VectorType::isValidElementType(mlir::Type t) {
 }
 
 bool fir::isCharacterProcedureTuple(mlir::Type ty, bool acceptRawFunc) {
-  mlir::TupleType tuple = ty.dyn_cast<mlir::TupleType>();
+  mlir::TupleType tuple = mlir::dyn_cast<mlir::TupleType>(ty);
   return tuple && tuple.size() == 2 &&
-         (tuple.getType(0).isa<fir::BoxProcType>() ||
-          (acceptRawFunc && tuple.getType(0).isa<mlir::FunctionType>())) &&
+         (mlir::isa<fir::BoxProcType>(tuple.getType(0)) ||
+          (acceptRawFunc && mlir::isa<mlir::FunctionType>(tuple.getType(0)))) &&
          fir::isa_integer(tuple.getType(1));
 }
 
@@ -1247,7 +1249,8 @@ bool fir::hasAbstractResult(mlir::FunctionType ty) {
   if (ty.getNumResults() == 0)
     return false;
   auto resultType = ty.getResult(0);
-  return resultType.isa<fir::SequenceType, fir::BaseBoxType, fir::RecordType>();
+  return mlir::isa<fir::SequenceType, fir::BaseBoxType, fir::RecordType>(
+      resultType);
 }
 
 /// Convert llvm::Type::TypeID to mlir::Type. \p kind is provided for error
@@ -1337,7 +1340,7 @@ void FIROpsDialect::registerTypes() {
            fir::ComplexType, FieldType, HeapType, fir::IntegerType, LenType,
            LogicalType, LLVMPointerType, PointerType, RealType, RecordType,
            ReferenceType, SequenceType, ShapeType, ShapeShiftType, ShiftType,
-           SliceType, TypeDescType, fir::VectorType>();
+           SliceType, TypeDescType, fir::VectorType, fir::DummyScopeType>();
   fir::ReferenceType::attachInterface<
       OpenMPPointerLikeModel<fir::ReferenceType>>(*getContext());
   fir::ReferenceType::attachInterface<
diff --git a/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp b/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
index 94f1689dfb05..70b1a2f3d844 100644
--- a/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
+++ b/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
@@ -18,7 +18,7 @@ mlir::LogicalResult
 fir::FortranVariableOpInterface::verifyDeclareLikeOpImpl(mlir::Value memref) {
   const unsigned numExplicitTypeParams = getExplicitTypeParams().size();
   mlir::Type memType = memref.getType();
-  const bool sourceIsBoxValue = memType.isa<fir::BaseBoxType>();
+  const bool sourceIsBoxValue = mlir::isa<fir::BaseBoxType>(memType);
   const bool sourceIsBoxAddress = fir::isBoxAddress(memType);
   const bool sourceIsBox = sourceIsBoxValue || sourceIsBoxAddress;
   if (isCharacter()) {
@@ -29,7 +29,8 @@ fir::FortranVariableOpInterface::verifyDeclareLikeOpImpl(mlir::Value memref) {
       return emitOpError("must be provided exactly one type parameter when its "
                          "base is a character that is not a box");
 
-  } else if (auto recordType = getElementType().dyn_cast<fir::RecordType>()) {
+  } else if (auto recordType =
+                 mlir::dyn_cast<fir::RecordType>(getElementType())) {
     if (numExplicitTypeParams < recordType.getNumLenParams() && !sourceIsBox)
       return emitOpError("must be provided all the derived type length "
                          "parameters when the base is not a box");
@@ -45,16 +46,16 @@ fir::FortranVariableOpInterface::verifyDeclareLikeOpImpl(mlir::Value memref) {
       if (sourceIsBoxAddress)
         return emitOpError("for box address must not have a shape operand");
       unsigned shapeRank = 0;
-      if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>()) {
+      if (auto shapeType = mlir::dyn_cast<fir::ShapeType>(shape.getType())) {
         shapeRank = shapeType.getRank();
       } else if (auto shapeShiftType =
-                     shape.getType().dyn_cast<fir::ShapeShiftType>()) {
+                     mlir::dyn_cast<fir::ShapeShiftType>(shape.getType())) {
         shapeRank = shapeShiftType.getRank();
       } else {
         if (!sourceIsBoxValue)
           emitOpError("of array entity with a raw address base must have a "
                       "shape operand that is a shape or shapeshift");
-        shapeRank = shape.getType().cast<fir::ShiftType>().getRank();
+        shapeRank = mlir::cast<fir::ShiftType>(shape.getType()).getRank();
       }
 
       std::optional<unsigned> rank = getRank();
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
index 08b2b0538c73..0b61c0edce62 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
@@ -84,7 +84,8 @@ bool hlfir::isFortranVariableType(mlir::Type type) {
   return llvm::TypeSwitch<mlir::Type, bool>(type)
       .Case<fir::ReferenceType, fir::PointerType, fir::HeapType>([](auto p) {
         mlir::Type eleType = p.getEleTy();
-        return eleType.isa<fir::BaseBoxType>() || !fir::hasDynamicSize(eleType);
+        return mlir::isa<fir::BaseBoxType>(eleType) ||
+               !fir::hasDynamicSize(eleType);
       })
       .Case<fir::BaseBoxType, fir::BoxCharType>([](auto) { return true; })
       .Case<fir::VectorType>([](auto) { return true; })
@@ -93,15 +94,15 @@ bool hlfir::isFortranVariableType(mlir::Type type) {
 
 bool hlfir::isFortranScalarCharacterType(mlir::Type type) {
   return isFortranScalarCharacterExprType(type) ||
-         type.isa<fir::BoxCharType>() ||
-         fir::unwrapPassByRefType(fir::unwrapRefType(type))
-             .isa<fir::CharacterType>();
+         mlir::isa<fir::BoxCharType>(type) ||
+         mlir::isa<fir::CharacterType>(
+             fir::unwrapPassByRefType(fir::unwrapRefType(type)));
 }
 
 bool hlfir::isFortranScalarCharacterExprType(mlir::Type type) {
-  if (auto exprType = type.dyn_cast<hlfir::ExprType>())
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type))
     return exprType.isScalar() &&
-           exprType.getElementType().isa<fir::CharacterType>();
+           mlir::isa<fir::CharacterType>(exprType.getElementType());
   return false;
 }
 
@@ -121,8 +122,8 @@ bool hlfir::isFortranScalarNumericalType(mlir::Type type) {
 bool hlfir::isFortranNumericalArrayObject(mlir::Type type) {
   if (isBoxAddressType(type))
     return false;
-  if (auto arrayTy =
-          getFortranElementOrSequenceType(type).dyn_cast<fir::SequenceType>())
+  if (auto arrayTy = mlir::dyn_cast<fir::SequenceType>(
+          getFortranElementOrSequenceType(type)))
     return isFortranScalarNumericalType(arrayTy.getEleTy());
   return false;
 }
@@ -130,8 +131,8 @@ bool hlfir::isFortranNumericalArrayObject(mlir::Type type) {
 bool hlfir::isFortranNumericalOrLogicalArrayObject(mlir::Type type) {
   if (isBoxAddressType(type))
     return false;
-  if (auto arrayTy =
-          getFortranElementOrSequenceType(type).dyn_cast<fir::SequenceType>()) {
+  if (auto arrayTy = mlir::dyn_cast<fir::SequenceType>(
+          getFortranElementOrSequenceType(type))) {
     mlir::Type eleTy = arrayTy.getEleTy();
     return isFortranScalarNumericalType(eleTy) ||
            mlir::isa<fir::LogicalType>(eleTy);
@@ -142,7 +143,8 @@ bool hlfir::isFortranNumericalOrLogicalArrayObject(mlir::Type type) {
 bool hlfir::isFortranArrayObject(mlir::Type type) {
   if (isBoxAddressType(type))
     return false;
-  return !!getFortranElementOrSequenceType(type).dyn_cast<fir::SequenceType>();
+  return !!mlir::dyn_cast<fir::SequenceType>(
+      getFortranElementOrSequenceType(type));
 }
 
 bool hlfir::isPassByRefOrIntegerType(mlir::Type type) {
@@ -151,7 +153,7 @@ bool hlfir::isPassByRefOrIntegerType(mlir::Type type) {
 }
 
 bool hlfir::isI1Type(mlir::Type type) {
-  if (mlir::IntegerType integer = type.dyn_cast<mlir::IntegerType>())
+  if (mlir::IntegerType integer = mlir::dyn_cast<mlir::IntegerType>(type))
     if (integer.getWidth() == 1)
       return true;
   return false;
@@ -160,8 +162,8 @@ bool hlfir::isI1Type(mlir::Type type) {
 bool hlfir::isFortranLogicalArrayObject(mlir::Type type) {
   if (isBoxAddressType(type))
     return false;
-  if (auto arrayTy =
-          getFortranElementOrSequenceType(type).dyn_cast<fir::SequenceType>()) {
+  if (auto arrayTy = mlir::dyn_cast<fir::SequenceType>(
+          getFortranElementOrSequenceType(type))) {
     mlir::Type eleTy = arrayTy.getEleTy();
     return mlir::isa<fir::LogicalType>(eleTy);
   }
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 8bad4e445082..4b586ad1d3a4 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -74,8 +74,8 @@ getIntrinsicEffects(mlir::Operation *self,
 /// Is this a fir.[ref/ptr/heap]<fir.[box/class]<fir.heap<T>>> type?
 static bool isAllocatableBoxRef(mlir::Type type) {
   fir::BaseBoxType boxType =
-      fir::dyn_cast_ptrEleTy(type).dyn_cast_or_null<fir::BaseBoxType>();
-  return boxType && boxType.getEleTy().isa<fir::HeapType>();
+      mlir::dyn_cast_or_null<fir::BaseBoxType>(fir::dyn_cast_ptrEleTy(type));
+  return boxType && mlir::isa<fir::HeapType>(boxType.getEleTy());
 }
 
 mlir::LogicalResult hlfir::AssignOp::verify() {
@@ -84,7 +84,7 @@ mlir::LogicalResult hlfir::AssignOp::verify() {
     return emitOpError("lhs must be an allocatable when `realloc` is set");
   if (mustKeepLhsLengthInAllocatableAssignment() &&
       !(isAllocatableAssignment() &&
-        hlfir::getFortranElementType(lhsType).isa<fir::CharacterType>()))
+        mlir::isa<fir::CharacterType>(hlfir::getFortranElementType(lhsType))))
     return emitOpError("`realloc` must be set and lhs must be a character "
                        "allocatable when `keep_lhs_length_if_realloc` is set");
   return mlir::success();
@@ -99,13 +99,13 @@ mlir::LogicalResult hlfir::AssignOp::verify() {
 mlir::Type hlfir::DeclareOp::getHLFIRVariableType(mlir::Type inputType,
                                                   bool hasExplicitLowerBounds) {
   mlir::Type type = fir::unwrapRefType(inputType);
-  if (type.isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(type))
     return inputType;
-  if (auto charType = type.dyn_cast<fir::CharacterType>())
+  if (auto charType = mlir::dyn_cast<fir::CharacterType>(type))
     if (charType.hasDynamicLen())
       return fir::BoxCharType::get(charType.getContext(), charType.getFKind());
 
-  auto seqType = type.dyn_cast<fir::SequenceType>();
+  auto seqType = mlir::dyn_cast<fir::SequenceType>(type);
   bool hasDynamicExtents =
       seqType && fir::sequenceWithNonConstantShape(seqType);
   mlir::Type eleType = seqType ? seqType.getEleTy() : type;
@@ -117,7 +117,8 @@ mlir::Type hlfir::DeclareOp::getHLFIRVariableType(mlir::Type inputType,
 }
 
 static bool hasExplicitLowerBounds(mlir::Value shape) {
-  return shape && shape.getType().isa<fir::ShapeShiftType, fir::ShiftType>();
+  return shape &&
+         mlir::isa<fir::ShapeShiftType, fir::ShiftType>(shape.getType());
 }
 
 void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
@@ -132,7 +133,8 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
   mlir::Type hlfirVariableType =
       getHLFIRVariableType(inputType, hasExplicitLbs);
   build(builder, result, {hlfirVariableType, inputType}, memref, shape,
-        typeparams, nameAttr, fortran_attrs, cuda_attr);
+        typeparams, /*dummy_scope=*/nullptr, nameAttr, fortran_attrs,
+        cuda_attr);
 }
 
 mlir::LogicalResult hlfir::DeclareOp::verify() {
@@ -288,7 +290,7 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
   bool hasBoxComponent;
   if (getComponent()) {
     auto component = getComponent().value();
-    auto recType = baseElementType.dyn_cast<fir::RecordType>();
+    auto recType = mlir::dyn_cast<fir::RecordType>(baseElementType);
     if (!recType)
       return emitOpError(
           "component must be provided only when the memref is a derived type");
@@ -300,14 +302,14 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
     }
     mlir::Type fieldType = recType.getType(fieldIdx);
     mlir::Type componentBaseType = getFortranElementOrSequenceType(fieldType);
-    hasBoxComponent = fieldType.isa<fir::BaseBoxType>();
-    if (componentBaseType.isa<fir::SequenceType>() &&
-        baseType.isa<fir::SequenceType>() &&
+    hasBoxComponent = mlir::isa<fir::BaseBoxType>(fieldType);
+    if (mlir::isa<fir::SequenceType>(componentBaseType) &&
+        mlir::isa<fir::SequenceType>(baseType) &&
         (numSubscripts == 0 || subscriptsRank > 0))
       return emitOpError("indices must be provided and must not contain "
                          "triplets when both memref and component are arrays");
     if (numSubscripts != 0) {
-      if (!componentBaseType.isa<fir::SequenceType>())
+      if (!mlir::isa<fir::SequenceType>(componentBaseType))
         return emitOpError("indices must not be provided if component appears "
                            "and is not an array component");
       if (!getComponentShape())
@@ -315,9 +317,9 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
             "component_shape must be provided when indexing a component");
       mlir::Type compShapeType = getComponentShape().getType();
       unsigned componentRank =
-          componentBaseType.cast<fir::SequenceType>().getDimension();
-      auto shapeType = compShapeType.dyn_cast<fir::ShapeType>();
-      auto shapeShiftType = compShapeType.dyn_cast<fir::ShapeShiftType>();
+          mlir::cast<fir::SequenceType>(componentBaseType).getDimension();
+      auto shapeType = mlir::dyn_cast<fir::ShapeType>(compShapeType);
+      auto shapeShiftType = mlir::dyn_cast<fir::ShapeShiftType>(compShapeType);
       if (!((shapeType && shapeType.getRank() == componentRank) ||
             (shapeShiftType && shapeShiftType.getRank() == componentRank)))
         return emitOpError("component_shape must be a fir.shape or "
@@ -325,33 +327,33 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
       if (numSubscripts > componentRank)
         return emitOpError("indices number must match array component rank");
     }
-    if (auto baseSeqType = baseType.dyn_cast<fir::SequenceType>())
+    if (auto baseSeqType = mlir::dyn_cast<fir::SequenceType>(baseType))
       // This case must come first to cover "array%array_comp(i, j)" that has
       // subscripts for the component but whose rank come from the base.
       outputRank = baseSeqType.getDimension();
     else if (numSubscripts != 0)
       outputRank = subscriptsRank;
     else if (auto componentSeqType =
-                 componentBaseType.dyn_cast<fir::SequenceType>())
+                 mlir::dyn_cast<fir::SequenceType>(componentBaseType))
       outputRank = componentSeqType.getDimension();
     outputElementType = fir::unwrapSequenceType(componentBaseType);
   } else {
     outputElementType = baseElementType;
     unsigned baseTypeRank =
-        baseType.isa<fir::SequenceType>()
-            ? baseType.cast<fir::SequenceType>().getDimension()
+        mlir::isa<fir::SequenceType>(baseType)
+            ? mlir::cast<fir::SequenceType>(baseType).getDimension()
             : 0;
     if (numSubscripts != 0) {
       if (baseTypeRank != numSubscripts)
         return emitOpError("indices number must match memref rank");
       outputRank = subscriptsRank;
-    } else if (auto baseSeqType = baseType.dyn_cast<fir::SequenceType>()) {
+    } else if (auto baseSeqType = mlir::dyn_cast<fir::SequenceType>(baseType)) {
       outputRank = baseSeqType.getDimension();
     }
   }
 
   if (!getSubstring().empty()) {
-    if (!outputElementType.isa<fir::CharacterType>())
+    if (!mlir::isa<fir::CharacterType>(outputElementType))
       return emitOpError("memref or component must have character type if "
                          "substring indices are provided");
     if (getSubstring().size() != 2)
@@ -361,16 +363,16 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
     if (!fir::isa_complex(outputElementType))
       return emitOpError("memref or component must have complex type if "
                          "complex_part is provided");
-    if (auto firCplx = outputElementType.dyn_cast<fir::ComplexType>())
+    if (auto firCplx = mlir::dyn_cast<fir::ComplexType>(outputElementType))
       outputElementType = firCplx.getElementType();
     else
       outputElementType =
-          outputElementType.cast<mlir::ComplexType>().getElementType();
+          mlir::cast<mlir::ComplexType>(outputElementType).getElementType();
   }
   mlir::Type resultBaseType =
       getFortranElementOrSequenceType(getResult().getType());
   unsigned resultRank = 0;
-  if (auto resultSeqType = resultBaseType.dyn_cast<fir::SequenceType>())
+  if (auto resultSeqType = mlir::dyn_cast<fir::SequenceType>(resultBaseType))
     resultRank = resultSeqType.getDimension();
   if (resultRank != outputRank)
     return emitOpError("result type rank is not consistent with operands, "
@@ -380,10 +382,10 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
   // result type must match the one that was inferred here, except the character
   // length may differ because of substrings.
   if (resultElementType != outputElementType &&
-      !(resultElementType.isa<fir::CharacterType>() &&
-        outputElementType.isa<fir::CharacterType>()) &&
-      !(resultElementType.isa<mlir::FloatType>() &&
-        outputElementType.isa<fir::RealType>()))
+      !(mlir::isa<fir::CharacterType>(resultElementType) &&
+        mlir::isa<fir::CharacterType>(outputElementType)) &&
+      !(mlir::isa<mlir::FloatType>(resultElementType) &&
+        mlir::isa<fir::RealType>(outputElementType)))
     return emitOpError(
                "result element type is not consistent with operands, expected ")
            << outputElementType;
@@ -401,22 +403,22 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
       return emitOpError("shape must be provided if and only if the result is "
                          "an array that is not a box address");
     if (resultRank != 0) {
-      auto shapeType = getShape().getType().dyn_cast<fir::ShapeType>();
+      auto shapeType = mlir::dyn_cast<fir::ShapeType>(getShape().getType());
       auto shapeShiftType =
-          getShape().getType().dyn_cast<fir::ShapeShiftType>();
+          mlir::dyn_cast<fir::ShapeShiftType>(getShape().getType());
       if (!((shapeType && shapeType.getRank() == resultRank) ||
             (shapeShiftType && shapeShiftType.getRank() == resultRank)))
         return emitOpError("shape must be a fir.shape or fir.shapeshift with "
                            "the rank of the result");
     }
     auto numLenParam = getTypeparams().size();
-    if (outputElementType.isa<fir::CharacterType>()) {
+    if (mlir::isa<fir::CharacterType>(outputElementType)) {
       if (numLenParam != 1)
         return emitOpError("must be provided one length parameter when the "
                            "result is a character");
     } else if (fir::isRecordWithTypeParameters(outputElementType)) {
       if (numLenParam !=
-          outputElementType.cast<fir::RecordType>().getNumLenParams())
+          mlir::cast<fir::RecordType>(outputElementType).getNumLenParams())
         return emitOpError("must be provided the same number of length "
                            "parameters as in the result derived type");
     } else if (numLenParam != 0) {
@@ -434,18 +436,18 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
 mlir::LogicalResult hlfir::ParentComponentOp::verify() {
   mlir::Type baseType =
       hlfir::getFortranElementOrSequenceType(getMemref().getType());
-  auto maybeInputSeqType = baseType.dyn_cast<fir::SequenceType>();
+  auto maybeInputSeqType = mlir::dyn_cast<fir::SequenceType>(baseType);
   unsigned inputTypeRank =
       maybeInputSeqType ? maybeInputSeqType.getDimension() : 0;
   unsigned shapeRank = 0;
   if (mlir::Value shape = getShape())
-    if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>())
+    if (auto shapeType = mlir::dyn_cast<fir::ShapeType>(shape.getType()))
       shapeRank = shapeType.getRank();
   if (inputTypeRank != shapeRank)
     return emitOpError(
         "must be provided a shape if and only if the base is an array");
   mlir::Type outputBaseType = hlfir::getFortranElementOrSequenceType(getType());
-  auto maybeOutputSeqType = outputBaseType.dyn_cast<fir::SequenceType>();
+  auto maybeOutputSeqType = mlir::dyn_cast<fir::SequenceType>(outputBaseType);
   unsigned outputTypeRank =
       maybeOutputSeqType ? maybeOutputSeqType.getDimension() : 0;
   if (inputTypeRank != outputTypeRank)
@@ -459,23 +461,23 @@ mlir::LogicalResult hlfir::ParentComponentOp::verify() {
           return emitOpError(
               "result type extents are inconsistent with memref type");
   fir::RecordType baseRecType =
-      hlfir::getFortranElementType(baseType).dyn_cast<fir::RecordType>();
-  fir::RecordType outRecType =
-      hlfir::getFortranElementType(outputBaseType).dyn_cast<fir::RecordType>();
+      mlir::dyn_cast<fir::RecordType>(hlfir::getFortranElementType(baseType));
+  fir::RecordType outRecType = mlir::dyn_cast<fir::RecordType>(
+      hlfir::getFortranElementType(outputBaseType));
   if (!baseRecType || !outRecType)
     return emitOpError("result type and input type must be derived types");
 
   // Note: result should not be a fir.class: its dynamic type is being set to
   // the parent type and allowing fir.class would break the operation codegen:
   // it would keep the input dynamic type.
-  if (getType().isa<fir::ClassType>())
+  if (mlir::isa<fir::ClassType>(getType()))
     return emitOpError("result type must not be polymorphic");
 
   // The array results are known to not be dis-contiguous in most cases (the
   // exception being if the parent type was extended by a type without any
   // components): require a fir.box to be used for the result to carry the
   // strides.
-  if (!getType().isa<fir::BoxType>() &&
+  if (!mlir::isa<fir::BoxType>(getType()) &&
       (outputTypeRank != 0 || fir::isRecordWithTypeParameters(outRecType)))
     return emitOpError("result type must be a fir.box if the result is an "
                        "array or has length parameters");
@@ -496,9 +498,8 @@ verifyLogicalReductionOp(LogicalReductionOp reductionOp) {
   mlir::Value mask = reductionOp->getMask();
   mlir::Value dim = reductionOp->getDim();
 
-  fir::SequenceType maskTy =
-      hlfir::getFortranElementOrSequenceType(mask.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType maskTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(mask.getType()));
   mlir::Type logicalTy = maskTy.getEleTy();
   llvm::ArrayRef<int64_t> maskShape = maskTy.getShape();
 
@@ -576,9 +577,8 @@ mlir::LogicalResult hlfir::CountOp::verify() {
   mlir::Value mask = getMask();
   mlir::Value dim = getDim();
 
-  fir::SequenceType maskTy =
-      hlfir::getFortranElementOrSequenceType(mask.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType maskTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(mask.getType()));
   llvm::ArrayRef<int64_t> maskShape = maskTy.getShape();
 
   mlir::Type resultType = results[0];
@@ -613,13 +613,14 @@ void hlfir::CountOp::getEffects(
 //===----------------------------------------------------------------------===//
 
 static unsigned getCharacterKind(mlir::Type t) {
-  return hlfir::getFortranElementType(t).cast<fir::CharacterType>().getFKind();
+  return mlir::cast<fir::CharacterType>(hlfir::getFortranElementType(t))
+      .getFKind();
 }
 
 static std::optional<fir::CharacterType::LenType>
 getCharacterLengthIfStatic(mlir::Type t) {
   if (auto charType =
-          hlfir::getFortranElementType(t).dyn_cast<fir::CharacterType>())
+          mlir::dyn_cast<fir::CharacterType>(hlfir::getFortranElementType(t)))
     if (charType.hasConstantLen())
       return charType.getLen();
   return std::nullopt;
@@ -672,15 +673,13 @@ verifyArrayAndMaskForReductionOp(NumericalReductionOp reductionOp) {
   mlir::Value array = reductionOp->getArray();
   mlir::Value mask = reductionOp->getMask();
 
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape();
 
   if (mask) {
-    fir::SequenceType maskSeq =
-        hlfir::getFortranElementOrSequenceType(mask.getType())
-            .dyn_cast<fir::SequenceType>();
+    fir::SequenceType maskSeq = mlir::dyn_cast<fir::SequenceType>(
+        hlfir::getFortranElementOrSequenceType(mask.getType()));
     llvm::ArrayRef<int64_t> maskShape;
 
     if (maskSeq)
@@ -720,9 +719,8 @@ verifyNumericalReductionOp(NumericalReductionOp reductionOp) {
 
   mlir::Value array = reductionOp->getArray();
   mlir::Value dim = reductionOp->getDim();
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   mlir::Type numTy = arrayTy.getEleTy();
   llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape();
 
@@ -790,13 +788,12 @@ verifyCharacterReductionOp(CharacterReductionOp reductionOp) {
 
   mlir::Value array = reductionOp->getArray();
   mlir::Value dim = reductionOp->getDim();
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   mlir::Type numTy = arrayTy.getEleTy();
   llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape();
 
-  auto resultExpr = results[0].cast<hlfir::ExprType>();
+  auto resultExpr = mlir::cast<hlfir::ExprType>(results[0]);
   mlir::Type resultType = resultExpr.getEleTy();
   assert(mlir::isa<fir::CharacterType>(resultType) &&
          "result must be character");
@@ -881,9 +878,8 @@ verifyResultForMinMaxLoc(NumericalReductionOp reductionOp) {
 
   mlir::Value array = reductionOp->getArray();
   mlir::Value dim = reductionOp->getDim();
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape();
 
   mlir::Type resultType = results[0];
@@ -993,12 +989,10 @@ void hlfir::SumOp::getEffects(
 mlir::LogicalResult hlfir::DotProductOp::verify() {
   mlir::Value lhs = getLhs();
   mlir::Value rhs = getRhs();
-  fir::SequenceType lhsTy =
-      hlfir::getFortranElementOrSequenceType(lhs.getType())
-          .cast<fir::SequenceType>();
-  fir::SequenceType rhsTy =
-      hlfir::getFortranElementOrSequenceType(rhs.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType lhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(lhs.getType()));
+  fir::SequenceType rhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(rhs.getType()));
   llvm::ArrayRef<int64_t> lhsShape = lhsTy.getShape();
   llvm::ArrayRef<int64_t> rhsShape = rhsTy.getShape();
   std::size_t lhsRank = lhsShape.size();
@@ -1051,19 +1045,17 @@ void hlfir::DotProductOp::getEffects(
 mlir::LogicalResult hlfir::MatmulOp::verify() {
   mlir::Value lhs = getLhs();
   mlir::Value rhs = getRhs();
-  fir::SequenceType lhsTy =
-      hlfir::getFortranElementOrSequenceType(lhs.getType())
-          .cast<fir::SequenceType>();
-  fir::SequenceType rhsTy =
-      hlfir::getFortranElementOrSequenceType(rhs.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType lhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(lhs.getType()));
+  fir::SequenceType rhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(rhs.getType()));
   llvm::ArrayRef<int64_t> lhsShape = lhsTy.getShape();
   llvm::ArrayRef<int64_t> rhsShape = rhsTy.getShape();
   std::size_t lhsRank = lhsShape.size();
   std::size_t rhsRank = rhsShape.size();
   mlir::Type lhsEleTy = lhsTy.getEleTy();
   mlir::Type rhsEleTy = rhsTy.getEleTy();
-  hlfir::ExprType resultTy = getResult().getType().cast<hlfir::ExprType>();
+  hlfir::ExprType resultTy = mlir::cast<hlfir::ExprType>(getResult().getType());
   llvm::ArrayRef<int64_t> resultShape = resultTy.getShape();
   mlir::Type resultEleTy = resultTy.getEleTy();
 
@@ -1180,13 +1172,12 @@ void hlfir::MatmulOp::getEffects(
 
 mlir::LogicalResult hlfir::TransposeOp::verify() {
   mlir::Value array = getArray();
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   llvm::ArrayRef<int64_t> inShape = arrayTy.getShape();
   std::size_t rank = inShape.size();
   mlir::Type eleTy = arrayTy.getEleTy();
-  hlfir::ExprType resultTy = getResult().getType().cast<hlfir::ExprType>();
+  hlfir::ExprType resultTy = mlir::cast<hlfir::ExprType>(getResult().getType());
   llvm::ArrayRef<int64_t> resultShape = resultTy.getShape();
   std::size_t resultRank = resultShape.size();
   mlir::Type resultEleTy = resultTy.getEleTy();
@@ -1224,19 +1215,17 @@ void hlfir::TransposeOp::getEffects(
 mlir::LogicalResult hlfir::MatmulTransposeOp::verify() {
   mlir::Value lhs = getLhs();
   mlir::Value rhs = getRhs();
-  fir::SequenceType lhsTy =
-      hlfir::getFortranElementOrSequenceType(lhs.getType())
-          .cast<fir::SequenceType>();
-  fir::SequenceType rhsTy =
-      hlfir::getFortranElementOrSequenceType(rhs.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType lhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(lhs.getType()));
+  fir::SequenceType rhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(rhs.getType()));
   llvm::ArrayRef<int64_t> lhsShape = lhsTy.getShape();
   llvm::ArrayRef<int64_t> rhsShape = rhsTy.getShape();
   std::size_t lhsRank = lhsShape.size();
   std::size_t rhsRank = rhsShape.size();
   mlir::Type lhsEleTy = lhsTy.getEleTy();
   mlir::Type rhsEleTy = rhsTy.getEleTy();
-  hlfir::ExprType resultTy = getResult().getType().cast<hlfir::ExprType>();
+  hlfir::ExprType resultTy = mlir::cast<hlfir::ExprType>(getResult().getType());
   llvm::ArrayRef<int64_t> resultShape = resultTy.getShape();
   mlir::Type resultEleTy = resultTy.getEleTy();
 
@@ -1381,7 +1370,7 @@ void hlfir::AsExprOp::build(mlir::OpBuilder &builder,
   hlfir::ExprType::Shape typeShape;
   bool isPolymorphic = fir::isPolymorphicType(var.getType());
   mlir::Type type = getFortranElementOrSequenceType(var.getType());
-  if (auto seqType = type.dyn_cast<fir::SequenceType>()) {
+  if (auto seqType = mlir::dyn_cast<fir::SequenceType>(type)) {
     typeShape.append(seqType.getShape().begin(), seqType.getShape().end());
     type = seqType.getEleTy();
   }
@@ -1427,7 +1416,7 @@ static void buildElemental(mlir::OpBuilder &builder,
                           isUnordered ? builder.getUnitAttr() : nullptr);
   mlir::Region *bodyRegion = odsState.addRegion();
   bodyRegion->push_back(new mlir::Block{});
-  if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>()) {
+  if (auto shapeType = mlir::dyn_cast<fir::ShapeType>(shape.getType())) {
     unsigned dim = shapeType.getRank();
     mlir::Type indexType = builder.getIndexType();
     for (unsigned d = 0; d < dim; ++d)
@@ -1468,7 +1457,7 @@ void hlfir::ApplyOp::build(mlir::OpBuilder &builder,
                            mlir::ValueRange indices,
                            mlir::ValueRange typeparams) {
   mlir::Type resultType = expr.getType();
-  if (auto exprType = resultType.dyn_cast<hlfir::ExprType>())
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(resultType))
     resultType = exprType.getElementExprType();
   build(builder, odsState, resultType, expr, indices, typeparams);
 }
@@ -1517,20 +1506,20 @@ void hlfir::CopyInOp::build(mlir::OpBuilder &builder,
 
 void hlfir::ShapeOfOp::build(mlir::OpBuilder &builder,
                              mlir::OperationState &result, mlir::Value expr) {
-  hlfir::ExprType exprTy = expr.getType().cast<hlfir::ExprType>();
+  hlfir::ExprType exprTy = mlir::cast<hlfir::ExprType>(expr.getType());
   mlir::Type type = fir::ShapeType::get(builder.getContext(), exprTy.getRank());
   build(builder, result, type, expr);
 }
 
 std::size_t hlfir::ShapeOfOp::getRank() {
   mlir::Type resTy = getResult().getType();
-  fir::ShapeType shape = resTy.cast<fir::ShapeType>();
+  fir::ShapeType shape = mlir::cast<fir::ShapeType>(resTy);
   return shape.getRank();
 }
 
 mlir::LogicalResult hlfir::ShapeOfOp::verify() {
   mlir::Value expr = getExpr();
-  hlfir::ExprType exprTy = expr.getType().cast<hlfir::ExprType>();
+  hlfir::ExprType exprTy = mlir::cast<hlfir::ExprType>(expr.getType());
   std::size_t exprRank = exprTy.getShape().size();
 
   if (exprRank == 0)
@@ -1549,7 +1538,8 @@ hlfir::ShapeOfOp::canonicalize(ShapeOfOp shapeOf,
   // if extent information is available at compile time, immediately fold the
   // hlfir.shape_of into a fir.shape
   mlir::Location loc = shapeOf.getLoc();
-  hlfir::ExprType expr = shapeOf.getExpr().getType().cast<hlfir::ExprType>();
+  hlfir::ExprType expr =
+      mlir::cast<hlfir::ExprType>(shapeOf.getExpr().getType());
 
   mlir::Value shape = hlfir::genExprShape(rewriter, loc, expr);
   if (!shape)
@@ -1574,7 +1564,7 @@ void hlfir::GetExtentOp::build(mlir::OpBuilder &builder,
 }
 
 mlir::LogicalResult hlfir::GetExtentOp::verify() {
-  fir::ShapeType shapeTy = getShape().getType().cast<fir::ShapeType>();
+  fir::ShapeType shapeTy = mlir::cast<fir::ShapeType>(getShape().getType());
   std::uint64_t rank = shapeTy.getRank();
   llvm::APInt dim = getDim();
   if (dim.sge(rank))
@@ -1709,10 +1699,11 @@ mlir::LogicalResult hlfir::ElementalAddrOp::verify() {
     return emitOpError("body region must be terminated by an hlfir.yield");
   mlir::Type elementAddrType = yieldOp.getEntity().getType();
   if (!hlfir::isFortranVariableType(elementAddrType) ||
-      hlfir::getFortranElementOrSequenceType(elementAddrType)
-          .isa<fir::SequenceType>())
+      mlir::isa<fir::SequenceType>(
+          hlfir::getFortranElementOrSequenceType(elementAddrType)))
     return emitOpError("body must compute the address of a scalar entity");
-  unsigned shapeRank = getShape().getType().cast<fir::ShapeType>().getRank();
+  unsigned shapeRank =
+      mlir::cast<fir::ShapeType>(getShape().getType()).getRank();
   if (shapeRank != getIndices().size())
     return emitOpError("body number of indices must match shape rank");
   return mlir::success();
@@ -1817,8 +1808,8 @@ static bool yieldsLogical(mlir::Region &region, bool mustBeScalarI1) {
   if (mustBeScalarI1)
     return hlfir::isI1Type(yieldType);
   return hlfir::isMaskArgument(yieldType) &&
-         hlfir::getFortranElementOrSequenceType(yieldType)
-             .isa<fir::SequenceType>();
+         mlir::isa<fir::SequenceType>(
+             hlfir::getFortranElementOrSequenceType(yieldType));
 }
 
 mlir::LogicalResult hlfir::ForallMaskOp::verify() {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index 1c4f82e2de81..d4e4835ee726 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -77,7 +77,7 @@ static mlir::Value packageBufferizedExpr(mlir::Location loc,
 /// currently enforced by the verifiers that only accept HLFIR value or
 /// variable types which do not include tuples.
 static hlfir::Entity getBufferizedExprStorage(mlir::Value bufferizedExpr) {
-  auto tupleType = bufferizedExpr.getType().dyn_cast<mlir::TupleType>();
+  auto tupleType = mlir::dyn_cast<mlir::TupleType>(bufferizedExpr.getType());
   if (!tupleType)
     return hlfir::Entity{bufferizedExpr};
   assert(tupleType.size() == 2 && "unexpected tuple type");
@@ -90,7 +90,7 @@ static hlfir::Entity getBufferizedExprStorage(mlir::Value bufferizedExpr) {
 /// Helper to extract the clean-up flag from a tuple created by
 /// packageBufferizedExpr.
 static mlir::Value getBufferizedExprMustFreeFlag(mlir::Value bufferizedExpr) {
-  auto tupleType = bufferizedExpr.getType().dyn_cast<mlir::TupleType>();
+  auto tupleType = mlir::dyn_cast<mlir::TupleType>(bufferizedExpr.getType());
   if (!tupleType)
     return bufferizedExpr;
   assert(tupleType.size() == 2 && "unexpected tuple type");
@@ -218,7 +218,7 @@ struct ShapeOfOpConversion
     } else {
       // everything else failed so try to create a shape from static type info
       hlfir::ExprType exprTy =
-          adaptor.getExpr().getType().dyn_cast_or_null<hlfir::ExprType>();
+          mlir::dyn_cast_or_null<hlfir::ExprType>(adaptor.getExpr().getType());
       if (exprTy)
         shape = hlfir::genExprShape(builder, loc, exprTy);
     }
@@ -480,10 +480,10 @@ struct AssociateOpConversion
           assert(mlir::isa<fir::ClassType>(sourceVar.getType()) &&
                  fir::isAllocatableType(sourceVar.getType()));
           assert(sourceVar.getType() == assocType);
-        } else if ((sourceVar.getType().isa<fir::BaseBoxType>() &&
-                    !assocType.isa<fir::BaseBoxType>()) ||
-                   ((sourceVar.getType().isa<fir::BoxCharType>() &&
-                     !assocType.isa<fir::BoxCharType>()))) {
+        } else if ((mlir::isa<fir::BaseBoxType>(sourceVar.getType()) &&
+                    !mlir::isa<fir::BaseBoxType>(assocType)) ||
+                   ((mlir::isa<fir::BoxCharType>(sourceVar.getType()) &&
+                     !mlir::isa<fir::BoxCharType>(assocType)))) {
           sourceVar = builder.create<fir::BoxAddrOp>(loc, assocType, sourceVar);
         } else {
           sourceVar = builder.createConvert(loc, assocType, sourceVar);
@@ -590,13 +590,13 @@ static void genBufferDestruction(mlir::Location loc, fir::FirOpBuilder &builder,
       // for MERGE with polymorphic results.
       if (mustFinalize)
         TODO(loc, "finalizing polymorphic temporary in HLFIR");
-    } else if (var.getType().isa<fir::BaseBoxType, fir::BoxCharType>()) {
+    } else if (mlir::isa<fir::BaseBoxType, fir::BoxCharType>(var.getType())) {
       if (mustFinalize && !mlir::isa<fir::BaseBoxType>(var.getType()))
         fir::emitFatalError(loc, "non-finalizable variable");
 
       addr = builder.create<fir::BoxAddrOp>(loc, heapType, var);
     } else {
-      if (!var.getType().isa<fir::HeapType>())
+      if (!mlir::isa<fir::HeapType>(var.getType()))
         addr = builder.create<fir::ConvertOp>(loc, heapType, var);
 
       if (mustFinalize || deallocComponents) {
@@ -831,7 +831,7 @@ struct ElementalOpConversion
       // the assign, insert an hlfir.destroy to mark the expression end-of-life.
       // If the expression creation allocated a buffer on the heap inside the
       // loop, this will ensure the buffer properly deallocated.
-      if (elementValue.getType().isa<hlfir::ExprType>() &&
+      if (mlir::isa<hlfir::ExprType>(elementValue.getType()) &&
           wasCreatedInCurrentBlock(elementValue, builder))
         builder.create<hlfir::DestroyOp>(loc, elementValue);
     }
@@ -926,11 +926,12 @@ public:
                         hlfir::EndAssociateOp, hlfir::SetLengthOp>();
 
     target.markUnknownOpDynamicallyLegal([](mlir::Operation *op) {
-      return llvm::all_of(
-                 op->getResultTypes(),
-                 [](mlir::Type ty) { return !ty.isa<hlfir::ExprType>(); }) &&
+      return llvm::all_of(op->getResultTypes(),
+                          [](mlir::Type ty) {
+                            return !mlir::isa<hlfir::ExprType>(ty);
+                          }) &&
              llvm::all_of(op->getOperandTypes(), [](mlir::Type ty) {
-               return !ty.isa<hlfir::ExprType>();
+               return !mlir::isa<hlfir::ExprType>(ty);
              });
     });
     if (mlir::failed(
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index cd534bae4ad2..3570e0011ca7 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -34,7 +34,7 @@ using namespace mlir;
 static mlir::Value genAllocatableTempFromSourceBox(mlir::Location loc,
                                                    fir::FirOpBuilder &builder,
                                                    mlir::Value sourceBox) {
-  assert(sourceBox.getType().isa<fir::BaseBoxType>() &&
+  assert(mlir::isa<fir::BaseBoxType>(sourceBox.getType()) &&
          "must be a base box type");
   // Use the runtime to make a quick and dirty temp with the rhs value.
   // Overkill for scalar rhs that could be done in much more clever ways.
@@ -44,7 +44,7 @@ static mlir::Value genAllocatableTempFromSourceBox(mlir::Location loc,
   // This has the huge benefit of dealing with all cases, including
   // polymorphic entities.
   mlir::Type fromHeapType = fir::HeapType::get(fir::unwrapRefType(
-      sourceBox.getType().cast<fir::BaseBoxType>().getEleTy()));
+      mlir::cast<fir::BaseBoxType>(sourceBox.getType()).getEleTy()));
   mlir::Type fromBoxHeapType = fir::BoxType::get(fromHeapType);
   mlir::Value fromMutableBox =
       fir::factory::genNullBoxStorage(builder, loc, fromBoxHeapType);
@@ -69,7 +69,7 @@ public:
     auto module = assignOp->getParentOfType<mlir::ModuleOp>();
     fir::FirOpBuilder builder(rewriter, module);
 
-    if (rhs.getType().isa<hlfir::ExprType>()) {
+    if (mlir::isa<hlfir::ExprType>(rhs.getType())) {
       mlir::emitError(loc, "hlfir must be bufferized with --bufferize-hlfir "
                            "pass before being converted to FIR");
       return mlir::failure();
@@ -328,8 +328,8 @@ public:
       cudaAttr = fir::CUDADataAttributeAttr::get(rewriter.getContext(), *attr);
     auto firDeclareOp = rewriter.create<fir::DeclareOp>(
         loc, memref.getType(), memref, declareOp.getShape(),
-        declareOp.getTypeparams(), declareOp.getUniqName(), fortranAttrs,
-        cudaAttr);
+        declareOp.getTypeparams(), declareOp.getDummyScope(),
+        declareOp.getUniqName(), fortranAttrs, cudaAttr);
 
     // Propagate other attributes from hlfir.declare to fir.declare.
     // OpenACC's acc.declare is one example. Right now, the propagation
@@ -343,16 +343,15 @@ public:
     auto firBase = firDeclareOp.getResult();
     mlir::Value hlfirBase;
     mlir::Type hlfirBaseType = declareOp.getBase().getType();
-    if (hlfirBaseType.isa<fir::BaseBoxType>()) {
+    if (mlir::isa<fir::BaseBoxType>(hlfirBaseType)) {
       fir::FirOpBuilder builder(rewriter, declareOp.getOperation());
       // Helper to generate the hlfir fir.box with the local lower bounds and
       // type parameters.
       auto genHlfirBox = [&]() -> mlir::Value {
-        if (!firBase.getType().isa<fir::BaseBoxType>()) {
+        if (!mlir::isa<fir::BaseBoxType>(firBase.getType())) {
           llvm::SmallVector<mlir::Value> typeParams;
-          auto maybeCharType =
-              fir::unwrapSequenceType(fir::unwrapPassByRefType(hlfirBaseType))
-                  .dyn_cast<fir::CharacterType>();
+          auto maybeCharType = mlir::dyn_cast<fir::CharacterType>(
+              fir::unwrapSequenceType(fir::unwrapPassByRefType(hlfirBaseType)));
           if (!maybeCharType || maybeCharType.hasDynamicLen())
             typeParams.append(declareOp.getTypeparams().begin(),
                               declareOp.getTypeparams().end());
@@ -399,7 +398,7 @@ public:
                         })
                         .getResults()[0];
       }
-    } else if (hlfirBaseType.isa<fir::BoxCharType>()) {
+    } else if (mlir::isa<fir::BoxCharType>(hlfirBaseType)) {
       assert(declareOp.getTypeparams().size() == 1 &&
              "must contain character length");
       hlfirBase = rewriter.create<fir::EmboxCharOp>(
@@ -480,11 +479,12 @@ public:
         // - scalar%scalar_component [substring|complex_part] or
         // - scalar%static_size_array_comp
         // - scalar%array(indices) [substring| complex part]
-        mlir::Type componentType = baseEleTy.cast<fir::RecordType>().getType(
-            designate.getComponent().value());
+        mlir::Type componentType =
+            mlir::cast<fir::RecordType>(baseEleTy).getType(
+                designate.getComponent().value());
         mlir::Type coorTy = fir::ReferenceType::get(componentType);
         base = builder.create<fir::CoordinateOp>(loc, coorTy, base, fieldIndex);
-        if (componentType.isa<fir::BaseBoxType>()) {
+        if (mlir::isa<fir::BaseBoxType>(componentType)) {
           auto variableInterface = mlir::cast<fir::FortranVariableOpInterface>(
               designate.getOperation());
           if (variableInterface.isAllocatable() ||
@@ -500,14 +500,14 @@ public:
       } else {
         // array%component[(indices) substring|complex part] cases.
         // Component ref of array bases are dealt with below in embox/rebox.
-        assert(designateResultType.isa<fir::BaseBoxType>());
+        assert(mlir::isa<fir::BaseBoxType>(designateResultType));
       }
     }
 
-    if (designateResultType.isa<fir::BaseBoxType>()) {
+    if (mlir::isa<fir::BaseBoxType>(designateResultType)) {
       // Generate embox or rebox.
       mlir::Type eleTy = fir::unwrapPassByRefType(designateResultType);
-      bool isScalarDesignator = !eleTy.isa<fir::SequenceType>();
+      bool isScalarDesignator = !mlir::isa<fir::SequenceType>(eleTy);
       mlir::Value sourceBox;
       if (isScalarDesignator) {
         // The base box will be used for emboxing the scalar element.
@@ -583,7 +583,7 @@ public:
         assert(sliceFields.empty() && substring.empty());
       llvm::SmallVector<mlir::Type> resultType{designateResultType};
       mlir::Value resultBox;
-      if (base.getType().isa<fir::BaseBoxType>())
+      if (mlir::isa<fir::BaseBoxType>(base.getType()))
         resultBox =
             builder.create<fir::ReboxOp>(loc, resultType, base, shape, slice);
       else
@@ -598,7 +598,8 @@ public:
     // first element of a contiguous array section with compile time constant
     // shape. The base may be an array, or a scalar.
     mlir::Type resultAddressType = designateResultType;
-    if (auto boxCharType = designateResultType.dyn_cast<fir::BoxCharType>())
+    if (auto boxCharType =
+            mlir::dyn_cast<fir::BoxCharType>(designateResultType))
       resultAddressType = fir::ReferenceType::get(boxCharType.getEleTy());
 
     // Array element indexing.
@@ -620,7 +621,7 @@ public:
     // Scalar complex part ref
     if (designate.getComplexPart()) {
       // Sequence types should have already been handled by this point
-      assert(!designateResultType.isa<fir::SequenceType>());
+      assert(!mlir::isa<fir::SequenceType>(designateResultType));
       auto index = builder.createIntegerConstant(loc, builder.getIndexType(),
                                                  *designate.getComplexPart());
       auto coorTy = fir::ReferenceType::get(resultEleTy);
@@ -628,7 +629,7 @@ public:
     }
 
     // Cast/embox the computed scalar address if needed.
-    if (designateResultType.isa<fir::BoxCharType>()) {
+    if (mlir::isa<fir::BoxCharType>(designateResultType)) {
       assert(designate.getTypeparams().size() == 1 &&
              "must have character length");
       auto emboxChar = builder.create<fir::EmboxCharOp>(
@@ -671,13 +672,13 @@ public:
                   mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = parentComponent.getLoc();
     mlir::Type resultType = parentComponent.getType();
-    if (!parentComponent.getType().isa<fir::BoxType>()) {
+    if (!mlir::isa<fir::BoxType>(parentComponent.getType())) {
       mlir::Value baseAddr = parentComponent.getMemref();
       // Scalar parent component ref without any length type parameters. The
       // input may be a fir.class if it is polymorphic, since this is a scalar
       // and the output will be monomorphic, the base address can be extracted
       // from the fir.class.
-      if (baseAddr.getType().isa<fir::BaseBoxType>())
+      if (mlir::isa<fir::BaseBoxType>(baseAddr.getType()))
         baseAddr = rewriter.create<fir::BoxAddrOp>(loc, baseAddr);
       rewriter.replaceOpWithNewOp<fir::ConvertOp>(parentComponent, resultType,
                                                   baseAddr);
@@ -686,7 +687,7 @@ public:
     // Array parent component ref or PDTs.
     hlfir::Entity base{parentComponent.getMemref()};
     mlir::Value baseAddr = base.getBase();
-    if (!baseAddr.getType().isa<fir::BaseBoxType>()) {
+    if (!mlir::isa<fir::BaseBoxType>(baseAddr.getType())) {
       // Embox cannot directly be used to address parent components: it expects
       // the output type to match the input type when there are no slices. When
       // the types have at least one component, a slice to the first element can
@@ -748,7 +749,7 @@ public:
     // the hlfir.shape_of operation which led to the creation of this get_extent
     // operation should now have been lowered to a fir.shape operation
     if (auto s = mlir::dyn_cast_or_null<fir::ShapeOp>(shapeOp)) {
-      fir::ShapeType shapeTy = shape.getType().cast<fir::ShapeType>();
+      fir::ShapeType shapeTy = mlir::cast<fir::ShapeType>(shape.getType());
       llvm::APInt dim = getExtentOp.getDim();
       uint64_t dimVal = dim.getLimitedValue(shapeTy.getRank());
       mlir::Value extent = s.getExtents()[dimVal];
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index 0142fb0cfb0b..e9dbb7095d0e 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -185,7 +185,7 @@ protected:
 // the width for use in runtime intrinsic calls.
 static unsigned getKindForType(mlir::Type ty) {
   mlir::Type eltty = hlfir::getFortranElementType(ty);
-  unsigned width = eltty.cast<mlir::IntegerType>().getWidth();
+  unsigned width = mlir::cast<mlir::IntegerType>(eltty).getWidth();
   return width / 8;
 }
 
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 84101353a740..63b52c0cd0bc 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -1090,7 +1090,7 @@ void OrderedAssignmentRewriter::generateSaveEntity(
         mlir::Value loopExtent =
             computeLoopNestIterationNumber(loc, builder, loopNest);
         auto sequenceType =
-            builder.getVarLenSeqTy(entityType).cast<fir::SequenceType>();
+            mlir::cast<fir::SequenceType>(builder.getVarLenSeqTy(entityType));
         temp = insertSavedEntity(region,
                                  fir::factory::HomogeneousScalarStack{
                                      loc, builder, sequenceType, loopExtent,
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 685c73d67625..8d68c7021608 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -249,7 +249,7 @@ static bool areIdenticalOrDisjointSlices(mlir::Value ref1, mlir::Value ref2) {
     auto isPositiveConstant = [](mlir::Value v) -> bool {
       if (auto conOp =
               mlir::dyn_cast<mlir::arith::ConstantOp>(v.getDefiningOp()))
-        if (auto iattr = conOp.getValue().dyn_cast<mlir::IntegerAttr>())
+        if (auto iattr = mlir::dyn_cast<mlir::IntegerAttr>(conOp.getValue()))
           return iattr.getInt() > 0;
       return false;
     };
@@ -601,7 +601,7 @@ mlir::LogicalResult VariableAssignBufferization::matchAndRewrite(
   // TODO: ExprType check is here to avoid conflicts with
   // ElementalAssignBufferization pattern. We need to combine
   // these matchers into a single one that applies to AssignOp.
-  if (rhs.getType().isa<hlfir::ExprType>())
+  if (mlir::isa<hlfir::ExprType>(rhs.getType()))
     return rewriter.notifyMatchFailure(assign, "RHS is not in memory");
 
   if (!rhs.isArray())
@@ -834,7 +834,7 @@ public:
 
     unsigned rank = mlir::cast<hlfir::ExprType>(mloc.getType()).getShape()[0];
     mlir::Type arrayType = array.getType();
-    if (!arrayType.isa<fir::BoxType>())
+    if (!mlir::isa<fir::BoxType>(arrayType))
       return rewriter.notifyMatchFailure(
           mloc, "Currently requires a boxed type input");
     mlir::Type elementType = hlfir::getFortranElementType(arrayType);
@@ -850,7 +850,7 @@ public:
 
     auto init = [isMax](fir::FirOpBuilder builder, mlir::Location loc,
                         mlir::Type elementType) {
-      if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
+      if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
         const llvm::fltSemantics &sem = ty.getFloatSemantics();
         llvm::APFloat limit = llvm::APFloat::getInf(sem, /*Negative=*/isMax);
         return builder.createRealConstant(loc, elementType, limit);
@@ -901,7 +901,7 @@ public:
 
       // Compare with the max reduction value
       mlir::Value cmp;
-      if (elementType.isa<mlir::FloatType>()) {
+      if (mlir::isa<mlir::FloatType>(elementType)) {
         // For FP reductions we want the first smallest value to be used, that
         // is not NaN. A OGL/OLT condition will usually work for this unless all
         // the values are Nan or Inf. This follows the same logic as
@@ -918,7 +918,7 @@ public:
             loc, mlir::arith::CmpFPredicate::OEQ, elem, elem);
         cmpNan = builder.create<mlir::arith::AndIOp>(loc, cmpNan, cmpNan2);
         cmp = builder.create<mlir::arith::OrIOp>(loc, cmp, cmpNan);
-      } else if (elementType.isa<mlir::IntegerType>()) {
+      } else if (mlir::isa<mlir::IntegerType>(elementType)) {
         cmp = builder.create<mlir::arith::CmpIOp>(
             loc,
             isMax ? mlir::arith::CmpIPredicate::sgt
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 2751575ce982..b761563eba0f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -103,7 +103,8 @@ public:
     // by hlfir.elemental)
     target.addDynamicallyLegalOp<hlfir::TransposeOp>(
         [](hlfir::TransposeOp transpose) {
-          return transpose.getType().cast<hlfir::ExprType>().isPolymorphic();
+          return mlir::cast<hlfir::ExprType>(transpose.getType())
+              .isPolymorphic();
         });
     target.markUnknownOpDynamicallyLegal(
         [](mlir::Operation *) { return true; });
diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
index eb4dd637bb16..85472cdc5103 100644
--- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp
+++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -65,14 +65,14 @@ static mlir::FunctionType getCPtrFunctionType(mlir::FunctionType funcTy) {
   auto resultType = funcTy.getResult(0);
   assert(fir::isa_builtin_cptr_type(resultType));
   llvm::SmallVector<mlir::Type> outputTypes;
-  auto recTy = resultType.dyn_cast<fir::RecordType>();
+  auto recTy = mlir::dyn_cast<fir::RecordType>(resultType);
   outputTypes.emplace_back(recTy.getTypeList()[0].second);
   return mlir::FunctionType::get(funcTy.getContext(), funcTy.getInputs(),
                                  outputTypes);
 }
 
 static bool mustEmboxResult(mlir::Type resultType, bool shouldBoxResult) {
-  return resultType.isa<fir::SequenceType, fir::RecordType>() &&
+  return mlir::isa<fir::SequenceType, fir::RecordType>(resultType) &&
          shouldBoxResult;
 }
 
@@ -114,7 +114,7 @@ public:
     bool isResultBuiltinCPtr = fir::isa_builtin_cptr_type(result.getType());
     Op newOp;
     if (isResultBuiltinCPtr) {
-      auto recTy = result.getType().template dyn_cast<fir::RecordType>();
+      auto recTy = mlir::dyn_cast<fir::RecordType>(result.getType());
       newResultTypes.emplace_back(recTy.getTypeList()[0].second);
     }
 
@@ -261,7 +261,7 @@ public:
   mlir::LogicalResult
   matchAndRewrite(fir::AddrOfOp addrOf,
                   mlir::PatternRewriter &rewriter) const override {
-    auto oldFuncTy = addrOf.getType().cast<mlir::FunctionType>();
+    auto oldFuncTy = mlir::cast<mlir::FunctionType>(addrOf.getType());
     mlir::FunctionType newFuncTy;
     // TODO: This should be generalized for derived types, and it is
     // architecture and OS dependent.
@@ -296,7 +296,7 @@ public:
     auto loc = func.getLoc();
     auto *context = &getContext();
     // Convert function type itself if it has an abstract result.
-    auto funcTy = func.getFunctionType().cast<mlir::FunctionType>();
+    auto funcTy = mlir::cast<mlir::FunctionType>(func.getFunctionType());
     if (hasAbstractResult(funcTy)) {
       // TODO: This should be generalized for derived types, and it is
       // architecture and OS dependent.
@@ -343,11 +343,11 @@ public:
     return mlir::TypeSwitch<mlir::Type, bool>(type)
         .Case([](fir::BoxProcType boxProc) {
           return fir::hasAbstractResult(
-              boxProc.getEleTy().cast<mlir::FunctionType>());
+              mlir::cast<mlir::FunctionType>(boxProc.getEleTy()));
         })
         .Case([](fir::PointerType pointer) {
           return fir::hasAbstractResult(
-              pointer.getEleTy().cast<mlir::FunctionType>());
+              mlir::cast<mlir::FunctionType>(pointer.getEleTy()));
         })
         .Default([](auto &&) { return false; });
   }
@@ -411,7 +411,7 @@ public:
       return !hasAbstractResult(call.getFunctionType());
     });
     target.addDynamicallyLegalOp<fir::AddrOfOp>([](fir::AddrOfOp addrOf) {
-      if (auto funTy = addrOf.getType().dyn_cast<mlir::FunctionType>())
+      if (auto funTy = mlir::dyn_cast<mlir::FunctionType>(addrOf.getType()))
         return !hasAbstractResult(funTy);
       return true;
     });
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 68584bef055b..908c8fc96f63 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -11,6 +11,7 @@
 /// This pass populates some debug information for the module and functions.
 //===----------------------------------------------------------------------===//
 
+#include "DebugTypeGenerator.h"
 #include "flang/Common/Version.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
@@ -69,7 +70,7 @@ void AddDebugInfoPass::runOnOperation() {
   // In that case, 'inputFilename' may be empty. Location embedded in the
   // module will be used to get file name and its directory.
   if (inputFilename.empty()) {
-    if (auto fileLoc = module.getLoc().dyn_cast<mlir::FileLineColLoc>()) {
+    if (auto fileLoc = mlir::dyn_cast<mlir::FileLineColLoc>(module.getLoc())) {
       fileName = llvm::sys::path::filename(fileLoc.getFilename().getValue());
       filePath = llvm::sys::path::parent_path(fileLoc.getFilename().getValue());
     } else
@@ -94,26 +95,39 @@ void AddDebugInfoPass::runOnOperation() {
     mlir::Location l = funcOp->getLoc();
     // If fused location has already been created then nothing to do
     // Otherwise, create a fused location.
-    if (l.dyn_cast<mlir::FusedLoc>())
+    if (mlir::dyn_cast<mlir::FusedLoc>(l))
       return;
 
     unsigned int CC = (funcOp.getName() == fir::NameUniquer::doProgramEntry())
                           ? llvm::dwarf::getCallingConvention("DW_CC_program")
                           : llvm::dwarf::getCallingConvention("DW_CC_normal");
 
-    if (auto funcLoc = l.dyn_cast<mlir::FileLineColLoc>()) {
+    if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l)) {
       fileName = llvm::sys::path::filename(funcLoc.getFilename().getValue());
       filePath = llvm::sys::path::parent_path(funcLoc.getFilename().getValue());
     }
 
-    mlir::StringAttr funcName =
+    mlir::StringAttr fullName =
         mlir::StringAttr::get(context, funcOp.getName());
-    mlir::LLVM::DIBasicTypeAttr bT = mlir::LLVM::DIBasicTypeAttr::get(
-        context, llvm::dwarf::DW_TAG_base_type, "void", /*sizeInBits=*/0,
-        /*encoding=*/1);
-    // FIXME: Provide proper type for subroutine
+    auto result = fir::NameUniquer::deconstruct(funcOp.getName());
+    mlir::StringAttr funcName =
+        mlir::StringAttr::get(context, result.second.name);
+
+    llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
+    fir::DebugTypeGenerator typeGen(module);
+    for (auto resTy : funcOp.getResultTypes()) {
+      auto tyAttr =
+          typeGen.convertType(resTy, fileAttr, cuAttr, funcOp.getLoc());
+      types.push_back(tyAttr);
+    }
+    for (auto inTy : funcOp.getArgumentTypes()) {
+      auto tyAttr = typeGen.convertType(fir::unwrapRefType(inTy), fileAttr,
+                                        cuAttr, funcOp.getLoc());
+      types.push_back(tyAttr);
+    }
+
     mlir::LLVM::DISubroutineTypeAttr subTypeAttr =
-        mlir::LLVM::DISubroutineTypeAttr::get(context, CC, {bT, bT});
+        mlir::LLVM::DISubroutineTypeAttr::get(context, CC, types);
     mlir::LLVM::DIFileAttr funcFileAttr =
         mlir::LLVM::DIFileAttr::get(context, fileName, filePath);
 
@@ -130,11 +144,13 @@ void AddDebugInfoPass::runOnOperation() {
       subprogramFlags =
           subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
     }
-    // FIXME: Provide proper line and scopeline.
+    unsigned line = 1;
+    if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l))
+      line = funcLoc.getLine();
+
     auto spAttr = mlir::LLVM::DISubprogramAttr::get(
-        context, id, compilationUnit, fileAttr, funcName, funcName,
-        funcFileAttr, /*line=*/1, /*scopeline=*/1, subprogramFlags,
-        subTypeAttr);
+        context, id, compilationUnit, fileAttr, funcName, fullName,
+        funcFileAttr, line, line, subprogramFlags, subTypeAttr);
     funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
   });
 }
diff --git a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
index da29ae880700..b4523a060f5a 100644
--- a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
@@ -98,14 +98,15 @@ public:
   mlir::LogicalResult
   matchAndRewrite(fir::ConvertOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    if (op.getRes().getType().isa<mlir::MemRefType>()) {
+    if (mlir::isa<mlir::MemRefType>(op.getRes().getType())) {
       // due to index calculation moving to affine maps we still need to
       // add converts for sequence types this has a side effect of losing
       // some information about arrays with known dimensions by creating:
       // fir.convert %arg0 : (!fir.ref<!fir.array<5xi32>>) ->
       // !fir.ref<!fir.array<?xi32>>
-      if (auto refTy = op.getValue().getType().dyn_cast<fir::ReferenceType>())
-        if (auto arrTy = refTy.getEleTy().dyn_cast<fir::SequenceType>()) {
+      if (auto refTy =
+              mlir::dyn_cast<fir::ReferenceType>(op.getValue().getType()))
+        if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(refTy.getEleTy())) {
           fir::SequenceType::Shape flatShape = {
               fir::SequenceType::getUnknownExtent()};
           auto flatArrTy = fir::SequenceType::get(flatShape, arrTy.getEleTy());
@@ -158,7 +159,7 @@ public:
     mlir::ConversionTarget target(*context);
     target.addIllegalOp<memref::AllocOp>();
     target.addDynamicallyLegalOp<fir::ConvertOp>([](fir::ConvertOp op) {
-      if (op.getRes().getType().isa<mlir::MemRefType>())
+      if (mlir::isa<mlir::MemRefType>(op.getRes().getType()))
         return false;
       return true;
     });
diff --git a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
index 64531cb1868e..7d0131ac6fa4 100644
--- a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
@@ -111,7 +111,7 @@ private:
 
   bool analyzeReference(mlir::Value memref, mlir::Operation *op) {
     if (auto acoOp = memref.getDefiningOp<ArrayCoorOp>()) {
-      if (acoOp.getMemref().getType().isa<fir::BoxType>()) {
+      if (mlir::isa<fir::BoxType>(acoOp.getMemref().getType())) {
         // TODO: Look if and how fir.box can be promoted to affine.
         LLVM_DEBUG(llvm::dbgs() << "AffineLoopAnalysis: cannot promote loop, "
                                    "array memory operation uses fir.box\n";
@@ -222,7 +222,7 @@ private:
       return affineBinaryOp(mlir::AffineExprKind::Mod, op.getLhs(),
                             op.getRhs());
     if (auto op = value.getDefiningOp<mlir::arith::ConstantOp>())
-      if (auto intConstant = op.getValue().dyn_cast<IntegerAttr>())
+      if (auto intConstant = mlir::dyn_cast<IntegerAttr>(op.getValue()))
         return toAffineExpr(intConstant.getInt());
     if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(value)) {
       affineArgs.push_back(value);
@@ -331,15 +331,16 @@ static mlir::AffineMap createArrayIndexAffineMap(unsigned dimensions,
 
 static std::optional<int64_t> constantIntegerLike(const mlir::Value value) {
   if (auto definition = value.getDefiningOp<mlir::arith::ConstantOp>())
-    if (auto stepAttr = definition.getValue().dyn_cast<IntegerAttr>())
+    if (auto stepAttr = mlir::dyn_cast<IntegerAttr>(definition.getValue()))
       return stepAttr.getInt();
   return {};
 }
 
 static mlir::Type coordinateArrayElement(fir::ArrayCoorOp op) {
   if (auto refType =
-          op.getMemref().getType().dyn_cast_or_null<ReferenceType>()) {
-    if (auto seqType = refType.getEleTy().dyn_cast_or_null<SequenceType>()) {
+          mlir::dyn_cast_or_null<ReferenceType>(op.getMemref().getType())) {
+    if (auto seqType =
+            mlir::dyn_cast_or_null<SequenceType>(refType.getEleTy())) {
       return seqType.getEleTy();
     }
   }
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index a08d58383d3a..ebc186222525 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -461,9 +461,9 @@ void ArrayCopyAnalysisBase::arrayMentions(
 }
 
 static bool hasPointerType(mlir::Type type) {
-  if (auto boxTy = type.dyn_cast<BoxType>())
+  if (auto boxTy = mlir::dyn_cast<BoxType>(type))
     type = boxTy.getEleTy();
-  return type.isa<fir::PointerType>();
+  return mlir::isa<fir::PointerType>(type);
 }
 
 // This is a NF performance hack. It makes a simple test that the slices of the
@@ -512,7 +512,7 @@ static bool mutuallyExclusiveSliceRange(ArrayLoadOp ld, ArrayMergeStoreOp st) {
     auto isPositiveConstant = [](mlir::Value v) -> bool {
       if (auto conOp =
               mlir::dyn_cast<mlir::arith::ConstantOp>(v.getDefiningOp()))
-        if (auto iattr = conOp.getValue().dyn_cast<mlir::IntegerAttr>())
+        if (auto iattr = mlir::dyn_cast<mlir::IntegerAttr>(conOp.getValue()))
           return iattr.getInt() > 0;
       return false;
     };
@@ -725,8 +725,8 @@ static bool
 conservativeCallConflict(llvm::ArrayRef<mlir::Operation *> reaches) {
   return llvm::any_of(reaches, [](mlir::Operation *op) {
     if (auto call = mlir::dyn_cast<fir::CallOp>(op))
-      if (auto callee =
-              call.getCallableForCallee().dyn_cast<mlir::SymbolRefAttr>()) {
+      if (auto callee = mlir::dyn_cast<mlir::SymbolRefAttr>(
+              call.getCallableForCallee())) {
         auto module = op->getParentOfType<mlir::ModuleOp>();
         return isInternalProcedure(
             module.lookupSymbol<mlir::func::FuncOp>(callee));
@@ -891,9 +891,9 @@ static mlir::Value getOrReadExtentsAndShapeOp(
   if (arrLoad->hasAttr(fir::getOptionalAttrName()))
     fir::emitFatalError(
         loc, "shapes from array load of OPTIONAL arrays must not be used");
-  if (auto boxTy = arrLoad.getMemref().getType().dyn_cast<BoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<BoxType>(arrLoad.getMemref().getType())) {
     auto rank =
-        dyn_cast_ptrOrBoxEleTy(boxTy).cast<SequenceType>().getDimension();
+        mlir::cast<SequenceType>(dyn_cast_ptrOrBoxEleTy(boxTy)).getDimension();
     auto idxTy = rewriter.getIndexType();
     for (decltype(rank) dim = 0; dim < rank; ++dim) {
       auto dimVal = rewriter.create<mlir::arith::ConstantIndexOp>(loc, dim);
@@ -929,7 +929,7 @@ static mlir::Type toRefType(mlir::Type ty) {
 static llvm::SmallVector<mlir::Value>
 getTypeParamsIfRawData(mlir::Location loc, FirOpBuilder &builder,
                        ArrayLoadOp arrLoad, mlir::Type ty) {
-  if (ty.isa<BoxType>())
+  if (mlir::isa<BoxType>(ty))
     return {};
   return fir::factory::getTypeParams(loc, builder, arrLoad);
 }
@@ -947,8 +947,8 @@ static mlir::Value genCoorOp(mlir::PatternRewriter &rewriter,
     originated = factory::originateIndices(loc, rewriter, alloc.getType(),
                                            shape, indices);
   auto seqTy = dyn_cast_ptrOrBoxEleTy(alloc.getType());
-  assert(seqTy && seqTy.isa<SequenceType>());
-  const auto dimension = seqTy.cast<SequenceType>().getDimension();
+  assert(seqTy && mlir::isa<SequenceType>(seqTy));
+  const auto dimension = mlir::cast<SequenceType>(seqTy).getDimension();
   auto module = load->getParentOfType<mlir::ModuleOp>();
   FirOpBuilder builder(rewriter, module);
   auto typeparams = getTypeParamsIfRawData(loc, builder, load, alloc.getType());
@@ -967,7 +967,7 @@ static mlir::Value getCharacterLen(mlir::Location loc, FirOpBuilder &builder,
                                    ArrayLoadOp load, CharacterType charTy) {
   auto charLenTy = builder.getCharacterLengthType();
   if (charTy.hasDynamicLen()) {
-    if (load.getMemref().getType().isa<BoxType>()) {
+    if (mlir::isa<BoxType>(load.getMemref().getType())) {
       // The loaded array is an emboxed value. Get the CHARACTER length from
       // the box value.
       auto eleSzInBytes =
@@ -1027,7 +1027,7 @@ void genArrayCopy(mlir::Location loc, mlir::PatternRewriter &rewriter,
       getTypeParamsIfRawData(loc, builder, arrLoad, dst.getType()));
   auto eleTy = unwrapSequenceType(unwrapPassByRefType(dst.getType()));
   // Copy from (to) object to (from) temp copy of same object.
-  if (auto charTy = eleTy.dyn_cast<CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<CharacterType>(eleTy)) {
     auto len = getCharacterLen(loc, builder, arrLoad, charTy);
     CharBoxValue toChar(toAddr, len);
     CharBoxValue fromChar(fromAddr, len);
@@ -1049,8 +1049,8 @@ genArrayLoadTypeParameters(mlir::Location loc, mlir::PatternRewriter &rewriter,
     auto eleTy =
         unwrapSequenceType(unwrapPassByRefType(load.getMemref().getType()));
     if (hasDynamicSize(eleTy)) {
-      if (auto charTy = eleTy.dyn_cast<CharacterType>()) {
-        assert(load.getMemref().getType().isa<BoxType>());
+      if (auto charTy = mlir::dyn_cast<CharacterType>(eleTy)) {
+        assert(mlir::isa<BoxType>(load.getMemref().getType()));
         auto module = load->getParentOfType<mlir::ModuleOp>();
         FirOpBuilder builder(rewriter, module);
         return {getCharacterLen(loc, builder, load, charTy)};
@@ -1067,7 +1067,7 @@ findNonconstantExtents(mlir::Type memrefTy,
                        llvm::ArrayRef<mlir::Value> extents) {
   llvm::SmallVector<mlir::Value> nce;
   auto arrTy = unwrapPassByRefType(memrefTy);
-  auto seqTy = arrTy.cast<SequenceType>();
+  auto seqTy = mlir::cast<SequenceType>(arrTy);
   for (auto [s, x] : llvm::zip(seqTy.getShape(), extents))
     if (s == SequenceType::getUnknownExtent())
       nce.emplace_back(x);
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index fc08d67540ce..5a542f237f8f 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -22,6 +22,7 @@ add_flang_library(FIRTransforms
   OMPMarkDeclareTarget.cpp
   VScaleAttr.cpp
   FunctionAttr.cpp
+  DebugTypeGenerator.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/Transforms/CharacterConversion.cpp b/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
index 2e8fc42487a5..44baad73aa25 100644
--- a/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
@@ -60,8 +60,8 @@ public:
     // For each code point in the `from` string, convert naively to the `to`
     // string code point. Conversion is done blindly on size only, not value.
     auto getCharBits = [&](mlir::Type t) {
-      auto chrTy = fir::unwrapSequenceType(fir::dyn_cast_ptrEleTy(t))
-                       .cast<fir::CharacterType>();
+      auto chrTy = mlir::cast<fir::CharacterType>(
+          fir::unwrapSequenceType(fir::dyn_cast_ptrEleTy(t)));
       return kindMap.getCharacterBitsize(chrTy.getFKind());
     };
     auto fromBits = getCharBits(conv.getFrom().getType());
@@ -102,6 +102,9 @@ public:
 class CharacterConversion
     : public fir::impl::CharacterConversionBase<CharacterConversion> {
 public:
+  using fir::impl::CharacterConversionBase<
+      CharacterConversion>::CharacterConversionBase;
+
   void runOnOperation() override {
     CharacterConversionOptions clOpts{useRuntimeCalls.getValue()};
     if (clOpts.runtimeName.empty()) {
@@ -130,7 +133,3 @@ public:
   }
 };
 } // end anonymous namespace
-
-std::unique_ptr<mlir::Pass> fir::createCharacterConversionPass() {
-  return std::make_unique<CharacterConversion>();
-}
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
new file mode 100644
index 000000000000..e5b4050dfb24
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -0,0 +1,67 @@
+//===-- DebugTypeGenerator.cpp -- type conversion ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "flang-debug-type-generator"
+
+#include "DebugTypeGenerator.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/Debug.h"
+
+namespace fir {
+
+DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m)
+    : module(m), kindMapping(getKindMapping(m)) {
+  LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n");
+}
+
+static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
+  return mlir::LLVM::DIBasicTypeAttr::get(
+      context, llvm::dwarf::DW_TAG_base_type, "void", 32, 1);
+}
+
+static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
+                                           mlir::StringAttr name,
+                                           unsigned bitSize,
+                                           unsigned decoding) {
+  return mlir::LLVM::DIBasicTypeAttr::get(
+      context, llvm::dwarf::DW_TAG_base_type, name, bitSize, decoding);
+}
+
+mlir::LLVM::DITypeAttr
+DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
+                                mlir::LLVM::DIScopeAttr scope,
+                                mlir::Location loc) {
+  mlir::MLIRContext *context = module.getContext();
+  if (Ty.isInteger()) {
+    return genBasicType(context, mlir::StringAttr::get(context, "integer"),
+                        Ty.getIntOrFloatBitWidth(), llvm::dwarf::DW_ATE_signed);
+  } else if (mlir::isa<mlir::FloatType>(Ty)) {
+    return genBasicType(context, mlir::StringAttr::get(context, "real"),
+                        Ty.getIntOrFloatBitWidth(), llvm::dwarf::DW_ATE_float);
+  } else if (auto realTy = mlir::dyn_cast_or_null<fir::RealType>(Ty)) {
+    return genBasicType(context, mlir::StringAttr::get(context, "real"),
+                        kindMapping.getRealBitsize(realTy.getFKind()),
+                        llvm::dwarf::DW_ATE_float);
+  } else if (auto logTy = mlir::dyn_cast_or_null<fir::LogicalType>(Ty)) {
+    return genBasicType(context,
+                        mlir::StringAttr::get(context, logTy.getMnemonic()),
+                        kindMapping.getLogicalBitsize(logTy.getFKind()),
+                        llvm::dwarf::DW_ATE_boolean);
+  } else {
+    // FIXME: These types are currently unhandled. We are generating a
+    // placeholder type to allow us to test supported bits.
+    return genPlaceholderType(context);
+  }
+}
+
+} // namespace fir
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
new file mode 100644
index 000000000000..5a2bb201db47
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
@@ -0,0 +1,40 @@
+//===-- DebugTypeGenerator.h -- type conversion ------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_DEBUGTYPEGENERATOR_H
+#define FORTRAN_OPTIMIZER_TRANSFORMS_DEBUGTYPEGENERATOR_H
+
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "llvm/Support/Debug.h"
+
+namespace fir {
+
+/// This converts FIR/mlir type to DITypeAttr.
+class DebugTypeGenerator {
+public:
+  DebugTypeGenerator(mlir::ModuleOp module);
+
+  mlir::LLVM::DITypeAttr convertType(mlir::Type Ty,
+                                     mlir::LLVM::DIFileAttr fileAttr,
+                                     mlir::LLVM::DIScopeAttr scope,
+                                     mlir::Location loc);
+
+private:
+  mlir::ModuleOp module;
+  KindMapping kindMapping;
+};
+
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_TRANSFORMS_DEBUGTYPEGENERATOR_H
diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index 7cbd2dd1f897..38cdc2b1388d 100644
--- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -147,7 +147,7 @@ struct ArgsUsageInLoop {
 
 static fir::SequenceType getAsSequenceType(mlir::Value *v) {
   mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v->getType()));
-  return argTy.dyn_cast<fir::SequenceType>();
+  return mlir::dyn_cast<fir::SequenceType>(argTy);
 }
 
 /// if a value comes from a fir.declare, follow it to the original source,
@@ -556,7 +556,3 @@ void LoopVersioningPass::runOnOperation() {
 
   LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
 }
-
-std::unique_ptr<mlir::Pass> fir::createLoopVersioningPass() {
-  return std::make_unique<LoopVersioningPass>();
-}
diff --git a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
index 166a6b10def2..ada67b4201e1 100644
--- a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
+++ b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
@@ -28,17 +28,6 @@ namespace fir {
 static constexpr std::size_t unlimitedArraySize = ~static_cast<std::size_t>(0);
 
 namespace {
-struct MemoryAllocationOptions {
-  // Always move dynamic array allocations to the heap. This may result in more
-  // heap fragmentation, so may impact performance negatively.
-  bool dynamicArrayOnHeap = false;
-
-  // Number of elements in array threshold for moving to heap. In environments
-  // with limited stack size, moving large arrays to the heap can avoid running
-  // out of stack space.
-  std::size_t maxStackArraySize = unlimitedArraySize;
-};
-
 class ReturnAnalysis {
 public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReturnAnalysis)
@@ -68,14 +57,15 @@ private:
 
 /// Return `true` if this allocation is to remain on the stack (`fir.alloca`).
 /// Otherwise the allocation should be moved to the heap (`fir.allocmem`).
-static inline bool keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry,
-                                       const MemoryAllocationOptions &options) {
+static inline bool
+keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry,
+                    const fir::MemoryAllocationOptOptions &options) {
   // Limitation: only arrays allocated on the stack in the entry block are
   // considered for now.
   // TODO: Generalize the algorithm and placement of the freemem nodes.
   if (alloca->getBlock() != entry)
     return true;
-  if (auto seqTy = alloca.getInType().dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(alloca.getInType())) {
     if (fir::hasDynamicSize(seqTy)) {
       // Move all arrays with runtime determined size to the heap.
       if (options.dynamicArrayOnHeap)
@@ -168,6 +158,9 @@ public:
     options = {dynOnHeap, maxStackSize};
   }
 
+  MemoryAllocationOpt(const fir::MemoryAllocationOptOptions &options)
+      : options{options} {}
+
   /// Override `options` if command-line options have been set.
   inline void useCommandLineOptions() {
     if (dynamicArrayOnHeap)
@@ -211,15 +204,6 @@ public:
   }
 
 private:
-  MemoryAllocationOptions options;
+  fir::MemoryAllocationOptOptions options;
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> fir::createMemoryAllocationPass() {
-  return std::make_unique<MemoryAllocationOpt>();
-}
-
-std::unique_ptr<mlir::Pass>
-fir::createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize) {
-  return std::make_unique<MemoryAllocationOpt>(dynOnHeap, maxStackSize);
-}
diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
index 93efea434cb1..76c12d2de5c4 100644
--- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
@@ -29,7 +29,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/CommandLine.h"
-#include <mutex>
 
 namespace fir {
 #define GEN_PASS_DEF_POLYMORPHICOPCONVERSION
@@ -48,9 +47,8 @@ class SelectTypeConv : public OpConversionPattern<fir::SelectTypeOp> {
 public:
   using OpConversionPattern<fir::SelectTypeOp>::OpConversionPattern;
 
-  SelectTypeConv(mlir::MLIRContext *ctx, std::mutex *moduleMutex)
-      : mlir::OpConversionPattern<fir::SelectTypeOp>(ctx),
-        moduleMutex(moduleMutex) {}
+  SelectTypeConv(mlir::MLIRContext *ctx)
+      : mlir::OpConversionPattern<fir::SelectTypeOp>(ctx) {}
 
   mlir::LogicalResult
   matchAndRewrite(fir::SelectTypeOp selectType, OpAdaptor adaptor,
@@ -72,9 +70,6 @@ private:
 
   llvm::SmallSet<llvm::StringRef, 4> collectAncestors(fir::TypeInfoOp dt,
                                                       mlir::ModuleOp mod) const;
-
-  // Mutex used to guard insertion of mlir::func::FuncOp in the module.
-  std::mutex *moduleMutex;
 };
 
 /// Lower `fir.dispatch` operation. A virtual call to a method in a dispatch
@@ -97,8 +92,8 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
     // Get derived type information.
     mlir::Type declaredType =
         fir::getDerivedType(dispatch.getObject().getType().getEleTy());
-    assert(declaredType.isa<fir::RecordType>() && "expecting fir.type");
-    auto recordType = declaredType.dyn_cast<fir::RecordType>();
+    assert(mlir::isa<fir::RecordType>(declaredType) && "expecting fir.type");
+    auto recordType = mlir::dyn_cast<fir::RecordType>(declaredType);
 
     // Lookup for the binding table.
     auto bindingsIter = bindingTables.find(recordType.getName());
@@ -157,7 +152,7 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
 
     // Load the bindings descriptor.
     auto bindingsCompName = Fortran::semantics::bindingDescCompName;
-    fir::RecordType typeDescRecTy = typeDescTy.cast<fir::RecordType>();
+    fir::RecordType typeDescRecTy = mlir::cast<fir::RecordType>(typeDescTy);
     mlir::Value field = rewriter.create<fir::FieldIndexOp>(
         loc, fieldTy, bindingsCompName, typeDescRecTy, mlir::ValueRange{});
     mlir::Type coorTy =
@@ -168,8 +163,8 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
 
     // Load the correct binding.
     mlir::Value bindings = rewriter.create<fir::BoxAddrOp>(loc, bindingBox);
-    fir::RecordType bindingTy =
-        fir::unwrapIfDerived(bindingBox.getType().cast<fir::BaseBoxType>());
+    fir::RecordType bindingTy = fir::unwrapIfDerived(
+        mlir::cast<fir::BaseBoxType>(bindingBox.getType()));
     mlir::Type bindingAddrTy = fir::ReferenceType::get(bindingTy);
     mlir::Value bindingIdxVal = rewriter.create<mlir::arith::ConstantOp>(
         loc, rewriter.getIndexType(), rewriter.getIndexAttr(bindingIdx));
@@ -181,7 +176,7 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
     mlir::Value procField = rewriter.create<fir::FieldIndexOp>(
         loc, fieldTy, procCompName, bindingTy, mlir::ValueRange{});
     fir::RecordType procTy =
-        bindingTy.getType(procCompName).cast<fir::RecordType>();
+        mlir::cast<fir::RecordType>(bindingTy.getType(procCompName));
     mlir::Type procRefTy = fir::ReferenceType::get(procTy);
     mlir::Value procRef = rewriter.create<fir::CoordinateOp>(
         loc, procRefTy, bindingAddr, procField);
@@ -223,19 +218,18 @@ class PolymorphicOpConversion
     : public fir::impl::PolymorphicOpConversionBase<PolymorphicOpConversion> {
 public:
   mlir::LogicalResult initialize(mlir::MLIRContext *ctx) override {
-    moduleMutex = new std::mutex();
     return mlir::success();
   }
 
   void runOnOperation() override {
     auto *context = &getContext();
-    auto mod = getOperation()->getParentOfType<ModuleOp>();
+    mlir::ModuleOp mod = getOperation();
     mlir::RewritePatternSet patterns(context);
 
     BindingTables bindingTables;
     buildBindingTables(bindingTables, mod);
 
-    patterns.insert<SelectTypeConv>(context, moduleMutex);
+    patterns.insert<SelectTypeConv>(context);
     patterns.insert<DispatchOpConv>(context, bindingTables);
     mlir::ConversionTarget target(*context);
     target.addLegalDialect<mlir::affine::AffineDialect,
@@ -253,9 +247,6 @@ public:
       signalPassFailure();
     }
   }
-
-private:
-  std::mutex *moduleMutex;
 };
 } // namespace
 
@@ -298,13 +289,13 @@ mlir::LogicalResult SelectTypeConv::matchAndRewrite(
   //   before in the list to respect point 3. above. Otherwise it is just
   //   added in order at the end.
   for (unsigned t = 0; t < typeGuardNum; ++t) {
-    if (auto a = typeGuards[t].dyn_cast<fir::ExactTypeAttr>()) {
+    if (auto a = mlir::dyn_cast<fir::ExactTypeAttr>(typeGuards[t])) {
       orderedTypeGuards.push_back(t);
       continue;
     }
 
-    if (auto a = typeGuards[t].dyn_cast<fir::SubclassAttr>()) {
-      if (auto recTy = a.getType().dyn_cast<fir::RecordType>()) {
+    if (auto a = mlir::dyn_cast<fir::SubclassAttr>(typeGuards[t])) {
+      if (auto recTy = mlir::dyn_cast<fir::RecordType>(a.getType())) {
         auto dt = mod.lookupSymbol<fir::TypeInfoOp>(recTy.getName());
         assert(dt && "dispatch table not found");
         llvm::SmallSet<llvm::StringRef, 4> ancestors =
@@ -313,8 +304,8 @@ mlir::LogicalResult SelectTypeConv::matchAndRewrite(
           auto it = orderedClassIsGuards.begin();
           while (it != orderedClassIsGuards.end()) {
             fir::SubclassAttr sAttr =
-                typeGuards[*it].dyn_cast<fir::SubclassAttr>();
-            if (auto ty = sAttr.getType().dyn_cast<fir::RecordType>()) {
+                mlir::dyn_cast<fir::SubclassAttr>(typeGuards[*it]);
+            if (auto ty = mlir::dyn_cast<fir::RecordType>(sAttr.getType())) {
               if (ancestors.contains(ty.getName()))
                 break;
             }
@@ -339,7 +330,7 @@ mlir::LogicalResult SelectTypeConv::matchAndRewrite(
     auto *dest = selectType.getSuccessor(idx);
     std::optional<mlir::ValueRange> destOps =
         selectType.getSuccessorOperands(operands, idx);
-    if (typeGuards[idx].dyn_cast<mlir::UnitAttr>())
+    if (mlir::dyn_cast<mlir::UnitAttr>(typeGuards[idx]))
       rewriter.replaceOpWithNewOp<mlir::cf::BranchOp>(
           selectType, dest, destOps.value_or(mlir::ValueRange{}));
     else if (mlir::failed(genTypeLadderStep(loc, selector, typeGuards[idx],
@@ -357,9 +348,9 @@ mlir::LogicalResult SelectTypeConv::genTypeLadderStep(
     fir::KindMapping &kindMap) const {
   mlir::Value cmp;
   // TYPE IS type guard comparison are all done inlined.
-  if (auto a = attr.dyn_cast<fir::ExactTypeAttr>()) {
+  if (auto a = mlir::dyn_cast<fir::ExactTypeAttr>(attr)) {
     if (fir::isa_trivial(a.getType()) ||
-        a.getType().isa<fir::CharacterType>()) {
+        mlir::isa<fir::CharacterType>(a.getType())) {
       // For type guard statement with Intrinsic type spec the type code of
       // the descriptor is compared.
       int code = fir::getTypeCode(a.getType(), kindMap);
@@ -383,10 +374,10 @@ mlir::LogicalResult SelectTypeConv::genTypeLadderStep(
       cmp = res;
     }
     // CLASS IS type guard statement is done with a runtime call.
-  } else if (auto a = attr.dyn_cast<fir::SubclassAttr>()) {
+  } else if (auto a = mlir::dyn_cast<fir::SubclassAttr>(attr)) {
     // Retrieve the type descriptor from the type guard statement record type.
-    assert(a.getType().isa<fir::RecordType>() && "expect fir.record type");
-    fir::RecordType recTy = a.getType().dyn_cast<fir::RecordType>();
+    assert(mlir::isa<fir::RecordType>(a.getType()) && "expect fir.record type");
+    fir::RecordType recTy = mlir::dyn_cast<fir::RecordType>(a.getType());
     std::string typeDescName =
         fir::NameUniquer::getTypeDescriptorName(recTy.getName());
     auto typeDescGlobal = mod.lookupSymbol<fir::GlobalOp>(typeDescName);
@@ -408,7 +399,6 @@ mlir::LogicalResult SelectTypeConv::genTypeLadderStep(
     {
       // Since conversion is done in parallel for each fir.select_type
       // operation, the runtime function insertion must be threadsafe.
-      std::lock_guard<std::mutex> lock(*moduleMutex);
       callee =
           fir::createFuncOp(rewriter.getUnknownLoc(), mod, fctName,
                             rewriter.getFunctionType({descNoneTy, typeDescTy},
@@ -438,8 +428,8 @@ mlir::Value
 SelectTypeConv::genTypeDescCompare(mlir::Location loc, mlir::Value selector,
                                    mlir::Type ty, mlir::ModuleOp mod,
                                    mlir::PatternRewriter &rewriter) const {
-  assert(ty.isa<fir::RecordType>() && "expect fir.record type");
-  fir::RecordType recTy = ty.dyn_cast<fir::RecordType>();
+  assert(mlir::isa<fir::RecordType>(ty) && "expect fir.record type");
+  fir::RecordType recTy = mlir::dyn_cast<fir::RecordType>(ty);
   std::string typeDescName =
       fir::NameUniquer::getTypeDescriptorName(recTy.getName());
   auto typeDescGlobal = mod.lookupSymbol<fir::GlobalOp>(typeDescName);
@@ -471,7 +461,3 @@ SelectTypeConv::collectAncestors(fir::TypeInfoOp dt, mlir::ModuleOp mod) const {
   }
   return ancestors;
 }
-
-std::unique_ptr<mlir::Pass> fir::createPolymorphicOpConversionPass() {
-  return std::make_unique<PolymorphicOpConversion>();
-}
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index f7820b6b8170..c61179a7460e 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -72,6 +72,9 @@ class SimplifyIntrinsicsPass
       mlir::Type elementType)>;
 
 public:
+  using fir::impl::SimplifyIntrinsicsBase<
+      SimplifyIntrinsicsPass>::SimplifyIntrinsicsBase;
+
   /// Generate a new function implementing a simplified version
   /// of a Fortran runtime function defined by \p basename name.
   /// \p typeGenerator is a callback that generates the new function's type.
@@ -212,8 +215,8 @@ static unsigned getDimCount(mlir::Value val) {
   // the first ConvertOp that has non-opaque box type that we meet
   // going through the ConvertOp chain.
   if (mlir::Value emboxVal = findBoxDef(val))
-    if (auto boxTy = emboxVal.getType().dyn_cast<fir::BoxType>())
-      if (auto seqTy = boxTy.getEleTy().dyn_cast<fir::SequenceType>())
+    if (auto boxTy = mlir::dyn_cast<fir::BoxType>(emboxVal.getType()))
+      if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy()))
         return seqTy.getDimension();
   return 0;
 }
@@ -234,9 +237,9 @@ static std::optional<mlir::Type> getArgElementType(mlir::Value val) {
     val = defOp->getOperand(0);
     // The convert operation is expected to convert from one
     // box type to another box type.
-    auto boxType = val.getType().cast<fir::BoxType>();
+    auto boxType = mlir::cast<fir::BoxType>(val.getType());
     auto elementType = fir::unwrapSeqOrBoxedSeqType(boxType);
-    if (!elementType.isa<mlir::NoneType>())
+    if (!mlir::isa<mlir::NoneType>(elementType))
       return elementType;
   } while (true);
 }
@@ -378,7 +381,7 @@ static void genRuntimeSumBody(fir::FirOpBuilder &builder,
   // end function RTNAME(Sum)<T>x<rank>_simplified
   auto zero = [](fir::FirOpBuilder builder, mlir::Location loc,
                  mlir::Type elementType) {
-    if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       return builder.createRealConstant(loc, elementType,
                                         llvm::APFloat::getZero(sem));
@@ -389,9 +392,9 @@ static void genRuntimeSumBody(fir::FirOpBuilder &builder,
   auto genBodyOp = [](fir::FirOpBuilder builder, mlir::Location loc,
                       mlir::Type elementType, mlir::Value elem1,
                       mlir::Value elem2) -> mlir::Value {
-    if (elementType.isa<mlir::FloatType>())
+    if (mlir::isa<mlir::FloatType>(elementType))
       return builder.create<mlir::arith::AddFOp>(loc, elem1, elem2);
-    if (elementType.isa<mlir::IntegerType>())
+    if (mlir::isa<mlir::IntegerType>(elementType))
       return builder.create<mlir::arith::AddIOp>(loc, elem1, elem2);
 
     llvm_unreachable("unsupported type");
@@ -411,7 +414,7 @@ static void genRuntimeMaxvalBody(fir::FirOpBuilder &builder,
                                  mlir::Type elementType) {
   auto init = [](fir::FirOpBuilder builder, mlir::Location loc,
                  mlir::Type elementType) {
-    if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       return builder.createRealConstant(
           loc, elementType, llvm::APFloat::getLargest(sem, /*Negative=*/true));
@@ -424,7 +427,7 @@ static void genRuntimeMaxvalBody(fir::FirOpBuilder &builder,
   auto genBodyOp = [](fir::FirOpBuilder builder, mlir::Location loc,
                       mlir::Type elementType, mlir::Value elem1,
                       mlir::Value elem2) -> mlir::Value {
-    if (elementType.isa<mlir::FloatType>()) {
+    if (mlir::isa<mlir::FloatType>(elementType)) {
       // arith.maxf later converted to llvm.intr.maxnum does not work
       // correctly for NaNs and -0.0 (see maxnum/minnum pattern matching
       // in LLVM's InstCombine pass). Moreover, llvm.intr.maxnum
@@ -436,7 +439,7 @@ static void genRuntimeMaxvalBody(fir::FirOpBuilder &builder,
           loc, mlir::arith::CmpFPredicate::OGT, elem1, elem2);
       return builder.create<mlir::arith::SelectOp>(loc, compare, elem1, elem2);
     }
-    if (elementType.isa<mlir::IntegerType>())
+    if (mlir::isa<mlir::IntegerType>(elementType))
       return builder.create<mlir::arith::MaxSIOp>(loc, elem1, elem2);
 
     llvm_unreachable("unsupported type");
@@ -659,7 +662,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
                                     mlir::Type resultElemTy, bool isDim) {
   auto init = [isMax](fir::FirOpBuilder builder, mlir::Location loc,
                       mlir::Type elementType) {
-    if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       llvm::APFloat limit = llvm::APFloat::getInf(sem, /*Negative=*/isMax);
       return builder.createRealConstant(loc, elementType, limit);
@@ -741,7 +744,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
     mlir::Value elem = builder.create<fir::LoadOp>(loc, addr);
 
     mlir::Value cmp;
-    if (elementType.isa<mlir::FloatType>()) {
+    if (mlir::isa<mlir::FloatType>(elementType)) {
       // For FP reductions we want the first smallest value to be used, that
       // is not NaN. A OGL/OLT condition will usually work for this unless all
       // the values are Nan or Inf. This follows the same logic as
@@ -758,7 +761,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
           loc, mlir::arith::CmpFPredicate::OEQ, elem, elem);
       cmpNan = builder.create<mlir::arith::AndIOp>(loc, cmpNan, cmpNan2);
       cmp = builder.create<mlir::arith::OrIOp>(loc, cmp, cmpNan);
-    } else if (elementType.isa<mlir::IntegerType>()) {
+    } else if (mlir::isa<mlir::IntegerType>(elementType)) {
       cmp = builder.create<mlir::arith::CmpIOp>(
           loc,
           isMax ? mlir::arith::CmpIPredicate::sgt
@@ -836,7 +839,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
 
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
     mlir::Value basicValue;
-    if (elementType.isa<mlir::IntegerType>()) {
+    if (mlir::isa<mlir::IntegerType>(elementType)) {
       basicValue = builder.createIntegerConstant(loc, elementType, 0);
     } else {
       basicValue = builder.createRealConstant(loc, elementType, 0);
@@ -918,7 +921,7 @@ static void genRuntimeDotBody(fir::FirOpBuilder &builder,
   mlir::IndexType idxTy = builder.getIndexType();
 
   mlir::Value zero =
-      resultElementType.isa<mlir::FloatType>()
+      mlir::isa<mlir::FloatType>(resultElementType)
           ? builder.createRealConstant(loc, resultElementType, 0.0)
           : builder.createIntegerConstant(loc, resultElementType, 0);
 
@@ -975,10 +978,10 @@ static void genRuntimeDotBody(fir::FirOpBuilder &builder,
   // Convert to the result type.
   elem2 = builder.create<fir::ConvertOp>(loc, resultElementType, elem2);
 
-  if (resultElementType.isa<mlir::FloatType>())
+  if (mlir::isa<mlir::FloatType>(resultElementType))
     sumVal = builder.create<mlir::arith::AddFOp>(
         loc, builder.create<mlir::arith::MulFOp>(loc, elem1, elem2), sumVal);
-  else if (resultElementType.isa<mlir::IntegerType>())
+  else if (mlir::isa<mlir::IntegerType>(resultElementType))
     sumVal = builder.create<mlir::arith::AddIOp>(
         loc, builder.create<mlir::arith::MulIOp>(loc, elem1, elem2), sumVal);
   else
@@ -1053,8 +1056,8 @@ void SimplifyIntrinsicsPass::simplifyIntOrFloatReduction(
 
   mlir::Type resultType = call.getResult(0).getType();
 
-  if (!resultType.isa<mlir::FloatType>() &&
-      !resultType.isa<mlir::IntegerType>())
+  if (!mlir::isa<mlir::FloatType>(resultType) &&
+      !mlir::isa<mlir::IntegerType>(resultType))
     return;
 
   auto argType = getArgElementType(args[0]);
@@ -1100,7 +1103,8 @@ void SimplifyIntrinsicsPass::simplifyLogicalDim0Reduction(
   fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
 
   // Treating logicals as integers makes things a lot easier
-  fir::LogicalType logicalType = {elementType.dyn_cast<fir::LogicalType>()};
+  fir::LogicalType logicalType = {
+      mlir::dyn_cast<fir::LogicalType>(elementType)};
   fir::KindTy kind = logicalType.getFKind();
   mlir::Type intElementType = builder.getIntegerType(kind * 8);
 
@@ -1135,7 +1139,8 @@ void SimplifyIntrinsicsPass::simplifyLogicalDim1Reduction(
   fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
 
   // Treating logicals as integers makes things a lot easier
-  fir::LogicalType logicalType = {elementType.dyn_cast<fir::LogicalType>()};
+  fir::LogicalType logicalType = {
+      mlir::dyn_cast<fir::LogicalType>(elementType)};
   fir::KindTy kind = logicalType.getFKind();
   mlir::Type intElementType = builder.getIntegerType(kind * 8);
 
@@ -1179,7 +1184,7 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   auto inputBox = findBoxDef(args[1]);
   mlir::Type inputType = hlfir::getFortranElementType(inputBox.getType());
 
-  if (inputType.isa<fir::CharacterType>())
+  if (mlir::isa<fir::CharacterType>(inputType))
     return;
 
   int maskRank;
@@ -1190,7 +1195,8 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   } else {
     maskRank = getDimCount(mask);
     mlir::Type maskElemTy = hlfir::getFortranElementType(maskDef.getType());
-    fir::LogicalType logicalFirType = {maskElemTy.dyn_cast<fir::LogicalType>()};
+    fir::LogicalType logicalFirType = {
+        mlir::dyn_cast<fir::LogicalType>(maskElemTy)};
     kind = logicalFirType.getFKind();
     // Convert fir::LogicalType to mlir::Type
     logicalElemType = logicalFirType;
@@ -1299,7 +1305,8 @@ void SimplifyIntrinsicsPass::runOnOperation() {
           std::string fmfString{builder.getFastMathFlagsString()};
 
           mlir::Type type = call.getResult(0).getType();
-          if (!type.isa<mlir::FloatType>() && !type.isa<mlir::IntegerType>())
+          if (!mlir::isa<mlir::FloatType>(type) &&
+              !mlir::isa<mlir::IntegerType>(type))
             return;
 
           // Try to find the element types of the boxed arguments.
@@ -1311,11 +1318,9 @@ void SimplifyIntrinsicsPass::runOnOperation() {
 
           // Support only floating point and integer arguments
           // now (e.g. logical is skipped here).
-          if (!arg1Type->isa<mlir::FloatType>() &&
-              !arg1Type->isa<mlir::IntegerType>())
+          if (!mlir::isa<mlir::FloatType, mlir::IntegerType>(*arg1Type))
             return;
-          if (!arg2Type->isa<mlir::FloatType>() &&
-              !arg2Type->isa<mlir::IntegerType>())
+          if (!mlir::isa<mlir::FloatType, mlir::IntegerType>(*arg2Type))
             return;
 
           auto typeGenerator = [&type](fir::FirOpBuilder &builder) {
@@ -1387,6 +1392,3 @@ void SimplifyIntrinsicsPass::getDependentDialects(
   // LLVM::LinkageAttr creation requires that LLVM dialect is loaded.
   registry.insert<mlir::LLVM::LLVMDialect>();
 }
-std::unique_ptr<mlir::Pass> fir::createSimplifyIntrinsicsPass() {
-  return std::make_unique<SimplifyIntrinsicsPass>();
-}
diff --git a/flang/lib/Optimizer/Transforms/SimplifyRegionLite.cpp b/flang/lib/Optimizer/Transforms/SimplifyRegionLite.cpp
index 3fe6bed12cf4..7d1f86f8cee9 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyRegionLite.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyRegionLite.cpp
@@ -45,7 +45,3 @@ void SimplifyRegionLitePass::runOnOperation() {
   (void)mlir::eraseUnreachableBlocks(rewriter, regions);
   (void)mlir::runRegionDCE(rewriter, regions);
 }
-
-std::unique_ptr<mlir::Pass> fir::createSimplifyRegionLitePass() {
-  return std::make_unique<SimplifyRegionLitePass>();
-}
diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index 1c213abefe6f..16bbb1c35646 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -351,7 +351,7 @@ void AllocationAnalysis::visitOperation(mlir::Operation *op,
     }
 
     auto retTy = allocmem.getAllocatedType();
-    if (!retTy.isa<fir::SequenceType>()) {
+    if (!mlir::isa<fir::SequenceType>(retTy)) {
       LLVM_DEBUG(llvm::dbgs()
                  << "--Allocation is not for an array: skipping\n");
       return;
@@ -776,7 +776,3 @@ void StackArraysPass::runOnFunc(mlir::Operation *func) {
     signalPassFailure();
   }
 }
-
-std::unique_ptr<mlir::Pass> fir::createStackArraysPass() {
-  return std::make_unique<StackArraysPass>();
-}
diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index 2bdb8e38db95..ff01974b549a 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -123,7 +123,8 @@ TYPE_PARSER(first(
 TYPE_CONTEXT_PARSER("internal subprogram"_en_US,
     (construct<InternalSubprogram>(indirect(functionSubprogram)) ||
         construct<InternalSubprogram>(indirect(subroutineSubprogram))) /
-        forceEndOfStmt)
+            forceEndOfStmt ||
+        construct<InternalSubprogram>(indirect(compilerDirective)))
 
 // R511 internal-subprogram-part -> contains-stmt [internal-subprogram]...
 TYPE_CONTEXT_PARSER("internal subprogram part"_en_US,
diff --git a/flang/lib/Parser/preprocessor.cpp b/flang/lib/Parser/preprocessor.cpp
index 2fba28b0c0c7..ce95dc4b7aae 100644
--- a/flang/lib/Parser/preprocessor.cpp
+++ b/flang/lib/Parser/preprocessor.cpp
@@ -593,8 +593,11 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner &prescanner) {
           "# missing or invalid name"_err_en_US);
     } else {
       if (dir.IsAnythingLeft(++j)) {
-        prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
-            "#undef: excess tokens at end of directive"_port_en_US);
+        if (prescanner.features().ShouldWarn(
+                common::UsageWarning::Portability)) {
+          prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
+              "#undef: excess tokens at end of directive"_port_en_US);
+        }
       } else {
         definitions_.erase(nameToken);
       }
@@ -607,8 +610,11 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner &prescanner) {
           "#%s: missing name"_err_en_US, dirName);
     } else {
       if (dir.IsAnythingLeft(++j)) {
-        prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
-            "#%s: excess tokens at end of directive"_port_en_US, dirName);
+        if (prescanner.features().ShouldWarn(
+                common::UsageWarning::Portability)) {
+          prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
+              "#%s: excess tokens at end of directive"_port_en_US, dirName);
+        }
       }
       doThen = IsNameDefined(nameToken) == (dirName == "ifdef");
     }
@@ -627,8 +633,10 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner &prescanner) {
     }
   } else if (dirName == "else") {
     if (dir.IsAnythingLeft(j)) {
-      prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
-          "#else: excess tokens at end of directive"_port_en_US);
+      if (prescanner.features().ShouldWarn(common::UsageWarning::Portability)) {
+        prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
+            "#else: excess tokens at end of directive"_port_en_US);
+      }
     } else if (ifStack_.empty()) {
       prescanner.Say(dir.GetTokenProvenanceRange(dirOffset),
           "#else: not nested within #if, #ifdef, or #ifndef"_err_en_US);
@@ -654,8 +662,10 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner &prescanner) {
     }
   } else if (dirName == "endif") {
     if (dir.IsAnythingLeft(j)) {
-      prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
-          "#endif: excess tokens at end of directive"_port_en_US);
+      if (prescanner.features().ShouldWarn(common::UsageWarning::Portability)) {
+        prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
+            "#endif: excess tokens at end of directive"_port_en_US);
+      }
     } else if (ifStack_.empty()) {
       prescanner.Say(dir.GetTokenProvenanceRange(dirOffset),
           "#endif: no #if, #ifdef, or #ifndef"_err_en_US);
@@ -702,8 +712,11 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner &prescanner) {
         ++k;
       }
       if (k >= pathTokens) {
-        prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
-            "#include: expected '>' at end of included file"_port_en_US);
+        if (prescanner.features().ShouldWarn(
+                common::UsageWarning::Portability)) {
+          prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
+              "#include: expected '>' at end of included file"_port_en_US);
+        }
       }
       TokenSequence braced{path, 1, k - 1};
       include = braced.ToString();
@@ -729,8 +742,10 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner &prescanner) {
     }
     k = path.SkipBlanks(k + 1);
     if (k < pathTokens && path.TokenAt(k).ToString() != "!") {
-      prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
-          "#include: extra stuff ignored after file name"_port_en_US);
+      if (prescanner.features().ShouldWarn(common::UsageWarning::Portability)) {
+        prescanner.Say(dir.GetIntervalProvenanceRange(j, tokens - j),
+            "#include: extra stuff ignored after file name"_port_en_US);
+      }
     }
     std::string buf;
     llvm::raw_string_ostream error{buf};
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 96db3955299f..c08a28cb4344 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -29,15 +29,18 @@ Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
       allSources_{preprocessor_.allSources()}, features_{lfc},
+      backslashFreeFormContinuation_{preprocessor.AnyDefinitions()},
       encoding_{allSources_.encoding()} {}
 
 Prescanner::Prescanner(const Prescanner &that)
     : messages_{that.messages_}, cooked_{that.cooked_},
       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
-      features_{that.features_}, inFixedForm_{that.inFixedForm_},
+      features_{that.features_},
+      backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
+      inFixedForm_{that.inFixedForm_},
       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
-      encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
-                                     1},
+      encoding_{that.encoding_},
+      prescannerNesting_{that.prescannerNesting_ + 1},
       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
@@ -206,8 +209,10 @@ void Prescanner::Statement() {
     case LineClassification::Kind::IncludeDirective:
     case LineClassification::Kind::DefinitionDirective:
     case LineClassification::Kind::PreprocessorDirective:
-      Say(preprocessed->GetProvenanceRange(),
-          "Preprocessed line resembles a preprocessor directive"_warn_en_US);
+      if (features_.ShouldWarn(common::UsageWarning::Preprocessing)) {
+        Say(preprocessed->GetProvenanceRange(),
+            "Preprocessed line resembles a preprocessor directive"_warn_en_US);
+      }
       preprocessed->ToLowerCase()
           .CheckBadFortranCharacters(messages_, *this)
           .CheckBadParentheses(messages_)
@@ -316,10 +321,12 @@ void Prescanner::LabelField(TokenSequence &token) {
     ++column_;
   }
   if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
-    Say(GetProvenance(start + *badColumn - 1),
-        *badColumn == 6
-            ? "Statement should not begin with a continuation line"_warn_en_US
-            : "Character in fixed-form label field must be a digit"_warn_en_US);
+    if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
+      Say(GetProvenance(start + *badColumn - 1),
+          *badColumn == 6
+              ? "Statement should not begin with a continuation line"_warn_en_US
+              : "Character in fixed-form label field must be a digit"_warn_en_US);
+    }
     token.clear();
     if (*badColumn < 6) {
       at_ = start;
@@ -796,8 +803,10 @@ void Prescanner::Hollerith(
   while (count-- > 0) {
     if (PadOutCharacterLiteral(tokens)) {
     } else if (*at_ == '\n') {
-      Say(GetProvenanceRange(start, at_),
-          "Possible truncated Hollerith literal"_warn_en_US);
+      if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
+        Say(GetProvenanceRange(start, at_),
+            "Possible truncated Hollerith literal"_warn_en_US);
+      }
       break;
     } else {
       NextChar();
@@ -955,8 +964,10 @@ void Prescanner::FortranInclude(const char *firstQuote) {
     const char *garbage{p};
     for (; *p != '\n' && *p != '!'; ++p) {
     }
-    Say(GetProvenanceRange(garbage, p),
-        "excess characters after path name"_warn_en_US);
+    if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
+      Say(GetProvenanceRange(garbage, p),
+          "excess characters after path name"_warn_en_US);
+    }
   }
   std::string buf;
   llvm::raw_string_ostream error{buf};
@@ -1226,9 +1237,14 @@ bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
     } else {
       return FreeFormContinuation();
     }
-  } else {
-    return false;
+  } else if (*at_ == '\\' && at_ + 2 == nextLine_ &&
+      backslashFreeFormContinuation_ && !inFixedForm_ && nextLine_ < limit_) {
+    // cpp-like handling of \ at end of a free form source line
+    BeginSourceLine(nextLine_);
+    NextLine();
+    return true;
   }
+  return false;
 }
 
 std::optional<Prescanner::LineClassification>
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index 581980001bcc..4eb3713bd3e3 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -43,6 +43,7 @@ public:
   Messages &messages() { return messages_; }
   const Preprocessor &preprocessor() const { return preprocessor_; }
   Preprocessor &preprocessor() { return preprocessor_; }
+  common::LanguageFeatureControl &features() { return features_; }
 
   Prescanner &set_fixedForm(bool yes) {
     inFixedForm_ = yes;
@@ -197,6 +198,7 @@ private:
   Preprocessor &preprocessor_;
   AllSources &allSources_;
   common::LanguageFeatureControl features_;
+  bool backslashFreeFormContinuation_{false};
   bool inFixedForm_{false};
   int fixedFormColumnLimit_{72};
   Encoding encoding_{Encoding::UTF_8};
diff --git a/flang/lib/Semantics/check-acc-structure.cpp b/flang/lib/Semantics/check-acc-structure.cpp
index 44aaa1fdd803..18704b53c66f 100644
--- a/flang/lib/Semantics/check-acc-structure.cpp
+++ b/flang/lib/Semantics/check-acc-structure.cpp
@@ -409,12 +409,16 @@ void AccStructureChecker::CheckMultipleOccurrenceInDeclare(
               if (const auto *name = getDesignatorNameIfDataRef(designator)) {
                 if (declareSymbols.contains(&name->symbol->GetUltimate())) {
                   if (declareSymbols[&name->symbol->GetUltimate()] == clause) {
-                    context_.Say(GetContext().clauseSource,
-                        "'%s' in the %s clause is already present in the same "
-                        "clause in this module"_warn_en_US,
-                        name->symbol->name(),
-                        parser::ToUpperCaseLetters(
-                            llvm::acc::getOpenACCClauseName(clause).str()));
+                    if (context_.languageFeatures().ShouldWarn(
+                            common::UsageWarning::OpenAccUsage)) {
+                      context_.Say(GetContext().clauseSource,
+                          "'%s' in the %s clause is already present in the "
+                          "same "
+                          "clause in this module"_warn_en_US,
+                          name->symbol->name(),
+                          parser::ToUpperCaseLetters(
+                              llvm::acc::getOpenACCClauseName(clause).str()));
+                    }
                   } else {
                     context_.Say(GetContext().clauseSource,
                         "'%s' in the %s clause is already present in another "
@@ -780,7 +784,10 @@ void AccStructureChecker::Enter(const parser::AccClause::If &x) {
 }
 
 void AccStructureChecker::Enter(const parser::OpenACCEndConstruct &x) {
-  context_.Say(x.source, "Misplaced OpenACC end directive"_warn_en_US);
+  if (context_.languageFeatures().ShouldWarn(
+          common::UsageWarning::OpenAccUsage)) {
+    context_.Say(x.source, "Misplaced OpenACC end directive"_warn_en_US);
+  }
 }
 
 void AccStructureChecker::Enter(const parser::Module &) {
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index ce82cccf26d5..94afcbb68b34 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -161,7 +161,10 @@ static void CheckCharacterActual(evaluate::Expr<evaluate::SomeType> &actual,
                                     actualOffset->offset()) /
                       actualType.type().kind();
                 }
-                if (actualChars < dummyChars) {
+                if (actualChars < dummyChars &&
+                    (extentErrors ||
+                        context.ShouldWarn(
+                            common::UsageWarning::ShortCharacterActual))) {
                   auto msg{
                       "Actual argument has fewer characters remaining in storage sequence (%jd) than %s (%jd)"_warn_en_US};
                   if (extentErrors) {
@@ -177,7 +180,10 @@ static void CheckCharacterActual(evaluate::Expr<evaluate::SomeType> &actual,
                       foldingContext,
                       evaluate::GetSize(evaluate::Shape(actualType.shape()))))};
                   actualSize &&
-                  *actualSize * *actualLength < *dummySize * *dummyLength) {
+                  *actualSize * *actualLength < *dummySize * *dummyLength &&
+                  (extentErrors ||
+                      context.ShouldWarn(
+                          common::UsageWarning::ShortCharacterActual))) {
                 auto msg{
                     "Actual argument array has fewer characters (%jd) than %s array (%jd)"_warn_en_US};
                 if (extentErrors) {
@@ -255,12 +261,15 @@ static void ConvertIntegerActual(evaluate::Expr<evaluate::SomeType> &actual,
                 common::LanguageFeature::ActualIntegerConvertedToSmallerKind)) {
           msg =
               "Actual argument scalar expression of type INTEGER(%d) cannot beimplicitly converted to smaller dummy argument type INTEGER(%d)"_err_en_US;
-        } else {
+        } else if (semanticsContext.ShouldWarn(
+                       common::LanguageFeature::ConvertedArgument)) {
           msg =
               "Actual argument scalar expression of type INTEGER(%d) was converted to smaller dummy argument type INTEGER(%d)"_port_en_US;
         }
-        messages.Say(std::move(msg.value()), actualType.type().kind(),
-            dummyType.type().kind());
+        if (msg) {
+          messages.Say(std::move(msg.value()), actualType.type().kind(),
+              dummyType.type().kind());
+        }
       }
     }
     actualType = dummyType;
@@ -336,7 +345,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
     if (const auto *constantChar{
             evaluate::UnwrapConstantValue<evaluate::Ascii>(actual)};
         constantChar && constantChar->wasHollerith() &&
-        dummy.type.type().IsUnlimitedPolymorphic()) {
+        dummy.type.type().IsUnlimitedPolymorphic() &&
+        context.ShouldWarn(common::LanguageFeature::HollerithPolymorphic)) {
       messages.Say(
           "passing Hollerith to unlimited polymorphic as if it were CHARACTER"_port_en_US);
     }
@@ -589,7 +599,10 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
                       *actualSymTypeBytes;
                 }
               }
-              if (actualElements && *actualElements < *dummySize) {
+              if (actualElements && *actualElements < *dummySize &&
+                  (extentErrors ||
+                      context.ShouldWarn(
+                          common::UsageWarning::ShortArrayActual))) {
                 auto msg{
                     "Actual argument has fewer elements remaining in storage sequence (%jd) than %s array (%jd)"_warn_en_US};
                 if (extentErrors) {
@@ -604,7 +617,9 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
         } else { // actualRank > 0 || actualIsAssumedRank
           if (auto actualSize{evaluate::ToInt64(evaluate::Fold(foldingContext,
                   evaluate::GetSize(evaluate::Shape(actualType.shape()))))};
-              actualSize && *actualSize < *dummySize) {
+              actualSize && *actualSize < *dummySize &&
+              (extentErrors ||
+                  context.ShouldWarn(common::UsageWarning::ShortArrayActual))) {
             auto msg{
                 "Actual argument array has fewer elements (%jd) than %s array (%jd)"_warn_en_US};
             if (extentErrors) {
@@ -706,7 +721,7 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
         // Extension (Intel, NAG, XLF): a NULL() pointer is an acceptable
         // actual argument for an INTENT(IN) allocatable dummy, and it
         // is treated as an unassociated allocatable.
-        if (context.languageFeatures().ShouldWarn(
+        if (context.ShouldWarn(
                 common::LanguageFeature::NullActualForAllocatable)) {
           messages.Say(
               "Allocatable %s is associated with a null pointer"_port_en_US,
@@ -897,8 +912,9 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
         actualDataAttr = common::CUDADataAttr::Device;
       }
     }
-    if (!common::AreCompatibleCUDADataAttrs(
-            dummyDataAttr, actualDataAttr, dummy.ignoreTKR)) {
+    if (!common::AreCompatibleCUDADataAttrs(dummyDataAttr, actualDataAttr,
+            dummy.ignoreTKR,
+            /*allowUnifiedMatchingRule=*/true)) {
       auto toStr{[](std::optional<common::CUDADataAttr> x) {
         return x ? "ATTRIBUTES("s +
                 parser::ToUpperCaseLetters(common::EnumToString(*x)) + ")"s
@@ -1160,8 +1176,8 @@ static void CheckExplicitInterfaceArg(evaluate::ActualArgument &arg,
                     evaluate::IsNullPointer(*expr)) {
                   if (object.intent == common::Intent::In) {
                     // Extension (Intel, NAG, XLF); see CheckExplicitDataArg.
-                    if (context.languageFeatures().ShouldWarn(common::
-                                LanguageFeature::NullActualForAllocatable)) {
+                    if (context.ShouldWarn(common::LanguageFeature::
+                                NullActualForAllocatable)) {
                       messages.Say(
                           "Allocatable %s is associated with NULL()"_port_en_US,
                           dummyName);
@@ -1390,6 +1406,11 @@ static void CheckAssociated(evaluate::ActualArguments &arguments,
                     msg =
                         "Procedures '%s' and '%s' may not be completely compatible: %s"_warn_en_US;
                     whyNot = std::move(*warning);
+                  } else if (msg &&
+                      msg->severity() != parser::Severity::Error &&
+                      !semanticsContext.ShouldWarn(
+                          common::UsageWarning::ProcPointerCompatibility)) {
+                    msg.reset();
                   }
                   if (msg) {
                     msg->set_severity(parser::Severity::Warning);
@@ -1736,7 +1757,7 @@ static void CheckTransfer(evaluate::ActualArguments &arguments,
               messages.Say(
                   "Element size of MOLD= array may not be zero when SOURCE= is not empty"_err_en_US);
             }
-          } else {
+          } else if (context.ShouldWarn(common::UsageWarning::VoidMold)) {
             messages.Say(
                 "Element size of MOLD= array may not be zero unless SOURCE= is empty"_warn_en_US);
           }
@@ -1929,6 +1950,7 @@ bool CheckArguments(const characteristics::Procedure &proc,
   bool explicitInterface{proc.HasExplicitInterface()};
   evaluate::FoldingContext foldingContext{context.foldingContext()};
   parser::ContextualMessages &messages{foldingContext.messages()};
+  bool allowArgumentConversions{true};
   if (!explicitInterface || treatingExternalAsImplicit) {
     parser::Messages buffer;
     {
@@ -1945,16 +1967,22 @@ bool CheckArguments(const characteristics::Procedure &proc,
       }
       return false; // don't pile on
     }
+    allowArgumentConversions = false;
   }
   if (explicitInterface) {
     auto buffer{CheckExplicitInterface(proc, actuals, context, &scope,
-        intrinsic, /*allowArgumentConversions=*/true, /*extentErrors=*/true,
-        ignoreImplicitVsExplicit)};
+        intrinsic, allowArgumentConversions,
+        /*extentErrors=*/true, ignoreImplicitVsExplicit)};
     if (!buffer.empty()) {
       if (treatingExternalAsImplicit) {
-        if (auto *msg{messages.Say(
-                "If the procedure's interface were explicit, this reference would be in error"_warn_en_US)}) {
-          buffer.AttachTo(*msg, parser::Severity::Because);
+        if (context.ShouldWarn(
+                common::UsageWarning::KnownBadImplicitInterface)) {
+          if (auto *msg{messages.Say(
+                  "If the procedure's interface were explicit, this reference would be in error"_warn_en_US)}) {
+            buffer.AttachTo(*msg, parser::Severity::Because);
+          }
+        } else {
+          buffer.clear();
         }
       }
       if (auto *msgs{messages.messages()}) {
diff --git a/flang/lib/Semantics/check-case.cpp b/flang/lib/Semantics/check-case.cpp
index 5bc166ef2126..d296460127e1 100644
--- a/flang/lib/Semantics/check-case.cpp
+++ b/flang/lib/Semantics/check-case.cpp
@@ -49,8 +49,10 @@ private:
               for (const auto &range : ranges) {
                 auto pair{ComputeBounds(range)};
                 if (pair.first && pair.second && *pair.first > *pair.second) {
-                  context_.Say(stmt.source,
-                      "CASE has lower bound greater than upper bound"_warn_en_US);
+                  if (context_.ShouldWarn(common::UsageWarning::EmptyCase)) {
+                    context_.Say(stmt.source,
+                        "CASE has lower bound greater than upper bound"_warn_en_US);
+                  }
                 } else {
                   if constexpr (T::category == TypeCategory::Logical) { // C1148
                     if ((pair.first || pair.second) &&
@@ -93,9 +95,11 @@ private:
               x->v = converted;
               return value;
             } else {
-              context_.Say(expr.source,
-                  "CASE value (%s) overflows type (%s) of SELECT CASE expression"_warn_en_US,
-                  folded.AsFortran(), caseExprType_.AsFortran());
+              if (context_.ShouldWarn(common::UsageWarning::CaseOverflow)) {
+                context_.Say(expr.source,
+                    "CASE value (%s) overflows type (%s) of SELECT CASE expression"_warn_en_US,
+                    folded.AsFortran(), caseExprType_.AsFortran());
+              }
               hasErrors_ = true;
               return std::nullopt;
             }
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index a9e57de7e2f2..96ab90239263 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -296,8 +296,10 @@ private:
     return false;
   }
   void WarnOnIoStmt(const parser::CharBlock &source) {
-    context_.Say(
-        source, "I/O statement might not be supported on device"_warn_en_US);
+    if (context_.ShouldWarn(common::UsageWarning::CUDAUsage)) {
+      context_.Say(
+          source, "I/O statement might not be supported on device"_warn_en_US);
+    }
   }
   template <typename A>
   void WarnIfNotInternal(const A &stmt, const parser::CharBlock &source) {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 901ac20f8aae..c1d9538e557f 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -289,6 +289,14 @@ void CheckHelper::Check(const Symbol &symbol) {
     messages_.Say(
         "An entity may not have the ASYNCHRONOUS attribute unless it is a variable"_err_en_US);
   }
+  if (symbol.attrs().HasAny({Attr::INTENT_IN, Attr::INTENT_INOUT,
+          Attr::INTENT_OUT, Attr::OPTIONAL, Attr::VALUE}) &&
+      !IsDummy(symbol)) {
+    messages_.Say(
+        "Only a dummy argument may have an INTENT, VALUE, or OPTIONAL attribute"_err_en_US);
+  } else if (symbol.attrs().test(Attr::VALUE)) {
+    CheckValue(symbol, derived);
+  }
 
   if (isDone) {
     return; // following checks do not apply
@@ -411,9 +419,6 @@ void CheckHelper::Check(const Symbol &symbol) {
       // The non-dummy case is a hard error that's caught elsewhere.
     }
   }
-  if (symbol.attrs().test(Attr::VALUE)) {
-    CheckValue(symbol, derived);
-  }
   if (IsDummy(symbol)) {
     if (IsNamedConstant(symbol)) {
       messages_.Say(
@@ -527,13 +532,10 @@ void CheckHelper::CheckBindCFunctionResult(const Symbol &symbol) { // C1553
 
 void CheckHelper::CheckValue(
     const Symbol &symbol, const DerivedTypeSpec *derived) { // C863 - C865
-  if (!IsDummy(symbol)) {
-    messages_.Say(
-        "VALUE attribute may apply only to a dummy argument"_err_en_US);
-  }
   if (IsProcedure(symbol)) {
     messages_.Say(
         "VALUE attribute may apply only to a dummy data object"_err_en_US);
+    return; // don't pile on
   }
   if (IsAssumedSizeArray(symbol)) {
     messages_.Say(
@@ -766,19 +768,25 @@ void CheckHelper::CheckObjectEntity(
       if (IsPassedViaDescriptor(symbol)) {
         if (IsAllocatableOrObjectPointer(&symbol)) {
           if (inExplicitInterface) {
-            WarnIfNotInModuleFile(
-                "!DIR$ IGNORE_TKR should not apply to an allocatable or pointer"_warn_en_US);
+            if (context_.ShouldWarn(common::UsageWarning::IgnoreTKRUsage)) {
+              WarnIfNotInModuleFile(
+                  "!DIR$ IGNORE_TKR should not apply to an allocatable or pointer"_warn_en_US);
+            }
           } else {
             messages_.Say(
                 "!DIR$ IGNORE_TKR may not apply to an allocatable or pointer"_err_en_US);
           }
         } else if (ignoreTKR.test(common::IgnoreTKR::Rank)) {
           if (ignoreTKR.count() == 1 && evaluate::IsAssumedRank(symbol)) {
-            WarnIfNotInModuleFile(
-                "!DIR$ IGNORE_TKR(R) is not meaningful for an assumed-rank array"_warn_en_US);
+            if (context_.ShouldWarn(common::UsageWarning::IgnoreTKRUsage)) {
+              WarnIfNotInModuleFile(
+                  "!DIR$ IGNORE_TKR(R) is not meaningful for an assumed-rank array"_warn_en_US);
+            }
           } else if (inExplicitInterface) {
-            WarnIfNotInModuleFile(
-                "!DIR$ IGNORE_TKR(R) should not apply to a dummy argument passed via descriptor"_warn_en_US);
+            if (context_.ShouldWarn(common::UsageWarning::IgnoreTKRUsage)) {
+              WarnIfNotInModuleFile(
+                  "!DIR$ IGNORE_TKR(R) should not apply to a dummy argument passed via descriptor"_warn_en_US);
+            }
           } else {
             messages_.Say(
                 "!DIR$ IGNORE_TKR(R) may not apply to a dummy argument passed via descriptor"_err_en_US);
@@ -786,14 +794,6 @@ void CheckHelper::CheckObjectEntity(
         }
       }
     }
-  } else if (symbol.attrs().test(Attr::INTENT_IN) ||
-      symbol.attrs().test(Attr::INTENT_OUT) ||
-      symbol.attrs().test(Attr::INTENT_INOUT)) {
-    messages_.Say(
-        "INTENT attributes may apply only to a dummy argument"_err_en_US); // C843
-  } else if (IsOptional(symbol)) {
-    messages_.Say(
-        "OPTIONAL attribute may apply only to a dummy argument"_err_en_US); // C849
   } else if (!details.ignoreTKR().empty()) {
     messages_.Say(
         "!DIR$ IGNORE_TKR directive may apply only to a dummy data argument"_err_en_US);
@@ -891,25 +891,31 @@ void CheckHelper::CheckObjectEntity(
   bool inDeviceSubprogram{IsCUDADeviceContext(&symbol.owner())};
   if (inDeviceSubprogram) {
     if (IsSaved(symbol)) {
-      WarnIfNotInModuleFile(
-          "'%s' should not have the SAVE attribute or initialization in a device subprogram"_warn_en_US,
-          symbol.name());
+      if (context_.ShouldWarn(common::UsageWarning::CUDAUsage)) {
+        WarnIfNotInModuleFile(
+            "'%s' should not have the SAVE attribute or initialization in a device subprogram"_warn_en_US,
+            symbol.name());
+      }
     }
     if (IsPointer(symbol)) {
-      WarnIfNotInModuleFile(
-          "Pointer '%s' may not be associated in a device subprogram"_warn_en_US,
-          symbol.name());
+      if (context_.ShouldWarn(common::UsageWarning::CUDAUsage)) {
+        WarnIfNotInModuleFile(
+            "Pointer '%s' may not be associated in a device subprogram"_warn_en_US,
+            symbol.name());
+      }
     }
     if (details.isDummy() &&
         details.cudaDataAttr().value_or(common::CUDADataAttr::Device) !=
             common::CUDADataAttr::Device &&
         details.cudaDataAttr().value_or(common::CUDADataAttr::Device) !=
             common::CUDADataAttr::Managed) {
-      WarnIfNotInModuleFile(
-          "Dummy argument '%s' may not have ATTRIBUTES(%s) in a device subprogram"_warn_en_US,
-          symbol.name(),
-          parser::ToUpperCaseLetters(
-              common::EnumToString(*details.cudaDataAttr())));
+      if (context_.ShouldWarn(common::UsageWarning::CUDAUsage)) {
+        WarnIfNotInModuleFile(
+            "Dummy argument '%s' may not have ATTRIBUTES(%s) in a device subprogram"_warn_en_US,
+            symbol.name(),
+            parser::ToUpperCaseLetters(
+                common::EnumToString(*details.cudaDataAttr())));
+      }
     }
   }
   if (details.cudaDataAttr()) {
@@ -959,17 +965,23 @@ void CheckHelper::CheckObjectEntity(
       break;
     case common::CUDADataAttr::Pinned:
       if (inDeviceSubprogram) {
-        WarnIfNotInModuleFile(
-            "Object '%s' with ATTRIBUTES(PINNED) may not be declared in a device subprogram"_warn_en_US,
-            symbol.name());
+        if (context_.ShouldWarn(common::UsageWarning::CUDAUsage)) {
+          WarnIfNotInModuleFile(
+              "Object '%s' with ATTRIBUTES(PINNED) may not be declared in a device subprogram"_warn_en_US,
+              symbol.name());
+        }
       } else if (IsPointer(symbol)) {
-        WarnIfNotInModuleFile(
-            "Object '%s' with ATTRIBUTES(PINNED) may not be a pointer"_warn_en_US,
-            symbol.name());
+        if (context_.ShouldWarn(common::UsageWarning::CUDAUsage)) {
+          WarnIfNotInModuleFile(
+              "Object '%s' with ATTRIBUTES(PINNED) may not be a pointer"_warn_en_US,
+              symbol.name());
+        }
       } else if (!IsAllocatable(symbol)) {
-        WarnIfNotInModuleFile(
-            "Object '%s' with ATTRIBUTES(PINNED) should also be allocatable"_warn_en_US,
-            symbol.name());
+        if (context_.ShouldWarn(common::UsageWarning::CUDAUsage)) {
+          WarnIfNotInModuleFile(
+              "Object '%s' with ATTRIBUTES(PINNED) should also be allocatable"_warn_en_US,
+              symbol.name());
+        }
       }
       break;
     case common::CUDADataAttr::Shared:
@@ -984,7 +996,10 @@ void CheckHelper::CheckObjectEntity(
       }
       break;
     case common::CUDADataAttr::Unified:
-      if ((!subpDetails || inDeviceSubprogram) && !isComponent) {
+      if (((!subpDetails &&
+               symbol.owner().kind() != Scope::Kind::MainProgram) ||
+              inDeviceSubprogram) &&
+          !isComponent) {
         messages_.Say(
             "Object '%s' with ATTRIBUTES(UNIFIED) must be declared in a host subprogram"_err_en_US,
             symbol.name());
@@ -1211,9 +1226,8 @@ void CheckHelper::CheckProcEntity(
   const Symbol *interface{details.procInterface()};
   if (details.isDummy()) {
     if (!symbol.attrs().test(Attr::POINTER) && // C843
-        (symbol.attrs().test(Attr::INTENT_IN) ||
-            symbol.attrs().test(Attr::INTENT_OUT) ||
-            symbol.attrs().test(Attr::INTENT_INOUT))) {
+        symbol.attrs().HasAny(
+            {Attr::INTENT_IN, Attr::INTENT_OUT, Attr::INTENT_INOUT})) {
       messages_.Say("A dummy procedure without the POINTER attribute"
                     " may not have an INTENT attribute"_err_en_US);
     }
@@ -1237,14 +1251,6 @@ void CheckHelper::CheckProcEntity(
         messages_.Say("A dummy procedure may not be ELEMENTAL"_err_en_US);
       }
     }
-  } else if (symbol.attrs().test(Attr::INTENT_IN) ||
-      symbol.attrs().test(Attr::INTENT_OUT) ||
-      symbol.attrs().test(Attr::INTENT_INOUT)) {
-    messages_.Say("INTENT attributes may apply only to a dummy "
-                  "argument"_err_en_US); // C843
-  } else if (IsOptional(symbol)) {
-    messages_.Say("OPTIONAL attribute may apply only to a dummy "
-                  "argument"_err_en_US); // C849
   } else if (IsPointer(symbol)) {
     CheckPointerInitialization(symbol);
     if (interface) {
@@ -1489,12 +1495,16 @@ void CheckHelper::CheckExternal(const Symbol &symbol) {
             if (chars->HasExplicitInterface()) {
               std::string whyNot;
               if (!chars->IsCompatibleWith(*globalChars,
-                      /*ignoreImplicitVsExplicit=*/false, &whyNot)) {
+                      /*ignoreImplicitVsExplicit=*/false, &whyNot) &&
+                  context_.ShouldWarn(
+                      common::UsageWarning::ExternalInterfaceMismatch)) {
                 msg = WarnIfNotInModuleFile(
                     "The global subprogram '%s' is not compatible with its local procedure declaration (%s)"_warn_en_US,
                     global->name(), whyNot);
               }
-            } else if (!globalChars->CanBeCalledViaImplicitInterface()) {
+            } else if (!globalChars->CanBeCalledViaImplicitInterface() &&
+                context_.ShouldWarn(
+                    common::UsageWarning::ExternalInterfaceMismatch)) {
               msg = messages_.Say(
                   "The global subprogram '%s' may not be referenced via the implicit interface '%s'"_err_en_US,
                   global->name(), symbol.name());
@@ -1516,7 +1526,9 @@ void CheckHelper::CheckExternal(const Symbol &symbol) {
         if (auto previousChars{Characterize(previous)}) {
           std::string whyNot;
           if (!chars->IsCompatibleWith(*previousChars,
-                  /*ignoreImplicitVsExplicit=*/false, &whyNot)) {
+                  /*ignoreImplicitVsExplicit=*/false, &whyNot) &&
+              context_.ShouldWarn(
+                  common::UsageWarning::ExternalInterfaceMismatch)) {
             if (auto *msg{WarnIfNotInModuleFile(
                     "The external interface '%s' is not compatible with an earlier definition (%s)"_warn_en_US,
                     symbol.name(), whyNot)}) {
@@ -1938,7 +1950,9 @@ std::optional<parser::MessageFixedText> CheckHelper::CheckNumberOfArgs(
     const GenericKind &kind, std::size_t nargs) {
   if (!kind.IsIntrinsicOperator()) {
     if (nargs < 1 || nargs > 2) {
-      return "%s function '%s' should have 1 or 2 dummy arguments"_warn_en_US;
+      if (context_.ShouldWarn(common::UsageWarning::DefinedOperatorArgs)) {
+        return "%s function '%s' should have 1 or 2 dummy arguments"_warn_en_US;
+      }
     }
     return std::nullopt;
   }
@@ -1995,8 +2009,10 @@ bool CheckHelper::CheckDefinedOperatorArg(const SourceName &opName,
         "In %s function '%s', dummy argument '%s' may not be INTENT(OUT)"_err_en_US;
   } else if (dataObject->intent != common::Intent::In &&
       !dataObject->attrs.test(DummyDataObject::Attr::Value)) {
-    msg =
-        "In %s function '%s', dummy argument '%s' should have INTENT(IN) or VALUE attribute"_warn_en_US;
+    if (context_.ShouldWarn(common::UsageWarning::DefinedOperatorArgs)) {
+      msg =
+          "In %s function '%s', dummy argument '%s' should have INTENT(IN) or VALUE attribute"_warn_en_US;
+    }
   }
   if (msg) {
     bool isFatal{msg->IsFatal()};
@@ -2058,8 +2074,10 @@ bool CheckHelper::CheckDefinedAssignmentArg(
               " may not have INTENT(IN)"_err_en_US;
       } else if (dataObject->intent != common::Intent::Out &&
           dataObject->intent != common::Intent::InOut) {
-        msg = "In defined assignment subroutine '%s', first dummy argument '%s'"
-              " should have INTENT(OUT) or INTENT(INOUT)"_warn_en_US;
+        if (context_.ShouldWarn(common::UsageWarning::DefinedOperatorArgs)) {
+          msg =
+              "In defined assignment subroutine '%s', first dummy argument '%s' should have INTENT(OUT) or INTENT(INOUT)"_warn_en_US;
+        }
       }
     } else if (pos == 1) {
       if (dataObject->intent == common::Intent::Out) {
@@ -2067,9 +2085,10 @@ bool CheckHelper::CheckDefinedAssignmentArg(
               " argument '%s' may not have INTENT(OUT)"_err_en_US;
       } else if (dataObject->intent != common::Intent::In &&
           !dataObject->attrs.test(DummyDataObject::Attr::Value)) {
-        msg =
-            "In defined assignment subroutine '%s', second dummy"
-            " argument '%s' should have INTENT(IN) or VALUE attribute"_warn_en_US;
+        if (context_.ShouldWarn(common::UsageWarning::DefinedOperatorArgs)) {
+          msg =
+              "In defined assignment subroutine '%s', second dummy argument '%s' should have INTENT(IN) or VALUE attribute"_warn_en_US;
+        }
       } else if (dataObject->attrs.test(DummyDataObject::Attr::Pointer)) {
         msg =
             "In defined assignment subroutine '%s', second dummy argument '%s' must not be a pointer"_err_en_US;
@@ -2123,7 +2142,8 @@ void CheckHelper::WarnMissingFinal(const Symbol &symbol) {
   while (const auto *derivedDetails{
       derivedSym ? derivedSym->detailsIf<DerivedTypeDetails>() : nullptr}) {
     if (!derivedDetails->finals().empty() &&
-        !derivedDetails->GetFinalForRank(rank)) {
+        !derivedDetails->GetFinalForRank(rank) &&
+        context_.ShouldWarn(common::UsageWarning::Final)) {
       if (auto *msg{derivedSym == initialDerivedSym
                   ? WarnIfNotInModuleFile(symbol.name(),
                         "'%s' of derived type '%s' does not have a FINAL subroutine for its rank (%d)"_warn_en_US,
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index 51f536f3d772..c1eab090a4bb 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -540,7 +540,8 @@ private:
     CheckDoExpression(bounds.upper);
     if (bounds.step) {
       CheckDoExpression(*bounds.step);
-      if (IsZero(*bounds.step)) {
+      if (IsZero(*bounds.step) &&
+          context_.ShouldWarn(common::UsageWarning::ZeroDoStep)) {
         context_.Say(bounds.step->thing.value().source,
             "DO step expression should not be zero"_warn_en_US);
       }
@@ -791,7 +792,8 @@ private:
           },
           assignment.u);
       for (const Symbol &index : indexVars) {
-        if (symbols.count(index) == 0) {
+        if (symbols.count(index) == 0 &&
+            context_.ShouldWarn(common::UsageWarning::UnusedForallIndex)) {
           context_.Say("FORALL index variable '%s' not used on left-hand side"
                        " of assignment"_warn_en_US,
               index.name());
diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index ad89a9be5a29..8f8a4e800b48 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -795,10 +795,12 @@ void IoChecker::Leave(const parser::ReadStmt &readStmt) {
   CheckForProhibitedSpecifier(IoSpecKind::Rec, IoSpecKind::End); // C1220
   if (specifierSet_.test(IoSpecKind::Size)) {
     // F'2023 C1214 - allow with a warning
-    if (specifierSet_.test(IoSpecKind::Nml)) {
-      context_.Say("If NML appears, SIZE should not appear"_port_en_US);
-    } else if (flags_.test(Flag::StarFmt)) {
-      context_.Say("If FMT=* appears, SIZE should not appear"_port_en_US);
+    if (context_.ShouldWarn(common::LanguageFeature::ListDirectedSize)) {
+      if (specifierSet_.test(IoSpecKind::Nml)) {
+        context_.Say("If NML appears, SIZE should not appear"_port_en_US);
+      } else if (flags_.test(Flag::StarFmt)) {
+        context_.Say("If FMT=* appears, SIZE should not appear"_port_en_US);
+      }
     }
   }
   CheckForRequiredSpecifier(IoSpecKind::Eor,
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 8a16299db319..ab76fe59911b 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1020,10 +1020,12 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar(
                         ContextDirectiveAsFortran());
                   else if (GetContext().directive ==
                       llvm::omp::Directive::OMPD_declare_target)
-                    context_.Say(name->source,
-                        "The entity with PARAMETER attribute is used in a %s "
-                        "directive"_warn_en_US,
-                        ContextDirectiveAsFortran());
+                    if (context_.ShouldWarn(
+                            common::UsageWarning::OpenMPUsage)) {
+                      context_.Say(name->source,
+                          "The entity with PARAMETER attribute is used in a %s directive"_warn_en_US,
+                          ContextDirectiveAsFortran());
+                    }
                 } else if (FindCommonBlockContaining(*name->symbol)) {
                   context_.Say(name->source,
                       "A variable in a %s directive cannot be an element of a "
@@ -1190,7 +1192,7 @@ void OmpStructureChecker::Leave(const parser::OmpDeclareTargetWithClause &x) {
       context_.Say(x.source,
           "If the DECLARE TARGET directive has a clause, it must contain at lease one ENTER clause or LINK clause"_err_en_US);
     }
-    if (toClause) {
+    if (toClause && context_.ShouldWarn(common::UsageWarning::OpenMPUsage)) {
       context_.Say(toClause->source,
           "The usage of TO clause on DECLARE TARGET directive has been deprecated. Use ENTER clause instead."_warn_en_US);
     }
@@ -2964,9 +2966,11 @@ void OmpStructureChecker::Enter(const parser::OmpClause::UseDevicePtr &x) {
       if (const auto *name{parser::Unwrap<parser::Name>(ompObject)}) {
         if (name->symbol) {
           if (!(IsBuiltinCPtr(*(name->symbol)))) {
-            context_.Say(itr->second->source,
-                "Use of non-C_PTR type '%s' in USE_DEVICE_PTR is deprecated, use USE_DEVICE_ADDR instead"_warn_en_US,
-                name->ToString());
+            if (context_.ShouldWarn(common::UsageWarning::OpenMPUsage)) {
+              context_.Say(itr->second->source,
+                  "Use of non-C_PTR type '%s' in USE_DEVICE_PTR is deprecated, use USE_DEVICE_ADDR instead"_warn_en_US,
+                  name->ToString());
+            }
           } else {
             useDevicePtrNameList.push_back(*name);
           }
@@ -3023,16 +3027,20 @@ void OmpStructureChecker::Enter(const parser::OmpClause::IsDevicePtr &x) {
             "Variable '%s' in IS_DEVICE_PTR clause must be of type C_PTR"_err_en_US,
             source.ToString());
       } else if (!(IsDummy(*symbol))) {
-        context_.Say(itr->second->source,
-            "Variable '%s' in IS_DEVICE_PTR clause must be a dummy argument. "
-            "This semantic check is deprecated from OpenMP 5.2 and later."_warn_en_US,
-            source.ToString());
+        if (context_.ShouldWarn(common::UsageWarning::OpenMPUsage)) {
+          context_.Say(itr->second->source,
+              "Variable '%s' in IS_DEVICE_PTR clause must be a dummy argument. "
+              "This semantic check is deprecated from OpenMP 5.2 and later."_warn_en_US,
+              source.ToString());
+        }
       } else if (IsAllocatableOrPointer(*symbol) || IsValue(*symbol)) {
-        context_.Say(itr->second->source,
-            "Variable '%s' in IS_DEVICE_PTR clause must be a dummy argument "
-            "that does not have the ALLOCATABLE, POINTER or VALUE attribute. "
-            "This semantic check is deprecated from OpenMP 5.2 and later."_warn_en_US,
-            source.ToString());
+        if (context_.ShouldWarn(common::UsageWarning::OpenMPUsage)) {
+          context_.Say(itr->second->source,
+              "Variable '%s' in IS_DEVICE_PTR clause must be a dummy argument "
+              "that does not have the ALLOCATABLE, POINTER or VALUE attribute. "
+              "This semantic check is deprecated from OpenMP 5.2 and later."_warn_en_US,
+              source.ToString());
+        }
       }
     }
   }
diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index 2ebc4e561a33..64050874bcde 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -462,9 +462,12 @@ bool DataInitializationCompiler<DSV>::InitElement(
       } else if (status == evaluate::InitialImage::OutOfRange) {
         OutOfRangeError();
       } else if (status == evaluate::InitialImage::LengthMismatch) {
-        exprAnalyzer_.Say(
-            "DATA statement value '%s' for '%s' has the wrong length"_warn_en_US,
-            folded.AsFortran(), DescribeElement());
+        if (exprAnalyzer_.context().ShouldWarn(
+                common::UsageWarning::DataLength)) {
+          exprAnalyzer_.Say(
+              "DATA statement value '%s' for '%s' has the wrong length"_warn_en_US,
+              folded.AsFortran(), DescribeElement());
+        }
         return true;
       } else if (status == evaluate::InitialImage::TooManyElems) {
         exprAnalyzer_.Say("DATA statement has too many elements"_err_en_US);
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index a270e4b385e8..f677973ca275 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -789,9 +789,11 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::RealLiteralConstant &x) {
   auto kind{AnalyzeKindParam(x.kind, defaultKind)};
   if (letterKind && expoLetter != 'e') {
     if (kind != *letterKind) {
-      Say("Explicit kind parameter on real constant disagrees with "
-          "exponent letter '%c'"_warn_en_US,
-          expoLetter);
+      if (context_.ShouldWarn(
+              common::LanguageFeature::ExponentMatchingKindParam)) {
+        Say("Explicit kind parameter on real constant disagrees with exponent letter '%c'"_warn_en_US,
+            expoLetter);
+      }
     } else if (x.kind &&
         context_.ShouldWarn(
             common::LanguageFeature::ExponentMatchingKindParam)) {
@@ -2776,7 +2778,9 @@ void ExpressionAnalyzer::CheckBadExplicitType(
       if (const auto *typeAndShape{result->GetTypeAndShape()}) {
         if (auto declared{
                 typeAndShape->Characterize(intrinsic, GetFoldingContext())}) {
-          if (!declared->type().IsTkCompatibleWith(typeAndShape->type())) {
+          if (!declared->type().IsTkCompatibleWith(typeAndShape->type()) &&
+              context_.ShouldWarn(
+                  common::UsageWarning::IgnoredIntrinsicFunctionType)) {
             if (auto *msg{Say(
                     "The result type '%s' of the intrinsic function '%s' is not the explicit declared type '%s'"_warn_en_US,
                     typeAndShape->AsFortran(), intrinsic.name(),
@@ -2989,8 +2993,8 @@ void ExpressionAnalyzer::Analyze(const parser::CallStmt &callStmt) {
   for (const auto &arg : actualArgList) {
     analyzer.Analyze(arg, true /* is subroutine call */);
   }
-  auto chevrons{AnalyzeChevrons(callStmt)};
-  if (!analyzer.fatalErrors() && chevrons) {
+  if (auto chevrons{AnalyzeChevrons(callStmt)};
+      chevrons && !analyzer.fatalErrors()) {
     if (std::optional<CalleeAndArguments> callee{
             GetCalleeAndArguments(std::get<parser::ProcedureDesignator>(call.t),
                 analyzer.GetActuals(), true /* subroutine */)}) {
@@ -3149,7 +3153,9 @@ std::optional<characteristics::Procedure> ExpressionAnalyzer::CheckCall(
           iter != implicitInterfaces_.end()) {
         std::string whyNot;
         if (!chars->IsCompatibleWith(iter->second.second,
-                /*ignoreImplicitVsExplicit=*/false, &whyNot)) {
+                /*ignoreImplicitVsExplicit=*/false, &whyNot) &&
+            context_.ShouldWarn(
+                common::UsageWarning::IncompatibleImplicitInterfaces)) {
           if (auto *msg{Say(callSite,
                   "Reference to the procedure '%s' has an implicit interface that is distinct from another reference: %s"_warn_en_US,
                   name, whyNot)}) {
@@ -3833,8 +3839,10 @@ bool ExpressionAnalyzer::CheckIntrinsicKind(
     return true;
   } else if (foldingContext_.targetCharacteristics().CanSupportType(
                  category, kind)) {
-    Say("%s(KIND=%jd) is not an enabled type for this target"_warn_en_US,
-        ToUpperCase(EnumToString(category)), kind);
+    if (context_.ShouldWarn(common::UsageWarning::BadTypeForTarget)) {
+      Say("%s(KIND=%jd) is not an enabled type for this target"_warn_en_US,
+          ToUpperCase(EnumToString(category)), kind);
+    }
     return true;
   } else {
     Say("%s(KIND=%jd) is not a supported type"_err_en_US,
@@ -3860,8 +3868,10 @@ bool ExpressionAnalyzer::CheckIntrinsicSize(
     return true;
   } else if (foldingContext_.targetCharacteristics().CanSupportType(
                  category, kind)) {
-    Say("%s*%jd is not an enabled type for this target"_warn_en_US,
-        ToUpperCase(EnumToString(category)), size);
+    if (context_.ShouldWarn(common::UsageWarning::BadTypeForTarget)) {
+      Say("%s*%jd is not an enabled type for this target"_warn_en_US,
+          ToUpperCase(EnumToString(category)), size);
+    }
     return true;
   } else {
     Say("%s*%jd is not a supported type"_err_en_US,
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 4a531c3c0f99..e9aebe5b08f2 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -202,7 +202,7 @@ static void HarvestInitializerSymbols(
           HarvestInitializerSymbols(set, *dtSym.scope());
         }
       } else {
-        CHECK(dtSym.has<UseErrorDetails>());
+        CHECK(dtSym.has<UseDetails>() || dtSym.has<UseErrorDetails>());
       }
     } else if (IsNamedConstant(*symbol) || scope.IsDerivedType()) {
       if (const auto *object{symbol->detailsIf<ObjectEntityDetails>()}) {
@@ -1397,13 +1397,17 @@ Scope *ModFileReader::Read(SourceName name, std::optional<bool> isIntrinsic,
   std::optional<ModuleCheckSumType> checkSum{
       VerifyHeader(sourceFile->content())};
   if (!checkSum) {
-    Say(name, ancestorName, "File has invalid checksum: %s"_warn_en_US,
-        sourceFile->path());
+    if (context_.ShouldWarn(common::UsageWarning::ModuleFile)) {
+      Say(name, ancestorName, "File has invalid checksum: %s"_warn_en_US,
+          sourceFile->path());
+    }
     return nullptr;
   } else if (requiredHash && *requiredHash != *checkSum) {
-    Say(name, ancestorName,
-        "File is not the right module file for %s"_warn_en_US,
-        "'"s + name.ToString() + "': "s + sourceFile->path());
+    if (context_.ShouldWarn(common::UsageWarning::ModuleFile)) {
+      Say(name, ancestorName,
+          "File is not the right module file for %s"_warn_en_US,
+          "'"s + name.ToString() + "': "s + sourceFile->path());
+    }
     return nullptr;
   }
   llvm::raw_null_ostream NullStream;
diff --git a/flang/lib/Semantics/pointer-assignment.cpp b/flang/lib/Semantics/pointer-assignment.cpp
index 60a496a63cb3..077072060e9b 100644
--- a/flang/lib/Semantics/pointer-assignment.cpp
+++ b/flang/lib/Semantics/pointer-assignment.cpp
@@ -266,8 +266,11 @@ bool PointerAssignmentChecker::Check(const evaluate::FunctionRef<T> &f) {
           " that is a not a pointer"_err_en_US;
   } else if (isContiguous_ &&
       !funcResult->attrs.test(FunctionResult::Attr::Contiguous)) {
-    msg = "CONTIGUOUS %s is associated with the result of reference to"
-          " function '%s' that is not known to be contiguous"_warn_en_US;
+    if (context_.ShouldWarn(
+            common::UsageWarning::PointerToPossibleNoncontiguous)) {
+      msg =
+          "CONTIGUOUS %s is associated with the result of reference to function '%s' that is not known to be contiguous"_warn_en_US;
+    }
   } else if (lhsType_) {
     const auto *frTypeAndShape{funcResult->GetTypeAndShape()};
     CHECK(frTypeAndShape);
diff --git a/flang/lib/Semantics/program-tree.cpp b/flang/lib/Semantics/program-tree.cpp
index bf773f3810c8..250f5801b39e 100644
--- a/flang/lib/Semantics/program-tree.cpp
+++ b/flang/lib/Semantics/program-tree.cpp
@@ -10,6 +10,7 @@
 #include "flang/Common/idioms.h"
 #include "flang/Parser/char-block.h"
 #include "flang/Semantics/scope.h"
+#include "flang/Semantics/semantics.h"
 
 namespace Fortran::semantics {
 
@@ -76,7 +77,8 @@ static void GetGenerics(
 }
 
 template <typename T>
-static ProgramTree BuildSubprogramTree(const parser::Name &name, const T &x) {
+static ProgramTree BuildSubprogramTree(
+    const parser::Name &name, SemanticsContext &context, const T &x) {
   const auto &spec{std::get<parser::SpecificationPart>(x.t)};
   const auto &exec{std::get<parser::ExecutionPart>(x.t)};
   const auto &subps{
@@ -89,7 +91,11 @@ static ProgramTree BuildSubprogramTree(const parser::Name &name, const T &x) {
     for (const auto &subp :
         std::get<std::list<parser::InternalSubprogram>>(subps->t)) {
       common::visit(
-          [&](const auto &y) { node.AddChild(ProgramTree::Build(y.value())); },
+          [&](const auto &y) {
+            if (auto child{ProgramTree::Build(y.value(), context)}) {
+              node.AddChild(std::move(*child));
+            }
+          },
           subp.u);
     }
   }
@@ -97,13 +103,14 @@ static ProgramTree BuildSubprogramTree(const parser::Name &name, const T &x) {
 }
 
 static ProgramTree BuildSubprogramTree(
-    const parser::Name &name, const parser::BlockData &x) {
+    const parser::Name &name, SemanticsContext &, const parser::BlockData &x) {
   const auto &spec{std::get<parser::SpecificationPart>(x.t)};
   return ProgramTree{name, spec};
 }
 
 template <typename T>
-static ProgramTree BuildModuleTree(const parser::Name &name, const T &x) {
+static ProgramTree BuildModuleTree(
+    const parser::Name &name, SemanticsContext &context, const T &x) {
   const auto &spec{std::get<parser::SpecificationPart>(x.t)};
   const auto &subps{std::get<std::optional<parser::ModuleSubprogramPart>>(x.t)};
   ProgramTree node{name, spec};
@@ -112,28 +119,42 @@ static ProgramTree BuildModuleTree(const parser::Name &name, const T &x) {
     for (const auto &subp :
         std::get<std::list<parser::ModuleSubprogram>>(subps->t)) {
       common::visit(
-          [&](const auto &y) { node.AddChild(ProgramTree::Build(y.value())); },
+          [&](const auto &y) {
+            if (auto child{ProgramTree::Build(y.value(), context)}) {
+              node.AddChild(std::move(*child));
+            }
+          },
           subp.u);
     }
   }
   return node;
 }
 
-ProgramTree ProgramTree::Build(const parser::ProgramUnit &x) {
-  return common::visit([](const auto &y) { return Build(y.value()); }, x.u);
+ProgramTree ProgramTree::Build(
+    const parser::ProgramUnit &x, SemanticsContext &context) {
+  return common::visit(
+      [&](const auto &y) {
+        auto node{Build(y.value(), context)};
+        CHECK(node.has_value());
+        return std::move(*node);
+      },
+      x.u);
 }
 
-ProgramTree ProgramTree::Build(const parser::MainProgram &x) {
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::MainProgram &x, SemanticsContext &context) {
   const auto &stmt{
       std::get<std::optional<parser::Statement<parser::ProgramStmt>>>(x.t)};
   const auto &end{std::get<parser::Statement<parser::EndProgramStmt>>(x.t)};
   static parser::Name emptyName;
-  auto result{stmt ? BuildSubprogramTree(stmt->statement.v, x).set_stmt(*stmt)
-                   : BuildSubprogramTree(emptyName, x)};
-  return result.set_endStmt(end);
+  auto result{stmt
+          ? BuildSubprogramTree(stmt->statement.v, context, x).set_stmt(*stmt)
+          : BuildSubprogramTree(emptyName, context, x)};
+  return std::move(result.set_endStmt(end));
 }
 
-ProgramTree ProgramTree::Build(const parser::FunctionSubprogram &x) {
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::FunctionSubprogram &x, SemanticsContext &context) {
   const auto &stmt{std::get<parser::Statement<parser::FunctionStmt>>(x.t)};
   const auto &end{std::get<parser::Statement<parser::EndFunctionStmt>>(x.t)};
   const auto &name{std::get<parser::Name>(stmt.statement.t)};
@@ -144,13 +165,14 @@ ProgramTree ProgramTree::Build(const parser::FunctionSubprogram &x) {
       bindingSpec = &*suffix->binding;
     }
   }
-  return BuildSubprogramTree(name, x)
+  return BuildSubprogramTree(name, context, x)
       .set_stmt(stmt)
       .set_endStmt(end)
       .set_bindingSpec(bindingSpec);
 }
 
-ProgramTree ProgramTree::Build(const parser::SubroutineSubprogram &x) {
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::SubroutineSubprogram &x, SemanticsContext &context) {
   const auto &stmt{std::get<parser::Statement<parser::SubroutineStmt>>(x.t)};
   const auto &end{std::get<parser::Statement<parser::EndSubroutineStmt>>(x.t)};
   const auto &name{std::get<parser::Name>(stmt.statement.t)};
@@ -159,48 +181,58 @@ ProgramTree ProgramTree::Build(const parser::SubroutineSubprogram &x) {
           stmt.statement.t)}) {
     bindingSpec = &*binding;
   }
-  return BuildSubprogramTree(name, x)
+  return BuildSubprogramTree(name, context, x)
       .set_stmt(stmt)
       .set_endStmt(end)
       .set_bindingSpec(bindingSpec);
 }
 
-ProgramTree ProgramTree::Build(const parser::SeparateModuleSubprogram &x) {
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::SeparateModuleSubprogram &x, SemanticsContext &context) {
   const auto &stmt{std::get<parser::Statement<parser::MpSubprogramStmt>>(x.t)};
   const auto &end{
       std::get<parser::Statement<parser::EndMpSubprogramStmt>>(x.t)};
   const auto &name{stmt.statement.v};
-  return BuildSubprogramTree(name, x).set_stmt(stmt).set_endStmt(end);
+  return BuildSubprogramTree(name, context, x).set_stmt(stmt).set_endStmt(end);
 }
 
-ProgramTree ProgramTree::Build(const parser::Module &x) {
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::Module &x, SemanticsContext &context) {
   const auto &stmt{std::get<parser::Statement<parser::ModuleStmt>>(x.t)};
   const auto &end{std::get<parser::Statement<parser::EndModuleStmt>>(x.t)};
   const auto &name{stmt.statement.v};
-  return BuildModuleTree(name, x).set_stmt(stmt).set_endStmt(end);
+  return BuildModuleTree(name, context, x).set_stmt(stmt).set_endStmt(end);
 }
 
-ProgramTree ProgramTree::Build(const parser::Submodule &x) {
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::Submodule &x, SemanticsContext &context) {
   const auto &stmt{std::get<parser::Statement<parser::SubmoduleStmt>>(x.t)};
   const auto &end{std::get<parser::Statement<parser::EndSubmoduleStmt>>(x.t)};
   const auto &name{std::get<parser::Name>(stmt.statement.t)};
-  return BuildModuleTree(name, x).set_stmt(stmt).set_endStmt(end);
+  return BuildModuleTree(name, context, x).set_stmt(stmt).set_endStmt(end);
 }
 
-ProgramTree ProgramTree::Build(const parser::BlockData &x) {
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::BlockData &x, SemanticsContext &context) {
   const auto &stmt{std::get<parser::Statement<parser::BlockDataStmt>>(x.t)};
   const auto &end{std::get<parser::Statement<parser::EndBlockDataStmt>>(x.t)};
   static parser::Name emptyName;
-  auto result{stmt.statement.v ? BuildSubprogramTree(*stmt.statement.v, x)
-                               : BuildSubprogramTree(emptyName, x)};
-  return result.set_stmt(stmt).set_endStmt(end);
+  auto result{stmt.statement.v
+          ? BuildSubprogramTree(*stmt.statement.v, context, x)
+          : BuildSubprogramTree(emptyName, context, x)};
+  return std::move(result.set_stmt(stmt).set_endStmt(end));
 }
 
-ProgramTree ProgramTree::Build(const parser::CompilerDirective &) {
-  DIE("ProgramTree::Build() called for CompilerDirective");
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::CompilerDirective &x, SemanticsContext &context) {
+  if (context.ShouldWarn(common::UsageWarning::IgnoredDirective)) {
+    context.Say(x.source, "Compiler directive ignored here"_warn_en_US);
+  }
+  return std::nullopt;
 }
 
-ProgramTree ProgramTree::Build(const parser::OpenACCRoutineConstruct &) {
+std::optional<ProgramTree> ProgramTree::Build(
+    const parser::OpenACCRoutineConstruct &, SemanticsContext &) {
   DIE("ProgramTree::Build() called for OpenACCRoutineConstruct");
 }
 
diff --git a/flang/lib/Semantics/program-tree.h b/flang/lib/Semantics/program-tree.h
index d49b0405d8b1..ab00261a964a 100644
--- a/flang/lib/Semantics/program-tree.h
+++ b/flang/lib/Semantics/program-tree.h
@@ -26,6 +26,7 @@
 namespace Fortran::semantics {
 
 class Scope;
+class SemanticsContext;
 
 class ProgramTree {
 public:
@@ -34,16 +35,25 @@ public:
       std::list<common::Reference<const parser::GenericSpec>>;
 
   // Build the ProgramTree rooted at one of these program units.
-  static ProgramTree Build(const parser::ProgramUnit &);
-  static ProgramTree Build(const parser::MainProgram &);
-  static ProgramTree Build(const parser::FunctionSubprogram &);
-  static ProgramTree Build(const parser::SubroutineSubprogram &);
-  static ProgramTree Build(const parser::SeparateModuleSubprogram &);
-  static ProgramTree Build(const parser::Module &);
-  static ProgramTree Build(const parser::Submodule &);
-  static ProgramTree Build(const parser::BlockData &);
-  static ProgramTree Build(const parser::CompilerDirective &);
-  static ProgramTree Build(const parser::OpenACCRoutineConstruct &);
+  static ProgramTree Build(const parser::ProgramUnit &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::MainProgram &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::FunctionSubprogram &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::SubroutineSubprogram &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::SeparateModuleSubprogram &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::Module &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::Submodule &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::BlockData &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::CompilerDirective &, SemanticsContext &);
+  static std::optional<ProgramTree> Build(
+      const parser::OpenACCRoutineConstruct &, SemanticsContext &);
 
   ENUM_CLASS(Kind, // kind of node
       Program, Function, Subroutine, MpSubprogram, Module, Submodule, BlockData)
diff --git a/flang/lib/Semantics/resolve-labels.cpp b/flang/lib/Semantics/resolve-labels.cpp
index d04b8f3eb548..63fc2e1168b8 100644
--- a/flang/lib/Semantics/resolve-labels.cpp
+++ b/flang/lib/Semantics/resolve-labels.cpp
@@ -935,7 +935,8 @@ void CheckBranchesIntoDoBody(const SourceStmtList &branches,
       const auto &fromPosition{branch.parserCharBlock};
       const auto &toPosition{branchTarget.parserCharBlock};
       for (const auto &body : loopBodies) {
-        if (!InBody(fromPosition, body) && InBody(toPosition, body)) {
+        if (!InBody(fromPosition, body) && InBody(toPosition, body) &&
+            context.ShouldWarn(common::LanguageFeature::BranchIntoConstruct)) {
           context
               .Say(
                   fromPosition, "branch into loop body from outside"_warn_en_US)
@@ -1062,11 +1063,16 @@ void CheckScopeConstraints(const SourceStmtList &stmts,
           break;
         }
       }
-      context.Say(position,
-          isFatal
-              ? "Label '%u' is in a construct that prevents its use as a branch target here"_err_en_US
-              : "Label '%u' is in a construct that should not be used as a branch target here"_warn_en_US,
-          SayLabel(label));
+      if (isFatal) {
+        context.Say(position,
+            "Label '%u' is in a construct that prevents its use as a branch target here"_err_en_US,
+            SayLabel(label));
+      } else if (context.ShouldWarn(
+                     common::LanguageFeature::BranchIntoConstruct)) {
+        context.Say(position,
+            "Label '%u' is in a construct that should not be used as a branch target here"_warn_en_US,
+            SayLabel(label));
+      }
     }
   }
 }
@@ -1087,7 +1093,8 @@ void CheckBranchTargetConstraints(const SourceStmtList &stmts,
             .Attach(stmt.parserCharBlock, "Control flow use of '%u'"_en_US,
                 SayLabel(label));
       } else if (!branchTarget.labeledStmtClassificationSet.test(
-                     TargetStatementEnum::Branch)) { // warning
+                     TargetStatementEnum::Branch) &&
+          context.ShouldWarn(common::LanguageFeature::BadBranchTarget)) {
         context
             .Say(branchTarget.parserCharBlock,
                 "Label '%u' is not a branch target"_warn_en_US, SayLabel(label))
@@ -1140,15 +1147,21 @@ void CheckAssignTargetConstraints(const SourceStmtList &stmts,
             TargetStatementEnum::Branch) &&
         !target.labeledStmtClassificationSet.test(
             TargetStatementEnum::Format)) {
-      context
-          .Say(target.parserCharBlock,
-              target.labeledStmtClassificationSet.test(
-                  TargetStatementEnum::CompatibleBranch)
-                  ? "Label '%u' is not a branch target or FORMAT"_warn_en_US
-                  : "Label '%u' is not a branch target or FORMAT"_err_en_US,
-              SayLabel(label))
-          .Attach(stmt.parserCharBlock, "ASSIGN statement use of '%u'"_en_US,
-              SayLabel(label));
+      parser::Message *msg{nullptr};
+      if (!target.labeledStmtClassificationSet.test(
+              TargetStatementEnum::CompatibleBranch)) {
+        msg = &context.Say(target.parserCharBlock,
+            "Label '%u' is not a branch target or FORMAT"_err_en_US,
+            SayLabel(label));
+      } else if (context.ShouldWarn(common::LanguageFeature::BadBranchTarget)) {
+        msg = &context.Say(target.parserCharBlock,
+            "Label '%u' is not a branch target or FORMAT"_warn_en_US,
+            SayLabel(label));
+      }
+      if (msg) {
+        msg->Attach(stmt.parserCharBlock, "ASSIGN statement use of '%u'"_en_US,
+            SayLabel(label));
+      }
     }
   }
 }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index b941f257a95e..61394b0f41de 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1779,7 +1779,6 @@ void AttrsVisitor::SetBindNameOn(Symbol &symbol) {
       !symbol.attrs().test(Attr::BIND_C)) {
     return;
   }
-
   std::optional<std::string> label{
       evaluate::GetScalarConstantValue<evaluate::Ascii>(bindName_)};
   // 18.9.2(2): discard leading and trailing blanks
@@ -1798,16 +1797,18 @@ void AttrsVisitor::SetBindNameOn(Symbol &symbol) {
   } else {
     label = symbol.name().ToString();
   }
-  // Check if a symbol has two Bind names.
+  // Checks whether a symbol has two Bind names.
   std::string oldBindName;
-  if (symbol.GetBindName()) {
-    oldBindName = *symbol.GetBindName();
+  if (const auto *bindName{symbol.GetBindName()}) {
+    oldBindName = *bindName;
   }
   symbol.SetBindName(std::move(*label));
   if (!oldBindName.empty()) {
     if (const std::string * newBindName{symbol.GetBindName()}) {
       if (oldBindName != *newBindName) {
-        Say(symbol.name(), "The entity '%s' has multiple BIND names"_err_en_US);
+        Say(symbol.name(),
+            "The entity '%s' has multiple BIND names ('%s' and '%s')"_err_en_US,
+            symbol.name(), oldBindName, *newBindName);
       }
     }
   }
@@ -1838,9 +1839,11 @@ bool AttrsVisitor::Pre(const parser::Pass &x) {
 bool AttrsVisitor::IsDuplicateAttr(Attr attrName) {
   CHECK(attrs_);
   if (attrs_->test(attrName)) {
-    Say(currStmtSource().value(),
-        "Attribute '%s' cannot be used more than once"_warn_en_US,
-        AttrToString(attrName));
+    if (context().ShouldWarn(common::LanguageFeature::RedundantAttribute)) {
+      Say(currStmtSource().value(),
+          "Attribute '%s' cannot be used more than once"_warn_en_US,
+          AttrToString(attrName));
+    }
     return true;
   }
   return false;
@@ -3602,9 +3605,11 @@ void InterfaceVisitor::CheckGenericProcedures(Symbol &generic) {
   ResolveSpecificsInGeneric(generic, true);
   auto &details{generic.get<GenericDetails>()};
   if (auto *proc{details.CheckSpecific()}) {
-    Say(proc->name().begin() > generic.name().begin() ? proc->name()
-                                                      : generic.name(),
-        "'%s' should not be the name of both a generic interface and a procedure unless it is a specific procedure of the generic"_warn_en_US);
+    if (context().ShouldWarn(common::UsageWarning::HomonymousSpecific)) {
+      Say(proc->name().begin() > generic.name().begin() ? proc->name()
+                                                        : generic.name(),
+          "'%s' should not be the name of both a generic interface and a procedure unless it is a specific procedure of the generic"_warn_en_US);
+    }
   }
   auto &specifics{details.specificProcs()};
   if (specifics.empty()) {
@@ -3618,14 +3623,17 @@ void InterfaceVisitor::CheckGenericProcedures(Symbol &generic) {
   bool isBoth{false};
   for (const Symbol &specific : specifics) {
     if (isFunction != specific.test(Symbol::Flag::Function)) { // C1514
-      auto &msg{Say(generic.name(),
-          "Generic interface '%s' has both a function and a subroutine"_warn_en_US)};
-      if (isFunction) {
-        msg.Attach(firstSpecific.name(), "Function declaration"_en_US);
-        msg.Attach(specific.name(), "Subroutine declaration"_en_US);
-      } else {
-        msg.Attach(firstSpecific.name(), "Subroutine declaration"_en_US);
-        msg.Attach(specific.name(), "Function declaration"_en_US);
+      if (context().ShouldWarn(
+              common::LanguageFeature::SubroutineAndFunctionSpecifics)) {
+        auto &msg{Say(generic.name(),
+            "Generic interface '%s' has both a function and a subroutine"_warn_en_US)};
+        if (isFunction) {
+          msg.Attach(firstSpecific.name(), "Function declaration"_en_US);
+          msg.Attach(specific.name(), "Subroutine declaration"_en_US);
+        } else {
+          msg.Attach(firstSpecific.name(), "Subroutine declaration"_en_US);
+          msg.Attach(specific.name(), "Function declaration"_en_US);
+        }
       }
       isFunction = false;
       isBoth = true;
@@ -3766,9 +3774,12 @@ bool SubprogramVisitor::Pre(const parser::PrefixSpec::Attributes &attrs) {
             (*current == common::CUDASubprogramAttrs::HostDevice &&
                 (attr == common::CUDASubprogramAttrs::Host ||
                     attr == common::CUDASubprogramAttrs::Device))) {
-          Say(currStmtSource().value(),
-              "ATTRIBUTES(%s) appears more than once"_warn_en_US,
-              common::EnumToString(attr));
+          if (context().ShouldWarn(
+                  common::LanguageFeature::RedundantAttribute)) {
+            Say(currStmtSource().value(),
+                "ATTRIBUTES(%s) appears more than once"_warn_en_US,
+                common::EnumToString(attr));
+          }
         } else if ((attr == common::CUDASubprogramAttrs::Host ||
                        attr == common::CUDASubprogramAttrs::Device) &&
             (*current == common::CUDASubprogramAttrs::Host ||
@@ -3950,11 +3961,13 @@ void SubprogramVisitor::Post(const parser::FunctionStmt &stmt) {
   }
   // C1560.
   if (info.resultName && !distinctResultName) {
-    Say(info.resultName->source,
-        "The function name should not appear in RESULT; references to '%s' "
-        "inside the function will be considered as references to the "
-        "result only"_warn_en_US,
-        name.source);
+    if (context().ShouldWarn(common::UsageWarning::HomonymousResult)) {
+      Say(info.resultName->source,
+          "The function name should not appear in RESULT; references to '%s' "
+          "inside the function will be considered as references to the "
+          "result only"_warn_en_US,
+          name.source);
+    }
     // RESULT name was ignored above, the only side effect from doing so will be
     // the inability to make recursive calls. The related parser::Name is still
     // resolved to the created function result symbol because every parser::Name
@@ -4368,8 +4381,10 @@ bool SubprogramVisitor::HandlePreviousCalls(
       if (symbol.attrs().test(Attr::EXTERNAL) &&
           !symbol.implicitAttrs().test(Attr::EXTERNAL)) {
         // Warn if external statement previously declared.
-        Say(name,
-            "EXTERNAL attribute was already specified on '%s'"_warn_en_US);
+        if (context().ShouldWarn(common::LanguageFeature::RedundantAttribute)) {
+          Say(name,
+              "EXTERNAL attribute was already specified on '%s'"_warn_en_US);
+        }
       } else if (symbol.test(other)) {
         Say2(name,
             subpFlag == Symbol::Flag::Function
@@ -4819,8 +4834,11 @@ bool DeclarationVisitor::Pre(const parser::ExternalStmt &x) {
       if (auto *details{symbol->detailsIf<SubprogramDetails>()}) {
         if (details->isInterface()) {
           // Warn if interface previously declared.
-          Say(name,
-              "EXTERNAL attribute was already specified on '%s'"_warn_en_US);
+          if (context().ShouldWarn(
+                  common::LanguageFeature::RedundantAttribute)) {
+            Say(name,
+                "EXTERNAL attribute was already specified on '%s'"_warn_en_US);
+          }
         }
       } else {
         SayWithDecl(
@@ -4865,12 +4883,15 @@ void DeclarationVisitor::DeclareIntrinsic(const parser::Name &name) {
     if (symbol.GetType()) {
       // These warnings are worded so that they should make sense in either
       // order.
-      Say(symbol.name(),
-          "Explicit type declaration ignored for intrinsic function '%s'"_warn_en_US,
-          symbol.name())
-          .Attach(name.source,
-              "INTRINSIC statement for explicitly-typed '%s'"_en_US,
-              name.source);
+      if (context().ShouldWarn(
+              common::UsageWarning::IgnoredIntrinsicFunctionType)) {
+        Say(symbol.name(),
+            "Explicit type declaration ignored for intrinsic function '%s'"_warn_en_US,
+            symbol.name())
+            .Attach(name.source,
+                "INTRINSIC statement for explicitly-typed '%s'"_en_US,
+                name.source);
+      }
     }
     if (!symbol.test(Symbol::Flag::Function) &&
         !symbol.test(Symbol::Flag::Subroutine)) {
@@ -4936,9 +4957,11 @@ Symbol &DeclarationVisitor::HandleAttributeStmt(
     }
   } else if (symbol && symbol->has<UseDetails>()) {
     if (symbol->GetUltimate().attrs().test(attr)) {
-      Say(currStmtSource().value(),
-          "Use-associated '%s' already has '%s' attribute"_warn_en_US,
-          name.source, EnumToString(attr));
+      if (context().ShouldWarn(common::LanguageFeature::RedundantAttribute)) {
+        Say(currStmtSource().value(),
+            "Use-associated '%s' already has '%s' attribute"_warn_en_US,
+            name.source, EnumToString(attr));
+      }
     } else {
       Say(currStmtSource().value(),
           "Cannot change %s attribute on use-associated '%s'"_err_en_US,
@@ -4986,7 +5009,9 @@ Symbol &DeclarationVisitor::DeclareUnknownEntity(
     if (symbol.attrs().test(Attr::EXTERNAL)) {
       ConvertToProcEntity(symbol);
     }
-    SetBindNameOn(symbol);
+    if (attrs.test(Attr::BIND_C)) {
+      SetBindNameOn(symbol);
+    }
     return symbol;
   }
 }
@@ -5067,8 +5092,10 @@ Symbol &DeclarationVisitor::DeclareObjectEntity(
           context().SetError(symbol);
         }
       } else if (MustBeScalar(symbol)) {
-        Say(name,
-            "'%s' appeared earlier as a scalar actual argument to a specification function"_warn_en_US);
+        if (context().ShouldWarn(common::UsageWarning::PreviousScalarUse)) {
+          Say(name,
+              "'%s' appeared earlier as a scalar actual argument to a specification function"_warn_en_US);
+        }
       } else if (details->init() || symbol.test(Symbol::Flag::InDataStmt)) {
         Say(name, "'%s' was initialized earlier as a scalar"_err_en_US);
       } else {
@@ -5446,8 +5473,10 @@ bool DeclarationVisitor::Pre(const parser::DerivedTypeDef &x) {
     details.set_sequence(true);
     if (componentDefs.empty()) {
       // F'2023 C745 - not enforced by any compiler
-      Say(stmt.source,
-          "A sequence type should have at least one component"_warn_en_US);
+      if (context().ShouldWarn(common::LanguageFeature::EmptySequenceType)) {
+        Say(stmt.source,
+            "A sequence type should have at least one component"_warn_en_US);
+      }
     }
     if (!details.paramNames().empty()) { // C740
       Say(stmt.source,
@@ -5551,13 +5580,17 @@ bool DeclarationVisitor::Pre(const parser::PrivateStmt &) {
   } else if (!derivedTypeInfo_.privateComps) {
     derivedTypeInfo_.privateComps = true;
   } else { // C738
-    Say("PRIVATE should not appear more than once in derived type components"_warn_en_US);
+    if (context().ShouldWarn(common::LanguageFeature::RedundantAttribute)) {
+      Say("PRIVATE should not appear more than once in derived type components"_warn_en_US);
+    }
   }
   return false;
 }
 bool DeclarationVisitor::Pre(const parser::SequenceStmt &) {
   if (derivedTypeInfo_.sequence) { // C738
-    Say("SEQUENCE should not appear more than once in derived type components"_warn_en_US);
+    if (context().ShouldWarn(common::LanguageFeature::RedundantAttribute)) {
+      Say("SEQUENCE should not appear more than once in derived type components"_warn_en_US);
+    }
   }
   derivedTypeInfo_.sequence = true;
   return false;
@@ -6081,7 +6114,9 @@ void DeclarationVisitor::Post(const parser::BasedPointer &bp) {
     }
     if (const auto *pointeeType{pointee->GetType()}) {
       if (const auto *derived{pointeeType->AsDerived()}) {
-        if (!IsSequenceOrBindCType(derived)) {
+        if (!IsSequenceOrBindCType(derived) &&
+            context().ShouldWarn(
+                common::LanguageFeature::NonSequenceCrayPointee)) {
           Say(pointeeName,
               "Type of Cray pointee '%s' is a derived type that is neither SEQUENCE nor BIND(C)"_warn_en_US);
         }
@@ -6229,9 +6264,11 @@ void DeclarationVisitor::CheckSaveStmts() {
       // error was reported
     } else if (specPartState_.saveInfo.saveAll) {
       // C889 - note that pgi, ifort, xlf do not enforce this constraint
-      Say2(name,
-          "Explicit SAVE of '%s' is redundant due to global SAVE statement"_warn_en_US,
-          *specPartState_.saveInfo.saveAll, "Global SAVE statement"_en_US);
+      if (context().ShouldWarn(common::LanguageFeature::RedundantAttribute)) {
+        Say2(name,
+            "Explicit SAVE of '%s' is redundant due to global SAVE statement"_warn_en_US,
+            *specPartState_.saveInfo.saveAll, "Global SAVE statement"_en_US);
+      }
     } else if (!IsSaved(*symbol)) {
       SetExplicitAttr(*symbol, Attr::SAVE);
     }
@@ -6273,7 +6310,8 @@ Attrs DeclarationVisitor::HandleSaveName(const SourceName &name, Attrs attrs) {
 void DeclarationVisitor::AddSaveName(
     std::set<SourceName> &set, const SourceName &name) {
   auto pair{set.insert(name)};
-  if (!pair.second) {
+  if (!pair.second &&
+      context().ShouldWarn(common::LanguageFeature::RedundantAttribute)) {
     Say2(name, "SAVE attribute was already specified on '%s'"_warn_en_US,
         *pair.first, "Previous specification of SAVE attribute"_en_US);
   }
@@ -6725,8 +6763,11 @@ bool DeclarationVisitor::OkToAddComponent(
               " '%s'"_err_en_US;
       } else if (CheckAccessibleSymbol(currScope(), *prev)) {
         // inaccessible component -- redeclaration is ok
-        msg = "Component '%s' is inaccessibly declared in or as a "
-              "parent of this derived type"_warn_en_US;
+        if (context().ShouldWarn(
+                common::UsageWarning::RedeclaredInaccessibleComponent)) {
+          msg =
+              "Component '%s' is inaccessibly declared in or as a parent of this derived type"_warn_en_US;
+        }
       } else if (prev->test(Symbol::Flag::ParentComp)) {
         msg = "'%s' is a parent type of this type and so cannot be"
               " a component"_err_en_US;
@@ -6858,8 +6899,10 @@ bool ConstructVisitor::Pre(const parser::LocalitySpec::LocalInit &x) {
 bool ConstructVisitor::Pre(const parser::LocalitySpec::Shared &x) {
   for (const auto &name : x.v) {
     if (!FindSymbol(name)) {
-      Say(name,
-          "Variable '%s' with SHARED locality implicitly declared"_warn_en_US);
+      if (context().ShouldWarn(common::UsageWarning::ImplicitShared)) {
+        Say(name,
+            "Variable '%s' with SHARED locality implicitly declared"_warn_en_US);
+      }
     }
     Symbol &prev{FindOrDeclareEnclosingEntity(name)};
     if (PassesSharedLocalityChecks(name, prev)) {
@@ -8321,12 +8364,16 @@ Symbol &ModuleVisitor::SetAccess(
   Attrs &attrs{symbol->attrs()};
   if (attrs.HasAny({Attr::PUBLIC, Attr::PRIVATE})) {
     // PUBLIC/PRIVATE already set: make it a fatal error if it changed
-    Attr prev = attrs.test(Attr::PUBLIC) ? Attr::PUBLIC : Attr::PRIVATE;
-    Say(name,
-        WithSeverity(
-            "The accessibility of '%s' has already been specified as %s"_warn_en_US,
-            attr != prev ? parser::Severity::Error : parser::Severity::Warning),
-        MakeOpName(name), EnumToString(prev));
+    Attr prev{attrs.test(Attr::PUBLIC) ? Attr::PUBLIC : Attr::PRIVATE};
+    if (attr != prev ||
+        context().ShouldWarn(common::LanguageFeature::RedundantAttribute)) {
+      Say(name,
+          WithSeverity(
+              "The accessibility of '%s' has already been specified as %s"_warn_en_US,
+              attr != prev ? parser::Severity::Error
+                           : parser::Severity::Warning),
+          MakeOpName(name), EnumToString(prev));
+    }
   } else {
     attrs.set(attr);
   }
@@ -8885,8 +8932,8 @@ void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) {
         }
       }
     }
-  } else {
-    Say(x.source, "Compiler directive was ignored"_warn_en_US);
+  } else if (context().ShouldWarn(common::UsageWarning::IgnoredDirective)) {
+    Say(x.source, "Unrecognized compiler directive was ignored"_warn_en_US);
   }
 }
 
@@ -8901,7 +8948,7 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) {
     ResolveAccParts(context(), x, &topScope_);
     return false;
   }
-  auto root{ProgramTree::Build(x)};
+  auto root{ProgramTree::Build(x, context())};
   SetScope(topScope_);
   ResolveSpecificationParts(root);
   FinishSpecificationParts(root);
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index e58a8f3b22c0..6ccd915c4dcb 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -443,8 +443,10 @@ void SemanticsContext::CheckIndexVarRedefine(const parser::CharBlock &location,
 
 void SemanticsContext::WarnIndexVarRedefine(
     const parser::CharBlock &location, const Symbol &variable) {
-  CheckIndexVarRedefine(location, variable,
-      "Possible redefinition of %s variable '%s'"_warn_en_US);
+  if (ShouldWarn(common::UsageWarning::IndexVarRedefinition)) {
+    CheckIndexVarRedefine(location, variable,
+        "Possible redefinition of %s variable '%s'"_warn_en_US);
+  }
 }
 
 void SemanticsContext::CheckIndexVarRedefine(
@@ -593,7 +595,10 @@ bool Semantics::Perform() {
       ModFileWriter{context_}.WriteAll();
 }
 
-void Semantics::EmitMessages(llvm::raw_ostream &os) const {
+void Semantics::EmitMessages(llvm::raw_ostream &os) {
+  // Resolve the CharBlock locations of the Messages to ProvenanceRanges
+  // so messages from parsing and semantics are intermixed in source order.
+  context_.messages().ResolveProvenances(context_.allCookedSources());
   context_.messages().Emit(os, context_.allCookedSources());
 }
 
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index df435906af68..2d0caff82eb2 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -1485,45 +1485,45 @@ const Symbol *IsFunctionResultWithSameNameAsFunction(const Symbol &symbol) {
 }
 
 void LabelEnforce::Post(const parser::GotoStmt &gotoStmt) {
-  checkLabelUse(gotoStmt.v);
+  CheckLabelUse(gotoStmt.v);
 }
 void LabelEnforce::Post(const parser::ComputedGotoStmt &computedGotoStmt) {
   for (auto &i : std::get<std::list<parser::Label>>(computedGotoStmt.t)) {
-    checkLabelUse(i);
+    CheckLabelUse(i);
   }
 }
 
 void LabelEnforce::Post(const parser::ArithmeticIfStmt &arithmeticIfStmt) {
-  checkLabelUse(std::get<1>(arithmeticIfStmt.t));
-  checkLabelUse(std::get<2>(arithmeticIfStmt.t));
-  checkLabelUse(std::get<3>(arithmeticIfStmt.t));
+  CheckLabelUse(std::get<1>(arithmeticIfStmt.t));
+  CheckLabelUse(std::get<2>(arithmeticIfStmt.t));
+  CheckLabelUse(std::get<3>(arithmeticIfStmt.t));
 }
 
 void LabelEnforce::Post(const parser::AssignStmt &assignStmt) {
-  checkLabelUse(std::get<parser::Label>(assignStmt.t));
+  CheckLabelUse(std::get<parser::Label>(assignStmt.t));
 }
 
 void LabelEnforce::Post(const parser::AssignedGotoStmt &assignedGotoStmt) {
   for (auto &i : std::get<std::list<parser::Label>>(assignedGotoStmt.t)) {
-    checkLabelUse(i);
+    CheckLabelUse(i);
   }
 }
 
 void LabelEnforce::Post(const parser::AltReturnSpec &altReturnSpec) {
-  checkLabelUse(altReturnSpec.v);
+  CheckLabelUse(altReturnSpec.v);
 }
 
 void LabelEnforce::Post(const parser::ErrLabel &errLabel) {
-  checkLabelUse(errLabel.v);
+  CheckLabelUse(errLabel.v);
 }
 void LabelEnforce::Post(const parser::EndLabel &endLabel) {
-  checkLabelUse(endLabel.v);
+  CheckLabelUse(endLabel.v);
 }
 void LabelEnforce::Post(const parser::EorLabel &eorLabel) {
-  checkLabelUse(eorLabel.v);
+  CheckLabelUse(eorLabel.v);
 }
 
-void LabelEnforce::checkLabelUse(const parser::Label &labelUsed) {
+void LabelEnforce::CheckLabelUse(const parser::Label &labelUsed) {
   if (labels_.find(labelUsed) == labels_.end()) {
     SayWithConstruct(context_, currentStatementSourcePosition_,
         parser::MessageFormattedText{
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index bdd0e07bbfd4..bc81e1b1887b 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -103,7 +103,6 @@ append(${NO_LTO_FLAGS} CMAKE_CXX_FLAGS)
 add_definitions(-U_GLIBCXX_ASSERTIONS)
 add_definitions(-U_LIBCPP_ENABLE_ASSERTIONS)
 
-add_subdirectory(FortranMain)
 add_subdirectory(Float128Math)
 
 set(sources
@@ -193,6 +192,7 @@ set(supported_files
   environment.cpp
   extrema.cpp
   external-unit.cpp
+  file.cpp
   findloc.cpp
   format.cpp
   inquiry.cpp
diff --git a/flang/runtime/FortranMain/CMakeLists.txt b/flang/runtime/FortranMain/CMakeLists.txt
deleted file mode 100644
index deb7bd10acf5..000000000000
--- a/flang/runtime/FortranMain/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-add_flang_library(Fortran_main STATIC INSTALL_WITH_TOOLCHAIN
-  Fortran_main.c
-)
-if (DEFINED MSVC)
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
-  add_flang_library(Fortran_main.static STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
-  add_flang_library(Fortran_main.dynamic STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebug)
-  add_flang_library(Fortran_main.static_dbg STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebugDLL)
-  add_flang_library(Fortran_main.dynamic_dbg STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  add_dependencies(Fortran_main Fortran_main.static Fortran_main.dynamic
-    Fortran_main.static_dbg Fortran_main.dynamic_dbg)
-endif()
diff --git a/flang/runtime/FortranMain/Fortran_main.c b/flang/runtime/FortranMain/Fortran_main.c
deleted file mode 100644
index 5d3eaced001e..000000000000
--- a/flang/runtime/FortranMain/Fortran_main.c
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- runtime/FortranMain/Fortran_main.c --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang/Runtime/main.h"
-#include "flang/Runtime/stop.h"
-
-/* main entry into PROGRAM */
-void _QQmain(void);
-
-extern const struct EnvironmentDefaultList *_QQEnvironmentDefaults;
-
-/* C main stub */
-int main(int argc, const char *argv[], const char *envp[]) {
-  RTNAME(ProgramStart)(argc, argv, envp, _QQEnvironmentDefaults);
-  _QQmain();
-  RTNAME(ProgramEndStatement)();
-  return 0;
-}
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index a06ed258f0f1..13ab91fc56ea 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -446,6 +446,7 @@ RT_API_ATTRS bool RealOutputEditing<KIND>::EditFOutput(const DataEdit &edit) {
       fracDigits = sizeof buffer_ - 2; // sign & NUL
     }
   }
+  bool emitTrailingZeroes{!(flags & decimal::Minimize)};
   // Multiple conversions may be needed to get the right number of
   // effective rounded fractional digits.
   bool canIncrease{true};
@@ -526,11 +527,18 @@ RT_API_ATTRS bool RealOutputEditing<KIND>::EditFOutput(const DataEdit &edit) {
     }
     int digitsBeforePoint{std::max(0, std::min(expo, convertedDigits))};
     int zeroesBeforePoint{std::max(0, expo - digitsBeforePoint)};
+    if (zeroesBeforePoint > 0 && (flags & decimal::Minimize)) {
+      // If a minimized result looks like an integer, emit all of
+      // its digits rather than clipping some to zeroes.
+      // This can happen with HUGE(0._2) == 65504._2.
+      flags &= ~decimal::Minimize;
+      continue;
+    }
     int zeroesAfterPoint{std::min(fracDigits, std::max(0, -expo))};
     int digitsAfterPoint{convertedDigits - digitsBeforePoint};
-    int trailingZeroes{flags & decimal::Minimize
-            ? 0
-            : std::max(0, fracDigits - (zeroesAfterPoint + digitsAfterPoint))};
+    int trailingZeroes{emitTrailingZeroes
+            ? std::max(0, fracDigits - (zeroesAfterPoint + digitsAfterPoint))
+            : 0};
     if (digitsBeforePoint + zeroesBeforePoint + zeroesAfterPoint +
             digitsAfterPoint + trailingZeroes ==
         0) {
@@ -822,6 +830,11 @@ RT_API_ATTRS bool EditLogicalOutput(
   case 'Z':
     return EditBOZOutput<4>(io, edit,
         reinterpret_cast<const unsigned char *>(&truth), sizeof truth);
+  case 'A': { // legacy extension
+    int truthBits{truth};
+    return EditCharacterOutput(
+        io, edit, reinterpret_cast<char *>(&truthBits), sizeof truthBits);
+  }
   default:
     io.GetIoErrorHandler().SignalError(IostatErrorInFormat,
         "Data edit descriptor '%c' may not be used with a LOGICAL data item",
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index b2c9665a28df..52b1d99ba536 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -23,9 +23,11 @@ extern char **environ;
 
 namespace Fortran::runtime {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 RT_OFFLOAD_VAR_GROUP_BEGIN
 RT_VAR_ATTRS ExecutionEnvironment executionEnvironment;
 RT_OFFLOAD_VAR_GROUP_END
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   if (!envDefaults) {
diff --git a/flang/runtime/file.cpp b/flang/runtime/file.cpp
index acd5d33d4bb8..79db17e70acd 100644
--- a/flang/runtime/file.cpp
+++ b/flang/runtime/file.cpp
@@ -457,22 +457,22 @@ std::int64_t SizeInBytes(const char *path) {
   return -1;
 }
 #else // defined(RT_DEVICE_COMPILATION)
-bool IsATerminal(int fd) {
+RT_API_ATTRS bool IsATerminal(int fd) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool IsExtant(const char *path) {
+RT_API_ATTRS bool IsExtant(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayRead(const char *path) {
+RT_API_ATTRS bool MayRead(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayWrite(const char *path) {
+RT_API_ATTRS bool MayWrite(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayReadAndWrite(const char *path) {
+RT_API_ATTRS bool MayReadAndWrite(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-std::int64_t SizeInBytes(const char *path) {
+RT_API_ATTRS std::int64_t SizeInBytes(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
 #endif // defined(RT_DEVICE_COMPILATION)
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index b9eed2101ecf..af092de70f78 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -596,7 +596,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   return true;
 }
 
-bool IsNamelistNameOrSlash(IoStatementState &io) {
+RT_API_ATTRS bool IsNamelistNameOrSlash(IoStatementState &io) {
   if (auto *listInput{
           io.get_if<ListDirectedStatementState<Direction::Input>>()}) {
     if (listInput->inNamelistSequence()) {
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index abd3e500029f..52b5a56894d8 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -9,6 +9,7 @@
 #include "flang/Runtime/numeric.h"
 #include "numeric-templates.h"
 #include "terminator.h"
+#include "tools.h"
 #include "flang/Common/float128.h"
 #include <cfloat>
 #include <climits>
@@ -18,30 +19,30 @@
 namespace Fortran::runtime {
 
 template <typename RES>
-inline RT_API_ATTRS RES getIntArgValue(const char *source, int line, void *arg,
-    int kind, std::int64_t defaultValue, int resKind) {
+inline RT_API_ATTRS RES GetIntArgValue(const char *source, int line,
+    const void *arg, int kind, std::int64_t defaultValue, int resKind) {
   RES res;
   if (!arg) {
     res = static_cast<RES>(defaultValue);
   } else if (kind == 1) {
     res = static_cast<RES>(
-        *static_cast<CppTypeFor<TypeCategory::Integer, 1> *>(arg));
+        *static_cast<const CppTypeFor<TypeCategory::Integer, 1> *>(arg));
   } else if (kind == 2) {
     res = static_cast<RES>(
-        *static_cast<CppTypeFor<TypeCategory::Integer, 2> *>(arg));
+        *static_cast<const CppTypeFor<TypeCategory::Integer, 2> *>(arg));
   } else if (kind == 4) {
     res = static_cast<RES>(
-        *static_cast<CppTypeFor<TypeCategory::Integer, 4> *>(arg));
+        *static_cast<const CppTypeFor<TypeCategory::Integer, 4> *>(arg));
   } else if (kind == 8) {
     res = static_cast<RES>(
-        *static_cast<CppTypeFor<TypeCategory::Integer, 8> *>(arg));
+        *static_cast<const CppTypeFor<TypeCategory::Integer, 8> *>(arg));
 #ifdef __SIZEOF_INT128__
   } else if (kind == 16) {
     if (resKind != 16) {
       Terminator{source, line}.Crash("Unexpected integer kind in runtime");
     }
     res = static_cast<RES>(
-        *static_cast<CppTypeFor<TypeCategory::Integer, 16> *>(arg));
+        *static_cast<const CppTypeFor<TypeCategory::Integer, 16> *>(arg));
 #endif
   } else {
     Terminator{source, line}.Crash("Unexpected integer kind in runtime");
@@ -112,6 +113,22 @@ inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedIntKind(T x) {
   return -1;
 }
 
+// SELECTED_LOGICAL_KIND (F'2023 16.9.182)
+template <typename T>
+inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedLogicalKind(
+    T x) {
+  if (x <= 2) {
+    return 1;
+  } else if (x <= 4) {
+    return 2;
+  } else if (x <= 9) {
+    return 4;
+  } else if (x <= 18) {
+    return 8;
+  }
+  return -1;
+}
+
 // SELECTED_REAL_KIND (16.9.170)
 template <typename P, typename R, typename D>
 inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedRealKind(
@@ -717,40 +734,72 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(Scale10)(
 }
 #endif
 
+// SELECTED_CHAR_KIND
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedCharKind)(
+    const char *source, int line, const char *x, std::size_t length) {
+  static const char *keywords[]{
+      "ASCII", "DEFAULT", "UCS-2", "ISO_10646", "UCS-4", nullptr};
+  switch (IdentifyValue(x, length, keywords)) {
+  case 0: // ASCII
+  case 1: // DEFAULT
+    return 1;
+  case 2: // UCS-2
+    return 2;
+  case 3: // ISO_10646
+  case 4: // UCS-4
+    return 4;
+  default:
+    return -1;
+  }
+}
 // SELECTED_INT_KIND
 CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedIntKind)(
     const char *source, int line, void *x, int xKind) {
 #ifdef __SIZEOF_INT128__
   CppTypeFor<TypeCategory::Integer, 16> r =
-      getIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
+      GetIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
           source, line, x, xKind, /*defaultValue*/ 0, /*resKind*/ 16);
 #else
-  std::int64_t r = getIntArgValue<std::int64_t>(
+  std::int64_t r = GetIntArgValue<std::int64_t>(
       source, line, x, xKind, /*defaultValue*/ 0, /*resKind*/ 8);
 #endif
   return SelectedIntKind(r);
 }
 
+// SELECTED_LOGICAL_KIND
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedLogicalKind)(
+    const char *source, int line, void *x, int xKind) {
+#ifdef __SIZEOF_INT128__
+  CppTypeFor<TypeCategory::Integer, 16> r =
+      GetIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
+          source, line, x, xKind, /*defaultValue*/ 0, /*resKind*/ 16);
+#else
+  std::int64_t r = GetIntArgValue<std::int64_t>(
+      source, line, x, xKind, /*defaultValue*/ 0, /*resKind*/ 8);
+#endif
+  return SelectedLogicalKind(r);
+}
+
 // SELECTED_REAL_KIND
 CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedRealKind)(const char *source,
     int line, void *precision, int pKind, void *range, int rKind, void *radix,
     int dKind) {
 #ifdef __SIZEOF_INT128__
   CppTypeFor<TypeCategory::Integer, 16> p =
-      getIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
+      GetIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
           source, line, precision, pKind, /*defaultValue*/ 0, /*resKind*/ 16);
   CppTypeFor<TypeCategory::Integer, 16> r =
-      getIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
+      GetIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
           source, line, range, rKind, /*defaultValue*/ 0, /*resKind*/ 16);
   CppTypeFor<TypeCategory::Integer, 16> d =
-      getIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
+      GetIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
           source, line, radix, dKind, /*defaultValue*/ 2, /*resKind*/ 16);
 #else
-  std::int64_t p = getIntArgValue<std::int64_t>(
+  std::int64_t p = GetIntArgValue<std::int64_t>(
       source, line, precision, pKind, /*defaultValue*/ 0, /*resKind*/ 8);
-  std::int64_t r = getIntArgValue<std::int64_t>(
+  std::int64_t r = GetIntArgValue<std::int64_t>(
       source, line, range, rKind, /*defaultValue*/ 0, /*resKind*/ 8);
-  std::int64_t d = getIntArgValue<std::int64_t>(
+  std::int64_t d = GetIntArgValue<std::int64_t>(
       source, line, radix, dKind, /*defaultValue*/ 2, /*resKind*/ 8);
 #endif
   return SelectedRealKind(p, r, d);
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 0e38cffdf907..3b42f45d5588 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -19,11 +19,13 @@
 
 namespace Fortran::runtime::io {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 RT_OFFLOAD_VAR_GROUP_BEGIN
 RT_VAR_ATTRS ExternalFileUnit *defaultInput{nullptr}; // unit 5
 RT_VAR_ATTRS ExternalFileUnit *defaultOutput{nullptr}; // unit 6
 RT_VAR_ATTRS ExternalFileUnit *errorOutput{nullptr}; // unit 0 extension
 RT_OFFLOAD_VAR_GROUP_END
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 RT_OFFLOAD_API_GROUP_BEGIN
 
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index 9945dc6509ec..f4b38d5225ce 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -10,6 +10,7 @@
 
 namespace Fortran::runtime {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 // clang-format off
 RT_OFFLOAD_VAR_GROUP_BEGIN
 const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
@@ -40,6 +41,7 @@ const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
 };
 RT_OFFLOAD_VAR_GROUP_END
 // clang-format on
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 RT_OFFLOAD_API_GROUP_BEGIN
 // Non-minimal encodings are accepted.
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index 7d96a72e5f36..7e036ad539df 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -62,7 +62,6 @@ set(FLANG_TEST_DEPENDS
   llvm-readobj
   split-file
   FortranRuntime
-  Fortran_main
   FortranDecimal
 )
 if (LLVM_ENABLE_PLUGINS AND NOT WIN32)
diff --git a/flang/test/Driver/bbc-mlir-pass-pipeline.f90 b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
index 2ee832e3c57a..2cc25b3c473f 100644
--- a/flang/test/Driver/bbc-mlir-pass-pipeline.f90
+++ b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
@@ -17,9 +17,16 @@ end program
 ! CHECK-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! CHECK-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
+! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! CHECK-NEXT: 'fir.global' Pipeline
+! CHECK-NEXT:   CharacterConversion
 ! CHECK-NEXT: 'func.func' Pipeline
 ! CHECK-NEXT:   ArrayValueCopy
 ! CHECK-NEXT:   CharacterConversion
+! CHECK-NEXT: 'omp.declare_reduction' Pipeline
+! CHECK-NEXT:   CharacterConversion
+! CHECK-NEXT: 'omp.private' Pipeline
+! CHECK-NEXT:   CharacterConversion
 
 ! CHECK-NEXT: Canonicalizer
 ! CHECK-NEXT: SimplifyRegionLite
@@ -38,14 +45,17 @@ end program
 ! CHECK-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! CHECK-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! CHECK-NEXT: PolymorphicOpConversion
+
+! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! CHECK-NEXT: 'fir.global' Pipeline
 ! CHECK-NEXT:   CFGConversion
 ! CHECK-NEXT: 'func.func' Pipeline
-! CHECK-NEXT:   PolymorphicOpConversion
 ! CHECK-NEXT:   CFGConversion
 ! CHECK-NEXT: 'omp.declare_reduction' Pipeline
 ! CHECK-NEXT:   CFGConversion
+! CHECK-NEXT: 'omp.private' Pipeline
+! CHECK-NEXT:   CFGConversion
 
 ! CHECK-NEXT: SCFToControlFlow
 ! CHECK-NEXT: Canonicalizer
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index b5bb0f1c1b25..706b2cb6c245 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -63,7 +63,6 @@
 ! CHECK-NEXT:                         Select Windows run-time library
 ! CHECK-NEXT: -fno-automatic          Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE
 ! CHECK-NEXT: -fno-color-diagnostics  Disable colors in diagnostics
-! CHECK-NEXT: -fno-fortran-main       Do not include Fortran_main.a (provided by Flang) when linking
 ! CHECK-NEXT: -fno-integrated-as      Disable the integrated assembler
 ! CHECK-NEXT: -fno-lto                Disable LTO mode (default)
 ! CHECK-NEXT: -fno-ppc-native-vector-element-order
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index 0b0a493baf07..4c3609db80b9 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -1,298 +1,15 @@
-
-!--------------------------
-! FLANG DRIVER (flang)
-!--------------------------
 ! RUN: %flang -help 2>&1 | FileCheck %s --check-prefix=HELP
 ! RUN: not %flang -helps 2>&1 | FileCheck %s --check-prefix=ERROR
 
-!----------------------------------------
-! FLANG FRONTEND DRIVER (flang -fc1)
-!----------------------------------------
 ! RUN: %flang_fc1 -help 2>&1 | FileCheck %s --check-prefix=HELP-FC1
 ! RUN: not %flang_fc1 -helps 2>&1 | FileCheck %s --check-prefix=ERROR
 
 ! HELP:USAGE: flang
 ! HELP-EMPTY:
 ! HELP-NEXT:OPTIONS:
-! HELP-NEXT: -###                    Print (but do not run) the commands to run for this compilation
-! HELP-NEXT: -cpp                    Enable predefined and command line preprocessor macros
-! HELP-NEXT: -c                      Only run preprocess, compile, and assemble steps
-! HELP-NEXT: -dM                     Print macro definitions in -E mode instead of normal output
-! HELP-NEXT: -dumpmachine            Display the compiler's target processor
-! HELP-NEXT: -dumpversion            Display the version of the compiler
-! HELP-NEXT: -D <macro>=<value>      Define <macro> to <value> (or 1 if <value> omitted)
-! HELP-NEXT: -emit-llvm              Use the LLVM representation for assembler and object files
-! HELP-NEXT: -E                      Only run the preprocessor
-! HELP-NEXT: -falternative-parameter-statement
-! HELP-NEXT:                         Enable the old style PARAMETER statement
-! HELP-NEXT: -fapprox-func           Allow certain math function calls to be replaced with an approximately equivalent calculation
-! HELP-NEXT: -fbackslash             Specify that backslash in string introduces an escape character
-! HELP-NEXT: -fcolor-diagnostics     Enable colors in diagnostics
-! HELP-NEXT: -fconvert=<value>       Set endian conversion of data for unformatted files
-! HELP-NEXT: -fdefault-double-8      Set the default double precision kind to an 8 byte wide type
-! HELP-NEXT: -fdefault-integer-8     Set the default integer and logical kind to an 8 byte wide type
-! HELP-NEXT: -fdefault-real-8        Set the default real kind to an 8 byte wide type
-! HELP-NEXT: -ffast-math             Allow aggressive, lossy floating-point optimizations
-! HELP-NEXT: -ffixed-form            Process source files in fixed form
-! HELP-NEXT: -ffixed-line-length=<value>
-! HELP-NEXT:                         Use <value> as character line width in fixed mode
-! HELP-NEXT: -ffp-contract=<value>   Form fused FP ops (e.g. FMAs)
-! HELP-NEXT: -ffree-form             Process source files in free form
-! HELP-NEXT: -fhonor-infinities      Specify that floating-point optimizations are not allowed that assume arguments and results are not +-inf.
-! HELP-NEXT: -fhonor-nans            Specify that floating-point optimizations are not allowed that assume arguments and results are not NANs.
-! HELP-NEXT: -fimplicit-none         No implicit typing allowed unless overridden by IMPLICIT statements
-! HELP-NEXT: -finput-charset=<value> Specify the default character set for source files
-! HELP-NEXT: -fintegrated-as         Enable the integrated assembler
-! HELP-NEXT: -fintrinsic-modules-path <dir>
-! HELP-NEXT:                         Specify where to find the compiled intrinsic modules
-! HELP-NEXT: -flarge-sizes           Use INTEGER(KIND=8) for the result type in size-related intrinsics
-! HELP-NEXT: -flogical-abbreviations Enable logical abbreviations
-! HELP-NEXT: -flto=auto              Enable LTO in 'full' mode
-! HELP-NEXT: -flto=jobserver         Enable LTO in 'full' mode
-! HELP-NEXT: -flto=<value>           Set LTO mode
-! HELP-NEXT: -flto                   Enable LTO in 'full' mode
-! HELP-NEXT: -fms-runtime-lib=<value>
-! HELP-NEXT:                         Select Windows run-time library
-! HELP-NEXT: -fno-automatic          Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE
-! HELP-NEXT: -fno-color-diagnostics  Disable colors in diagnostics
-! HELP-NEXT: -fno-fortran-main       Do not include Fortran_main.a (provided by Flang) when linking
-! HELP-NEXT: -fno-integrated-as      Disable the integrated assembler
-! HELP-NEXT: -fno-lto                Disable LTO mode (default)
-! HELP-NEXT: -fno-ppc-native-vector-element-order
-! HELP-NEXT:                         Specifies PowerPC non-native vector element order
-! HELP-NEXT:  -fno-rtlib-add-rpath Do not add -rpath with architecture-specific resource directory to the linker flags. When --hip-link is specified, do not add -rpath with HIP runtime library directory to the linker flags
-! HELP-NEXT: -fno-signed-zeros       Allow optimizations that ignore the sign of floating point zeros
-! HELP-NEXT: -fno-stack-arrays       Allocate array temporaries on the heap (default)
-! HELP-NEXT: -fno-version-loops-for-stride
-! HELP-NEXT:                         Do not create unit-strided loops (default)
-! HELP-NEXT: -fomit-frame-pointer    Omit the frame pointer from functions that don't need it. Some stack unwinding cases, such as profilers and sanitizers, may prefer specifying -fno-omit-frame-pointer. On many targets, -O1 and higher omit the frame pointer by default. -m[no-]omit-leaf-frame-pointer takes precedence for leaf functions
-! HELP-NEXT: -fopenacc               Enable OpenACC
-! HELP-NEXT: -fopenmp-target-debug   Enable debugging in the OpenMP offloading device RTL
-! HELP-NEXT: -fopenmp-targets=<value>
-! HELP-NEXT:                         Specify comma-separated list of triples OpenMP offloading targets to be supported
-! HELP-NEXT: -fopenmp-version=<value>
-! HELP-NEXT:                         Set OpenMP version (e.g. 45 for OpenMP 4.5, 51 for OpenMP 5.1). Default value is 11 for Flang
-! HELP-NEXT: -fopenmp                Parse OpenMP pragmas and generate parallel code.
-! HELP-NEXT: -foptimization-record-file=<file>
-! HELP-NEXT:                         Specify the output name of the file containing the optimization remarks. Implies -fsave-optimization-record. On Darwin platforms, this cannot be used with multiple -arch <arch> options.
-! HELP-NEXT: -foptimization-record-passes=<regex>
-! HELP-NEXT:                         Only include passes which match a specified regular expression in the generated optimization record (by default, include all passes)
-! HELP-NEXT: -fpass-plugin=<dsopath> Load pass plugin from a dynamic shared object file (only with new pass manager).
-! HELP-NEXT: -fppc-native-vector-element-order
-! HELP-NEXT:                         Specifies PowerPC native vector element order (default)
-! HELP-NEXT: -freciprocal-math       Allow division operations to be reassociated
-! HELP-NEXT: -fropi                  Generate read-only position independent code (ARM only)
-! HELP-NEXT: -frtlib-add-rpath Add -rpath with architecture-specific resource directory to the linker flags. When --hip-link is specified, also add -rpath with HIP runtime library directory to the linker flags
-! HELP-NEXT: -frwpi                  Generate read-write position independent code (ARM only)
-! HELP-NEXT: -fsave-optimization-record=<format>
-! HELP-NEXT:                         Generate an optimization record file in a specific format
-! HELP-NEXT: -fsave-optimization-record
-! HELP-NEXT:                         Generate a YAML optimization record file
-! HELP-NEXT: -fstack-arrays          Attempt to allocate array temporaries on the stack, no matter their size
-! HELP-NEXT: -fsyntax-only           Run the preprocessor, parser and semantic analysis stages
-! HELP-NEXT: -funderscoring          Appends one trailing underscore to external names
-! HELP-NEXT: -fveclib=<value>        Use the given vector functions library
-! HELP-NEXT: -fversion-loops-for-stride
-! HELP-NEXT:                         Create unit-strided versions of loops
-! HELP-NEXT: -fxor-operator          Enable .XOR. as a synonym of .NEQV.
-! HELP-NEXT: --gcc-install-dir=<value>
-! HELP-NEXT:                         Use GCC installation in the specified directory. The directory ends with path components like 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Note: executables (e.g. ld) used by the compiler are not overridden by the selected GCC installation
-! HELP-NEXT: --gcc-toolchain=<value> Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Flang will use the GCC installation with the largest version
-! HELP-NEXT: -gline-directives-only  Emit debug line info directives only
-! HELP-NEXT: -gline-tables-only      Emit debug line number tables only
-! HELP-NEXT: -gpulibc                Link the LLVM C Library for GPUs
-! HELP-NEXT: -g                      Generate source-level debug information
-! HELP-NEXT: --help-hidden           Display help for hidden options
-! HELP-NEXT: -help                   Display available options
-! HELP-NEXT: -isysroot <dir>         Set the system root directory (usually /)
-! HELP-NEXT: -I <dir>                Add directory to the end of the list of include search paths
-! HELP-NEXT: -L <dir>                Add directory to library search path
-! HELP-NEXT: -march=<value>          For a list of available architectures for the target use '-mcpu=help'
-! HELP-NEXT: -mcode-object-version=<value>
-! HELP-NEXT:                         Specify code object ABI version. Defaults to 5. (AMDGPU only)
-! HELP-NEXT: -mcpu=<value>           For a list of available CPUs for the target use '-mcpu=help'
-! HELP-NEXT: -mllvm=<arg>            Alias for -mllvm
-! HELP-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
-! HELP-NEXT: -mmlir <value>          Additional arguments to forward to MLIR's option processing
-! HELP-NEXT: -mno-outline-atomics    Don't generate local calls to out-of-line atomic operations
-! HELP-NEXT: -module-dir <dir>       Put MODULE files in <dir>
-! HELP-NEXT: -moutline-atomics       Generate local calls to out-of-line atomic operations
-! HELP-NEXT: -mrvv-vector-bits=<value>
-! HELP-NEXT:                         Specify the size in bits of an RVV vector register
-! HELP-NEXT: -msve-vector-bits=<value>
-! HELP-NEXT:                          Specify the size in bits of an SVE vector register. Defaults to the vector length agnostic value of "scalable". (AArch64 only)
-! HELP-NEXT: --no-offload-arch=<value>
-! HELP-NEXT:                         Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. 'all' resets the list to its default value.
-! HELP-NEXT: -nocpp                  Disable predefined and command line preprocessor macros
-! HELP-NEXT: -nogpulib               Do not link device library for CUDA/HIP device compilation
-! HELP-NEXT: --offload-arch=<value>  Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g. sm_35). If 'native' is used the compiler will detect locally installed architectures. For HIP offloading, the device architecture can be followed by target ID features delimited by a colon (e.g. gfx908:xnack+:sramecc-). May be specified more than once.
-! HELP-NEXT: --offload-device-only   Only compile for the offloading device.
-! HELP-NEXT: --offload-host-device   Compile for both the offloading host and device (default).
-! HELP-NEXT: --offload-host-only     Only compile for the offloading host.
-! HELP-NEXT: -o <file>               Write output to <file>
-! HELP-NEXT: -pedantic               Warn on language extensions
-! HELP-NEXT: -print-effective-triple Print the effective target triple
-! HELP-NEXT: -print-target-triple    Print the normalized target triple
-! HELP-NEXT: -pthread                Support POSIX threads in generated code
-! HELP-NEXT: -P                      Disable linemarker output in -E mode
-! HELP-NEXT:  --rocm-path=<value> ROCm installation path, used for finding and automatically linking required bitcode libraries.
-! HELP-NEXT: -Rpass-analysis=<value> Report transformation analysis from optimization passes whose name matches the given POSIX regular expression
-! HELP-NEXT: -Rpass-missed=<value>   Report missed transformations by optimization passes whose name matches the given POSIX regular expression
-! HELP-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
-! HELP-NEXT: -R<remark>              Enable the specified remark
-! HELP-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! HELP-NEXT: -save-temps             Alias for --save-temps=cwd
-! HELP-NEXT: -std=<value>            Language standard to compile for
-! HELP-NEXT: -S                      Only run preprocess and compilation steps
-! HELP-NEXT: --target=<value>        Generate code for the given target
-! HELP-NEXT: -U <macro>              Undefine macro <macro>
-! HELP-NEXT: --version               Print version information
-! HELP-NEXT: -v                      Show commands to run and use verbose output
-! HELP-NEXT: -Wl,<arg>               Pass the comma separated arguments in <arg> to the linker
-! HELP-NEXT: -W<warning>             Enable the specified warning
-! HELP-NEXT: -Xflang <arg>           Pass <arg> to the flang compiler
-! HELP-NEXT: -x <language>           Treat subsequent input files as having type <language>
-
 
 ! HELP-FC1:USAGE: flang
 ! HELP-FC1-EMPTY:
 ! HELP-FC1-NEXT:OPTIONS:
-! HELP-FC1-NEXT: -cpp                    Enable predefined and command line preprocessor macros
-! HELP-FC1-NEXT: --dependent-lib=<value> Add dependent library
-! HELP-FC1-NEXT: -dM                     Print macro definitions in -E mode instead of normal output
-! HELP-FC1-NEXT: -D <macro>=<value>      Define <macro> to <value> (or 1 if <value> omitted)
-! HELP-FC1-NEXT: -emit-fir               Build the parse tree, then lower it to FIR
-! HELP-FC1-NEXT: -emit-hlfir             Build the parse tree, then lower it to HLFIR
-! HELP-FC1-NEXT: -emit-llvm-bc           Build ASTs then convert to LLVM, emit .bc file
-! HELP-FC1-NEXT: -emit-llvm              Use the LLVM representation for assembler and object files
-! HELP-FC1-NEXT: -emit-obj               Emit native object files
-! HELP-FC1-NEXT: -E                      Only run the preprocessor
-! HELP-FC1-NEXT: -falternative-parameter-statement
-! HELP-FC1-NEXT:                         Enable the old style PARAMETER statement
-! HELP-FC1-NEXT: -fapprox-func           Allow certain math function calls to be replaced with an approximately equivalent calculation
-! HELP-FC1-NEXT: -fbackslash             Specify that backslash in string introduces an escape character
-! HELP-FC1-NEXT: -fcolor-diagnostics     Enable colors in diagnostics
-! HELP-FC1-NEXT: -fconvert=<value>       Set endian conversion of data for unformatted files
-! HELP-FC1-NEXT: -fdebug-dump-all        Dump symbols and the parse tree after the semantic checks
-! HELP-FC1-NEXT: -fdebug-dump-parse-tree-no-sema
-! HELP-FC1-NEXT:                         Dump the parse tree (skips the semantic checks)
-! HELP-FC1-NEXT: -fdebug-dump-parse-tree Dump the parse tree
-! HELP-FC1-NEXT: -fdebug-dump-parsing-log
-! HELP-FC1-NEXT:                         Run instrumented parse and dump the parsing log
-! HELP-FC1-NEXT: -fdebug-dump-pft        Dump the pre-fir parse tree
-! HELP-FC1-NEXT: -fdebug-dump-provenance Dump provenance
-! HELP-FC1-NEXT: -fdebug-dump-symbols    Dump symbols after the semantic analysis
-! HELP-FC1-NEXT: -fdebug-measure-parse-tree
-! HELP-FC1-NEXT:                         Measure the parse tree
-! HELP-FC1-NEXT: -fdebug-module-writer   Enable debug messages while writing module files
-! HELP-FC1-NEXT: -fdebug-pass-manager    Prints debug information for the new pass manager
-! HELP-FC1-NEXT: -fdebug-pre-fir-tree    Dump the pre-FIR tree
-! HELP-FC1-NEXT: -fdebug-unparse-no-sema Unparse and stop (skips the semantic checks)
-! HELP-FC1-NEXT: -fdebug-unparse-with-symbols
-! HELP-FC1-NEXT:                         Unparse and stop.
-! HELP-FC1-NEXT: -fdebug-unparse         Unparse and stop.
-! HELP-FC1-NEXT: -fdefault-double-8      Set the default double precision kind to an 8 byte wide type
-! HELP-FC1-NEXT: -fdefault-integer-8     Set the default integer and logical kind to an 8 byte wide type
-! HELP-FC1-NEXT: -fdefault-real-8        Set the default real kind to an 8 byte wide type
-! HELP-FC1-NEXT: -fembed-offload-object=<value>
-! HELP-FC1-NEXT:                         Embed Offloading device-side binary into host object file as a section.
-! HELP-FC1-NEXT: -ffast-math             Allow aggressive, lossy floating-point optimizations
-! HELP-FC1-NEXT: -ffixed-form            Process source files in fixed form
-! HELP-FC1-NEXT: -ffixed-line-length=<value>
-! HELP-FC1-NEXT:                         Use <value> as character line width in fixed mode
-! HELP-FC1-NEXT: -ffp-contract=<value>   Form fused FP ops (e.g. FMAs)
-! HELP-FC1-NEXT: -ffree-form             Process source files in free form
-! HELP-FC1-NEXT: -fget-definition <value> <value> <value>
-! HELP-FC1-NEXT:                         Get the symbol definition from <line> <start-column> <end-column>
-! HELP-FC1-NEXT: -fget-symbols-sources   Dump symbols and their source code locations
-! HELP-FC1-NEXT: -fimplicit-none         No implicit typing allowed unless overridden by IMPLICIT statements
-! HELP-FC1-NEXT: -finput-charset=<value> Specify the default character set for source files
-! HELP-FC1-NEXT: -fintrinsic-modules-path <dir>
-! HELP-FC1-NEXT:                         Specify where to find the compiled intrinsic modules
-! HELP-FC1-NEXT: -flarge-sizes           Use INTEGER(KIND=8) for the result type in size-related intrinsics
-! HELP-FC1-NEXT: -flogical-abbreviations Enable logical abbreviations
-! HELP-FC1-NEXT: -flto=<value>           Set LTO mode
-! HELP-FC1-NEXT: -flto                   Enable LTO in 'full' mode
-! HELP-FC1-NEXT: -fno-analyzed-objects-for-unparse
-! HELP-FC1-NEXT:                         Do not use the analyzed objects when unparsing
-! HELP-FC1-NEXT: -fno-automatic          Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE
-! HELP-FC1-NEXT: -fno-debug-pass-manager Disables debug printing for the new pass manager
-! HELP-FC1-NEXT: -fno-ppc-native-vector-element-order
-! HELP-FC1-NEXT:                         Specifies PowerPC non-native vector element order
-! HELP-FC1-NEXT: -fno-reformat           Dump the cooked character stream in -E mode
-! HELP-FC1-NEXT: -fno-signed-zeros       Allow optimizations that ignore the sign of floating point zeros
-! HELP-FC1-NEXT: -fno-stack-arrays       Allocate array temporaries on the heap (default)
-! HELP-FC1-NEXT: -fno-version-loops-for-stride
-! HELP-FC1-NEXT:                         Do not create unit-strided loops (default)
-! HELP-FC1-NEXT: -fopenacc               Enable OpenACC
-! HELP-FC1-NEXT: -fopenmp-host-ir-file-path <value>
-! HELP-FC1-NEXT:                         Path to the IR file produced by the frontend for the host.
-! HELP-FC1-NEXT: -fopenmp-is-target-device
-! HELP-FC1-NEXT:                         Generate code only for an OpenMP target device.
-! HELP-FC1-NEXT: -fopenmp-target-debug   Enable debugging in the OpenMP offloading device RTL
-! HELP-FC1-NEXT: -fopenmp-version=<value>
-! HELP-FC1-NEXT:                         Set OpenMP version (e.g. 45 for OpenMP 4.5, 51 for OpenMP 5.1). Default value is 11 for Flang
-! HELP-FC1-NEXT: -fopenmp                Parse OpenMP pragmas and generate parallel code.
-! HELP-FC1-NEXT: -fpass-plugin=<dsopath> Load pass plugin from a dynamic shared object file (only with new pass manager).
-! HELP-FC1-NEXT: -fppc-native-vector-element-order
-! HELP-FC1-NEXT:                         Specifies PowerPC native vector element order (default)
-! HELP-FC1-NEXT: -freciprocal-math       Allow division operations to be reassociated
-! HELP-FC1-NEXT: -fstack-arrays          Attempt to allocate array temporaries on the stack, no matter their size
-! HELP-FC1-NEXT: -fsyntax-only           Run the preprocessor, parser and semantic analysis stages
-! HELP-FC1-NEXT: -funderscoring          Appends one trailing underscore to external names
-! HELP-FC1-NEXT: -fveclib=<value>        Use the given vector functions library
-! HELP-FC1-NEXT: -fversion-loops-for-stride
-! HELP-FC1-NEXT:                         Create unit-strided versions of loops
-! HELP-FC1-NEXT: -fxor-operator          Enable .XOR. as a synonym of .NEQV.
-! HELP-FC1-NEXT: -gpulibc                Link the LLVM C Library for GPUs
-! HELP-FC1-NEXT: -help                   Display available options
-! HELP-FC1-NEXT: -init-only              Only execute frontend initialization
-! HELP-FC1-NEXT: -I <dir>                Add directory to the end of the list of include search paths
-! HELP-FC1-NEXT: -load <dsopath>         Load the named plugin (dynamic shared object)
-! HELP-FC1-NEXT: -mcode-object-version=<value>
-! HELP-FC1-NEXT:                         Specify code object ABI version. Defaults to 5. (AMDGPU only)
-! HELP-FC1-NEXT: -menable-no-infs        Allow optimization to assume there are no infinities.
-! HELP-FC1-NEXT: -menable-no-nans        Allow optimization to assume there are no NaNs.
-! HELP-FC1-NEXT: -mframe-pointer=<value> Specify which frame pointers to retain.
-! HELP-FC1-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
-! HELP-FC1-NEXT: -mmlir <value>          Additional arguments to forward to MLIR's option processing
-! HELP-FC1-NEXT: -module-dir <dir>       Put MODULE files in <dir>
-! HELP-FC1-NEXT: -module-suffix <suffix> Use <suffix> as the suffix for module files (the default value is `.mod`)
-! HELP-FC1-NEXT: -mreassociate           Allow reassociation transformations for floating-point instructions
-! HELP-FC1-NEXT: -mrelocation-model <value>
-! HELP-FC1-NEXT:                         The relocation model to use
-! HELP-FC1-NEXT: -mvscale-max=<value>    Specify the vscale maximum. Defaults to the vector length agnostic value of "0". (AArch64/RISC-V only)
-! HELP-FC1-NEXT: -mvscale-min=<value>    Specify the vscale minimum. Defaults to "1". (AArch64/RISC-V only)
-! HELP-FC1-NEXT: -nocpp                  Disable predefined and command line preprocessor macros
-! HELP-FC1-NEXT: -nogpulib               Do not link device library for CUDA/HIP device compilation
-! HELP-FC1-NEXT: -opt-record-file <value>
-! HELP-FC1-NEXT:                         File name to use for YAML optimization record output
-! HELP-FC1-NEXT: -opt-record-format <value>
-! HELP-FC1-NEXT:                         The format used for serializing remarks (default: YAML)
-! HELP-FC1-NEXT: -opt-record-passes <value>
-! HELP-FC1-NEXT:                         Only record remark information for passes whose names match the given regular expression
-! HELP-FC1-NEXT: -o <file>               Write output to <file>
-! HELP-FC1-NEXT: -pedantic               Warn on language extensions
-! HELP-FC1-NEXT: -pic-is-pie             File is for a position independent executable
-! HELP-FC1-NEXT: -pic-level <value>      Value for __PIC__
-! HELP-FC1-NEXT: -plugin <name>          Use the named plugin action instead of the default action (use "help" to list available options)
-! HELP-FC1-NEXT: -pthread                Support POSIX threads in generated code
-! HELP-FC1-NEXT: -P                      Disable linemarker output in -E mode
-! HELP-FC1-NEXT: -Rpass-analysis=<value> Report transformation analysis from optimization passes whose name matches the given POSIX regular expression
-! HELP-FC1-NEXT: -Rpass-missed=<value>   Report missed transformations by optimization passes whose name matches the given POSIX regular expression
-! HELP-FC1-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
-! HELP-FC1-NEXT: -R<remark>              Enable the specified remark
-! HELP-FC1-NEXT: -save-temps=<value>     Save intermediate compilation results.
-! HELP-FC1-NEXT: -save-temps             Alias for --save-temps=cwd
-! HELP-FC1-NEXT: -std=<value>            Language standard to compile for
-! HELP-FC1-NEXT: -S                      Only run preprocess and compilation steps
-! HELP-FC1-NEXT: -target-cpu <value>     Target a specific cpu type
-! HELP-FC1-NEXT: -target-feature <value> Target specific attributes
-! HELP-FC1-NEXT: -test-io                Run the InputOuputTest action. Use for development and testing only.
-! HELP-FC1-NEXT: -triple <value>         Specify target triple (e.g. i686-apple-darwin9)
-! HELP-FC1-NEXT: -U <macro>              Undefine macro <macro>
-! HELP-FC1-NEXT: -version                Print the compiler version
-! HELP-FC1-NEXT: -W<warning>             Enable the specified warning
-! HELP-FC1-NEXT: -x <language>           Treat subsequent input files as having type <language>
 
 ! ERROR: error: unknown argument '-helps'; did you mean '-help'
diff --git a/flang/test/Driver/dynamic-linker.f90 b/flang/test/Driver/dynamic-linker.f90
index 7c3f1b5a53fe..6d5c443ab75c 100644
--- a/flang/test/Driver/dynamic-linker.f90
+++ b/flang/test/Driver/dynamic-linker.f90
@@ -16,7 +16,6 @@
 ! GNU-LINKER-OPTIONS-SAME: "-shared"
 ! GNU-LINKER-OPTIONS-SAME: "-static"
 ! GNU-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
-! GNU-LINKER-OPTIONS-NOT: "-lFortran_main.a"
 
 ! RDYNAMIC-LINKER-OPTION: "{{.*}}ld"
 ! RDYNAMIC-LINKER-OPTION-SAME: "-export-dynamic"
@@ -25,4 +24,3 @@
 ! MSVC-LINKER-OPTIONS: "{{.*}}link{{(.exe)?}}"
 ! MSVC-LINKER-OPTIONS-SAME: "-dll"
 ! MSVC-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
-! MSVC-LINKER-OPTIONS-NOT: "/WHOLEARCHIVE:Fortran_main"
diff --git a/flang/test/Driver/emit-mlir.f90 b/flang/test/Driver/emit-mlir.f90
index 191ee13396ef..2345a7cf53e4 100644
--- a/flang/test/Driver/emit-mlir.f90
+++ b/flang/test/Driver/emit-mlir.f90
@@ -15,9 +15,15 @@
 ! CHECK-LABEL: func @_QQmain() {
 ! CHECK-NEXT:  return
 ! CHECK-NEXT: }
-! CHECK-NEXT: fir.global @_QQEnvironmentDefaults constant : !fir.ref<tuple<i[[int_size:.*]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>> {
-! CHECK-NEXT:  %[[VAL_0:.*]] = fir.zero_bits !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
-! CHECK-NEXT: fir.has_value  %[[VAL_0]] : !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
+! CHECK-NEXT: func.func private @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr)
+! CHECK-NEXT: func.func private @_FortranAProgramEndStatement()
+! CHECK-NEXT: func.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
+! CHECK-NEXT: %c0_i32 = arith.constant 0 : i32
+! CHECK-NEXT: %0 = fir.zero_bits !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
+! CHECK-NEXT: fir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) {{.*}} : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>) 
+! CHECK-NEXT: fir.call @_QQmain() fastmath<contract> : () -> ()
+! CHECK-NEXT: fir.call @_FortranAProgramEndStatement() {{.*}} : () -> ()
+! CHECK-NEXT: return %c0_i32 : i32
 ! CHECK-NEXT: }
 ! CHECK-NEXT: }
 
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 4d3d528b5e99..02e217494f81 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -11,7 +11,6 @@
 ! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU,HAIKU-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW,MINGW-F128%f128-lib
-! RUN: %flang -### --target=aarch64-unknown-linux-gnu %S/Inputs/hello.f90 -lFortran_main 2>&1 | FileCheck %s --check-prefixes=DEPRECATED
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'oldnames' on Windows,
 !       but it is not needed when compiling Fortran code and they might bring in
@@ -29,7 +28,6 @@
 !       executable and may find the GNU linker from MinGW or Cygwin.
 ! UNIX-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! UNIX-SAME: "[[object_file]]"
-! UNIX-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
 ! UNIX-F128NONE-NOT: FortranFloat128Math
 ! SOLARIS-F128NONE-NOT: FortranFloat128Math
 ! UNIX-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
@@ -38,7 +36,6 @@
 
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
-! DARWIN-SAME: -lFortran_main
 ! DARWIN-F128NONE-NOT: FortranFloat128Math
 ! DARWIN-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! DARWIN-SAME: -lFortranRuntime
@@ -46,14 +43,12 @@
 
 ! HAIKU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! HAIKU-SAME: "[[object_file]]"
-! HAIKU-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
 ! HAIKU-F128NONE-NOT: FortranFloat128Math
 ! HAIKU-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! HAIKU-SAME: "-lFortranRuntime" "-lFortranDecimal"
 
 ! MINGW-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! MINGW-SAME: "[[object_file]]"
-! MINGW-SAME: -lFortran_main
 ! MINGW-F128NONE-NOT: FortranFloat128Math
 ! MINGW-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! MINGW-SAME: -lFortranRuntime
@@ -66,6 +61,3 @@
 ! MSVC-LABEL: link
 ! MSVC-SAME: /subsystem:console
 ! MSVC-SAME: "[[object_file]]"
-
-! Check that we warn when using -lFortran_main
-! DEPRECATED: warning: argument '-lFortran_main' is deprecated, see the Flang driver documentation for correct usage [-Wdeprecated]
diff --git a/flang/test/Driver/message-merging.f90 b/flang/test/Driver/message-merging.f90
new file mode 100644
index 000000000000..85fed4669eb5
--- /dev/null
+++ b/flang/test/Driver/message-merging.f90
@@ -0,0 +1,7 @@
+!RUN: %flang -fsyntax-only -pedantic -I %S/Inputs/ %s 2>&1 | FileCheck %s
+!CHECK: warning: SAVE attribute was already specified on 'x'
+!CHECK: portability: #include: extra stuff ignored after file name
+save x
+save x
+#include <empty.h>    crud after header name
+end
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index 06957604d7aa..a9980e3c932c 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -39,9 +39,16 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: 'fir.global' Pipeline
+! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'func.func' Pipeline
 ! ALL-NEXT:   ArrayValueCopy
 ! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'omp.declare_reduction' Pipeline
+! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'omp.private' Pipeline
+! ALL-NEXT:   CharacterConversion
 
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -58,14 +65,17 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: PolymorphicOpConversion
+
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:     CFGConversion
 ! ALL-NEXT:   'func.func' Pipeline
-! ALL-NEXT:     PolymorphicOpConversion
 ! ALL-NEXT:     CFGConversion
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     CFGConversion
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:     CFGConversion
 ! ALL-NEXT: SCFToControlFlow
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -74,13 +84,15 @@ end program
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! ALL-NEXT: BoxedProcedurePass
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'func.func' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:     AbstractResultOpt
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 0272739aba4d..4ebac7c3fb65 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -1,8 +1,8 @@
 ! Test the MLIR pass pipeline
 
-! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline -o /dev/null %s 2>&1 | FileCheck --check-prefixes=ALL,NOTO2 %s
+! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline -o /dev/null %s 2>&1 | FileCheck --check-prefixes=ALL %s
 ! -O0 is the default:
-! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline %s -O0 -o /dev/null 2>&1 | FileCheck --check-prefixes=ALL,NOTO2 %s
+! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline %s -O0 -o /dev/null 2>&1 | FileCheck --check-prefixes=ALL %s
 ! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline %s -O2 -o /dev/null 2>&1 | FileCheck --check-prefixes=ALL,O2 %s
 
 ! REQUIRES: asserts
@@ -28,9 +28,16 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: 'fir.global' Pipeline
+! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'func.func' Pipeline
 ! ALL-NEXT:   ArrayValueCopy
 ! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'omp.declare_reduction' Pipeline
+! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'omp.private' Pipeline
+! ALL-NEXT:   CharacterConversion
 
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -49,17 +56,18 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! O2-NEXT:    'func.func' Pipeline
-! O2-NEXT:      PolymorphicOpConversion
+! ALL-NEXT: PolymorphicOpConversion
 ! O2-NEXT:  AddAliasTags
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:    'fir.global' Pipeline
 ! ALL-NEXT:      CFGConversion
 ! ALL-NEXT:    'func.func' Pipeline
-! NOTO2-NEXT:      PolymorphicOpConversion
 ! ALL-NEXT:      CFGConversion
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:      CFGConversion
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:      CFGConversion
 
 ! ALL-NEXT: SCFToControlFlow
 ! ALL-NEXT: Canonicalizer
@@ -69,13 +77,15 @@ end program
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! ALL-NEXT: BoxedProcedurePass
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'func.func' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'omp.declare_reduction' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
+! ALL-NEXT:  'omp.private' Pipeline
+! ALL-NEXT:    AbstractResultOpt
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90
index 6cfc969e92b2..765917f07d8e 100644
--- a/flang/test/Driver/msvc-dependent-lib-flags.f90
+++ b/flang/test/Driver/msvc-dependent-lib-flags.f90
@@ -7,7 +7,6 @@
 ! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib
 ! MSVC-SAME: -D_MT
 ! MSVC-SAME: --dependent-lib=libcmt
-! MSVC-SAME: --dependent-lib=Fortran_main.static.lib
 ! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib
 ! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib
 
@@ -16,7 +15,6 @@
 ! MSVC-DEBUG-SAME: -D_MT
 ! MSVC-DEBUG-SAME: -D_DEBUG
 ! MSVC-DEBUG-SAME: --dependent-lib=libcmtd
-! MSVC-DEBUG-SAME: --dependent-lib=Fortran_main.static_dbg.lib
 ! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib
 ! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib
 
@@ -25,7 +23,6 @@
 ! MSVC-DLL-SAME: -D_MT
 ! MSVC-DLL-SAME: -D_DLL
 ! MSVC-DLL-SAME: --dependent-lib=msvcrt
-! MSVC-DLL-SAME: --dependent-lib=Fortran_main.dynamic.lib
 ! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib
 ! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib
 
@@ -35,6 +32,5 @@
 ! MSVC-DLL-DEBUG-SAME: -D_DEBUG
 ! MSVC-DLL-DEBUG-SAME: -D_DLL
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd
-! MSVC-DLL-DEBUG-SAME: --dependent-lib=Fortran_main.dynamic_dbg.lib
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib
diff --git a/flang/test/Driver/no-duplicate-main.f90 b/flang/test/Driver/no-duplicate-main.f90
index 88f4430828e0..b0bb6c2a2fef 100644
--- a/flang/test/Driver/no-duplicate-main.f90
+++ b/flang/test/Driver/no-duplicate-main.f90
@@ -4,8 +4,6 @@
 ! RUN: %flang -o %t -c %s
 ! RUN: not %flang -o %t.exe %t %t.c-object 2>&1
 
-! RUN: %flang -fno-fortran-main -o %t.exe %t %t.c-object 2>&1
-
 ! TODO: potentially add further checks to ensure that proper
 !       linker error messages are detected and checked via
 !       FileCheck.
diff --git a/flang/test/Driver/prescanner-diag.f90 b/flang/test/Driver/prescanner-diag.f90
index 7c2f8d4d7ef4..5064af13835f 100644
--- a/flang/test/Driver/prescanner-diag.f90
+++ b/flang/test/Driver/prescanner-diag.f90
@@ -5,12 +5,12 @@
 ! on some DiagnosticsEngine).
 
 ! Test with -E (i.e. PrintPreprocessedAction, stops after prescanning)
-! RUN: %flang -E -I %S/Inputs/ %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -E -I %S/Inputs/ %s 2>&1 | FileCheck %s
+! RUN: %flang -pedantic -E -I %S/Inputs/ %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -pedantic -E -I %S/Inputs/ %s 2>&1 | FileCheck %s
 
 ! Test with -fsyntax-only (i.e. ParseSyntaxOnlyAction, stops after semantic checks)
-! RUN: %flang -fsyntax-only -I %S/Inputs/ %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -fsyntax-only -I %S/Inputs/ %s 2>&1 | FileCheck %s
+! RUN: %flang -pedantic -fsyntax-only -I %S/Inputs/ %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -pedantic -fsyntax-only -I %S/Inputs/ %s 2>&1 | FileCheck %s
 
 ! CHECK: prescanner-diag.f90:[[#@LINE+3]]:10: portability: #include: extra stuff ignored after file name
 ! CHECK: prescanner-diag.f90:[[#@LINE+3]]:10: portability: #include: extra stuff ignored after file name
diff --git a/flang/test/Evaluate/fold-out_of_range.f90 b/flang/test/Evaluate/fold-out_of_range.f90
index 30665b9021a9..81551255135d 100644
--- a/flang/test/Evaluate/fold-out_of_range.f90
+++ b/flang/test/Evaluate/fold-out_of_range.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_folding.py %s %flang_fc1
+! RUN: %python %S/test_folding.py %s %flang_fc1 -pedantic
 ! UNSUPPORTED: target=powerpc{{.*}}, target=aarch{{.*}}, target=arm{{.*}}, system-windows, system-solaris
 ! Tests folding of OUT_OF_RANGE().
 module m
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index d4826dd2e476..02fb84ed8c87 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -34,9 +34,16 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: 'fir.global' Pipeline
+// PASSES-NEXT:   CharacterConversion
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   ArrayValueCopy
 // PASSES-NEXT:   CharacterConversion
+// PASSES-NEXT: 'omp.declare_reduction' Pipeline
+// PASSES-NEXT:   CharacterConversion
+// PASSES-NEXT: 'omp.private' Pipeline
+// PASSES-NEXT:   CharacterConversion
 
 // PASSES-NEXT: Canonicalizer
 // PASSES-NEXT: SimplifyRegionLite
@@ -55,18 +62,18 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-// PASSES-NEXT: 'func.func' Pipeline
-// PASSES-NEXT:   PolymorphicOpConversion
-
+// PASSES-NEXT: PolymorphicOpConversion
 // PASSES-NEXT: AddAliasTags
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   CFGConversion
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   CFGConversion
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   CFGConversion
+// PASSES-NEXT: 'omp.private' Pipeline
+// PASSES-NEXT:   CFGConversion
 
 // PASSES-NEXT: SCFToControlFlow
 // PASSES-NEXT: Canonicalizer
@@ -76,13 +83,15 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 // PASSES-NEXT: BoxedProcedurePass
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
-// PASSES-NEXT:   'fir.global' Pipeline
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT:  'fir.global' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'func.func' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'omp.declare_reduction' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
+// PASSES-NEXT:  'omp.private' Pipeline
+// PASSES-NEXT:    AbstractResultOpt
 
 // PASSES-NEXT: CodeGenRewrite
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Fir/cuf.mlir b/flang/test/Fir/cuf.mlir
index 71f0652067fa..8e2346def43e 100644
--- a/flang/test/Fir/cuf.mlir
+++ b/flang/test/Fir/cuf.mlir
@@ -74,3 +74,15 @@ func.func @_QPsub1() {
 
 // CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> errmsg(%{{.*}} : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
 // CHECK: fir.cuda_deallocate %{{.*}} : !fir.ref<!fir.box<none>> errmsg(%{{.*}} : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.cuda_alloc f32 {bindc_name = "r", cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
+  fir.cuda_free %0 : !fir.ref<f32> {cuda_attr = #fir.cuda<device>}
+  return
+}
+
+// CHECK: fir.cuda_alloc
+// CHECK: fir.cuda_free
+
diff --git a/flang/test/Fir/dummy-scope-codegen.fir b/flang/test/Fir/dummy-scope-codegen.fir
new file mode 100644
index 000000000000..caef3c1b2578
--- /dev/null
+++ b/flang/test/Fir/dummy-scope-codegen.fir
@@ -0,0 +1,9 @@
+// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s
+
+func.func @dummy_scope(%arg0: !fir.ref<f32>) {
+  %scope = fir.dummy_scope : !fir.dscope
+  %0 = fir.declare %arg0 dummy_scope %scope {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  return
+}
+// CHECK-LABEL: func.func @dummy_scope(
+// CHECK-NEXT: return
diff --git a/flang/test/Fir/dummy_scope.fir b/flang/test/Fir/dummy_scope.fir
new file mode 100644
index 000000000000..58985923a8f4
--- /dev/null
+++ b/flang/test/Fir/dummy_scope.fir
@@ -0,0 +1,34 @@
+// RUN: fir-opt %s | fir-opt | FileCheck %s
+// RUN: fir-opt %s | fir-opt -cse | FileCheck %s
+
+// CHECK-LABEL:   func.func @dummy_scope(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<f32>) {
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+// CHECK:           return
+// CHECK:         }
+func.func @dummy_scope(%arg0: !fir.ref<f32>) {
+  %scope = fir.dummy_scope : !fir.dscope
+  %0 = fir.declare %arg0 dummy_scope %scope {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  return
+}
+
+// CHECK-LABEL:   func.func @dummy_scopes(
+// CHECK-SAME:                            %[[VAL_0:.*]]: !fir.ref<f32>) {
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+// CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_4:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "innerEx"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+// CHECK:           %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_6:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[VAL_5]] {uniq_name = "innerEx"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+// CHECK:           return
+// CHECK:         }
+func.func @dummy_scopes(%arg0: !fir.ref<f32>) {
+  %scope_out = fir.dummy_scope : !fir.dscope
+  %0 = fir.declare %arg0 dummy_scope %scope_out {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  %scope_in1 = fir.dummy_scope : !fir.dscope
+  %1 = fir.declare %arg0 dummy_scope %scope_in1 {uniq_name = "innerEx"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  %scope_in2 = fir.dummy_scope : !fir.dscope
+  %2 = fir.declare %arg0 dummy_scope %scope_in2 {uniq_name = "innerEx"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  return
+}
diff --git a/flang/test/HLFIR/declare-codegen.fir b/flang/test/HLFIR/declare-codegen.fir
index 3e80a52be452..9f51d0fbc7af 100644
--- a/flang/test/HLFIR/declare-codegen.fir
+++ b/flang/test/HLFIR/declare-codegen.fir
@@ -200,3 +200,13 @@ func.func @test_optional_declare(%arg0: !fir.box<!fir.array<?xi32>>) {
 // CHECK:    %[[VAL_7:.*]] = fir.absent !fir.box<!fir.array<?xi32>>
 // CHECK:    fir.result %[[VAL_7]] : !fir.box<!fir.array<?xi32>>
 // CHECK:  }
+
+func.func @dummy_scope(%arg0: !fir.ref<f32>) {
+  %scope = fir.dummy_scope : !fir.dscope
+  %0:2 = hlfir.declare %arg0 dummy_scope %scope {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+// CHECK-LABEL:   func.func @dummy_scope(
+// CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<f32>) {
+// CHECK:         %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:         %[[VAL_1:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[SCOPE]] {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
diff --git a/flang/test/HLFIR/dummy_scope.fir b/flang/test/HLFIR/dummy_scope.fir
new file mode 100644
index 000000000000..6b5c61e21f1d
--- /dev/null
+++ b/flang/test/HLFIR/dummy_scope.fir
@@ -0,0 +1,34 @@
+// RUN: fir-opt %s | fir-opt | FileCheck %s
+// RUN: fir-opt %s | fir-opt -cse | FileCheck %s
+
+// CHECK-LABEL:   func.func @dummy_scope(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<f32>) {
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+// CHECK:           return
+// CHECK:         }
+func.func @dummy_scope(%arg0: !fir.ref<f32>) {
+  %scope = fir.dummy_scope : !fir.dscope
+  %0:2 = hlfir.declare %arg0 dummy_scope %scope {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// CHECK-LABEL:   func.func @dummy_scopes(
+// CHECK-SAME:                            %[[VAL_0:.*]]: !fir.ref<f32>) {
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+// CHECK:           %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_3]] {uniq_name = "innerEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+// CHECK:           %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_5]] {uniq_name = "innerEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+// CHECK:           return
+// CHECK:         }
+func.func @dummy_scopes(%arg0: !fir.ref<f32>) {
+  %scope_out = fir.dummy_scope : !fir.dscope
+  %0:2 = hlfir.declare %arg0 dummy_scope %scope_out {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+  %scope_in1 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %scope_in1 {uniq_name = "innerEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+  %scope_in2 = fir.dummy_scope : !fir.dscope
+  %2:2 = hlfir.declare %arg0 dummy_scope %scope_in2 {uniq_name = "innerEx"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 4ebd736315bc..0a2608639bce 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -98,7 +98,7 @@ end
 
 ! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<10xi32> {bindc_name = ".tmp", uniq_name = ""}
 ! CHECK: %[[DECL_TEMP:.*]]:2 = hlfir.declare %[[TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
-! CHECK: %[[ADEV_TEMP:.*]]:2 = hlfir.declare %21#0 {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Eadev"} : (!fir.heap<!fir.array<10xi32>>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
+! CHECK: %[[ADEV_TEMP:.*]]:2 = hlfir.declare %[[DECL_TEMP]]#1(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Eadev"} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
 ! CHECK: fir.cuda_data_transfer %[[ADEV]]#1 to %[[DECL_TEMP]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>
 ! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32>
 ! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[BHOST]]#0 : !hlfir.expr<10xi32>, !fir.ref<!fir.array<10xi32>>
@@ -119,3 +119,43 @@ end
 ! CHECK: %[[T:.*]]:2 = hlfir.declare %7 {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub3Et"} : (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>) -> (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>)
 ! CHECK: %[[TMP_DECL:.*]]:2 = hlfir.declare %0 {uniq_name = ".tmp"} : (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>) -> (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>)
 ! CHECK: fir.cuda_data_transfer %[[T]]#1 to %[[TMP_DECL]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>
+
+
+! Check that fir.cuda_data_transfer are not generated within cuf kernel
+subroutine sub4()
+  integer, parameter :: n = 10
+  real, device :: adev(n)
+  real :: ahost(n)
+  real :: b
+  integer :: i
+
+  adev = ahost
+  !$cuf kernel do <<<*,*>>>
+  do i = 1, n
+    adev(i) = adev(i) + b
+  enddo
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub4()
+! CHECK: fir.cuda_data_transfer
+! CHECK: fir.cuda_kernel<<<*, *>>>
+! CHECK-NOT: fir.cuda_data_transfer
+! CHECK: hlfir.assign
+
+attributes(global) subroutine sub5(a)
+  integer, device :: a
+  integer :: i
+  a = i
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub5
+! CHECK-NOT: fir.cuda_data_transfer
+
+attributes(host,device) subroutine sub6(a)
+  integer, device :: a
+  integer :: i
+  a = i
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub6
+! CHECK: fir.cuda_data_transfer
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index 9b728cd19eb5..d80542f76c92 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -24,6 +24,7 @@ subroutine sub1()
 ! CHECK-NOT: fir.do_loop
 ! CHECK: %[[ARG0_I32:.*]] = fir.convert %[[ARG0]] : (index) -> i32
 ! CHECK: fir.store %[[ARG0_I32]] to %[[IV]]#1 : !fir.ref<i32>
+! CHECK: hlfir.assign
 
 
   !$cuf kernel do <<< *, * >>>
diff --git a/flang/test/Lower/OpenMP/FIR/array-bounds.f90 b/flang/test/Lower/OpenMP/FIR/array-bounds.f90
deleted file mode 100644
index c2bb7a94712b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/array-bounds.f90
+++ /dev/null
@@ -1,121 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes=HOST,ALL
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes=DEVICE,ALL
-
-!ALL-LABEL: func.func @_QPread_write_section(
-!ALL:  %[[ITER:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFread_write_sectionEi"}
-!ALL:  %[[READ:.*]] = fir.address_of(@_QFread_write_sectionEsp_read) : !fir.ref<!fir.array<10xi32>>
-!ALL:  %[[C10:.*]] = arith.constant 10 : index
-!ALL:  %[[WRITE:.*]] = fir.address_of(@_QFread_write_sectionEsp_write) : !fir.ref<!fir.array<10xi32>>
-!ALL:  %[[C10_0:.*]] = arith.constant 10 : index
-!ALL:  %[[C1:.*]] = arith.constant 1 : index
-!ALL:  %[[C2:.*]] = arith.constant 1 : index
-!ALL:  %[[C3:.*]] = arith.constant 4 : index
-!ALL:  %[[BOUNDS0:.*]] = omp.map.bounds   lower_bound(%[[C2]] : index) upper_bound(%[[C3]] : index) extent(%[[C10]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
-!ALL:  %[[MAP0:.*]] = omp.map.info var_ptr(%[[READ]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_read(2:5)"}
-!ALL:  %[[C4:.*]] = arith.constant 1 : index
-!ALL:  %[[C5:.*]] = arith.constant 1 : index
-!ALL:  %[[C6:.*]] = arith.constant 4 : index
-!ALL:  %[[BOUNDS1:.*]] = omp.map.bounds   lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) extent(%[[C10_0]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index)
-!ALL:  %[[MAP1:.*]] = omp.map.info var_ptr(%[[WRITE]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_write(2:5)"}
-!ALL:  %[[MAP2:.*]] = omp.map.info var_ptr(%[[ITER]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-!ALL: omp.target map_entries(%[[MAP0]] -> %{{.*}}, %[[MAP1]] -> %{{.*}}, %[[MAP2]] -> %{{.*}} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, !fir.ref<i32>) {
-
-subroutine read_write_section()
-    integer :: sp_read(10) = (/1,2,3,4,5,6,7,8,9,10/)
-    integer :: sp_write(10) = (/0,0,0,0,0,0,0,0,0,0/)
-
-!$omp target map(tofrom:sp_read(2:5)) map(tofrom:sp_write(2:5))
-    do i = 2, 5
-        sp_write(i) = sp_read(i)
-    end do
-!$omp end target
-end subroutine read_write_section
-
-module assumed_array_routines
-contains
-!ALL-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array(
-!ALL-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"})
-!ALL: %[[INTERMEDIATE_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-!ALL: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEi"}
-!ALL: %[[C0:.*]] = arith.constant 1 : index
-!ALL: %[[C1:.*]] = arith.constant 0 : index
-!ALL: %[[DIMS0:.*]]:3 = fir.box_dims %arg0, %[[C1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-!ALL: %[[C3:.*]] = arith.constant 1 : index
-!ALL: %[[C4:.*]] = arith.constant 4 : index
-!ALL: %[[C0_1:.*]] = arith.constant 0 : index
-!ALL: %[[DIMS1:.*]]:3 = fir.box_dims %arg0, %[[C0_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-!ALL: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) extent(%[[DIMS1]]#1 : index) stride(%[[DIMS0]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true}
-!ALL: %[[BOXADDRADDR:.*]] = fir.box_offset %0 base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!ALL: %[[MAP_MEMBER:.*]] = omp.map.info var_ptr(%0 : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[BOXADDRADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
-!ALL: %[[MAP:.*]] = omp.map.info var_ptr(%0 : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!ALL: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-!ALL: omp.target map_entries(%[[MAP_MEMBER]] -> %{{.*}}, %[[MAP]] -> %{{.*}}, %[[MAP2]] -> %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
-    subroutine assumed_shape_array(arr_read_write)
-        integer, intent(inout) :: arr_read_write(:)
-
-        !$omp target map(tofrom:arr_read_write(2:5))
-            do i = 2, 5
-                arr_read_write(i) = i
-            end do
-        !$omp end target
-    end subroutine assumed_shape_array
-
-!ALL-LABEL:   func.func @_QMassumed_array_routinesPassumed_size_array(
-!ALL-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"})
-!ALL: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"}
-!ALL: %[[C0:.*]] = arith.constant 1 : index
-!ALL: %[[C1:.*]] = arith.constant 1 : index
-!ALL: %[[C2:.*]] = arith.constant 4 : index
-!ALL: %[[DIFF:.*]] = arith.subi %[[C2]], %[[C1]] : index
-!ALL: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C0]] : index
-!ALL: %[[BOUNDS:.*]]  = omp.map.bounds   lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) extent(%[[EXT]] : index) stride(%[[C0]] : index) start_idx(%[[C0]] : index)
-!ALL: %[[MAP:.*]] = omp.map.info var_ptr(%[[ARG0]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!ALL: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-!ALL: omp.target map_entries(%[[MAP]] -> %{{.*}}, %[[MAP2]] -> %{{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
-    subroutine assumed_size_array(arr_read_write)
-        integer, intent(inout) :: arr_read_write(*)
-
-    !$omp target map(tofrom:arr_read_write(2:5))
-        do i = 2, 5
-            arr_read_write(i) = i
-        end do
-    !$omp end target
-    end subroutine assumed_size_array
-end module assumed_array_routines
-
-!DEVICE-NOT:func.func @_QPcall_assumed_shape_and_size_array() {
-
-!HOST-LABEL:func.func @_QPcall_assumed_shape_and_size_array() {
-!HOST:%{{.*}} = arith.constant 20 : index
-!HOST:%[[ALLOCA:.*]] = fir.alloca !fir.array<20xi32> {bindc_name = "arr_read_write", uniq_name = "_QFcall_assumed_shape_and_size_arrayEarr_read_write"}
-!HOST:%{{.*}} = arith.constant 1 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%{{.*}} = arith.constant 1 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%{{.*}} = arith.constant 10 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%[[SHAPE0:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-!HOST:%[[SLICE0:.*]] = fir.slice %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.slice<1>
-!HOST:%[[ARG0EMB:.*]] = fir.embox %[[ALLOCA]](%[[SHAPE0]]) [%[[SLICE0]]] : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<10xi32>>
-!HOST:%[[ARG0:.*]] = fir.convert %[[ARG0EMB]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<!fir.array<?xi32>>
-!HOST:fir.call @_QMassumed_array_routinesPassumed_shape_array(%[[ARG0]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>) -> ()
-!HOST:%{{.*}} = arith.constant 10 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%{{.*}} = arith.constant 1 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%{{.*}} = arith.constant 20 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%[[SHAPE1:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-!HOST:%[[SLICE1:.*]] = fir.slice %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.slice<1>
-!HOST:%[[ARG1EMB:.*]] = fir.embox %[[ALLOCA]](%[[SHAPE1]]) [%[[SLICE1]]] : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<11xi32>>
-!HOST:%[[ADDROF:.*]] = fir.box_addr %[[ARG1EMB]] : (!fir.box<!fir.array<11xi32>>) -> !fir.ref<!fir.array<11xi32>>
-!HOST:%[[ARG1:.*]] = fir.convert %[[ADDROF]] : (!fir.ref<!fir.array<11xi32>>) -> !fir.ref<!fir.array<?xi32>>
-!HOST:fir.call @_QMassumed_array_routinesPassumed_size_array(%[[ARG1]]) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
-!HOST:return
-!HOST:}
-subroutine call_assumed_shape_and_size_array
-    use assumed_array_routines
-    integer :: arr_read_write(20)
-    call assumed_shape_array(arr_read_write(1:10))
-    call assumed_size_array(arr_read_write(10:20))
-end subroutine call_assumed_shape_and_size_array
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-capture.f90 b/flang/test/Lower/OpenMP/FIR/atomic-capture.f90
deleted file mode 100644
index 9b94214b9da8..000000000000
--- a/flang/test/Lower/OpenMP/FIR/atomic-capture.f90
+++ /dev/null
@@ -1,119 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! This test checks the lowering of atomic capture
-
-program OmpAtomicCapture
-    use omp_lib                                                                                                       
-    integer :: x, y
-
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: omp.atomic.capture   memory_order(release) {
-!CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref<i32>
-!CHECK: omp.atomic.update %[[Y]] : !fir.ref<i32> {
-!CHECK: ^bb0(%[[ARG:.*]]: i32):
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[ARG]] : i32
-!CHECK: omp.yield(%[[result]] : i32)
-!CHECK: }
-!CHECK: }
-
-    !$omp atomic capture release
-        x = y
-        y = x + y
-    !$omp end atomic
-
-
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: omp.atomic.capture   hint(uncontended) {
-!CHECK: omp.atomic.update %[[Y]] : !fir.ref<i32> {
-!CHECK: ^bb0(%[[ARG:.*]]: i32):
-!CHECK: %[[result:.*]] = arith.muli %[[temp]], %[[ARG]] : i32
-!CHECK: omp.yield(%[[result]] : i32)
-!CHECK: }
-!CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref<i32>
-!CHECK: }
-
-    !$omp atomic hint(omp_sync_hint_uncontended) capture
-        y = x * y 
-        x = y
-    !$omp end atomic
-
-!CHECK: %[[constant_20:.*]] = arith.constant 20 : i32
-!CHECK: %[[constant_8:.*]] = arith.constant 8 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.subi %[[constant_8]], %[[temp]] : i32
-!CHECK: %[[result_noreassoc:.*]] = fir.no_reassoc %[[result]] : i32
-!CHECK: %[[result:.*]] = arith.addi %[[constant_20]], %[[result_noreassoc]] : i32
-!CHECK: omp.atomic.capture   memory_order(acquire) hint(nonspeculative) {
-!CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref<i32>
-!CHECK: omp.atomic.write %[[Y]] = %[[result]] : !fir.ref<i32>, i32
-!CHECK: }
-
-    !$omp atomic hint(omp_lock_hint_nonspeculative) capture acquire
-        x = y
-        y = 2 * 10 + (8 - x) 
-    !$omp end atomic
-
-
-!CHECK: %[[constant_20:.*]] = arith.constant 20 : i32
-!CHECK: %[[constant_8:.*]] = arith.constant 8 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.subi %[[constant_8]], %[[temp]] : i32
-!CHECK: %[[result_noreassoc:.*]] = fir.no_reassoc %[[result]] : i32
-!CHECK: %[[result:.*]] = arith.addi %[[constant_20]], %[[result_noreassoc]] : i32
-!CHECK: omp.atomic.capture {
-!CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref<i32>
-!CHECK: omp.atomic.write %[[Y]] = %[[result]] : !fir.ref<i32>, i32
-!CHECK: }
-
-    !$omp atomic capture
-        x = y
-        y = 2 * 10 + (8 - x) 
-    !$omp end atomic 
-end program
-
-
-
-subroutine pointers_in_atomic_capture()
-!CHECK: %[[A:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFpointers_in_atomic_captureEa"}
-!CHECK: {{.*}} = fir.zero_bits !fir.ptr<i32>
-!CHECK: {{.*}} = fir.embox {{.*}} : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK: fir.store {{.*}} to %[[A]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[B:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "b", uniq_name = "_QFpointers_in_atomic_captureEb"}
-!CHECK: {{.*}} = fir.zero_bits !fir.ptr<i32>
-!CHECK: {{.*}} = fir.embox {{.*}} : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK: fir.store {{.*}} to %[[B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[C:.*]] = fir.alloca i32 {bindc_name = "c", fir.target, uniq_name = "_QFpointers_in_atomic_captureEc"}
-!CHECK: %[[D:.*]] = fir.alloca i32 {bindc_name = "d", fir.target, uniq_name = "_QFpointers_in_atomic_captureEd"}
-!CHECK: {{.*}} = fir.embox {{.*}} : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK: fir.store {{.*}} to %[[A]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: {{.*}} = fir.embox {{.*}} : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK: fir.store {{.*}} to %[[B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[loaded_A:.*]] = fir.load %[[A]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[loaded_A_addr:.*]] = fir.box_addr %[[loaded_A]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-!CHECK: %[[loaded_B:.*]] = fir.load %[[B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[loaded_B_addr:.*]] = fir.box_addr %[[loaded_B]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-!CHECK: %[[PRIVATE_LOADED_B:.*]] = fir.load %[[B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[PRIVATE_LOADED_B_addr:.*]] = fir.box_addr %[[PRIVATE_LOADED_B]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-!CHECK: %[[loaded_value:.*]] = fir.load %[[PRIVATE_LOADED_B_addr]] : !fir.ptr<i32>
-!CHECK: omp.atomic.capture   {
-!CHECK: omp.atomic.update %[[loaded_A_addr]] : !fir.ptr<i32> {
-!CHECK: ^bb0(%[[ARG:.*]]: i32):
-!CHECK: %[[result:.*]] = arith.addi %[[ARG]], %[[loaded_value]] : i32
-!CHECK: omp.yield(%[[result]] : i32)
-!CHECK: }
-!CHECK: omp.atomic.read %[[loaded_B_addr]] = %[[loaded_A_addr]] : !fir.ptr<i32>, i32
-!CHECK: }
-    integer, pointer :: a, b
-    integer, target :: c, d
-    a=>c
-    b=>d
-
-    !$omp atomic capture
-        a = a + b
-        b = a
-    !$omp end atomic
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-read.f90 b/flang/test/Lower/OpenMP/FIR/atomic-read.f90
deleted file mode 100644
index 7698c3d7490f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/atomic-read.f90
+++ /dev/null
@@ -1,80 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! This test checks the lowering of atomic read
-
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomic"} {
-!CHECK: %[[VAR_A:.*]] = fir.alloca !fir.char<1> {bindc_name = "a", uniq_name = "_QFEa"}
-!CHECK: %[[VAR_B:.*]] = fir.alloca !fir.char<1> {bindc_name = "b", uniq_name = "_QFEb"}
-!CHECK: %[[VAR_C:.*]] = fir.alloca !fir.logical<4> {bindc_name = "c", uniq_name = "_QFEc"}
-!CHECK: %[[VAR_D:.*]] = fir.alloca !fir.logical<4> {bindc_name = "d", uniq_name = "_QFEd"}
-!CHECK: %[[VAR_E:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "e", uniq_name = "_QFEe"}
-!CHECK: %[[VAR_F:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "f", uniq_name = "_QFEf"}
-!CHECK: %[[VAR_G:.*]] = fir.alloca f32 {bindc_name = "g", uniq_name = "_QFEg"}
-!CHECK: %[[VAR_H:.*]] = fir.alloca f32 {bindc_name = "h", uniq_name = "_QFEh"}
-!CHECK: %[[VAR_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[VAR_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: omp.atomic.read %[[VAR_X]] = %[[VAR_Y]] memory_order(acquire)  hint(uncontended) : !fir.ref<i32>, i32
-!CHECK: omp.atomic.read %[[VAR_A]] = %[[VAR_B]] memory_order(relaxed) : !fir.ref<!fir.char<1>>, !fir.char<1>
-!CHECK: omp.atomic.read %[[VAR_C]] = %[[VAR_D]] memory_order(seq_cst)  hint(contended) : !fir.ref<!fir.logical<4>>, !fir.logical<4>
-!CHECK: omp.atomic.read %[[VAR_E]] = %[[VAR_F]] hint(speculative) : !fir.ref<!fir.char<1,8>>, !fir.char<1,8>
-!CHECK: omp.atomic.read %[[VAR_G]] = %[[VAR_H]] hint(nonspeculative) : !fir.ref<f32>, f32
-!CHECK: omp.atomic.read %[[VAR_G]] = %[[VAR_H]] : !fir.ref<f32>, f32
-!CHECK: return
-!CHECK: }
-
-program OmpAtomic
-
-    use omp_lib
-    integer :: x, y
-    character :: a, b
-    logical :: c, d
-    character(8) :: e, f
-    real g, h
-    !$omp atomic acquire read hint(omp_sync_hint_uncontended)
-       x = y
-    !$omp atomic relaxed read hint(omp_sync_hint_none)
-       a = b
-    !$omp atomic read seq_cst hint(omp_sync_hint_contended)
-       c = d
-    !$omp atomic read hint(omp_sync_hint_speculative)
-       e = f
-    !$omp atomic read hint(omp_sync_hint_nonspeculative)
-       g = h
-    !$omp atomic read
-       g = h
-end program OmpAtomic
-
-! Test lowering atomic read for pointer variables.
-! Please notice to use %[[VAL_4]] and %[[VAL_1]] for operands of atomic
-! operation, instead of %[[VAL_3]] and %[[VAL_0]].
-
-!CHECK-LABEL: func.func @_QPatomic_read_pointer() {
-!CHECK:         %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "x", uniq_name = "_QFatomic_read_pointerEx"}
-!CHECK:         %[[VAL_1:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFatomic_read_pointerEx.addr"}
-!CHECK:         %[[VAL_2:.*]] = fir.zero_bits !fir.ptr<i32>
-!CHECK:         fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y", uniq_name = "_QFatomic_read_pointerEy"}
-!CHECK:         %[[VAL_4:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFatomic_read_pointerEy.addr"}
-!CHECK:         %[[VAL_5:.*]] = fir.zero_bits !fir.ptr<i32>
-!CHECK:         fir.store %[[VAL_5]] to %[[VAL_4]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_7:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         omp.atomic.read %[[VAL_7]] = %[[VAL_6]]   : !fir.ptr<i32>, i32
-!CHECK:         %[[VAL_8:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ptr<i32>
-!CHECK:         %[[VAL_10:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         fir.store %[[VAL_9]] to %[[VAL_10]] : !fir.ptr<i32>
-!CHECK:         return
-!CHECK:       }
-
-subroutine atomic_read_pointer()
-  integer, pointer :: x, y
-
-  !$omp atomic read
-    y = x
-
-  x = y
-end
-
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-update.f90 b/flang/test/Lower/OpenMP/FIR/atomic-update.f90
deleted file mode 100644
index ae201807c337..000000000000
--- a/flang/test/Lower/OpenMP/FIR/atomic-update.f90
+++ /dev/null
@@ -1,141 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! This test checks lowering of atomic and atomic update constructs
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -mllvm --use-desc-for-alloc=false -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-program OmpAtomicUpdate
-    use omp_lib
-    integer :: x, y, z
-    integer, pointer :: a, b
-    integer, target :: c, d
-    integer(1) :: i1
-
-    a=>c
-    b=>d
-
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "ompatomicupdate"} {
-!CHECK: %[[A:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFEa"}
-!CHECK: %[[A_ADDR:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFEa.addr"}
-!CHECK: %{{.*}} = fir.zero_bits !fir.ptr<i32>
-!CHECK: fir.store %{{.*}} to %[[A_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %[[B:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "b", uniq_name = "_QFEb"}
-!CHECK: %[[B_ADDR:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFEb.addr"}
-!CHECK: %{{.*}} = fir.zero_bits !fir.ptr<i32>
-!CHECK: fir.store %{{.*}} to %[[B_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %[[C_ADDR:.*]] = fir.address_of(@_QFEc) : !fir.ref<i32>
-!CHECK: %[[D_ADDR:.*]] = fir.address_of(@_QFEd) : !fir.ref<i32>
-!CHECK: %[[I1:.*]] = fir.alloca i8 {bindc_name = "i1", uniq_name = "_QFEi1"}
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
-!CHECK: %{{.*}} = fir.convert %[[C_ADDR]] : (!fir.ref<i32>) -> !fir.ptr<i32>
-!CHECK: fir.store %{{.*}} to %[[A_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %{{.*}} = fir.convert %[[D_ADDR]] : (!fir.ref<i32>) -> !fir.ptr<i32>
-!CHECK: fir.store {{.*}} to %[[B_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %[[LOADED_A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %[[LOADED_B:.*]] = fir.load %[[B_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %{{.*}} = fir.load %[[LOADED_B]] : !fir.ptr<i32>
-!CHECK:  omp.atomic.update   %[[LOADED_A]] : !fir.ptr<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %[[ARG]], %{{.*}} : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK: }
-    !$omp atomic update
-        a = a + b 
-
-!CHECK: {{.*}} = arith.constant 1 : i32
-!CHECK: omp.atomic.update   %[[Y]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %[[ARG]], {{.*}} : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   %[[Z]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.muli %[[LOADED_X]], %[[ARG]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-    !$omp atomic 
-        y = y + 1
-    !$omp atomic update
-        z = x * z 
-
-!CHECK:  %{{.*}} = arith.constant 1 : i32
-!CHECK:  omp.atomic.update   memory_order(relaxed) hint(uncontended) %[[X]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.subi %[[ARG]], {{.*}} : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK:  %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   memory_order(relaxed) %[[Y]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %{{.*}} = arith.cmpi sgt, %[[ARG]], %[[LOADED_X]] : i32
-!CHECK:    %{{.*}} = arith.select %{{.*}}, %[[ARG]], %[[LOADED_X]] : i32
-!CHECK:    %{{.*}} = arith.cmpi sgt, %{{.*}}, %[[LOADED_Z]] : i32
-!CHECK:    %[[RESULT:.*]] = arith.select %{{.*}}, %{{.*}}, %[[LOADED_Z]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   memory_order(relaxed) hint(contended) %[[Z]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %[[ARG]], %[[LOADED_X]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-    !$omp atomic relaxed update hint(omp_sync_hint_uncontended)
-        x = x - 1
-    !$omp atomic update relaxed 
-        y = max(y, x, z)
-    !$omp atomic relaxed hint(omp_sync_hint_contended)
-        z = z + x
-
-!CHECK:  %{{.*}} = arith.constant 10 : i32
-!CHECK:  omp.atomic.update   memory_order(release) hint(contended) %[[Z]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:   %[[RESULT:.*]] = arith.muli {{.*}}, %[[ARG]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   memory_order(release) hint(speculative) %[[X]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.divsi %[[ARG]], %[[LOADED_Z]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-
-    !$omp atomic release update hint(omp_lock_hint_contended)
-        z = z * 10
-    !$omp atomic hint(omp_lock_hint_speculative) update release
-        x = x / z
-
-!CHECK:  %{{.*}} = arith.constant 10 : i32
-!CHECK:  omp.atomic.update   memory_order(seq_cst) hint(nonspeculative) %[[Y]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %{{.*}}, %[[ARG]] : i32
-!CHECK:   omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_Y:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   memory_order(seq_cst) %[[Z]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %[[LOADED_Y]], %[[ARG]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-    !$omp atomic hint(omp_sync_hint_nonspeculative) seq_cst
-        y = 10 + y
-    !$omp atomic seq_cst update
-        z = y + z
-
-!CHECK:  %[[C1_VAL:.*]] = arith.constant 1 : i32
-!CHECK:  omp.atomic.update   %[[I1]] : !fir.ref<i8> {
-!CHECK:  ^bb0(%[[VAL:.*]]: i8):
-!CHECK:    %[[CVT_VAL:.*]] = fir.convert %[[VAL]] : (i8) -> i32
-!CHECK:    %[[ADD_VAL:.*]] = arith.addi %[[CVT_VAL]], %[[C1_VAL]] : i32
-!CHECK:    %[[UPDATED_VAL:.*]] = fir.convert %[[ADD_VAL]] : (i32) -> i8
-!CHECK:    omp.yield(%[[UPDATED_VAL]] : i8)
-!CHECK:  }
-    !$omp atomic
-      i1 = i1 + 1
-    !$omp end atomic
-!CHECK:  return
-!CHECK: }
-end program OmpAtomicUpdate
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-write.f90 b/flang/test/Lower/OpenMP/FIR/atomic-write.f90
deleted file mode 100644
index 142481b7a1d2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/atomic-write.f90
+++ /dev/null
@@ -1,77 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! This test checks the lowering of atomic write
-
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomicwrite"} {
-!CHECK: %[[VAR_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[VAR_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: %[[VAR_Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
-!CHECK: %[[CONST_44:.*]] = arith.constant 44 : i32
-!CHECK: omp.atomic.write %[[VAR_X]] = %[[CONST_44]] hint(uncontended) memory_order(seq_cst) : !fir.ref<i32>, i32
-!CHECK: %[[CONST_7:.*]] = arith.constant 7 : i32
-!CHECK: {{.*}} = fir.load %[[VAR_Y]] : !fir.ref<i32>
-!CHECK: %[[VAR_7y:.*]] = arith.muli %[[CONST_7]], {{.*}} : i32
-!CHECK: omp.atomic.write %[[VAR_X]] = %[[VAR_7y]] memory_order(relaxed) : !fir.ref<i32>, i32
-!CHECK: %[[CONST_10:.*]] = arith.constant 10 : i32
-!CHECK: {{.*}} = fir.load %[[VAR_X]] : !fir.ref<i32>
-!CHECK: {{.*}} = arith.muli %[[CONST_10]], {{.*}} : i32
-!CHECK: {{.*}} = fir.load %[[VAR_Z]] : !fir.ref<i32>
-!CHECK: %[[CONST_2:.*]] = arith.constant 2 : i32
-!CHECK: {{.*}} = arith.divsi {{.*}}, %[[CONST_2]] : i32
-!CHECK: {{.*}} = arith.addi {{.*}}, {{.*}} : i32
-!CHECK: omp.atomic.write %[[VAR_Y]] = {{.*}} hint(speculative) memory_order(release) : !fir.ref<i32>, i32
-!CHECK: return
-!CHECK: }
-
-program OmpAtomicWrite
-    use omp_lib
-    integer :: x, y, z
-    !$omp atomic seq_cst write hint(omp_sync_hint_uncontended)
-        x = 8*4 + 12
-
-    !$omp atomic write relaxed
-        x = 7 * y
-
-    !$omp atomic write release hint(omp_sync_hint_speculative)
-        y = 10*x + z/2
-end program OmpAtomicWrite
-
-! Test lowering atomic read for pointer variables.
-! Please notice to use %[[VAL_1]] for operands of atomic operation, instead
-! of %[[VAL_0]].
-
-!CHECK-LABEL: func.func @_QPatomic_write_pointer() {
-!CHECK:         %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "x", uniq_name = "_QFatomic_write_pointerEx"}
-!CHECK:         %[[VAL_1:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFatomic_write_pointerEx.addr"}
-!CHECK:         %[[VAL_2:.*]] = fir.zero_bits !fir.ptr<i32>
-!CHECK:         fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i32
-!CHECK:         %[[VAL_4:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         omp.atomic.write %[[VAL_4]] = %[[VAL_3]]   : !fir.ptr<i32>, i32
-!CHECK:         %[[VAL_5:.*]] = arith.constant 2 : i32
-!CHECK:         %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         fir.store %[[VAL_5]] to %[[VAL_6]] : !fir.ptr<i32>
-!CHECK:         return
-!CHECK:       }
-
-subroutine atomic_write_pointer()
-  integer, pointer :: x
-
-  !$omp atomic write
-    x = 1
-
-  x = 2
-end
-
-!CHECK-LABEL: func.func @_QPatomic_write_typed_assign
-!CHECK: %[[VAR:.*]] = fir.alloca f32 {bindc_name = "r2", uniq_name = "{{.*}}r2"}
-!CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK: omp.atomic.write %[[VAR]] = %[[CST]]   : !fir.ref<f32>, f32
-
-subroutine atomic_write_typed_assign
-  real :: r2
-  !$omp atomic write
-  r2 = 0
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/copyin.f90 b/flang/test/Lower/OpenMP/FIR/copyin.f90
deleted file mode 100644
index e256404d3d55..000000000000
--- a/flang/test/Lower/OpenMP/FIR/copyin.f90
+++ /dev/null
@@ -1,358 +0,0 @@
-! This test checks lowering of `COPYIN` clause.
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL: func.func @_QPcopyin_scalar_array() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_scalar_arrayEx1) : !fir.ref<i32>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         %[[VAL_2:.*]] = fir.address_of(@_QFcopyin_scalar_arrayEx2) : !fir.ref<!fir.array<10xi64>>
-! CHECK:         %[[VAL_3:.*]] = arith.constant 10 : index
-! CHECK:         %[[VAL_4:.*]] = omp.threadprivate %[[VAL_2]] : !fir.ref<!fir.array<10xi64>> -> !fir.ref<!fir.array<10xi64>>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_5:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:           %[[VAL_7:.*]] = omp.threadprivate %[[VAL_2]] : !fir.ref<!fir.array<10xi64>> -> !fir.ref<!fir.array<10xi64>>
-! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]] = fir.array_load %[[VAL_7]](%[[VAL_8]]) : (!fir.ref<!fir.array<10xi64>>, !fir.shape<1>) -> !fir.array<10xi64>
-! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_11:.*]] = fir.array_load %[[VAL_4]](%[[VAL_10]]) : (!fir.ref<!fir.array<10xi64>>, !fir.shape<1>) -> !fir.array<10xi64>
-! CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
-! CHECK:           %[[VAL_14:.*]] = arith.subi %[[VAL_3]], %[[VAL_12]] : index
-! CHECK:           %[[VAL_15:.*]] = fir.do_loop %[[VAL_16:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_12]] unordered iter_args(%[[VAL_17:.*]] = %[[VAL_9]]) -> (!fir.array<10xi64>) {
-! CHECK:             %[[VAL_18:.*]] = fir.array_fetch %[[VAL_11]], %[[VAL_16]] : (!fir.array<10xi64>, index) -> i64
-! CHECK:             %[[VAL_19:.*]] = fir.array_update %[[VAL_17]], %[[VAL_18]], %[[VAL_16]] : (!fir.array<10xi64>, i64, index) -> !fir.array<10xi64>
-! CHECK:             fir.result %[[VAL_19]] : !fir.array<10xi64>
-! CHECK:           }
-! CHECK:           fir.array_merge_store %[[VAL_9]], %[[VAL_20:.*]] to %[[VAL_7]] : !fir.array<10xi64>, !fir.array<10xi64>, !fir.ref<!fir.array<10xi64>>
-! CHECK:           omp.barrier
-! CHECK:           fir.call @_QPsub1(%[[VAL_5]], %[[VAL_7]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.array<10xi64>>) -> ()
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine copyin_scalar_array()
-  integer(kind=4), save :: x1
-  integer(kind=8), save :: x2(10)
-  !$omp threadprivate(x1, x2)
-
-  !$omp parallel copyin(x1) copyin(x2)
-    call sub1(x1, x2)
-  !$omp end parallel
-
-end
-
-! CHECK-LABEL: func.func @_QPcopyin_char_chararray() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_char_chararrayEx3) : !fir.ref<!fir.char<1,5>>
-! CHECK:         %[[VAL_1:.*]] = arith.constant 5 : index
-! CHECK:         %[[VAL_2:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.char<1,5>> -> !fir.ref<!fir.char<1,5>>
-! CHECK:         %[[VAL_3:.*]] = fir.address_of(@_QFcopyin_char_chararrayEx4) : !fir.ref<!fir.array<10x!fir.char<1,5>>>
-! CHECK:         %[[VAL_4:.*]] = arith.constant 5 : index
-! CHECK:         %[[VAL_5:.*]] = arith.constant 10 : index
-! CHECK:         %[[VAL_6:.*]] = omp.threadprivate %[[VAL_3]] : !fir.ref<!fir.array<10x!fir.char<1,5>>> -> !fir.ref<!fir.array<10x!fir.char<1,5>>>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_7:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.char<1,5>> -> !fir.ref<!fir.char<1,5>>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_1]] : (index) -> i64
-! CHECK:           %[[VAL_10:.*]] = arith.muli %[[VAL_8]], %[[VAL_9]] : i64
-! CHECK:           %[[VAL_11:.*]] = arith.constant false
-! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-! CHECK:           fir.call @llvm.memmove.p0.p0.i64(%[[VAL_12]], %[[VAL_13]], %[[VAL_10]], %[[VAL_11]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-! CHECK:           %[[VAL_14:.*]] = omp.threadprivate %[[VAL_3]] : !fir.ref<!fir.array<10x!fir.char<1,5>>> -> !fir.ref<!fir.array<10x!fir.char<1,5>>>
-! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_16:.*]] = fir.array_load %[[VAL_14]](%[[VAL_15]]) : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.array<10x!fir.char<1,5>>
-! CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_18:.*]] = fir.array_load %[[VAL_6]](%[[VAL_17]]) : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.array<10x!fir.char<1,5>>
-! CHECK:           %[[VAL_19:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_20:.*]] = arith.constant 0 : index
-! CHECK:           %[[VAL_21:.*]] = arith.subi %[[VAL_5]], %[[VAL_19]] : index
-! CHECK:           %[[VAL_22:.*]] = fir.do_loop %[[VAL_23:.*]] = %[[VAL_20]] to %[[VAL_21]] step %[[VAL_19]] unordered iter_args(%[[VAL_24:.*]] = %[[VAL_16]]) -> (!fir.array<10x!fir.char<1,5>>) {
-! CHECK:             %[[VAL_25:.*]] = fir.array_access %[[VAL_18]], %[[VAL_23]] : (!fir.array<10x!fir.char<1,5>>, index) -> !fir.ref<!fir.char<1,5>>
-! CHECK:             %[[VAL_26:.*]] = fir.array_access %[[VAL_24]], %[[VAL_23]] : (!fir.array<10x!fir.char<1,5>>, index) -> !fir.ref<!fir.char<1,5>>
-! CHECK:             %[[VAL_27:.*]] = arith.constant 5 : index
-! CHECK:             %[[VAL_28:.*]] = arith.constant 1 : i64
-! CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_27]] : (index) -> i64
-! CHECK:             %[[VAL_30:.*]] = arith.muli %[[VAL_28]], %[[VAL_29]] : i64
-! CHECK:             %[[VAL_31:.*]] = arith.constant false
-! CHECK:             %[[VAL_32:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-! CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_25]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-! CHECK:             fir.call @llvm.memmove.p0.p0.i64(%[[VAL_32]], %[[VAL_33]], %[[VAL_30]], %[[VAL_31]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-! CHECK:             %[[VAL_34:.*]] = fir.array_amend %[[VAL_24]], %[[VAL_26]] : (!fir.array<10x!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) -> !fir.array<10x!fir.char<1,5>>
-! CHECK:             fir.result %[[VAL_34]] : !fir.array<10x!fir.char<1,5>>
-! CHECK:           }
-! CHECK:           fir.array_merge_store %[[VAL_16]], %[[VAL_35:.*]] to %[[VAL_14]] : !fir.array<10x!fir.char<1,5>>, !fir.array<10x!fir.char<1,5>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>
-! CHECK:           omp.barrier
-! CHECK:           %[[VAL_37:.*]] = fir.emboxchar %[[VAL_7]], %[[VAL_1]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-! CHECK:           %[[VAL_38:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.array<10x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:           %[[VAL_39:.*]] = fir.emboxchar %[[VAL_38]], %[[VAL_4]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-! CHECK:           fir.call @_QPsub2(%[[VAL_37]], %[[VAL_39]]) {{.*}}: (!fir.boxchar<1>, !fir.boxchar<1>) -> ()
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine copyin_char_chararray()
-  character(5), save :: x3, x4(10)
-  !$omp threadprivate(x3, x4)
-
-  !$omp parallel copyin(x3) copyin(x4)
-    call sub2(x3, x4)
-  !$omp end parallel
-
-end
-
-! CHECK-LABEL: func.func @_QPcopyin_derived_type() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeEx5) : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> -> !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_2:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> -> !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:           fir.store %[[VAL_3]] to %[[VAL_2]] : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:           omp.barrier
-! CHECK:           fir.call @_QPsub3(%[[VAL_2]]) {{.*}}: (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> ()
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine copyin_derived_type()
-  type my_type
-    integer :: t_i
-    integer :: t_arr(5)
-  end type my_type
-  type(my_type), save :: x5
-  !$omp threadprivate(x5)
-
-  !$omp parallel copyin(x5)
-    call sub3(x5)
-  !$omp end parallel
-
-end
-
-! CHECK-LABEL: func.func @_QPcombined_parallel_worksharing_loop() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcombined_parallel_worksharing_loopEi"}
-! CHECK:         %[[VAL_1:.*]] = fir.address_of(@_QFcombined_parallel_worksharing_loopEx6) : !fir.ref<i32>
-! CHECK:         %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:           %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_4]] : !fir.ref<i32>
-! CHECK:           omp.barrier
-! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:           omp.wsloop {
-! CHECK-NEXT:        omp.loop_nest (%[[VAL_9:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:               fir.store %[[VAL_9]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:               fir.call @_QPsub4(%[[VAL_4]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:               omp.yield
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine combined_parallel_worksharing_loop()
-  integer, save :: x6
-  !$omp threadprivate(x6)
-
-  !$omp parallel do copyin(x6)
-    do i=1, x6
-      call sub4(x6)
-    end do
-  !$omp end parallel do
-
-end
-
-! CHECK-LABEL: func.func @_QPcombined_parallel_sections() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QFcombined_parallel_sectionsEx7) : !fir.ref<i32>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_2:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           fir.store %[[VAL_3]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           omp.barrier
-! CHECK:           omp.sections   {
-! CHECK:             omp.section {
-! CHECK:               fir.call @_QPsub5(%[[VAL_2]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.section {
-! CHECK:               fir.call @_QPsub6(%[[VAL_2]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine combined_parallel_sections()
-  integer, save :: x7
-  !$omp threadprivate(x7)
-
-  !$omp parallel sections copyin(x7)
-    !$omp section
-      call sub5(x7)
-    !$omp section
-      call sub6(x7)
-  !$omp end parallel sections
-
-end
-
-
-!CHECK: func.func @_QPcommon_1() {
-!CHECK: %[[val_0:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<4xi8>>
-!CHECK: %[[val_1:.*]] = omp.threadprivate %[[val_0]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
-!CHECK: %[[val_2:.*]] = fir.convert %[[val_1]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_3:.*]] = fir.coordinate_of %[[val_2]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_3]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_5:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFcommon_1Ey"}
-!CHECK: omp.parallel {
-!CHECK: %[[val_6:.*]] = omp.threadprivate %[[val_0]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
-!CHECK: %[[val_7:.*]] = fir.convert %[[val_6]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0_0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_8:.*]] = fir.coordinate_of %[[val_7]], %[[val_c0_0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_9:.*]] = fir.convert %[[val_8]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_10:.*]] = fir.load %[[val_4]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_10]] to %[[val_9]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: omp.sections   {
-!CHECK: omp.section {
-!CHECK: %[[val_11:.*]] = fir.load %[[val_9]] : !fir.ref<i32>
-!CHECK: %[[val_c1_i32:.*]] = arith.constant 1 : i32
-!CHECK: %[[val_12:.*]] = arith.addi %[[val_11]], %[[val_c1_i32]] : i32
-!CHECK: fir.store %[[val_12]] to %[[val_5]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.section {
-!CHECK: %[[val_11:.*]] = fir.load %[[val_5]] : !fir.ref<i32>
-!CHECK: %[[val_12:.*]] = fir.load %[[val_5]] : !fir.ref<i32>
-!CHECK: %[[val_13:.*]] = arith.muli %[[val_11]], %[[val_12]] : i32
-!CHECK: fir.store %[[val_13]] to %[[val_9]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine common_1()
-  integer :: x
-  integer :: y
-  common /c/ x
-  !$omp threadprivate(/c/)
-
-  !$omp parallel sections copyin(/c/)
-      !$omp section
-        y = x + 1
-      !$omp section
-        x = y * y
-  !$omp end parallel sections
-end subroutine
-
-!CHECK: func.func @_QPcommon_2() {
-!CHECK: %[[val_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcommon_2Ei"}
-!CHECK: %[[val_1:.*]] = fir.address_of(@d_) : !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_2:.*]] = omp.threadprivate %[[val_1]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_4:.*]] = fir.coordinate_of %[[val_3]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_5:.*]] = fir.convert %[[val_4]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_7:.*]] = fir.coordinate_of %[[val_6]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_8:.*]] = fir.convert %[[val_7]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: omp.parallel {
-!CHECK: %[[val_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[val_10:.*]] = omp.threadprivate %[[val_1]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_11:.*]] = fir.convert %[[val_10]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0_0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_12:.*]] = fir.coordinate_of %[[val_11]], %[[val_c0_0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_13:.*]] = fir.convert %[[val_12]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_14:.*]] = fir.convert %[[val_10]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4_1:.*]] = arith.constant 4 : index
-!CHECK: %[[val_15:.*]] = fir.coordinate_of %[[val_14]], %[[val_c4_1]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_16:.*]] = fir.convert %[[val_15]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_17:.*]] = fir.load %[[val_5]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_17]] to %[[val_13]] : !fir.ref<i32>
-!CHECK: %[[val_18:.*]] = fir.load %[[val_8]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_18]] to %[[val_16]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[val_c1_i32:.*]] = arith.constant 1 : i32
-!CHECK: %[[val_19:.*]] = fir.load %[[val_13]] : !fir.ref<i32>
-!CHECK: %[[val_c1_i32_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[arg:.*]]) : i32 = (%[[val_c1_i32]]) to (%[[val_19]]) inclusive step (%[[val_c1_i32_2]]) {
-!CHECK: fir.store %[[arg]] to %[[val_9]] : !fir.ref<i32>
-!CHECK: %[[val_20:.*]] = fir.load %[[val_16]] : !fir.ref<i32>
-!CHECK: %[[val_21:.*]] = fir.load %[[val_9]] : !fir.ref<i32>
-!CHECK: %[[val_22:.*]] = arith.addi %[[val_20]], %[[val_21]] : i32
-!CHECK: fir.store %[[val_22]] to %[[val_16]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine common_2()
-  integer :: x
-  integer :: y
-  common /d/ x, y
-  !$omp threadprivate(/d/)
-  
-  !$omp parallel do copyin(/d/)
-     do i = 1, x
-        y = y + i
-     end do
-  !$omp end parallel do
-end subroutine
-
-!CHECK: func.func @_QPcommon_3() {
-!CHECK: %[[val_0:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_1:.*]] = omp.threadprivate %[[val_0]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_2:.*]] = fir.convert %[[val_1]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_3:.*]] = fir.coordinate_of %[[val_2]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_3]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: omp.parallel {
-!CHECK: %[[val_5:.*]] = omp.threadprivate %[[val_0]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4_0:.*]] = arith.constant 4 : index
-!CHECK: %[[val_7:.*]] = fir.coordinate_of %[[val_6]], %[[val_c4_0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_8:.*]] = fir.convert %[[val_7]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_9:.*]] = fir.load %[[val_4]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_9]] to %[[val_8]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: omp.sections {
-!CHECK: omp.section {
-!CHECK: %[[val_10:.*]] = fir.load %[[val_8]] : !fir.ref<i32>
-!CHECK: %[[val_c3_i32:.*]] = arith.constant 3 : i32
-!CHECK: %[[val_11:.*]] = arith.addi %[[val_10]], %[[val_c3_i32]] : i32
-!CHECK: fir.store %[[val_11]] to %[[val_8]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine common_3()
-  integer :: x
-  integer :: y
-  common /blk/ x, y
-  !$omp threadprivate (/blk/)
-
-  !$omp parallel sections copyin(/blk/)
-        !$omp section
-            y = y + 3
-  !$omp end parallel sections
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/critical.f90 b/flang/test/Lower/OpenMP/FIR/critical.f90
deleted file mode 100644
index fa33fb0fe58b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/critical.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefix="OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | tco | FileCheck %s --check-prefix="LLVMIR"
-
-!OMPDialect: omp.critical.declare @help2
-!OMPDialect: omp.critical.declare @help1 hint(contended)
-
-subroutine omp_critical()
-  use omp_lib
-  integer :: x, y
-!OMPDialect: omp.critical(@help1)
-!LLVMIR: call void @__kmpc_critical_with_hint({{.*}}, {{.*}}, {{.*}} @{{.*}}help1.var, i32 2)
-!$OMP CRITICAL(help1) HINT(omp_lock_hint_contended)
-  x = x + y
-!OMPDialect: omp.terminator
-!LLVMIR: call void @__kmpc_end_critical({{.*}}, {{.*}}, {{.*}} @{{.*}}help1.var)
-!$OMP END CRITICAL(help1)
-
-! Test that the same name can be used again
-! Also test with the zero hint expression
-!OMPDialect: omp.critical(@help2)
-!LLVMIR: call void @__kmpc_critical_with_hint({{.*}}, {{.*}}, {{.*}} @{{.*}}help2.var, i32 0)
-!$OMP CRITICAL(help2) HINT(omp_lock_hint_none)
-  x = x - y
-!OMPDialect: omp.terminator
-!LLVMIR: call void @__kmpc_end_critical({{.*}}, {{.*}}, {{.*}} @{{.*}}help2.var)
-!$OMP END CRITICAL(help2)
-
-!OMPDialect: omp.critical
-!LLVMIR: call void @__kmpc_critical({{.*}}, {{.*}}, {{.*}} @{{.*}}_.var)
-!$OMP CRITICAL
-  y = x + y
-!OMPDialect: omp.terminator
-!LLVMIR: call void @__kmpc_end_critical({{.*}}, {{.*}}, {{.*}} @{{.*}}_.var)
-!$OMP END CRITICAL
-end subroutine omp_critical
diff --git a/flang/test/Lower/OpenMP/FIR/declare-target-data.f90 b/flang/test/Lower/OpenMP/FIR/declare-target-data.f90
deleted file mode 100644
index bb3bbc8dfa83..000000000000
--- a/flang/test/Lower/OpenMP/FIR/declare-target-data.f90
+++ /dev/null
@@ -1,88 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s 
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-
-module test_0
-    implicit none
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_int {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32
-INTEGER :: data_int = 10
-!$omp declare target link(data_int)
-
-!CHECK-DAG: fir.global @_QMtest_0Earray_1d({{.*}}) {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.array<3xi32>
-INTEGER :: array_1d(3) = (/1,2,3/)
-!$omp declare target link(array_1d)
-
-!CHECK-DAG: fir.global @_QMtest_0Earray_2d({{.*}}) {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.array<2x2xi32> 
-INTEGER :: array_2d(2,2) = reshape((/1,2,3,4/), (/2,2/))
-!$omp declare target link(array_2d)
-
-!CHECK-DAG: fir.global @_QMtest_0Ept1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.box<!fir.ptr<i32>>
-INTEGER, POINTER :: pt1
-!$omp declare target link(pt1)
-
-!CHECK-DAG: fir.global @_QMtest_0Ept2_tar {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} target : i32
-INTEGER, TARGET :: pt2_tar = 5 
-!$omp declare target link(pt2_tar)
-
-!CHECK-DAG: fir.global @_QMtest_0Ept2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.box<!fir.ptr<i32>>
-INTEGER, POINTER :: pt2 => pt2_tar
-!$omp declare target link(pt2)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_int_to {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32
-INTEGER :: data_int_to = 5
-!$omp declare target to(data_int_to)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_int_enter {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : i32
-INTEGER :: data_int_enter = 5
-!$omp declare target enter(data_int_enter)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_int_clauseless {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32
-INTEGER :: data_int_clauseless = 1
-!$omp declare target(data_int_clauseless)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_to_1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_to_2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32
-REAL :: data_extended_to_1 = 2
-REAL :: data_extended_to_2 = 3
-!$omp declare target to(data_extended_to_1, data_extended_to_2)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_enter_1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : f32
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_enter_2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : f32
-REAL :: data_extended_enter_1 = 2
-REAL :: data_extended_enter_2 = 3
-!$omp declare target enter(data_extended_enter_1, data_extended_enter_2)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_link_1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_link_2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32
-REAL :: data_extended_link_1 = 2
-REAL :: data_extended_link_2 = 3
-!$omp declare target link(data_extended_link_1, data_extended_link_2)
-
-contains
-end module test_0
-
-PROGRAM commons
-    !CHECK-DAG: fir.global @numbers_ {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : tuple<f32, f32> {
-    REAL :: one = 1
-    REAL :: two = 2
-    COMMON /numbers/ one, two
-    !$omp declare target(/numbers/)
-    
-    !CHECK-DAG: fir.global @numbers_link_ {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : tuple<f32, f32> {
-    REAL :: one_link = 1
-    REAL :: two_link = 2
-    COMMON /numbers_link/ one_link, two_link
-    !$omp declare target link(/numbers_link/)
-
-    !CHECK-DAG: fir.global @numbers_to_ {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : tuple<f32, f32> {
-    REAL :: one_to = 1
-    REAL :: two_to = 2
-    COMMON /numbers_to/ one_to, two_to
-    !$omp declare target to(/numbers_to/)
-
-    !CHECK-DAG: fir.global @numbers_enter_ {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : tuple<f32, f32> {
-    REAL :: one_enter = 1
-    REAL :: two_enter = 2
-    COMMON /numbers_enter/ one_enter, two_enter
-    !$omp declare target enter(/numbers_enter/)
-END
diff --git a/flang/test/Lower/OpenMP/FIR/declare-target-func-and-subr.f90 b/flang/test/Lower/OpenMP/FIR/declare-target-func-and-subr.f90
deleted file mode 100644
index 36d4d7db64e5..000000000000
--- a/flang/test/Lower/OpenMP/FIR/declare-target-func-and-subr.f90
+++ /dev/null
@@ -1,178 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes ALL,HOST
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-device %s -o - | FileCheck %s --check-prefixes ALL,DEVICE
-
-! Check specification valid forms of declare target with functions 
-! utilising device_type and to clauses as well as the default 
-! zero clause declare target
-
-! DEVICE-LABEL: func.func @_QPfunc_t_device()
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_T_DEVICE() RESULT(I)
-!$omp declare target to(FUNC_T_DEVICE) device_type(nohost)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_T_DEVICE
-
-! DEVICE-LABEL: func.func @_QPfunc_enter_device()
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}
-FUNCTION FUNC_ENTER_DEVICE() RESULT(I)
-!$omp declare target enter(FUNC_ENTER_DEVICE) device_type(nohost)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_ENTER_DEVICE
-
-! HOST-LABEL: func.func @_QPfunc_t_host()
-! HOST-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_T_HOST() RESULT(I)
-!$omp declare target to(FUNC_T_HOST) device_type(host)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_T_HOST
-
-! HOST-LABEL: func.func @_QPfunc_enter_host()
-! HOST-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (enter)>{{.*}}
-FUNCTION FUNC_ENTER_HOST() RESULT(I)
-!$omp declare target enter(FUNC_ENTER_HOST) device_type(host)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_ENTER_HOST
-
-! ALL-LABEL: func.func @_QPfunc_t_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_T_ANY() RESULT(I)
-!$omp declare target to(FUNC_T_ANY) device_type(any)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_T_ANY
-
-! ALL-LABEL: func.func @_QPfunc_enter_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}
-FUNCTION FUNC_ENTER_ANY() RESULT(I)
-!$omp declare target enter(FUNC_ENTER_ANY) device_type(any)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_ENTER_ANY
-
-! ALL-LABEL: func.func @_QPfunc_default_t_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_DEFAULT_T_ANY() RESULT(I)
-!$omp declare target to(FUNC_DEFAULT_T_ANY)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_DEFAULT_T_ANY
-
-! ALL-LABEL: func.func @_QPfunc_default_enter_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}
-FUNCTION FUNC_DEFAULT_ENTER_ANY() RESULT(I)
-!$omp declare target enter(FUNC_DEFAULT_ENTER_ANY)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_DEFAULT_ENTER_ANY
-
-! ALL-LABEL: func.func @_QPfunc_default_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_DEFAULT_ANY() RESULT(I)
-!$omp declare target
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_DEFAULT_ANY
-
-! ALL-LABEL: func.func @_QPfunc_default_extendedlist()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_DEFAULT_EXTENDEDLIST() RESULT(I)
-!$omp declare target(FUNC_DEFAULT_EXTENDEDLIST)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_DEFAULT_EXTENDEDLIST
-
-!! -----
-
-! Check specification valid forms of declare target with subroutines 
-! utilising device_type and to clauses as well as the default 
-! zero clause declare target
-
-! DEVICE-LABEL: func.func @_QPsubr_t_device()
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_T_DEVICE()
-!$omp declare target to(SUBR_T_DEVICE) device_type(nohost)
-END
-
-! DEVICE-LABEL: func.func @_QPsubr_enter_device()
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}
-SUBROUTINE SUBR_ENTER_DEVICE()
-!$omp declare target enter(SUBR_ENTER_DEVICE) device_type(nohost)
-END
-
-! HOST-LABEL: func.func @_QPsubr_t_host()
-! HOST-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_T_HOST()
-!$omp declare target to(SUBR_T_HOST) device_type(host)
-END
-
-! HOST-LABEL: func.func @_QPsubr_enter_host()
-! HOST-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (enter)>{{.*}}
-SUBROUTINE SUBR_ENTER_HOST()
-!$omp declare target enter(SUBR_ENTER_HOST) device_type(host)
-END
-
-! ALL-LABEL: func.func @_QPsubr_t_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_T_ANY()
-!$omp declare target to(SUBR_T_ANY) device_type(any)
-END
-
-! ALL-LABEL: func.func @_QPsubr_enter_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}
-SUBROUTINE SUBR_ENTER_ANY()
-!$omp declare target enter(SUBR_ENTER_ANY) device_type(any)
-END
-
-! ALL-LABEL: func.func @_QPsubr_default_t_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_DEFAULT_T_ANY()
-!$omp declare target to(SUBR_DEFAULT_T_ANY)
-END
-
-! ALL-LABEL: func.func @_QPsubr_default_enter_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}
-SUBROUTINE SUBR_DEFAULT_ENTER_ANY()
-!$omp declare target enter(SUBR_DEFAULT_ENTER_ANY)
-END
-
-! ALL-LABEL: func.func @_QPsubr_default_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_DEFAULT_ANY()
-!$omp declare target
-END
-
-! ALL-LABEL: func.func @_QPsubr_default_extendedlist()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_DEFAULT_EXTENDEDLIST()
-!$omp declare target(SUBR_DEFAULT_EXTENDEDLIST)
-END
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPrecursive_declare_target
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}
-RECURSIVE FUNCTION RECURSIVE_DECLARE_TARGET(INCREMENT) RESULT(K)
-!$omp declare target to(RECURSIVE_DECLARE_TARGET) device_type(nohost)
-    INTEGER :: INCREMENT, K
-    IF (INCREMENT == 10) THEN
-        K = INCREMENT
-    ELSE
-        K = RECURSIVE_DECLARE_TARGET(INCREMENT + 1)
-    END IF
-END FUNCTION RECURSIVE_DECLARE_TARGET
-
-! DEVICE-LABEL: func.func @_QPrecursive_declare_target_enter
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}
-RECURSIVE FUNCTION RECURSIVE_DECLARE_TARGET_ENTER(INCREMENT) RESULT(K)
-!$omp declare target enter(RECURSIVE_DECLARE_TARGET_ENTER) device_type(nohost)
-    INTEGER :: INCREMENT, K
-    IF (INCREMENT == 10) THEN
-        K = INCREMENT
-    ELSE
-        K = RECURSIVE_DECLARE_TARGET_ENTER(INCREMENT + 1)
-    END IF
-END FUNCTION RECURSIVE_DECLARE_TARGET_ENTER
diff --git a/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap-enter.f90 b/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap-enter.f90
deleted file mode 100644
index 8e88d1b0f52a..000000000000
--- a/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap-enter.f90
+++ /dev/null
@@ -1,192 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s  --check-prefix=DEVICE
-!RUN: bbc -emit-fir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_twice
-
-! CHECK-LABEL: func.func @_QPtarget_function_twice_host
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (enter)>{{.*}}}
-function target_function_twice_host() result(i)
-!$omp declare target enter(target_function_twice_host) device_type(host)
-   integer :: i
-   i = implicitly_captured_twice()
-end function target_function_twice_host
-
-! DEVICE-LABEL: func.func @_QPtarget_function_twice_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function target_function_twice_device() result(i)
-!$omp declare target enter(target_function_twice_device) device_type(nohost)
-   integer :: i
-   i = implicitly_captured_twice()
-end function target_function_twice_device
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_nest
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_nest() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_nest
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_one
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter){{.*}}}
-function implicitly_captured_one() result(k)
-   k = implicitly_captured_nest()
-end function implicitly_captured_one
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_two
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_two() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_two
-
-! DEVICE-LABEL: func.func @_QPtarget_function_test
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function target_function_test() result(j)
-!$omp declare target enter(target_function_test) device_type(nohost)
-   integer :: i, j
-   i = implicitly_captured_one()
-   j = implicitly_captured_two() + i
-end function target_function_test
-
-!! -----
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_nest_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_nest_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_nest_twice
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_one_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_one_twice() result(k)
-   k = implicitly_captured_nest_twice()
-end function implicitly_captured_one_twice
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_two_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_two_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_two_twice
-
-! DEVICE-LABEL: func.func @_QPtarget_function_test_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function target_function_test_device() result(j)
-   !$omp declare target enter(target_function_test_device) device_type(nohost)
-   integer :: i, j
-   i = implicitly_captured_one_twice()
-   j = implicitly_captured_two_twice() + i
-end function target_function_test_device
-
-! CHECK-LABEL: func.func @_QPtarget_function_test_host
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (enter)>{{.*}}}
-function target_function_test_host() result(j)
-   !$omp declare target enter(target_function_test_host) device_type(host)
-   integer :: i, j
-   i = implicitly_captured_one_twice()
-   j = implicitly_captured_two_twice() + i
-end function target_function_test_host
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_with_dev_type_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-recursive function implicitly_captured_with_dev_type_recursive(increment) result(k)
-!$omp declare target enter(implicitly_captured_with_dev_type_recursive) device_type(host)
-   integer :: increment, k
-   if (increment == 10) then
-      k = increment
-   else
-      k = implicitly_captured_with_dev_type_recursive(increment + 1)
-   end if
-end function implicitly_captured_with_dev_type_recursive
-
-! DEVICE-LABEL: func.func @_QPtarget_function_with_dev_type_recurse
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function target_function_with_dev_type_recurse() result(i)
-!$omp declare target enter(target_function_with_dev_type_recurse) device_type(nohost)
-   integer :: i
-   i = implicitly_captured_with_dev_type_recursive(0)
-end function target_function_with_dev_type_recurse
-
-!! -----
-
-module test_module
-contains
-! CHECK-LABEL: func.func @_QMtest_modulePimplicitly_captured_nest_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-   function implicitly_captured_nest_twice() result(i)
-      integer :: i
-      i = 10
-   end function implicitly_captured_nest_twice
-
-! CHECK-LABEL: func.func @_QMtest_modulePimplicitly_captured_one_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-   function implicitly_captured_one_twice() result(k)
-      !$omp declare target enter(implicitly_captured_one_twice) device_type(host)
-      k = implicitly_captured_nest_twice()
-   end function implicitly_captured_one_twice
-
-! DEVICE-LABEL: func.func @_QMtest_modulePimplicitly_captured_two_twice
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-   function implicitly_captured_two_twice() result(y)
-      integer :: y
-      y = 5
-   end function implicitly_captured_two_twice
-
-! DEVICE-LABEL: func.func @_QMtest_modulePtarget_function_test_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-   function target_function_test_device() result(j)
-      !$omp declare target enter(target_function_test_device) device_type(nohost)
-      integer :: i, j
-      i = implicitly_captured_one_twice()
-      j = implicitly_captured_two_twice() + i
-   end function target_function_test_device
-end module test_module
-
-!! -----
-
-program mb
-   interface
-      subroutine caller_recursive
-         !$omp declare target enter(caller_recursive) device_type(nohost)
-      end subroutine
-
-      recursive subroutine implicitly_captured_recursive(increment)
-         integer :: increment
-      end subroutine
-   end interface
-end program
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-recursive subroutine implicitly_captured_recursive(increment)
-   integer :: increment
-   if (increment == 10) then
-      return
-   else
-      call implicitly_captured_recursive(increment + 1)
-   end if
-end subroutine
-
-! DEVICE-LABEL: func.func @_QPcaller_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-subroutine caller_recursive
-!$omp declare target enter(caller_recursive) device_type(nohost)
-   call implicitly_captured_recursive(0)
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap.f90 b/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap.f90
deleted file mode 100644
index a90b04246e6d..000000000000
--- a/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap.f90
+++ /dev/null
@@ -1,218 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s  --check-prefix=DEVICE
-!RUN: bbc -emit-fir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured(toggle) result(k)
-   integer :: i, j, k
-   logical :: toggle
-   i = 10
-   j = 5
-   if (toggle) then
-      k = i
-   else
-      k = j
-   end if
-end function implicitly_captured
-
-
-! CHECK-LABEL: func.func @_QPtarget_function
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function target_function(toggle) result(i)
-!$omp declare target
-   integer :: i
-   logical :: toggle
-   i = implicitly_captured(toggle)
-end function target_function
-
-!! -----
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_twice
-
-! CHECK-LABEL: func.func @_QPtarget_function_twice_host
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>{{.*}}}
-function target_function_twice_host() result(i)
-!$omp declare target to(target_function_twice_host) device_type(host)
-   integer :: i
-   i = implicitly_captured_twice()
-end function target_function_twice_host
-
-! DEVICE-LABEL: func.func @_QPtarget_function_twice_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function target_function_twice_device() result(i)
-!$omp declare target to(target_function_twice_device) device_type(nohost)
-   integer :: i
-   i = implicitly_captured_twice()
-end function target_function_twice_device
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_nest
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function implicitly_captured_nest() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_nest
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_one
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to){{.*}}}
-function implicitly_captured_one() result(k)
-   k = implicitly_captured_nest()
-end function implicitly_captured_one
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_two
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function implicitly_captured_two() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_two
-
-! DEVICE-LABEL: func.func @_QPtarget_function_test
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function target_function_test() result(j)
-!$omp declare target to(target_function_test) device_type(nohost)
-   integer :: i, j
-   i = implicitly_captured_one()
-   j = implicitly_captured_two() + i
-end function target_function_test
-
-!! -----
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_nest_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured_nest_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_nest_twice
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_one_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured_one_twice() result(k)
-   k = implicitly_captured_nest_twice()
-end function implicitly_captured_one_twice
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_two_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured_two_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_two_twice
-
-! DEVICE-LABEL: func.func @_QPtarget_function_test_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function target_function_test_device() result(j)
-   !$omp declare target to(target_function_test_device) device_type(nohost)
-   integer :: i, j
-   i = implicitly_captured_one_twice()
-   j = implicitly_captured_two_twice() + i
-end function target_function_test_device
-
-! CHECK-LABEL: func.func @_QPtarget_function_test_host
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>{{.*}}}
-function target_function_test_host() result(j)
-   !$omp declare target to(target_function_test_host) device_type(host)
-   integer :: i, j
-   i = implicitly_captured_one_twice()
-   j = implicitly_captured_two_twice() + i
-end function target_function_test_host
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_with_dev_type_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-recursive function implicitly_captured_with_dev_type_recursive(increment) result(k)
-!$omp declare target to(implicitly_captured_with_dev_type_recursive) device_type(host)
-   integer :: increment, k
-   if (increment == 10) then
-      k = increment
-   else
-      k = implicitly_captured_with_dev_type_recursive(increment + 1)
-   end if
-end function implicitly_captured_with_dev_type_recursive
-
-! DEVICE-LABEL: func.func @_QPtarget_function_with_dev_type_recurse
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function target_function_with_dev_type_recurse() result(i)
-!$omp declare target to(target_function_with_dev_type_recurse) device_type(nohost)
-   integer :: i
-   i = implicitly_captured_with_dev_type_recursive(0)
-end function target_function_with_dev_type_recurse
-
-!! -----
-
-module test_module
-contains
-! CHECK-LABEL: func.func @_QMtest_modulePimplicitly_captured_nest_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-   function implicitly_captured_nest_twice() result(i)
-      integer :: i
-      i = 10
-   end function implicitly_captured_nest_twice
-
-! CHECK-LABEL: func.func @_QMtest_modulePimplicitly_captured_one_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-   function implicitly_captured_one_twice() result(k)
-      !$omp declare target to(implicitly_captured_one_twice) device_type(host)
-      k = implicitly_captured_nest_twice()
-   end function implicitly_captured_one_twice
-
-! DEVICE-LABEL: func.func @_QMtest_modulePimplicitly_captured_two_twice
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-   function implicitly_captured_two_twice() result(y)
-      integer :: y
-      y = 5
-   end function implicitly_captured_two_twice
-
-! DEVICE-LABEL: func.func @_QMtest_modulePtarget_function_test_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-   function target_function_test_device() result(j)
-      !$omp declare target to(target_function_test_device) device_type(nohost)
-      integer :: i, j
-      i = implicitly_captured_one_twice()
-      j = implicitly_captured_two_twice() + i
-   end function target_function_test_device
-end module test_module
-
-!! -----
-
-program mb
-   interface
-      subroutine caller_recursive
-         !$omp declare target to(caller_recursive) device_type(nohost)
-      end subroutine
-
-      recursive subroutine implicitly_captured_recursive(increment)
-         integer :: increment
-      end subroutine
-   end interface
-end program
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-recursive subroutine implicitly_captured_recursive(increment)
-   integer :: increment
-   if (increment == 10) then
-      return
-   else
-      call implicitly_captured_recursive(increment + 1)
-   end if
-end subroutine
-
-! DEVICE-LABEL: func.func @_QPcaller_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-subroutine caller_recursive
-!$omp declare target to(caller_recursive) device_type(nohost)
-   call implicitly_captured_recursive(0)
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/default-clause.f90 b/flang/test/Lower/OpenMP/FIR/default-clause.f90
deleted file mode 100644
index 14c0d375896a..000000000000
--- a/flang/test/Lower/OpenMP/FIR/default-clause.f90
+++ /dev/null
@@ -1,281 +0,0 @@
-! This test checks lowering of OpenMP parallel directive
-! with `DEFAULT` clause present.
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
-!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[const:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[const]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
-!CHECK: %[[const:.*]] = arith.constant 2 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %[[const]], %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 45 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[Z]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-program default_clause_lowering
-    integer :: x, y, z, w
-
-    !$omp parallel default(private) firstprivate(x) shared(z)
-        x = y * 2
-        z = w + 45
-    !$omp end parallel
-
-!CHECK: omp.parallel {
-!CHECK: %[[temp:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-    !$omp parallel default(shared)
-        x = y
-    !$omp end parallel
-
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-    !$omp parallel default(none) private(x, y)
-        x = y
-    !$omp end parallel
-
-!CHECK: omp.parallel   {
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-    !$omp parallel default(firstprivate) firstprivate(y)
-        x = y
-    !$omp end parallel
-
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
-!CHECK: %[[temp:.*]] = fir.load %[[W]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 2 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %[[const]], %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 45 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[Z]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-    !$omp parallel default(firstprivate) private(x) shared(z)
-        x = y * 2
-        z = w + 45
-    !$omp end parallel
-
-!CHECK: omp.parallel   {
-!CHECK: omp.parallel   {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
-!CHECK: %[[temp:.*]] = fir.load %[[W]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp parallel
-        !$omp parallel default(private)
-            x = y
-        !$omp end parallel
-
-        !$omp parallel default(firstprivate)
-            w = x
-        !$omp end parallel
-    !$omp end parallel
-
-end program default_clause_lowering
-
-subroutine nested_default_clause_tests
-    integer :: x, y, z, w, k, a
-    
-!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: omp.parallel   {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: omp.parallel {
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[const:.*]] = arith.constant 20 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 10 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.parallel   {
-!CHECK: %[[INNER_PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[INNER_PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Z]]
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_Z]] : !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_K]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_K]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 30 : i32
-!CHECK: fir.store %[[const]] to %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 40 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 50 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_Z]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 40 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_K]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp parallel  firstprivate(x) private(y) shared(w) default(private)  
-        !$omp parallel default(private)
-           y = 20
-           x = 10 
-        !$omp end parallel 
-
-        !$omp parallel default(firstprivate) shared(y) private(w) 
-            y = 30
-            w = 40 
-            z = 50
-            k = 40
-        !$omp end parallel
-    !$omp end parallel
-    
-    
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"} 
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_INNER_X]] : !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_INNER_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_INNER_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"} 
-!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp_1:.*]] = fir.load %[[PRIVATE_INNER_X]] : !fir.ref<i32>
-!CHECK: %[[temp_2:.*]] = fir.load %[[PRIVATE_Z]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_INNER_W]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp parallel default(private)
-        !$omp parallel default(firstprivate)
-            x = y
-        !$omp end parallel
-
-        !$omp parallel default(private) shared(z)
-            w = x + z
-        !$omp end parallel
-    !$omp end parallel    
-    
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: omp.parallel {
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.parallel {
-!CHECK: %[[temp_1:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp_2:.*]] = fir.load %[[PRIVATE_Z]] : !fir.ref<i32>
-!CHECK: %[[temp_3:.*]] = arith.addi %[[temp_1]], %[[temp_2]] : i32
-!CHECK: fir.store %[[temp_3]] to %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: }
-    !$omp parallel default(private)
-		!$omp parallel default(firstprivate)
-			x = y
-		!$omp end parallel
-
-		!$omp parallel default(shared)
-			w = x + z
-		!$omp end parallel
-	!$omp end parallel
-
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: omp.single {
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: } 
-	!$omp parallel default(firstprivate)
-		!$omp single
-			x = y
-		!$omp end single
-	!$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
deleted file mode 100644
index 50938342dee7..000000000000
--- a/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
+++ /dev/null
@@ -1,32 +0,0 @@
-! Test delayed privatization for the `private` clause.
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir \
-! RUN:   --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization \
-! RUN:   -o - %s 2>&1 | FileCheck %s
-
-subroutine delayed_privatization_firstprivate
-  implicit none
-  integer :: var1
-
-!$OMP PARALLEL FIRSTPRIVATE(var1)
-  var1 = 10
-!$OMP END PARALLEL
-end subroutine
-
-! CHECK-LABEL: omp.private {type = firstprivate}
-! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
-! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
-! CHECK: } copy {
-! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
-! CHECK:    fir.store %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<i32>
-! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
-! CHECK: }
-
-! CHECK-LABEL: @_QPdelayed_privatization_firstprivate
-! CHECK: omp.parallel private(@[[VAR1_PRIVATIZER_SYM]] %{{.*}} -> %{{.*}} : !fir.ref<i32>) {
-! CHECK: omp.terminator
-
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
deleted file mode 100644
index b13687faa3f2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
+++ /dev/null
@@ -1,41 +0,0 @@
-! Test delayed privatization for the `private` clause.
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir \
-! RUN:   --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization \
-! RUN:   -o - %s 2>&1 | FileCheck %s
-
-subroutine delayed_privatization_private
-  implicit none
-  integer :: var1
-
-!$OMP PARALLEL PRIVATE(var1)
-  var1 = 10
-!$OMP END PARALLEL
-
-!$OMP PARALLEL PRIVATE(var1)
-  var1 = 20
-!$OMP END PARALLEL
-
-end subroutine
-
-! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
-! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
-! CHECK-NOT: } copy {
-
-! CHECK-LABEL: @_QPdelayed_privatization_private
-! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
-! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
-! CHECK: %[[C10:.*]] = arith.constant 10 : i32
-! CHECK: fir.store %[[C10]] to %[[PAR_ARG]] : !fir.ref<i32>
-! CHECK: omp.terminator
-
-! Test that the same privatizer is used if the a variable with the same type and
-! name was previously privatized.
-! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
-! CHECK: %[[C20:.*]] = arith.constant 20 : i32
-! CHECK: fir.store %[[C20]] to %[[PAR_ARG]] : !fir.ref<i32>
-! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/firstprivate-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/firstprivate-commonblock.f90
deleted file mode 100644
index 6adc7d9f6c82..000000000000
--- a/flang/test/Lower/OpenMP/FIR/firstprivate-commonblock.f90
+++ /dev/null
@@ -1,30 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: func.func @_QPfirstprivate_common() {
-!CHECK: %[[val_0:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_1:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_2:.*]] = fir.coordinate_of %[[val_1]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<i8>) -> !fir.ref<f32>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<f32>
-!CHECK: omp.parallel {
-!CHECK: %[[val_7:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFfirstprivate_commonEx"}
-!CHECK: %[[val_8:.*]] = fir.load %[[val_3]] : !fir.ref<f32>
-!CHECK: fir.store %[[val_8]] to %[[val_7]] : !fir.ref<f32>
-!CHECK: %[[val_9:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFfirstprivate_commonEy"}
-!CHECK: %[[val_10:.*]] = fir.load %[[val_6]] : !fir.ref<f32>
-!CHECK: fir.store %[[val_10]] to %[[val_9]] : !fir.ref<f32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-
-subroutine firstprivate_common
-  common /c/ x, y
-  real x, y
-  !$omp parallel firstprivate(/c/)
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/flush.f90 b/flang/test/Lower/OpenMP/FIR/flush.f90
deleted file mode 100644
index 2c281632b85c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/flush.f90
+++ /dev/null
@@ -1,45 +0,0 @@
-! This test checks lowering of OpenMP Flush Directive.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="LLVMIRDialect,OMPDialect"
-
-subroutine flush_standalone(a, b, c)
-    integer, intent(inout) :: a, b, c
-
-!$omp flush(a,b,c)
-!$omp flush
-!OMPDialect: omp.flush(%{{.*}}, %{{.*}}, %{{.*}} :
-!FIRDialect: !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
-!LLVMIRDialect: !llvm.ptr, !llvm.ptr, !llvm.ptr)
-!OMPDialect: omp.flush
-
-end subroutine flush_standalone
-
-subroutine flush_parallel(a, b, c)
-    integer, intent(inout) :: a, b, c
-
-!$omp parallel
-!OMPDialect:  omp.parallel {
-
-!OMPDialect: omp.flush(%{{.*}}, %{{.*}}, %{{.*}} :
-!FIRDialect: !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
-!LLVMIRDialect: !llvm.ptr, !llvm.ptr, !llvm.ptr)
-!OMPDialect: omp.flush
-!$omp flush(a,b,c)
-!$omp flush
-
-!FIRDialect: %{{.*}} = fir.load %{{.*}} : !fir.ref<i32>
-!FIRDialect: %{{.*}} = fir.load %{{.*}} : !fir.ref<i32>
-!FIRDialect: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : i32
-!FIRDialect: fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
-
-!LLVMIRDialect: %{{.*}} = llvm.load %{{.*}} : !llvm.ptr -> i32
-!LLVMIRDialect: %{{.*}} = llvm.load %{{.*}} : !llvm.ptr -> i32
-!LLVMIRDialect: %{{.*}} = llvm.add %{{.*}}, %{{.*}} : i32
-!LLVMIRDialect: llvm.store %{{.*}}, %{{.*}} : i32, !llvm.ptr
-    c = a + b
-
-!OMPDialect: omp.terminator
-!$omp END parallel
-
-end subroutine flush_parallel
diff --git a/flang/test/Lower/OpenMP/FIR/if-clause.f90 b/flang/test/Lower/OpenMP/FIR/if-clause.f90
deleted file mode 100644
index f686b9708fc5..000000000000
--- a/flang/test/Lower/OpenMP/FIR/if-clause.f90
+++ /dev/null
@@ -1,496 +0,0 @@
-! This test checks lowering of OpenMP IF clauses.
-
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -emit-fir %s -o - | FileCheck %s
-
-program main
-  integer :: i
-
-  ! TODO When they are supported, add tests for:
-  ! - DISTRIBUTE PARALLEL DO
-  ! - DISTRIBUTE PARALLEL DO SIMD
-  ! - DISTRIBUTE SIMD
-  ! - PARALLEL SECTIONS
-  ! - PARALLEL WORKSHARE
-  ! - TARGET PARALLEL
-  ! - TARGET TEAMS DISTRIBUTE
-  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO
-  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD
-  ! - TARGET TEAMS DISTRIBUTE SIMD
-  ! - TARGET UPDATE
-  ! - TASKLOOP
-  ! - TASKLOOP SIMD
-  ! - TEAMS DISTRIBUTE
-  ! - TEAMS DISTRIBUTE PARALLEL DO
-  ! - TEAMS DISTRIBUTE PARALLEL DO SIMD
-  ! - TEAMS DISTRIBUTE SIMD
-
-  ! ----------------------------------------------------------------------------
-  ! DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp do simd
-  do i = 1, 10
-  end do
-  !$omp end do simd
-
-  ! CHECK:      omp.wsloop
-  !$omp do simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end do simd
-
-  ! CHECK:      omp.wsloop
-  !$omp do simd if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end do simd
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp parallel
-  i = 10
-  !$omp end parallel
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp parallel if(.true.)
-  i = 10
-  !$omp end parallel
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp parallel if(parallel: .true.)
-  i = 10
-  !$omp end parallel
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL DO
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp parallel do
-  do i = 1, 10
-  end do
-  !$omp end parallel do
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp parallel do if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp parallel do if(parallel: .true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp parallel do simd
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp parallel do simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp parallel do simd if(parallel: .true.) if(simd: .false.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp parallel do simd if(parallel: .true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.wsloop
-  !$omp parallel do simd if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! ----------------------------------------------------------------------------
-  ! SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.simd
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp simd
-  do i = 1, 10
-  end do
-  !$omp end simd
-
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end simd
-
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp simd if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end simd
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target
-  !$omp end target
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  !$omp target if(.true.)
-  !$omp end target
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  !$omp target if(target: .true.)
-  !$omp end target
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET DATA
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target_data
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target data map(tofrom: i)
-  !$omp end target data
-
-  ! CHECK:      omp.target_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target data map(tofrom: i) if(.true.)
-  !$omp end target data
-
-  ! CHECK:      omp.target_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target data map(tofrom: i) if(target data: .true.)
-  !$omp end target data
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET ENTER DATA
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target_enter_data
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: map
-  !$omp target enter data map(to: i)
-
-  ! CHECK:      omp.target_enter_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target enter data map(to: i) if(.true.)
-
-  ! CHECK:      omp.target_enter_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target enter data map(to: i) if(target enter data: .true.)
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET EXIT DATA
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target_exit_data
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: map
-  !$omp target exit data map(from: i)
-
-  ! CHECK:      omp.target_exit_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target exit data map(from: i) if(.true.)
-
-  ! CHECK:      omp.target_exit_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target exit data map(from: i) if(target exit data: .true.)
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET PARALLEL DO
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target parallel do
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp target parallel do if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp target parallel do if(target: .true.) if(parallel: .false.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target parallel do if(target: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp target parallel do if(parallel: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET PARALLEL DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target parallel do simd
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp target parallel do simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp target parallel do simd if(target: .true.) if(parallel: .false.) &
-  !$omp&                        if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target parallel do simd if(target: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp target parallel do simd if(parallel: .true.) if(simd: .false.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.simd
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target simd
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp target simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp target simd if(target: .true.) if(simd: .false.)
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simd
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target simd if(target: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp target simd if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET TEAMS
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.teams
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target teams
-  i = 1
-  !$omp end target teams
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp target teams if(.true.)
-  i = 1
-  !$omp end target teams
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp target teams if(target: .true.) if(teams: .false.)
-  i = 1
-  !$omp end target teams
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.teams
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target teams if(target: .true.)
-  i = 1
-  !$omp end target teams
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp target teams if(teams: .true.)
-  i = 1
-  !$omp end target teams
-
-  ! ----------------------------------------------------------------------------
-  ! TASK
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.task
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp task
-  !$omp end task
-
-  ! CHECK:      omp.task
-  ! CHECK-SAME: if({{.*}})
-  !$omp task if(.true.)
-  !$omp end task
-
-  ! CHECK:      omp.task
-  ! CHECK-SAME: if({{.*}})
-  !$omp task if(task: .true.)
-  !$omp end task
-
-  ! ----------------------------------------------------------------------------
-  ! TEAMS
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.teams
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp teams
-  i = 1
-  !$omp end teams
-
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp teams if(.true.)
-  i = 1
-  !$omp end teams
-
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp teams if(teams: .true.)
-  i = 1
-  !$omp end teams
-end program main
diff --git a/flang/test/Lower/OpenMP/FIR/is-device.f90 b/flang/test/Lower/OpenMP/FIR/is-device.f90
deleted file mode 100644
index 79e0ee506c5f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/is-device.f90
+++ /dev/null
@@ -1,14 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefix=HOST
-!RUN: %flang_fc1 -emit-fir -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE-FLAG-ONLY
-!RUN: bbc -fopenmp -fopenmp-is-target-device -emit-fir -o - %s | FileCheck %s --check-prefix=DEVICE
-!RUN: bbc -fopenmp -emit-fir -o - %s | FileCheck %s --check-prefix=HOST
-!RUN: bbc -fopenmp-is-target-device -emit-fir -o - %s | FileCheck %s --check-prefix=DEVICE-FLAG-ONLY
-
-!DEVICE: module attributes {{{.*}}, omp.is_target_device = true{{.*}}}
-!HOST: module attributes {{{.*}}, omp.is_target_device = false{{.*}}}
-!DEVICE-FLAG-ONLY: module attributes {{{.*}}"
-!DEVICE-FLAG-ONLY-NOT: , omp.is_target_device = {{.*}}
-!DEVICE-FLAG-ONLY-SAME: }
-subroutine omp_subroutine()
-end subroutine omp_subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90
deleted file mode 100644
index 86c4d917fa51..000000000000
--- a/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90
+++ /dev/null
@@ -1,49 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s 
-
-!CHECK: func.func @_QPlastprivate_common() {
-!CHECK: %[[val_0:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[val_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFlastprivate_commonEi"}
-!CHECK: %[[val_2:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_4:.*]] = fir.coordinate_of %[[val_3]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_5:.*]] = fir.convert %[[val_4]] : (!fir.ref<i8>) -> !fir.ref<f32>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_7:.*]] = fir.coordinate_of %[[val_6]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_8:.*]] = fir.convert %[[val_7]] : (!fir.ref<i8>) -> !fir.ref<f32>
-!CHECK: %[[val_9:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivate_commonEx"}
-!CHECK: %[[val_10:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFlastprivate_commonEy"}
-!CHECK: %[[val_c1_i32:.*]] = arith.constant 1 : i32
-!CHECK: %[[val_c100_i32:.*]] = arith.constant 100 : i32
-!CHECK: %[[val_c1_i32_0:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[arg:.*]]) : i32 = (%[[val_c1_i32]]) to (%[[val_c100_i32]]) inclusive step (%[[val_c1_i32_0]]) {
-!CHECK: fir.store %[[arg]] to %[[val_0]] : !fir.ref<i32>
-!CHECK: %[[val_11:.*]] = arith.addi %[[arg]], %[[val_c1_i32_0]] : i32
-!CHECK: %[[val_c0_i32:.*]] = arith.constant 0 : i32
-!CHECK: %[[val_12:.*]] = arith.cmpi slt, %[[val_c1_i32_0]], %[[val_c0_i32]] : i32
-!CHECK: %[[val_13:.*]] = arith.cmpi slt, %[[val_11]], %[[val_c100_i32]] : i32
-!CHECK: %[[val_14:.*]] = arith.cmpi sgt, %[[val_11]], %[[val_c100_i32]] : i32
-!CHECK: %[[val_15:.*]] = arith.select %[[val_12]], %[[val_13]], %[[val_14]] : i1
-!CHECK: fir.if %[[val_15]] {
-!CHECK: fir.store %[[val_11]] to %[[val_0]] : !fir.ref<i32>
-!CHECK: %[[val_16:.*]] = fir.load %[[val_9]] : !fir.ref<f32>
-!CHECK: fir.store %[[val_16]] to %[[val_5]] : !fir.ref<f32>
-!CHECK: %[[val_17:.*]] = fir.load %[[val_10]] : !fir.ref<f32>
-!CHECK: fir.store %[[val_17]] to %[[val_8]] : !fir.ref<f32>
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine lastprivate_common
-  common /c/ x, y
-  real x, y
-  !$omp do lastprivate(/c/)
-  do i=1,100
-  end do
-  !$omp end do
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/location.f90 b/flang/test/Lower/OpenMP/FIR/location.f90
deleted file mode 100644
index 6a7fb3c03584..000000000000
--- a/flang/test/Lower/OpenMP/FIR/location.f90
+++ /dev/null
@@ -1,71 +0,0 @@
-! This test checks location of OpenMP constructs and clauses
-
-!RUN: %flang_fc1 -emit-fir -fopenmp -mmlir --mlir-print-debuginfo %s -o - | FileCheck %s
-
-!CHECK-LABEL: sub_parallel
-subroutine sub_parallel()
-  print *, x
-!CHECK: omp.parallel   {
-  !$omp parallel
-    print *, x
-!CHECK:   omp.terminator loc(#[[PAR_LOC:.*]])
-!CHECK: } loc(#[[PAR_LOC]])
-  !$omp end parallel
-  print *, x
-end
-
-!CHECK-LABEL: sub_target
-subroutine sub_target()
-  print *, x
-!CHECK: omp.target {{.*}} {
-  !$omp target
-    print *, x
-!CHECK:   omp.terminator loc(#[[TAR_LOC:.*]])
-!CHECK: } loc(#[[TAR_LOC]])
-  !$omp end target
-  print *, x
-end
-
-!CHECK-LABEL: sub_loop
-subroutine sub_loop()
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest {{.*}} {
-  !$omp do
-  do i=1,10
-    print *, i
-!CHECK:   omp.yield loc(#[[LOOP_LOC:.*]])
-!CHECK: } loc(#[[LOOP_LOC]])
-!CHECK:   omp.terminator loc(#[[LOOP_LOC]])
-!CHECK: } loc(#[[LOOP_LOC]])
-  end do
-  !$omp end do
-end
-
-!CHECK-LABEL: sub_standalone
-subroutine sub_standalone()
-  !CHECK: omp.barrier loc(#[[BAR_LOC:.*]])
-  !$omp barrier
-  !CHECK: omp.taskwait loc(#[[TW_LOC:.*]])
-  !$omp taskwait
-  !CHECK: omp.taskyield loc(#[[TY_LOC:.*]])
-  !$omp taskyield
-end
-
-subroutine sub_if(c)
-  logical(kind=4) :: c
-  !CHECK: %[[CVT:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1 loc(#[[IF_LOC:.*]])
-  !CHECK: omp.task if(%[[CVT]])
-  !$omp task if(c)
-    print *, "Task"
-  !$omp end task
-  !CHECK: } loc(#[[TASK_LOC:.*]])
-end subroutine
-
-!CHECK: #[[PAR_LOC]] = loc("{{.*}}location.f90":9:9)
-!CHECK: #[[TAR_LOC]] = loc("{{.*}}location.f90":21:9)
-!CHECK: #[[LOOP_LOC]] = loc("{{.*}}location.f90":33:9)
-!CHECK: #[[BAR_LOC]] = loc("{{.*}}location.f90":47:9)
-!CHECK: #[[TW_LOC]] = loc("{{.*}}location.f90":49:9)
-!CHECK: #[[TY_LOC]] = loc("{{.*}}location.f90":51:9)
-!CHECK: #[[IF_LOC]] = loc("{{.*}}location.f90":58:14)
-!CHECK: #[[TASK_LOC]] = loc("{{.*}}location.f90":58:9)
diff --git a/flang/test/Lower/OpenMP/FIR/loop-combined.f90 b/flang/test/Lower/OpenMP/FIR/loop-combined.f90
deleted file mode 100644
index 6c6618dc9fb5..000000000000
--- a/flang/test/Lower/OpenMP/FIR/loop-combined.f90
+++ /dev/null
@@ -1,83 +0,0 @@
-! This test checks lowering of OpenMP combined loop constructs.
-
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -emit-fir %s -o - | FileCheck %s
-
-program main
-  integer :: i
-
-  ! TODO When DISTRIBUTE, TASKLOOP and TEAMS are supported add:
-  ! - DISTRIBUTE PARALLEL DO SIMD
-  ! - DISTRIBUTE PARALLEL DO
-  ! - DISTRIBUTE SIMD
-  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD
-  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO
-  ! - TARGET TEAMS DISTRIBUTE SIMD
-  ! - TARGET TEAMS DISTRIBUTE
-  ! - TASKLOOP SIMD
-  ! - TEAMS DISTRIBUTE PARALLEL DO SIMD
-  ! - TEAMS DISTRIBUTE PARALLEL DO
-  ! - TEAMS DISTRIBUTE SIMD
-  ! - TEAMS DISTRIBUTE
-
-  ! ----------------------------------------------------------------------------
-  ! DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.wsloop
-  !$omp do simd
-  do i = 1, 10
-  end do
-  !$omp end do simd
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.parallel
-  ! CHECK: omp.wsloop
-  !$omp parallel do simd
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL DO
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.parallel
-  ! CHECK: omp.wsloop
-  !$omp parallel do
-  do i = 1, 10
-  end do
-  !$omp end parallel do
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET PARALLEL DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.target
-  ! CHECK: omp.parallel
-  ! CHECK: omp.wsloop
-  !$omp target parallel do simd
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET PARALLEL DO
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.target
-  ! CHECK: omp.parallel
-  ! CHECK: omp.wsloop
-  !$omp target parallel do
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.target
-  ! CHECK: omp.simd
-  !$omp target simd
-  do i = 1, 10
-  end do
-  !$omp end target simd
-end program main
diff --git a/flang/test/Lower/OpenMP/FIR/map-component-ref.f90 b/flang/test/Lower/OpenMP/FIR/map-component-ref.f90
deleted file mode 100644
index 6799941701f4..000000000000
--- a/flang/test/Lower/OpenMP/FIR/map-component-ref.f90
+++ /dev/null
@@ -1,33 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-
-! CHECK: %[[V0:[0-9]+]] = fir.alloca !fir.type<_QFfooTt0{a0:i32,a1:i32}> {bindc_name = "a", uniq_name = "_QFfooEa"}
-! CHECK: %[[V1:[0-9]+]] = fir.declare %[[V0]] {uniq_name = "_QFfooEa"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>
-! CHECK: %[[V2:[0-9]+]] = fir.field_index a1, !fir.type<_QFfooTt0{a0:i32,a1:i32}>
-! CHECK: %[[V3:[0-9]+]] = fir.coordinate_of %[[V1]], %[[V2]] : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.field) -> !fir.ref<i32>
-! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V3]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a%a1"}
-! CHECK: %[[V5:[0-9]+]] = omp.map.info var_ptr(%[[V1]] : !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.type<_QFfooTt0{a0:i32,a1:i32}>) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>> {name = "a"}
-! CHECK: omp.target map_entries(%[[V4]] -> %arg0, %[[V5]] -> %arg1 : !fir.ref<i32>, !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) {
-! CHECK: ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>):
-! CHECK:   %c0_i32 = arith.constant 0 : i32
-! CHECK:   %[[V6:[0-9]+]] = fir.declare %arg1 {uniq_name = "_QFfooEa"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>
-! CHECK:   %[[V7:[0-9]+]] = fir.field_index a1, !fir.type<_QFfooTt0{a0:i32,a1:i32}>
-! CHECK:   %[[V8:[0-9]+]] = fir.coordinate_of %[[V6]], %[[V7]] : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.field) -> !fir.ref<i32>
-! CHECK:   fir.store %c0_i32 to %[[V8]] : !fir.ref<i32>
-! CHECK:   omp.terminator
-! CHECK: }
-
-subroutine foo()
-  implicit none
-
-  type t0
-    integer :: a0, a1
-  end type
-
-  type(t0) :: a
-
-  !$omp target map(a%a1)
-  a%a1 = 0
-  !$omp end target
-end
-
diff --git a/flang/test/Lower/OpenMP/FIR/master.f90 b/flang/test/Lower/OpenMP/FIR/master.f90
deleted file mode 100644
index dd9910da2f41..000000000000
--- a/flang/test/Lower/OpenMP/FIR/master.f90
+++ /dev/null
@@ -1,100 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect"
-
-!===============================================================================
-! parallel construct with function call which has master construct internally
-!===============================================================================
-!FIRDialect-LABEL: func @_QPomp_master
-subroutine omp_master()
-
-!OMPDialect: omp.master  {
-!$omp master
-
-    !FIRDialect: fir.call @_QPmaster() {{.*}}: () -> ()
-    call master()
-
-!OMPDialect: omp.terminator
-!$omp end master
-
-end subroutine omp_master
-
-!FIRDialect-LABEL: func @_QPparallel_function_master
-subroutine parallel_function_master()
-
-!OMPDialect: omp.parallel {
-!$omp parallel
-
-    !FIRDialect: fir.call @_QPfoo() {{.*}}: () -> ()
-    call foo()
-
-!OMPDialect: omp.terminator
-!$omp end parallel
-
-end subroutine parallel_function_master
-
-!===============================================================================
-! master construct nested inside parallel construct
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPomp_parallel_master
-subroutine omp_parallel_master()
-
-!OMPDialect: omp.parallel {
-!$omp parallel
-    !FIRDialect: fir.call @_QPparallel() {{.*}}: () -> ()
-    call parallel()
-
-!OMPDialect: omp.master {
-!$omp master
-
-    !FIRDialect: fir.call @_QPparallel_master() {{.*}}: () -> ()
-    call parallel_master()
-
-!OMPDialect: omp.terminator
-!$omp end master
-
-!OMPDialect: omp.terminator
-!$omp end parallel
-
-end subroutine omp_parallel_master
-
-!===============================================================================
-! master construct nested inside parallel construct with conditional flow
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPomp_master_parallel
-subroutine omp_master_parallel()
-    integer :: alpha, beta, gama
-    alpha = 4
-    beta = 5
-    gama = 6
-
-!OMPDialect: omp.master {
-!$omp master
-
-    !FIRDialect: %{{.*}} = fir.load %{{.*}}
-    !FIRDialect: %{{.*}} = fir.load %{{.*}}
-    !FIRDialect: %[[RESULT:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
-    !FIRDialect: fir.if %[[RESULT]] {
-    if (alpha .ge. gama) then
-
-!OMPDialect: omp.parallel {
-!$omp parallel
-        !FIRDialect: fir.call @_QPinside_if_parallel() {{.*}}: () -> ()
-        call inside_if_parallel()
-
-!OMPDialect: omp.terminator
-!$omp end parallel
-
-        !FIRDialect: %{{.*}} = fir.load %{{.*}}
-        !FIRDialect: %{{.*}} = fir.load %{{.*}}
-        !FIRDialect: %{{.*}} = arith.addi %{{.*}}, %{{.*}}
-        !FIRDialect: fir.store %{{.*}} to %{{.*}}
-        beta = alpha + gama
-    end if
-    !FIRDialect: else
-
-!OMPDialect: omp.terminator
-!$omp end master
-
-end subroutine omp_master_parallel
diff --git a/flang/test/Lower/OpenMP/FIR/omp-declare-target-program-var.f90 b/flang/test/Lower/OpenMP/FIR/omp-declare-target-program-var.f90
deleted file mode 100644
index 0da76f6d9ad2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/omp-declare-target-program-var.f90
+++ /dev/null
@@ -1,12 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes=HOST,ALL
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=ALL
-
-PROGRAM main
-    ! HOST-DAG: %0 = fir.alloca f32 {bindc_name = "i", uniq_name = "_QFEi"}
-    REAL :: I
-    ! ALL-DAG: fir.global internal @_QFEi {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
-    ! ALL-DAG: %0 = fir.undefined f32
-    ! ALL-DAG: fir.has_value %0 : f32
-    ! ALL-DAG: }
-    !$omp declare target(I)
-END
diff --git a/flang/test/Lower/OpenMP/FIR/omp-is-gpu.f90 b/flang/test/Lower/OpenMP/FIR/omp-is-gpu.f90
deleted file mode 100644
index ac8d24974801..000000000000
--- a/flang/test/Lower/OpenMP/FIR/omp-is-gpu.f90
+++ /dev/null
@@ -1,16 +0,0 @@
-!REQUIRES: amdgpu-registered-target, nvptx-registered-target
-
-!RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-!RUN: %flang_fc1 -triple nvptx64-nvidia-cuda -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-!RUN: bbc -fopenmp -fopenmp-is-target-device -fopenmp-is-gpu -emit-fir -o - %s | FileCheck %s
-
-!RUN: not %flang_fc1 -triple amdgcn-amd-amdhsa -emit-fir -fopenmp %s -o - 2>&1 | FileCheck %s --check-prefix=FLANG-ERROR
-!RUN: not %flang_fc1 -triple nvptx64-nvidia-cuda -emit-fir -fopenmp %s -o - 2>&1 | FileCheck %s --check-prefix=FLANG-ERROR
-!RUN: not bbc -fopenmp -fopenmp-is-gpu -emit-fir %s -o - 2>&1 | FileCheck %s --check-prefix=BBC-ERROR
-
-!CHECK: module attributes {{{.*}}omp.is_gpu = true
-subroutine omp_subroutine()
-end subroutine omp_subroutine
-
-!FLANG-ERROR: error: OpenMP AMDGPU/NVPTX is only prepared to deal with device code.
-!BBC-ERROR: FATAL: -fopenmp-is-gpu can only be set if -fopenmp-is-target-device is also set
diff --git a/flang/test/Lower/OpenMP/FIR/ordered-threads.f90 b/flang/test/Lower/OpenMP/FIR/ordered-threads.f90
deleted file mode 100644
index 2dea4c857e87..000000000000
--- a/flang/test/Lower/OpenMP/FIR/ordered-threads.f90
+++ /dev/null
@@ -1,40 +0,0 @@
-! This test checks lowering of OpenMP ordered directive with threads Clause.
-! Without clause in ordered direcitve, it behaves as if threads clause is
-! specified.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefix=FIRDialect
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefix=LLVMIRDialect
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | tco | FileCheck %s --check-prefix=LLVMIR
-
-subroutine ordered
-        integer :: i
-        integer :: a(20)
-
-!FIRDialect: omp.ordered.region  {
-!LLVMIRDialect: omp.ordered.region  {
-!LLVMIR: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB0:[0-9]+]])
-!LLVMIR-NEXT: call void @__kmpc_ordered(ptr @[[GLOB0]], i32 [[TMP0]])
-!$OMP ORDERED
-        a(i) = a(i-1) + 1
-!FIRDialect:   omp.terminator
-!FIRDialect-NEXT: }
-!LLVMIRDialect:   omp.terminator
-!LLVMIRDialect-NEXT: }
-!LLVMIR: call void @__kmpc_end_ordered(ptr @[[GLOB0]], i32 [[TMP0]])
-!$OMP END ORDERED
-
-!FIRDialect: omp.ordered.region  {
-!LLVMIRDialect: omp.ordered.region  {
-!LLVMIR: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
-!LLVMIR-NEXT: call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[TMP1]])
-!$OMP ORDERED THREADS
-        a(i) = a(i-1) + 1
-!FIRDialect:   omp.terminator
-!FIRDialect-NEXT: }
-!LLVMIRDialect:   omp.terminator
-!LLVMIRDialect-NEXT: }
-!LLVMIR: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP1]])
-!LLVMIR-NEXT: ret void
-!$OMP END ORDERED
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-firstprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/FIR/parallel-firstprivate-clause-scalar.f90
deleted file mode 100644
index 37f916ecb84c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-firstprivate-clause-scalar.f90
+++ /dev/null
@@ -1,159 +0,0 @@
-! This test checks lowering of `FIRSTPRIVATE` clause for scalar types.
-
-! REQUIRES: shell
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s --check-prefix=FIRDialect
-
-!FIRDialect-DAG: func @_QPfirstprivate_complex(%[[ARG1:.*]]: !fir.ref<!fir.complex<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.complex<8>>{{.*}}) {
-!FIRDialect:   omp.parallel {
-!FIRDialect:     %[[ARG1_PVT:.*]] = fir.alloca !fir.complex<4> {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_complexEarg1"}
-!FIRDialect:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.complex<4>>
-!FIRDialect:     fir.store %[[ARG1_VAL]] to %[[ARG1_PVT]] : !fir.ref<!fir.complex<4>>
-!FIRDialect:     %[[ARG2_PVT:.*]] = fir.alloca !fir.complex<8> {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_complexEarg2"}
-!FIRDialect:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2]] : !fir.ref<!fir.complex<8>>
-!FIRDialect:     fir.store %[[ARG2_VAL]] to %[[ARG2_PVT]] : !fir.ref<!fir.complex<8>>
-!FIRDialect:     fir.call @_QPfoo(%[[ARG1_PVT]], %[[ARG2_PVT]]) {{.*}}: (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<8>>) -> ()
-!FIRDialect:     omp.terminator
-!FIRDialect:   }
-
-subroutine firstprivate_complex(arg1, arg2)
-        complex(4) :: arg1
-        complex(8) :: arg2
-
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2)
-        call foo(arg1, arg2)
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect-DAG: func @_QPfirstprivate_integer(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<i8>{{.*}}, %[[ARG3:.*]]: !fir.ref<i16>{{.*}}, %[[ARG4:.*]]: !fir.ref<i32>{{.*}}, %[[ARG5:.*]]: !fir.ref<i64>{{.*}}, %[[ARG6:.*]]: !fir.ref<i128>{{.*}}) {
-!FIRDialect:  omp.parallel {
-!FIRDialect:    %[[ARG1_PVT:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_integerEarg1"}
-!FIRDialect:    %[[ARG1_VAL:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!FIRDialect:    fir.store %[[ARG1_VAL]] to %[[ARG1_PVT]] : !fir.ref<i32>
-!FIRDialect:    %[[ARG2_PVT:.*]] = fir.alloca i8 {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_integerEarg2"}
-!FIRDialect:    %[[ARG2_VAL:.*]] = fir.load %[[ARG2]] : !fir.ref<i8>
-!FIRDialect:    fir.store %[[ARG2_VAL]] to %[[ARG2_PVT]] : !fir.ref<i8>
-!FIRDialect:    %[[ARG3_PVT:.*]] = fir.alloca i16 {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_integerEarg3"}
-!FIRDialect:    %[[ARG3_VAL:.*]] = fir.load %[[ARG3]] : !fir.ref<i16>
-!FIRDialect:    fir.store %[[ARG3_VAL]] to %[[ARG3_PVT]] : !fir.ref<i16>
-!FIRDialect:    %[[ARG4_PVT:.*]] = fir.alloca i32 {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_integerEarg4"}
-!FIRDialect:    %[[ARG4_VAL:.*]] = fir.load %[[ARG4]] : !fir.ref<i32>
-!FIRDialect:    fir.store %[[ARG4_VAL]] to %[[ARG4_PVT]] : !fir.ref<i32>
-!FIRDialect:    %[[ARG5_PVT:.*]] = fir.alloca i64 {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_integerEarg5"}
-!FIRDialect:    %[[ARG5_VAL:.*]] = fir.load %[[ARG5]] : !fir.ref<i64>
-!FIRDialect:    fir.store %[[ARG5_VAL]] to %[[ARG5_PVT]] : !fir.ref<i64>
-!FIRDialect:    %[[ARG6_PVT:.*]] = fir.alloca i128 {bindc_name = "arg6", pinned, uniq_name = "_QFfirstprivate_integerEarg6"}
-!FIRDialect:    %[[ARG6_VAL:.*]] = fir.load %[[ARG6]] : !fir.ref<i128>
-!FIRDialect:    fir.store %[[ARG6_VAL]] to %[[ARG6_PVT]] : !fir.ref<i128>
-!FIRDialect:    fir.call @_QPbar(%[[ARG1_PVT]], %[[ARG2_PVT]], %[[ARG3_PVT]], %[[ARG4_PVT]], %[[ARG5_PVT]], %[[ARG6_PVT]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i8>, !fir.ref<i16>, !fir.ref<i32>, !fir.ref<i64>, !fir.ref<i128>) -> ()
-!FIRDialect:    omp.terminator
-!FIRDialect:  }
-
-subroutine firstprivate_integer(arg1, arg2, arg3, arg4, arg5, arg6)
-        integer :: arg1
-        integer(kind=1) :: arg2
-        integer(kind=2) :: arg3
-        integer(kind=4) :: arg4
-        integer(kind=8) :: arg5
-        integer(kind=16) :: arg6
-
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5, arg6)
-        call bar(arg1, arg2, arg3, arg4, arg5, arg6)
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect-DAG: func @_QPfirstprivate_logical(%[[ARG1:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.logical<1>>{{.*}}, %[[ARG3:.*]]: !fir.ref<!fir.logical<2>>{{.*}}, %[[ARG4:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG5:.*]]: !fir.ref<!fir.logical<8>>{{.*}}) {
-!FIRDialect:   omp.parallel {
-!FIRDialect:     %[[ARG1_PVT:.*]] = fir.alloca !fir.logical<4> {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_logicalEarg1"}
-!FIRDialect:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-!FIRDialect:     fir.store %[[ARG1_VAL]] to %[[ARG1_PVT]] : !fir.ref<!fir.logical<4>>
-!FIRDialect:     %[[ARG2_PVT:.*]] = fir.alloca !fir.logical<1> {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_logicalEarg2"}
-!FIRDialect:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2]] : !fir.ref<!fir.logical<1>>
-!FIRDialect:     fir.store %[[ARG2_VAL]] to %[[ARG2_PVT]] : !fir.ref<!fir.logical<1>>
-!FIRDialect:     %[[ARG3_PVT:.*]] = fir.alloca !fir.logical<2> {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_logicalEarg3"}
-!FIRDialect:     %[[ARG3_VAL:.*]] = fir.load %[[ARG3]] : !fir.ref<!fir.logical<2>>
-!FIRDialect:     fir.store %[[ARG3_VAL]] to %[[ARG3_PVT]] : !fir.ref<!fir.logical<2>>
-!FIRDialect:     %[[ARG4_PVT:.*]] = fir.alloca !fir.logical<4> {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_logicalEarg4"}
-!FIRDialect:     %[[ARG4_VAL:.*]] = fir.load %[[ARG4]] : !fir.ref<!fir.logical<4>>
-!FIRDialect:     fir.store %[[ARG4_VAL]] to %[[ARG4_PVT]] : !fir.ref<!fir.logical<4>>
-!FIRDialect:     %[[ARG5_PVT:.*]] = fir.alloca !fir.logical<8> {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_logicalEarg5"}
-!FIRDialect:     %[[ARG5_VAL:.*]] = fir.load %[[ARG5]] : !fir.ref<!fir.logical<8>>
-!FIRDialect:     fir.store %[[ARG5_VAL]] to %[[ARG5_PVT]] : !fir.ref<!fir.logical<8>>
-!FIRDialect:     fir.call @_QPbaz(%[[ARG1_PVT]], %[[ARG2_PVT]], %[[ARG3_PVT]], %[[ARG4_PVT]], %[[ARG5_PVT]]) {{.*}}: (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<8>>) -> ()
-!FIRDialect:     omp.terminator
-!FIRDialect:   }
-
-subroutine firstprivate_logical(arg1, arg2, arg3, arg4, arg5)
-        logical :: arg1
-        logical(kind=1) :: arg2
-        logical(kind=2) :: arg3
-        logical(kind=4) :: arg4
-        logical(kind=8) :: arg5
-
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5)
-        call baz(arg1, arg2, arg3, arg4, arg5)
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect-DAG: func @_QPfirstprivate_real(%[[ARG1:.*]]: !fir.ref<f32>{{.*}}, %[[ARG2:.*]]: !fir.ref<f16>{{.*}}, %[[ARG3:.*]]: !fir.ref<f32>{{.*}}, %[[ARG4:.*]]: !fir.ref<f64>{{.*}}, %[[ARG5:.*]]: !fir.ref<f80>{{.*}}, %[[ARG6:.*]]: !fir.ref<f128>{{.*}}) {
-!FIRDialect:   omp.parallel {
-!FIRDialect:     %[[ARG1_PVT:.*]] = fir.alloca f32 {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_realEarg1"}
-!FIRDialect:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-!FIRDialect:     fir.store %[[ARG1_VAL]] to %[[ARG1_PVT]] : !fir.ref<f32>
-!FIRDialect:     %[[ARG2_PVT:.*]] = fir.alloca f16 {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_realEarg2"}
-!FIRDialect:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2]] : !fir.ref<f16>
-!FIRDialect:     fir.store %[[ARG2_VAL]] to %[[ARG2_PVT]] : !fir.ref<f16>
-!FIRDialect:     %[[ARG3_PVT:.*]] = fir.alloca f32 {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_realEarg3"}
-!FIRDialect:     %[[ARG3_VAL:.*]] = fir.load %[[ARG3]] : !fir.ref<f32>
-!FIRDialect:     fir.store %[[ARG3_VAL]] to %[[ARG3_PVT]] : !fir.ref<f32>
-!FIRDialect:     %[[ARG4_PVT:.*]] = fir.alloca f64 {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_realEarg4"}
-!FIRDialect:     %[[ARG4_VAL:.*]] = fir.load %[[ARG4]] : !fir.ref<f64>
-!FIRDialect:     fir.store %[[ARG4_VAL]] to %[[ARG4_PVT]] : !fir.ref<f64>
-!FIRDialect:     %[[ARG5_PVT:.*]] = fir.alloca f80 {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_realEarg5"}
-!FIRDialect:     %[[ARG5_VAL:.*]] = fir.load %[[ARG5]] : !fir.ref<f80>
-!FIRDialect:     fir.store %[[ARG5_VAL]] to %[[ARG5_PVT]] : !fir.ref<f80>
-!FIRDialect:     %[[ARG6_PVT:.*]] = fir.alloca f128 {bindc_name = "arg6", pinned, uniq_name = "_QFfirstprivate_realEarg6"}
-!FIRDialect:     %[[ARG6_VAL:.*]] = fir.load %[[ARG6]] : !fir.ref<f128>
-!FIRDialect:     fir.store %[[ARG6_VAL]] to %[[ARG6_PVT]] : !fir.ref<f128>
-!FIRDialect:     fir.call @_QPqux(%[[ARG1_PVT]], %[[ARG2_PVT]], %[[ARG3_PVT]], %[[ARG4_PVT]], %[[ARG5_PVT]], %[[ARG6_PVT]]) {{.*}}: (!fir.ref<f32>, !fir.ref<f16>, !fir.ref<f32>, !fir.ref<f64>, !fir.ref<f80>, !fir.ref<f128>) -> ()
-!FIRDialect:     omp.terminator
-!FIRDialect:   }
-
-subroutine firstprivate_real(arg1, arg2, arg3, arg4, arg5, arg6)
-        real :: arg1
-        real(kind=2) :: arg2
-        real(kind=4) :: arg3
-        real(kind=8) :: arg4
-        real(kind=10) :: arg5
-        real(kind=16) :: arg6
-
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5, arg6)
-        call qux(arg1, arg2, arg3, arg4, arg5, arg6)
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect-LABEL:   func.func @_QPmultiple_firstprivate(
-!FIRDialect-SAME:                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
-!FIRDialect-SAME:                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-!FIRDialect:           omp.parallel   {
-!FIRDialect:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFmultiple_firstprivateEa"}
-!FIRDialect:             %[[A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<i32>
-!FIRDialect:             fir.store %[[A]] to %[[A_PRIV_ADDR]] : !fir.ref<i32>
-!FIRDialect:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFmultiple_firstprivateEb"}
-!FIRDialect:             %[[B:.*]] = fir.load %[[B_ADDR]] : !fir.ref<i32>
-!FIRDialect:             fir.store %[[B]] to %[[B_PRIV_ADDR]] : !fir.ref<i32>
-!FIRDialect:             fir.call @_QPquux(%[[A_PRIV_ADDR]], %[[B_PRIV_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-!FIRDialect:             omp.terminator
-!FIRDialect:           }
-!FIRDialect:           return
-!FIRDialect:         }
-
-subroutine multiple_firstprivate(a, b)
-        integer :: a, b
-!$OMP PARALLEL FIRSTPRIVATE(a) FIRSTPRIVATE(b)
-        call quux(a, b)
-!$OMP END PARALLEL
-end subroutine multiple_firstprivate
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
deleted file mode 100644
index 16832355f5d1..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
+++ /dev/null
@@ -1,261 +0,0 @@
-! This test checks lowering of `LASTPRIVATE` clause for scalar types.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
-
-!CHECK: func @_QPlastprivate_character(%[[ARG1:.*]]: !fir.boxchar<1>{{.*}}) {
-!CHECK-DAG: %[[ARG1_UNBOX:.*]]:2 = fir.unboxchar
-!CHECK-DAG: %[[FIVE:.*]] = arith.constant 5 : index
-!CHECK-DAG: %[[ARG1_REF:.*]] = fir.convert %[[ARG1_UNBOX]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,5>>
-
-!CHECK: omp.parallel {
-!CHECK-DAG: %[[ARG1_PVT:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "arg1", 
-
-! Check that we are accessing the clone inside the loop
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-!CHECK: %[[UNIT:.*]] = arith.constant 6 : i32
-!CHECK-NEXT: %[[ADDR:.*]] = fir.address_of(@_QQclX
-!CHECK-NEXT: %[[CVT0:.*]] = fir.convert %[[ADDR]] 
-!CHECK-NEXT: %[[CNST:.*]] = arith.constant
-!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[UNIT]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
-!CHECK-NEXT: %[[CVT_0_1:.*]] = fir.convert %[[ARG1_PVT]] 
-!CHECK-NEXT: %[[CVT_0_2:.*]] = fir.convert %[[FIVE]]
-!CHECK-NEXT: %[[CALL_OP_ASCII:.*]] = fir.call @_FortranAioOutputAscii(%[[CALL_BEGIN_IO]], %[[CVT_0_1]], %[[CVT_0_2]])
-!CHECK-NEXT: %[[CALL_END_IO:.*]] = fir.call @_FortranAioEndIoStatement(%[[CALL_BEGIN_IO]])
-
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-
-! Testing lastprivate val update
-!CHECK-DAG: %[[CVT:.*]] = fir.convert %[[ARG1_REF]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK-DAG: %[[CVT1:.*]] = fir.convert %[[ARG1_PVT]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[CVT]], %[[CVT1]]{{.*}})
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-
-subroutine lastprivate_character(arg1)
-        character(5) :: arg1
-!$OMP PARALLEL 
-!$OMP DO LASTPRIVATE(arg1)
-do n = 1, 5
-        arg1(n:n) = 'c'
-        print *, arg1
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-end subroutine
-
-!CHECK: func @_QPlastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
-!CHECK-DAG: omp.parallel  {
-!CHECK-DAG: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-
-! Testing lastprivate val update
-!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE]] : !fir.ref<i32>
-!CHECK-NEXT: fir.store %[[CLONE_LD]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-
-subroutine lastprivate_int(arg1)
-        integer :: arg1
-!$OMP PARALLEL 
-!$OMP DO LASTPRIVATE(arg1)
-do n = 1, 5
-        arg1 = 2
-        print *, arg1
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1
-end subroutine
-
-!CHECK: func.func @_QPmult_lastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: omp.parallel  {
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-! Testing lastprivate val update
-!CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[CLONE_LD1]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[CLONE_LD2]] to %[[ARG2]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-
-subroutine mult_lastprivate_int(arg1, arg2)
-        integer :: arg1, arg2
-!$OMP PARALLEL 
-!$OMP DO LASTPRIVATE(arg1) LASTPRIVATE(arg2)
-do n = 1, 5
-        arg1 = 2
-        arg2 = 3
-        print *, arg1, arg2
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1, arg2
-end subroutine
-
-!CHECK: func.func @_QPmult_lastprivate_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: omp.parallel  {
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-
-!Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-!Testing lastprivate val update
-!CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[CLONE_LD2]] to %[[ARG2]] : !fir.ref<i32>
-!CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[CLONE_LD1]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-
-subroutine mult_lastprivate_int2(arg1, arg2)
-        integer :: arg1, arg2
-!$OMP PARALLEL 
-!$OMP DO LASTPRIVATE(arg1, arg2)
-do n = 1, 5
-        arg1 = 2
-        arg2 = 3
-        print *, arg1, arg2
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1, arg2
-end subroutine
-
-!CHECK: func.func @_QPfirstpriv_lastpriv_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: omp.parallel  {
-! Firstprivate update
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-DAG: %[[FPV_LD:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[FPV_LD]] to %[[CLONE1]] : !fir.ref<i32>
-! Lastprivate Allocation
-!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
-!CHECK-NOT: omp.barrier
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-! Testing lastprivate val update
-!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE2]] : !fir.ref<i32>
-!CHECK-NEXT: fir.store %[[CLONE_LD]] to %[[ARG2]] : !fir.ref<i32>
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.yield
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.terminator
-!CHECK-NEXT: }
-
-subroutine firstpriv_lastpriv_int(arg1, arg2)
-        integer :: arg1, arg2
-!$OMP PARALLEL 
-!$OMP DO FIRSTPRIVATE(arg1) LASTPRIVATE(arg2)
-do n = 1, 5
-        arg1 = 2
-        arg2 = 3
-        print *, arg1, arg2
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1, arg2
-end subroutine
-
-!CHECK: func.func @_QPfirstpriv_lastpriv_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
-!CHECK: omp.parallel  {
-! Firstprivate update
-!CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-NEXT: %[[FPV_LD:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK-NEXT: fir.store %[[FPV_LD]] to %[[CLONE1]] : !fir.ref<i32>
-!CHECK-NEXT: omp.barrier
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-! Testing lastprivate val update
-!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
-!CHECK-NEXT: fir.store %[[CLONE_LD]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.yield
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.terminator
-!CHECK-NEXT: }
-
-subroutine firstpriv_lastpriv_int2(arg1)
-        integer :: arg1
-!$OMP PARALLEL 
-!$OMP DO FIRSTPRIVATE(arg1) LASTPRIVATE(arg1)
-do n = 1, 5
-        arg1 = 2
-        print *, arg1
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90
deleted file mode 100644
index fb0fb9594c35..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-! This test checks a few bug fixes in the PRIVATE clause lowering
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! CHECK-LABEL: multiple_private_fix
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_private_fixEi"}
-! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFmultiple_private_fixEj"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_private_fixEx"}
-! CHECK:         omp.parallel {
-! CHECK-DAG:           %[[PRIV_J:.*]] = fir.alloca i32 {bindc_name = "j", pinned
-! CHECK-DAG:           %[[PRIV_I:.*]] = fir.alloca i32 {adapt.valuebyref, pinned
-! CHECK-DAG:           %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned
-! CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_4:.*]] : !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:           omp.wsloop {
-! CHECK-NEXT:        omp.loop_nest (%[[VAL_6:.*]]) : i32 = (%[[ONE]]) to (%[[VAL_3]]) inclusive step (%[[VAL_5]]) {
-! CHECK:               fir.store %[[VAL_6]] to %[[PRIV_I]] : !fir.ref<i32>
-! CHECK:               %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:               %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index
-! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-! CHECK:               %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
-! CHECK:               %[[VAL_11:.*]] = arith.constant 1 : index
-! CHECK:               %[[LB:.*]] = fir.convert %[[VAL_8]] : (index) -> i32
-! CHECK:               %[[VAL_12:.*]]:2 = fir.do_loop %[[VAL_13:[^ ]*]] =
-! CHECK-SAME:              %[[VAL_8]] to %[[VAL_10]] step %[[VAL_11]]
-! CHECK-SAME:              iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i32) {
-! CHECK:                 fir.store %[[IV]] to %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:                 %[[LOAD:.*]] = fir.load %[[PRIV_I]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_16:.*]] = arith.addi %[[LOAD]], %[[VAL_15]] : i32
-! CHECK:                 fir.store %[[VAL_16]] to %[[PRIV_X]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = arith.addi %[[VAL_13]], %[[VAL_11]] : index
-! CHECK:                 %[[STEPCAST:.*]] = fir.convert %[[VAL_11]] : (index) -> i32
-! CHECK:                 %[[IVLOAD:.*]] = fir.load %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:                 %[[IVINC:.*]] = arith.addi %[[IVLOAD]], %[[STEPCAST]]
-! CHECK:                 fir.result %[[VAL_17]], %[[IVINC]] : index, i32
-! CHECK:               }
-! CHECK:               fir.store %[[VAL_12]]#1 to %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:               omp.yield
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-subroutine multiple_private_fix(gama)
-        integer :: i, j, x, gama
-!$OMP PARALLEL DO PRIVATE(j,x)
-        do i = 1, gama
-          do j = 1, gama
-            x = i + j
-          end do
-        end do
-!$OMP END PARALLEL DO
-end subroutine
-
-! CHECK-LABEL: multiple_private_fix2
-! CHECK:  %[[X1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_private_fix2Ex"}
-! CHECK:  omp.parallel  {
-! CHECK:    %[[X2:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFmultiple_private_fix2Ex"}
-! CHECK:    omp.parallel  {
-! CHECK:      %[[X3:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFmultiple_private_fix2Ex"}
-! CHECK:      %[[C3:.*]] = arith.constant 1 : i32
-! CHECK:      fir.store %[[C3]] to %[[X3]] : !fir.ref<i32>
-! CHECK:      omp.terminator
-! CHECK:    }
-! CHECK:      %[[C2:.*]] = arith.constant 1 : i32
-! CHECK:      fir.store %[[C2]] to %[[X2]] : !fir.ref<i32>
-! CHECK:    omp.terminator
-! CHECK:  }
-! CHECK:      %[[C1:.*]] = arith.constant 1 : i32
-! CHECK:      fir.store %[[C1]] to %[[X1]] : !fir.ref<i32>
-! CHECK:  return
-subroutine multiple_private_fix2()
-   integer :: x
-   !$omp parallel private(x)
-   !$omp parallel private(x)
-      x = 1
-   !$omp end parallel
-      x = 1
-   !$omp end parallel
-      x = 1
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
deleted file mode 100644
index 2e68d25a15ed..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
+++ /dev/null
@@ -1,387 +0,0 @@
-! This test checks lowering of OpenMP parallel Directive with
-! `PRIVATE` clause present.
-
-! REQUIRES: shell
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir -hlfir=false %s -o - | \
-! RUN:   FileCheck %s --check-prefix=FIRDialect
-
-!FIRDialect: func @_QPprivate_clause(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.array<10xi32>>{{.*}}, %[[ARG3:.*]]: !fir.boxchar<1>{{.*}}, %[[ARG4:.*]]: !fir.boxchar<1>{{.*}}) {
-!FIRDialect-DAG: %[[ALPHA:.*]] = fir.alloca i32 {{{.*}}, uniq_name = "{{.*}}Ealpha"}
-!FIRDialect-DAG: %[[ALPHA_ARRAY:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, uniq_name = "{{.*}}Ealpha_array"}
-!FIRDialect-DAG: %[[BETA:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, uniq_name = "{{.*}}Ebeta"}
-!FIRDialect-DAG: %[[BETA_ARRAY:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, uniq_name = "{{.*}}Ebeta_array"}
-
-!FIRDialect-DAG:  omp.parallel {
-!FIRDialect-DAG: %[[ALPHA_PRIVATE:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "{{.*}}Ealpha"}
-!FIRDialect-DAG: %[[ALPHA_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, pinned, uniq_name = "{{.*}}Ealpha_array"}
-!FIRDialect-DAG: %[[BETA_PRIVATE:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, pinned, uniq_name = "{{.*}}Ebeta"}
-!FIRDialect-DAG: %[[BETA_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, pinned, uniq_name = "{{.*}}Ebeta_array"}
-!FIRDialect-DAG: %[[ARG1_PRIVATE:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "{{.*}}Earg1"}
-!FIRDialect-DAG: %[[ARG2_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, pinned, uniq_name = "{{.*}}Earg2"}
-!FIRDialect-DAG: %[[ARG3_PRIVATE:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, pinned, uniq_name = "{{.*}}Earg3"}
-!FIRDialect-DAG: %[[ARG4_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, pinned, uniq_name = "{{.*}}Earg4"}
-!FIRDialect:    omp.terminator
-!FIRDialect:  }
-
-subroutine private_clause(arg1, arg2, arg3, arg4)
-
-        integer :: arg1, arg2(10)
-        integer :: alpha, alpha_array(10)
-        character(5) :: arg3, arg4(10)
-        character(5) :: beta, beta_array(10)
-
-!$OMP PARALLEL PRIVATE(alpha, alpha_array, beta, beta_array, arg1, arg2, arg3, arg4)
-        alpha = 1
-        alpha_array = 4
-        beta = "hi"
-        beta_array = "hi"
-        arg1 = 2
-        arg2 = 3
-        arg3 = "world"
-        arg4 = "world"
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect: func @_QPprivate_clause_scalar() {
-!FIRDialect-DAG:   {{.*}} = fir.alloca !fir.complex<4> {bindc_name = "c", uniq_name = "{{.*}}Ec"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i8 {bindc_name = "i1", uniq_name = "{{.*}}Ei1"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i128 {bindc_name = "i16", uniq_name = "{{.*}}Ei16"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i16 {bindc_name = "i2", uniq_name = "{{.*}}Ei2"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i32 {bindc_name = "i4", uniq_name = "{{.*}}Ei4"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i64 {bindc_name = "i8", uniq_name = "{{.*}}Ei8"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca !fir.logical<4> {bindc_name = "l", uniq_name = "{{.*}}El"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca f32 {bindc_name = "r", uniq_name = "{{.*}}Er"}
-
-!FIRDialect:   omp.parallel {
-!FIRDialect-DAG:     {{.*}} = fir.alloca i8 {bindc_name = "i1", pinned, uniq_name = "{{.*}}Ei1"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i16 {bindc_name = "i2", pinned, uniq_name = "{{.*}}Ei2"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i32 {bindc_name = "i4", pinned, uniq_name = "{{.*}}Ei4"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i64 {bindc_name = "i8", pinned, uniq_name = "{{.*}}Ei8"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i128 {bindc_name = "i16", pinned, uniq_name = "{{.*}}Ei16"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca !fir.complex<4> {bindc_name = "c", pinned, uniq_name = "{{.*}}Ec"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca !fir.logical<4> {bindc_name = "l", pinned, uniq_name = "{{.*}}El"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca f32 {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-
-subroutine private_clause_scalar()
-
-        integer(kind=1) :: i1
-        integer(kind=2) :: i2
-        integer(kind=4) :: i4
-        integer(kind=8) :: i8
-        integer(kind=16) :: i16
-        complex :: c
-        logical :: l
-        real :: r
-
-!$OMP PARALLEL PRIVATE(i1, i2, i4, i8, i16, c, l, r)
-        print *, i1, i2, i4, i8, i16, c, l, r
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect: func @_QPprivate_clause_derived_type() {
-!FIRDialect:   {{.*}} = fir.alloca !fir.type<{{.*}}{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", uniq_name = "{{.*}}Et"}
-
-!FIRDialect:   omp.parallel {
-!FIRDialect:     {{.*}} = fir.alloca !fir.type<{{.*}}{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "{{.*}}Et"}
-
-subroutine private_clause_derived_type()
-
-        type my_type
-          integer :: t_i
-          integer :: t_arr(5)
-        end type my_type
-        type(my_type) :: t
-
-!$OMP PARALLEL PRIVATE(t)
-        print *, t%t_i
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect: func @_QPprivate_clause_allocatable() {
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", uniq_name = "{{.*}}Ex"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.heap<i32> {uniq_name = "{{.*}}Ex.addr"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", uniq_name = "{{.*}}Ex2"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.heap<!fir.array<?xi32>> {uniq_name = "{{.*}}Ex2.addr"}
-!FIRDialect-DAG:  {{.*}} = fir.address_of(@{{.*}}Ex3) : !fir.ref<!fir.box<!fir.heap<i32>>>
-!FIRDialect-DAG:  [[TMP8:%.*]] = fir.address_of(@{{.*}}Ex4) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-
-!FIRDialect:   omp.parallel {
-!FIRDialect-DAG:    [[TMP35:%.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", pinned, uniq_name = "{{.*}}Ex"}
-!FIRDialect-DAG:    [[TMP39:%.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", pinned, uniq_name = "{{.*}}Ex2"}
-!FIRDialect-DAG:    [[TMP45:%.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x3", pinned, uniq_name = "{{.*}}Ex3"}
-
-!FIRDialect-DAG:    [[TMP51:%.*]] = fir.load [[TMP8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!FIRDialect-DAG:    [[TMP97:%.*]] = fir.load [[TMP8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!FIRDialect-DAG:    [[TMP98:%.*]]:3 = fir.box_dims [[TMP97]], {{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
-!FIRDialect-DAG:    [[TMP50:%.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x4", pinned, uniq_name = "{{.*}}Ex4"}
-
-! FIRDialect-DAG:    [[TMP101:%.*]] = fir.allocmem !fir.array<?xi32>, {{.*}} {fir.must_be_heap = true, uniq_name = "{{.*}}Ex4.alloc"}
-! FIRDialect-DAG:    [[TMP102:%.*]] = fir.shape_shift {{.*}}#0, {{.*}} : (index, index) -> !fir.shapeshift<1>
-! FIRDialect-DAG:    [[TMP103:%.*]] = fir.embox [[TMP101]]([[TMP102]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
-! FIRDialect-DAG:  fir.store [[TMP103]] to [[TMP50]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-
-
-subroutine private_clause_allocatable()
-
-        integer, allocatable :: x, x2(:)
-        integer, allocatable, save :: x3, x4(:)
-
-        print *, x, x2, x3, x4
-
-!$OMP PARALLEL PRIVATE(x, x2, x3, x4)
-        print *, x, x2, x3, x4
-!$OMP END PARALLEL
-
-end subroutine
-
-
-!FIRDialect: func @_QPprivate_clause_real_call_allocatable() {
-!FIRDialect-DAG: {{.*}} = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", uniq_name = "{{.*}}Ex5"}
-!FIRDialect-DAG: {{.*}} = fir.zero_bits !fir.heap<f32>
-!FIRDialect-DAG: {{.*}} = fir.embox %1 : (!fir.heap<f32>) -> !fir.box<!fir.heap<f32>>
-!FIRDialect-DAG: fir.store %2 to %0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-!FIRDialect-DAG: omp.parallel   {
-!FIRDialect-DAG:  [[TMP203:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", pinned, uniq_name = "{{.*}}Ex5"}
-
-!FIRDialect-DAG: fir.if %{{.*}} {
-
-!FIRDialect-DAG:   fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-!FIRDialect-DAG: } else {
-
-!FIRDialect-DAG:   fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-!FIRDialect-DAG: }
-!FIRDialect-DAG: fir.call @_QFprivate_clause_real_call_allocatablePhelper_private_clause_real_call_allocatable([[TMP203]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> ()
-!FIRDialect-DAG: %{{.*}} = fir.load [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-!FIRDialect-DAG: fir.if %{{.*}} {
-!FIRDialect-DAG:   %{{.*}} = fir.load [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-!FIRDialect-DAG:     fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-!FIRDialect-DAG:   }
-!FIRDialect-DAG:   omp.terminator
-!FIRDialect-DAG:   }
-!FIRDialect-DAG:   return
-!FIRDialect-DAG: }
-
-
-subroutine private_clause_real_call_allocatable
-        real, allocatable :: x5
-        !$omp parallel private(x5)
-            call helper_private_clause_real_call_allocatable(x5)
-        !$omp end parallel
-    contains
-        subroutine helper_private_clause_real_call_allocatable(x6)
-            real, allocatable :: x6
-            print *, allocated(x6)
-        end subroutine
-end subroutine
-
-!FIRDialect:  func.func @_QPincrement_list_items(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>> {fir.bindc_name = "head"}) {
-!FIRDialect:    {{%.*}} = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", uniq_name = "_QFincrement_list_itemsEp"}
-!FIRDialect:    omp.parallel   {
-!FIRDialect:      {{%.*}} = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", pinned, uniq_name = "_QFincrement_list_itemsEp"}
-!FIRDialect:      omp.single   {
-
-!FIRDialect:         omp.terminator
-!FIRDialect:       omp.terminator
-!FIRDialect:    return
-
-subroutine increment_list_items (head)
-  type node
-     integer :: payload
-     type (node), pointer :: next
-  end type node
-
-  type (node), pointer :: head
-  type (node), pointer :: p
-!$omp parallel private(p)
-!$omp single
-  p => head
-  do
-     p => p%next
-     if ( associated (p) .eqv. .false. ) exit
-  end do
-!$omp end single
-!$omp end parallel
-end subroutine increment_list_items
-
-!FIRDialect:  func.func @_QPparallel_pointer() {
-!FIRDialect-DAG: [[PP0:%.*]]  = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", uniq_name = "{{.*}}Ey1"}
-!FIRDialect-DAG: [[PP1:%.*]]  = fir.alloca !fir.ptr<i32> {uniq_name = "{{.*}}Ey1.addr"}
-!FIRDialect-DAG: [[PP2:%.*]]  = fir.zero_bits !fir.ptr<i32>
-!FIRDialect:     fir.store [[PP2]] to [[PP1]] : !fir.ref<!fir.ptr<i32>>
-!FIRDialect-DAG: [[PP3:%.*]]  = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", uniq_name = "{{.*}}Ey2"}
-
-!FIRDialect:     fir.store %6 to %3 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!FIRDialect-DAG: [[PP7:%.*]] = fir.alloca i32 {bindc_name = "z1", fir.target, uniq_name = "{{.*}}Ez1"}
-
-!FIRDialect-DAG: [[PP8:%.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "z2", fir.target, uniq_name = "{{.*}}Ez2"}
-!FIRDialect:     omp.parallel   {
-!FIRDialect-DAG:   [[PP9:%.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", pinned, uniq_name = "{{.*}}Ey1"}
-!FIRDialect-DAG:   [[PP10:%.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", pinned, uniq_name = "{{.*}}Ey2"}
-!FIRDialect-DAG:   [[PP11:%.*]] = fir.embox [[PP7]] : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
-!FIRDialect:       fir.store [[PP11]] to [[PP9]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!FIRDialect-DAG:   [[PP12:%.*]] = fir.shape %c{{.*}} : (index) -> !fir.shape<1>
-!FIRDialect-DAG:   [[PP13:%.*]] = fir.embox [[PP8]]([[PP12]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
-!FIRDialect:       fir.store %13 to [[PP10]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!FIRDialect:       omp.terminator
-!FIRDialect:     }
-!FIRDialect:   return
-!FIRDialect: }
-
-subroutine parallel_pointer()
-    integer, pointer :: y1, y2(:)
-    integer, target :: z1, z2(10)
-
-!$omp parallel private(y1, y2)
-  y1=>z1
-  y2=>z2
-!$omp end parallel
-end subroutine parallel_pointer
-
-
-!FIRDialect-LABEL: func @_QPsimple_loop_1()
-subroutine simple_loop_1
-  integer :: i
-  real, allocatable :: r;
-  ! FIRDialect:  omp.parallel
-  !$OMP PARALLEL PRIVATE(r)
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
-  ! FIRDialect:      [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-  ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-
-  ! FIRDialect:      omp.wsloop {
-  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP DO
-  do i=1, 9
-  ! FIRDialect:      fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:      %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! FIRDialect:      omp.yield
-  ! FIRDialect:      omp.terminator
-  ! FIRDialect:      {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.if {{%.*}} {
-  ! FIRDialect:      [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:      fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  !$OMP END DO
-  ! FIRDialect:  omp.terminator
-  !$OMP END PARALLEL
-end subroutine
-
-!FIRDialect-LABEL: func @_QPsimple_loop_2()
-subroutine simple_loop_2
-  integer :: i
-  real, allocatable :: r;
-  ! FIRDialect:  omp.parallel
-  !$OMP PARALLEL
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
-  ! FIRDialect:      [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-  ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-
-  ! FIRDialect:      omp.wsloop {
-  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP DO PRIVATE(r)
-  do i=1, 9
-  ! FIRDialect:      fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:      %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! FIRDialect:      omp.yield
-  ! FIRDialect:      omp.terminator
-  ! FIRDialect:      {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.if {{%.*}} {
-  ! FIRDialect:      [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:      fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  !$OMP END DO
-  ! FIRDialect:  omp.terminator
-  !$OMP END PARALLEL
-end subroutine
-
-!FIRDialect-LABEL: func @_QPsimple_loop_3()
-subroutine simple_loop_3
-  integer :: i
-  real, allocatable :: r;
-  ! FIRDialect:  omp.parallel
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
-  ! FIRDialect:      [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-  ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-
-  ! FIRDialect:      omp.wsloop {
-  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP PARALLEL DO PRIVATE(r)
-  do i=1, 9
-  ! FIRDialect:      fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:      %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! FIRDialect:      omp.yield
-  ! FIRDialect:      omp.terminator
-  ! FIRDialect:      {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.if {{%.*}} {
-  ! FIRDialect:      [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:      fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  !$OMP END PARALLEL DO
-  ! FIRDialect:  omp.terminator
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_loop_1()
-subroutine simd_loop_1
-  integer :: i
-  real, allocatable :: r;
-  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-  ! FIRDialect:     %[[LB:.*]] = arith.constant 1 : i32
-  ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
-  ! FIRDialect:     %[[STEP:.*]] = arith.constant 1 : i32
-
-  ! FIRDialect: omp.simd {
-  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  !$OMP SIMD PRIVATE(r)
-  do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-  ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
deleted file mode 100644
index ea45e716ceae..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
+++ /dev/null
@@ -1,117 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-
-!CHECK-LABEL: omp.declare_reduction
-!CHECK-SAME: @[[RED_F32_NAME:.*]] : !fir.ref<f32>
-!CHECK-SAME: init {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  %[[REF:.*]] = fir.alloca f32
-!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<f32>
-!CHECK:  omp.yield(%[[REF]] : !fir.ref<f32>)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-!CHECK:  %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] {{.*}}: f32
-!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
-!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<f32>)
-!CHECK: }
-
-!CHECK-LABEL: omp.declare_reduction
-!CHECK-SAME: @[[RED_I32_NAME:.*]] : !fir.ref<i32>
-!CHECK-SAME: init {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
-!CHECK:  %[[REF:.*]] = fir.alloca i32
-!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<i32>
-!CHECK:  omp.yield(%[[REF]] : !fir.ref<i32>)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK:  %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
-!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<i32>)
-!CHECK: }
-
-!CHECK-LABEL: func.func @_QPsimple_int_add
-!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
-!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV:.+]] : !fir.ref<i32>) {
-!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:    %[[I_INCR:.+]] = arith.constant 1 : i32
-!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]]
-!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine simple_int_add
-    integer :: i
-    i = 0
-
-    !$omp parallel reduction(+:i)
-    i = i + 1
-    !$omp end parallel
-
-    print *, i
-end subroutine
-
-!CHECK-LABEL: func.func @_QPsimple_real_add
-!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
-!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  omp.parallel byref reduction(@[[RED_F32_NAME]] %[[RREF]] -> %[[PRV:.+]] : !fir.ref<f32>) {
-!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:    %[[R_INCR:.+]] = arith.constant 1.500000e+00 : f32
-!CHECK:    %[[RES]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
-!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<f32>
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine simple_real_add
-    real :: r
-    r = 0.0
-
-    !$omp parallel reduction(+:r)
-    r = r + 1.5
-    !$omp end parallel
-
-    print *, r
-end subroutine
-
-!CHECK-LABEL: func.func @_QPint_real_add
-!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
-!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
-!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV0:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[RREF]] -> %[[PRV1:.+]] : !fir.ref<f32>) {
-!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    %[[LPRV1:.+]] = fir.load %[[PRV1]] : !fir.ref<f32>
-!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[LPRV1]] {{.*}} : f32
-!CHECK:    fir.store %[[RES1]] to %[[PRV1]]
-!CHECK:    %[[LPRV0:.+]] = fir.load %[[PRV0]] : !fir.ref<i32>
-!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
-!CHECK:    %[[RES0:.+]] = arith.addi %[[LPRV0]], %[[I_INCR]]
-!CHECK:    fir.store %[[RES0]] to %[[PRV0]]
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine int_real_add
-    real :: r
-    integer :: i
-
-    r = 0.0
-    i = 0
-
-    !$omp parallel reduction(+:i,r)
-    r = 1.5 + r
-    i = i + 3
-    !$omp end parallel
-
-    print *, r
-    print *, i
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
deleted file mode 100644
index 3f6d9e647c9b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
+++ /dev/null
@@ -1,105 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK-LABEL: omp.declare_reduction
-!CHECK-SAME: @[[RED_F32_NAME:.*]] : f32 init {
-!CHECK: ^bb0(%{{.*}}: f32):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  omp.yield(%[[C0_1]] : f32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
-!CHECK:  %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f32
-!CHECK:  omp.yield(%[[RES]] : f32)
-!CHECK: }
-
-!CHECK-LABEL: omp.declare_reduction
-!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
-!CHECK: ^bb0(%{{.*}}: i32):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
-!CHECK:  omp.yield(%[[C0_1]] : i32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
-!CHECK:  %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
-!CHECK:  omp.yield(%[[RES]] : i32)
-!CHECK: }
-
-!CHECK-LABEL: func.func @_QPsimple_int_add
-!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
-!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV:.+]] : !fir.ref<i32>) {
-!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:    %[[I_INCR:.+]] = arith.constant 1 : i32
-!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]]
-!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine simple_int_add
-    integer :: i
-    i = 0
-
-    !$omp parallel reduction(+:i)
-    i = i + 1
-    !$omp end parallel
-
-    print *, i
-end subroutine
-
-!CHECK-LABEL: func.func @_QPsimple_real_add
-!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
-!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  omp.parallel reduction(@[[RED_F32_NAME]] %[[RREF]] -> %[[PRV:.+]] : !fir.ref<f32>) {
-!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:    %[[R_INCR:.+]] = arith.constant 1.500000e+00 : f32
-!CHECK:    %[[RES]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
-!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<f32>
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine simple_real_add
-    real :: r
-    r = 0.0
-
-    !$omp parallel reduction(+:r)
-    r = r + 1.5
-    !$omp end parallel
-
-    print *, r
-end subroutine
-
-!CHECK-LABEL: func.func @_QPint_real_add
-!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
-!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
-!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV0:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[RREF]] -> %[[PRV1:.+]] : !fir.ref<f32>) {
-!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    %[[LPRV1:.+]] = fir.load %[[PRV1]] : !fir.ref<f32>
-!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[LPRV1]] {{.*}} : f32
-!CHECK:    fir.store %[[RES1]] to %[[PRV1]]
-!CHECK:    %[[LPRV0:.+]] = fir.load %[[PRV0]] : !fir.ref<i32>
-!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
-!CHECK:    %[[RES0:.+]] = arith.addi %[[LPRV0]], %[[I_INCR]]
-!CHECK:    fir.store %[[RES0]] to %[[PRV0]]
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine int_real_add
-    real :: r
-    integer :: i
-
-    r = 0.0
-    i = 0
-
-    !$omp parallel reduction(+:i,r)
-    r = 1.5 + r
-    i = i + 3
-    !$omp end parallel
-
-    print *, r
-    print *, i
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-sections.f90 b/flang/test/Lower/OpenMP/FIR/parallel-sections.f90
deleted file mode 100644
index 7730ab87a719..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-sections.f90
+++ /dev/null
@@ -1,65 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect,LLVMDialect"
-
-!===============================================================================
-! Parallel sections construct
-!===============================================================================
-
-!FIRDialect: func @_QPomp_parallel_sections
-subroutine omp_parallel_sections(x, y)
-  integer, intent(inout) :: x, y
-  !OMPDialect: omp.parallel {
-  !OMPDialect: omp.sections {
-  !$omp parallel sections
-    !OMPDialect: omp.section {
-    !$omp section
-      !FIRDialect: fir.load
-      !FIRDialect: arith.addi
-      !FIRDialect: fir.store
-      x = x + 12
-      !OMPDialect: omp.terminator
-    !OMPDialect: omp.section {
-    !$omp section
-      !FIRDialect: fir.load
-      !FIRDialect: arith.subi
-      !FIRDialect: fir.store
-      y = y - 5
-      !OMPDialect: omp.terminator
-  !OMPDialect: omp.terminator
-  !OMPDialect: omp.terminator
-  !$omp end parallel sections
-end subroutine omp_parallel_sections
-
-!===============================================================================
-! Parallel sections construct with allocate clause
-!===============================================================================
-
-!FIRDialect: func @_QPomp_parallel_sections
-subroutine omp_parallel_sections_allocate(x, y)
-  use omp_lib
-  integer, intent(inout) :: x, y
-  !FIRDialect: %[[allocator_1:.*]] = arith.constant 4 : i64
-  !FIRDialect: %[[allocator_2:.*]] = arith.constant 4 : i64
-  !LLVMDialect: %[[allocator_1:.*]] = llvm.mlir.constant(4 : i64) : i64
-  !LLVMDialect: %[[allocator_2:.*]] = llvm.mlir.constant(4 : i64) : i64
-  !OMPDialect: omp.parallel allocate(
-  !FIRDialect: %[[allocator_2]] : i64 -> %{{.*}} : !fir.ref<i32>) {
-  !LLVMDialect: %[[allocator_2]] : i64 -> %{{.*}} : !llvm.ptr) {
-  !OMPDialect: omp.sections allocate(
-  !FIRDialect: %[[allocator_1]] : i64 -> %{{.*}} : !fir.ref<i32>) {
-  !LLVMDialect: %[[allocator_1]] : i64 -> %{{.*}} : !llvm.ptr) {
-  !$omp parallel sections allocate(omp_high_bw_mem_alloc: x)
-    !OMPDialect: omp.section {
-    !$omp section
-      x = x + 12
-      !OMPDialect: omp.terminator
-    !OMPDialect: omp.section {
-    !$omp section
-      y = y + 5
-      !OMPDialect: omp.terminator
-  !OMPDialect: omp.terminator
-  !OMPDialect: omp.terminator
-  !$omp end parallel sections
-end subroutine omp_parallel_sections_allocate
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90
deleted file mode 100644
index 490f6d0cf7bc..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90
+++ /dev/null
@@ -1,69 +0,0 @@
-! This test checks lowering of OpenMP parallel DO, with the loop bound being
-! a firstprivate variable
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! CHECK: func @_QPomp_do_firstprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) 
-subroutine omp_do_firstprivate(a)
-  integer::a
-  integer::n
-  n = a+1
-  !$omp parallel do firstprivate(a)
-  ! CHECK:  omp.parallel {
-  ! CHECK-NEXT: %[[REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-  ! CHECK-NEXT: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "a", pinned
-  ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.store %[[LD]] to %[[CLONE]] : !fir.ref<i32>
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[CLONE]] : !fir.ref<i32>
-  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  ! CHECK-NEXT: fir.store %[[ARG1]] to %[[REF]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.call @_QPfoo(%[[REF]], %[[CLONE]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-  ! CHECK-NEXT: omp.yield
-  ! CHECK-NEXT: }
-  ! CHECK-NEXT: omp.terminator
-  ! CHECK-NEXT: }
-    do i=1, a
-      call foo(i, a)
-    end do
-  !$omp end parallel do
-  !CHECK: fir.call @_QPbar(%[[ARG0]]) {{.*}}: (!fir.ref<i32>) -> ()
-  call bar(a)
-end subroutine omp_do_firstprivate
-
-! CHECK: func @_QPomp_do_firstprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) 
-subroutine omp_do_firstprivate2(a, n)
-  integer::a
-  integer::n
-  n = a+1
-  !$omp parallel do firstprivate(a, n)
-  ! CHECK:  omp.parallel {
-  ! CHECK-NEXT: %[[REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-  ! CHECK-NEXT: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "a", pinned
-  ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.store %[[LD]] to %[[CLONE]] : !fir.ref<i32>
-  ! CHECK-NEXT: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "n", pinned
-  ! CHECK-NEXT: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.store %[[LD1]] to %[[CLONE1]] : !fir.ref<i32>
-
-
-  ! CHECK: %[[LB:.*]] = fir.load %[[CLONE]] : !fir.ref<i32>
-  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
-  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  ! CHECK-NEXT: fir.store %[[ARG2]] to %[[REF]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.call @_QPfoo(%[[REF]], %[[CLONE]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-  ! CHECK-NEXT: omp.yield
-  ! CHECK-NEXT: }
-  ! CHECK-NEXT: omp.terminator
-  ! CHECK-NEXT: }
-    do i= a, n
-      call foo(i, a)
-    end do
-  !$omp end parallel do
-  !CHECK: fir.call @_QPbar(%[[ARG1]]) {{.*}}: (!fir.ref<i32>) -> ()
-  call bar(n)
-end subroutine omp_do_firstprivate2
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90
deleted file mode 100644
index 630d647bc64b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90
+++ /dev/null
@@ -1,297 +0,0 @@
-! This test checks lowering of OpenMP DO Directive (Worksharing).
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! CHECK-LABEL: func @_QPsimple_parallel_do()
-subroutine simple_parallel_do
-  integer :: i
-  ! CHECK:  omp.parallel
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP PARALLEL DO
-  do i=1, 9
-  ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:      omp.yield
-  ! CHECK:      omp.terminator
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL DO
-end subroutine
-
-! CHECK-LABEL: func @_QPparallel_do_with_parallel_clauses
-! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
-subroutine parallel_do_with_parallel_clauses(cond, nt)
-  logical :: cond
-  integer :: nt
-  integer :: i
-  ! CHECK:  %[[COND:.*]] = fir.load %[[COND_REF]] : !fir.ref<!fir.logical<4>>
-  ! CHECK:  %[[COND_CVT:.*]] = fir.convert %[[COND]] : (!fir.logical<4>) -> i1
-  ! CHECK:  %[[NT:.*]] = fir.load %[[NT_REF]] : !fir.ref<i32>
-  ! CHECK:  omp.parallel if(%[[COND_CVT]] : i1) num_threads(%[[NT]] : i32) proc_bind(close)
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP PARALLEL DO IF(cond) NUM_THREADS(nt) PROC_BIND(close)
-  do i=1, 9
-  ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:      omp.yield
-  ! CHECK:      omp.terminator
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL DO
-end subroutine
-
-! CHECK-LABEL: func @_QPparallel_do_with_clauses
-! CHECK-SAME: %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
-subroutine parallel_do_with_clauses(nt)
-  integer :: nt
-  integer :: i
-  ! CHECK:  %[[NT:.*]] = fir.load %[[NT_REF]] : !fir.ref<i32>
-  ! CHECK:  omp.parallel num_threads(%[[NT]] : i32)
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop schedule(dynamic) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
-  !$OMP PARALLEL DO NUM_THREADS(nt) SCHEDULE(dynamic)
-  do i=1, 9
-  ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:      omp.yield
-  ! CHECK:      omp.terminator
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL DO
-end subroutine
-
-!===============================================================================
-! Checking for the following construct:
-!   !$omp parallel do private(...) firstprivate(...)
-!===============================================================================
-
-! CHECK-LABEL: func @_QPparallel_do_with_privatisation_clauses
-! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
-subroutine parallel_do_with_privatisation_clauses(cond,nt)
-  logical :: cond
-  integer :: nt
-  integer :: i
-  ! CHECK:  omp.parallel
-  ! CHECK:      %[[PRIVATE_COND_REF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"}
-  ! CHECK:      %[[PRIVATE_NT_REF:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"}
-  ! CHECK:      %[[NT_VAL:.*]] = fir.load %[[NT_REF]] : !fir.ref<i32>
-  ! CHECK:      fir.store %[[NT_VAL]] to %[[PRIVATE_NT_REF]] : !fir.ref<i32>
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP PARALLEL DO PRIVATE(cond) FIRSTPRIVATE(nt)
-  do i=1, 9
-  ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-  ! CHECK:      %[[PRIVATE_COND_VAL:.*]] = fir.load %[[PRIVATE_COND_REF]] : !fir.ref<!fir.logical<4>>
-  ! CHECK:      %[[PRIVATE_COND_VAL_CVT:.*]] = fir.convert %[[PRIVATE_COND_VAL]] : (!fir.logical<4>) -> i1
-  ! CHECK:      fir.call @_FortranAioOutputLogical({{.*}}, %[[PRIVATE_COND_VAL_CVT]]) {{.*}}: (!fir.ref<i8>, i1) -> i1
-  ! CHECK:      %[[PRIVATE_NT_VAL:.*]] = fir.load %[[PRIVATE_NT_REF]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[PRIVATE_NT_VAL]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i, cond, nt
-  end do
-  ! CHECK:      omp.yield
-  ! CHECK:      omp.terminator
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL DO
-end subroutine
-
-!===============================================================================
-! Checking for the following construct
-!   !$omp parallel private(...) firstprivate(...)
-!   !$omp do
-!===============================================================================
-
-subroutine parallel_private_do(cond,nt)
-logical :: cond
-  integer :: nt
-  integer :: i
-  !$OMP PARALLEL PRIVATE(cond) FIRSTPRIVATE(nt)
-  !$OMP DO
-  do i=1, 9
-    call foo(i, cond, nt)
-  end do
-  !$OMP END DO
-  !$OMP END PARALLEL
-end subroutine parallel_private_do
-
-! CHECK-LABEL:   func.func @_QPparallel_private_do(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
-! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
-! CHECK:           %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFparallel_private_doEi"}
-! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[COND_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_private_doEcond"}
-! CHECK:             %[[NT_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_private_doEnt"}
-! CHECK:             %[[NT:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:             fir.store %[[NT]] to %[[NT_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
-! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
-! CHECK:                 fir.store %[[I]] to %[[I_PRIV]] : !fir.ref<i32>
-! CHECK:                 fir.call @_QPfoo(%[[I_PRIV]], %[[COND_ADDR]], %[[NT_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-!===============================================================================
-! Checking for the following construct
-!   !$omp parallel
-!   !$omp do firstprivate(...) firstprivate(...)
-!===============================================================================
-
-subroutine omp_parallel_multiple_firstprivate_do(a, b)
-  integer::a, b
-  !$OMP PARALLEL FIRSTPRIVATE(a) FIRSTPRIVATE(b)
-  !$OMP DO
-  do i=1, 10
-    call bar(i, a)
-  end do
-  !$OMP END DO
-  !$OMP END PARALLEL
-end subroutine omp_parallel_multiple_firstprivate_do
-
-! CHECK-LABEL:   func.func @_QPomp_parallel_multiple_firstprivate_do(
-! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
-! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_parallel_multiple_firstprivate_doEi"}
-! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"}
-! CHECK:             %[[A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<i32>
-! CHECK:             fir.store %[[A]] to %[[A_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"}
-! CHECK:             %[[B:.*]] = fir.load %[[B_ADDR]] : !fir.ref<i32>
-! CHECK:             fir.store %[[B]] to %[[B_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
-! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:                 fir.call @_QPbar(%[[I_PRIV_ADDR]], %[[A_PRIV_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-!===============================================================================
-! Checking for the following construct
-!   !$omp parallel
-!   !$omp do private(...) firstprivate(...)
-!===============================================================================
-
-subroutine parallel_do_private(cond,nt)
-logical :: cond
-  integer :: nt
-  integer :: i
-  !$OMP PARALLEL
-  !$OMP DO PRIVATE(cond) FIRSTPRIVATE(nt)
-  do i=1, 9
-    call foo(i, cond, nt)
-  end do
-  !$OMP END DO
-  !$OMP END PARALLEL
-end subroutine parallel_do_private
-
-! CHECK-LABEL:   func.func @_QPparallel_do_private(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
-! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
-! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFparallel_do_privateEi"}
-! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[COND_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_privateEcond"}
-! CHECK:             %[[NT_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_privateEnt"}
-! CHECK:             %[[NT:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:             fir.store %[[NT]] to %[[NT_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
-! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
-! CHECK:                 fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:                 fir.call @_QPfoo(%[[I_PRIV_ADDR]], %[[COND_ADDR]], %[[NT_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-!===============================================================================
-! Checking for the following construct
-!   !$omp parallel
-!   !$omp do firstprivate(...) firstprivate(...)
-!===============================================================================
-
-subroutine omp_parallel_do_multiple_firstprivate(a, b)
-  integer::a, b
-  !$OMP PARALLEL
-  !$OMP DO FIRSTPRIVATE(a) FIRSTPRIVATE(b)
-  do i=1, 10
-    call bar(i, a)
-  end do
-  !$OMP END DO
-  !$OMP END PARALLEL
-end subroutine omp_parallel_do_multiple_firstprivate
-
-! CHECK-LABEL:   func.func @_QPomp_parallel_do_multiple_firstprivate(
-! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
-! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"}
-! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"}
-! CHECK:             %[[A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<i32>
-! CHECK:             fir.store %[[A]] to %[[A_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"}
-! CHECK:             %[[B:.*]] = fir.load %[[B_ADDR]] : !fir.ref<i32>
-! CHECK:             fir.store %[[B]] to %[[B_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
-! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:                 fir.call @_QPbar(%[[I_PRIV_ADDR]], %[[A_PRIV_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
diff --git a/flang/test/Lower/OpenMP/FIR/parallel.f90 b/flang/test/Lower/OpenMP/FIR/parallel.f90
deleted file mode 100644
index a2ceb2d939f2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel.f90
+++ /dev/null
@@ -1,211 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="LLVMDialect,OMPDialect"
-
-!FIRDialect-LABEL: func @_QPparallel_simple
-subroutine parallel_simple()
-   !OMPDialect: omp.parallel
-!$omp parallel
-   !FIRDialect: fir.call
-   call f1()
-!$omp end parallel
-end subroutine parallel_simple
-
-!===============================================================================
-! `if` clause
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_if
-subroutine parallel_if(alpha, beta, gamma)
-   integer, intent(in) :: alpha
-   logical, intent(in) :: beta
-   logical(1) :: logical1
-   logical(2) :: logical2
-   logical(4) :: logical4
-   logical(8) :: logical8
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(alpha .le. 0)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(.false.)
-   !FIRDialect: fir.call
-   call f2()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(alpha .ge. 0)
-   !FIRDialect: fir.call
-   call f3()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(.true.)
-   !FIRDialect: fir.call
-   call f4()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(beta)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(logical1)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(logical2)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(logical4)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(logical8)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-end subroutine parallel_if
-
-!===============================================================================
-! `num_threads` clause
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_numthreads
-subroutine parallel_numthreads(num_threads)
-   integer, intent(inout) :: num_threads
-
-   !OMPDialect: omp.parallel num_threads(%{{.*}}: i32) {
-   !$omp parallel num_threads(16)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   num_threads = 4
-
-   !OMPDialect: omp.parallel num_threads(%{{.*}} : i32) {
-   !$omp parallel num_threads(num_threads)
-   !FIRDialect: fir.call
-   call f2()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-end subroutine parallel_numthreads
-
-!===============================================================================
-! `proc_bind` clause
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_proc_bind
-subroutine parallel_proc_bind()
-
-   !OMPDialect: omp.parallel proc_bind(master) {
-   !$omp parallel proc_bind(master)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel proc_bind(close) {
-   !$omp parallel proc_bind(close)
-   !FIRDialect: fir.call
-   call f2()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel proc_bind(spread) {
-   !$omp parallel proc_bind(spread)
-   !FIRDialect: fir.call
-   call f3()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-end subroutine parallel_proc_bind
-
-!===============================================================================
-! `allocate` clause
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_allocate
-subroutine parallel_allocate()
-   use omp_lib
-   integer :: x
-   !OMPDialect: omp.parallel allocate(
-   !FIRDialect: %{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>
-   !LLVMDialect: %{{.+}} : i64 -> %{{.+}} : !llvm.ptr
-   !OMPDialect: ) {
-   !$omp parallel allocate(omp_high_bw_mem_alloc: x) private(x)
-   !FIRDialect: arith.addi
-   x = x + 12
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-end subroutine parallel_allocate
-
-!===============================================================================
-! multiple clauses
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_multiple_clauses
-subroutine parallel_multiple_clauses(alpha, num_threads)
-   use omp_lib
-   integer, intent(inout) :: alpha
-   integer, intent(in) :: num_threads
-
-   !OMPDialect: omp.parallel if({{.*}} : i1) proc_bind(master) {
-   !$omp parallel if(alpha .le. 0) proc_bind(master)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel num_threads({{.*}} : i32) proc_bind(close) {
-   !$omp parallel proc_bind(close) num_threads(num_threads)
-   !FIRDialect: fir.call
-   call f2()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if({{.*}} : i1) num_threads({{.*}} : i32) {
-   !$omp parallel num_threads(num_threads) if(alpha .le. 0)
-   !FIRDialect: fir.call
-   call f3()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if({{.*}} : i1) num_threads({{.*}} : i32) allocate(
-   !FIRDialect: %{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>
-   !LLVMDialect: %{{.+}} : i64 -> %{{.+}} : !llvm.ptr
-   !OMPDialect: ) {
-   !$omp parallel num_threads(num_threads) if(alpha .le. 0) allocate(omp_high_bw_mem_alloc: alpha) private(alpha)
-   !FIRDialect: fir.call
-   call f3()
-   !FIRDialect: arith.addi
-   alpha = alpha + 12
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-end subroutine parallel_multiple_clauses
diff --git a/flang/test/Lower/OpenMP/FIR/pre-fir-tree-loop.f90 b/flang/test/Lower/OpenMP/FIR/pre-fir-tree-loop.f90
deleted file mode 100644
index eca8fb304986..000000000000
--- a/flang/test/Lower/OpenMP/FIR/pre-fir-tree-loop.f90
+++ /dev/null
@@ -1,70 +0,0 @@
-! RUN: bbc -fopenmp -pft-test -o %t %s | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -fdebug-dump-pft -o %t %s | FileCheck %s
-
-! Loop constructs always have an `end do` which can be the target of
-! a branch. So OpenMP loop constructs do not need an artificial
-! continue inserted for a target.
-
-!CHECK-LABEL: sb0
-!CHECK-NOT: continue
-subroutine sb0(cond)
-  implicit none
-  logical :: cond
-  integer :: i
-  !$omp parallel do
-  do i = 1, 20
-    if( cond) then
-      cycle
-    end if
-  end do
-  return
-end subroutine
-
-!CHECK-LABEL: sb1
-!CHECK-NOT: continue
-subroutine sb1(cond)
-  implicit none
-  logical :: cond
-  integer :: i
-  !$omp parallel do
-  do i = 1, 20
-    if( cond) then
-      cycle
-    end if
-  end do
-  !$omp end parallel do
-  return
-end subroutine
-
-!CHECK-LABEL: sb2
-!CHECK-NOT: continue
-subroutine sb2
-  integer :: i, n
-  integer :: tmp
-
-  !$omp parallel do
-  do ifld=1,n
-     do isum=1,n
-       if (tmp > n) then
-         exit
-       endif
-     enddo
-     tmp = n
-  enddo
-end subroutine
-
-!CHECK-LABEL: sb3
-!CHECK-NOT: continue
-subroutine sb3
-  integer :: i, n
-  integer :: tmp
-
-  !$omp parallel do
-  do ifld=1,n
-     do isum=1,n
-       if (tmp > n) then
-         exit
-       endif
-     enddo
-  enddo
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/pre-fir-tree01.f90 b/flang/test/Lower/OpenMP/FIR/pre-fir-tree01.f90
deleted file mode 100644
index fc817942513e..000000000000
--- a/flang/test/Lower/OpenMP/FIR/pre-fir-tree01.f90
+++ /dev/null
@@ -1,19 +0,0 @@
-! RUN: bbc -fopenmp -pft-test -o %t %s | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -fdebug-dump-pft -o %t %s | FileCheck %s
-
-! Test structure of the Pre-FIR tree with OpenMP
-
-subroutine sub1(a, b, n)
-  real :: a(:), b(:)
-  integer :: n, i
-  !$omp parallel do
-  do i = 1, n
-    b(i) = exp(a(i))
-  end do
-  !$omp end parallel do
-end subroutine
-
-! CHECK-LABEL: Subroutine sub1
-! CHECK:       <<OpenMPConstruct>>
-! CHECK:       <<DoConstruct>>
-! CHECK:       <<End OpenMPConstruct>>
diff --git a/flang/test/Lower/OpenMP/FIR/private-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/private-commonblock.f90
deleted file mode 100644
index 90036e0c0c7e..000000000000
--- a/flang/test/Lower/OpenMP/FIR/private-commonblock.f90
+++ /dev/null
@@ -1,109 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: func.func @_QPprivate_common() {
-!CHECK: omp.parallel {
-!CHECK: %[[X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFprivate_commonEx"}
-!CHECK: %[[Y:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFprivate_commonEy"}
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine private_common
-  common /c/ x, y
-  real x, y
-  !$omp parallel private(/c/)
-  !$omp end parallel
-end subroutine
-
-!CHECK: %[[val_0:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<74xi8>>
-!CHECK: %[[val_1:.*]] = fir.convert %0 : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_2:.*]] = fir.coordinate_of %[[val_1]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<10xf32>>
-!CHECK: %[[val_7:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c44:.*]] = arith.constant 44 : index
-!CHECK: %[[val_8:.*]] = fir.coordinate_of %[[val_7]], %[[val_c44]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_9:.*]] = fir.convert %[[val_8]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
-!CHECK: %[[val_c5:.*]] = arith.constant 5 : index
-!CHECK: %[[val_10:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c49:.*]] = arith.constant 49 : index
-!CHECK: %[[val_11:.*]] = fir.coordinate_of %[[val_10]], %[[val_c49]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_12:.*]] = fir.convert %[[val_11]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
-!CHECK: %[[val_c5_0:.*]] = arith.constant 5 : index
-!CHECK: %[[val_14:.*]] = fir.emboxchar %[[val_9]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-!CHECK: %[[val_15:.*]] = fir.convert %[[val_12]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_16:.*]] = fir.emboxchar %[[val_15]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-!CHECK: fir.call @_QPsub1(%[[val_3]], %[[val_6]], %[[val_14]], %[[val_16]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
-!CHECK: omp.parallel {
-!CHECK: %[[val_21:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblockEa"}
-!CHECK: %[[val_22:.*]] = fir.alloca !fir.array<10xf32> {bindc_name = "b", pinned, uniq_name = "_QFprivate_clause_commonblockEb"}
-!CHECK: %[[val_23:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblockEc"}
-!CHECK: %[[val_24:.*]] = fir.alloca !fir.array<5x!fir.char<1,5>> {bindc_name = "d", pinned, uniq_name = "_QFprivate_clause_commonblockEd"}
-!CHECK: %[[val_26:.*]] = fir.emboxchar %[[val_23]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-!CHECK: %[[val_27:.*]] = fir.convert %[[val_24]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_28:.*]] = fir.emboxchar %[[val_27]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-!CHECK: fir.call @_QPsub2(%[[val_21]], %[[val_22]], %[[val_26]], %[[val_28]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: %[[val_18:.*]] = fir.emboxchar %[[val_9]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-!CHECK: %[[val_19:.*]] = fir.convert %[[val_12]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_20:.*]] = fir.emboxchar %[[val_19]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-!CHECK: fir.call @_QPsub3(%[[val_3]], %[[val_6]], %[[val_18]], %[[val_20]]) fastmath<contract> : {{.*}}
-!CHECK: return
-!CHECK: }
-subroutine private_clause_commonblock()
-  integer::a
-  real::b(10)
-  character(5):: c, d(5)
-  common /blk/ a, b, c, d
-  
-  call sub1(a, b, c, d)
-  !$omp parallel private(/blk/)
-        call sub2(a, b, c, d)
-  !$omp end parallel
-  call sub3(a, b, c, d)
-end subroutine
-
-!CHECK: func.func @_QPprivate_clause_commonblock_pointer() {
-!CHECK: %[[val_0:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<74xi8>>
-!CHECK: %[[val_1:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c24:.*]] = arith.constant 24 : index
-!CHECK: %[[val_2:.*]] = fir.coordinate_of %[[val_1]], %[[val_c24]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK: %[[val_7:.*]] = fir.load %[[val_6]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK: %[[val_8:.*]] = fir.box_addr %[[val_7]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
-!CHECK: %[[val_9:.*]] = fir.convert %[[val_8]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
-!CHECK: fir.call @_QPsub4(%[[val_9]], %[[val_3]]) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
-!CHECK: omp.parallel {
-!CHECK: %[[val_13:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.complex<4>>> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEc"}
-!CHECK: %[[val_14:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEa"}
-!CHECK: %[[val_15:.*]] = fir.load %[[val_13]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK: %[[val_16:.*]] = fir.box_addr %[[val_15]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
-!CHECK: %[[val_17:.*]] = fir.convert %[[val_16]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
-!CHECK: fir.call @_QPsub5(%[[val_17]], %[[val_14]]) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: %[[val_10:.*]] = fir.load %[[val_6]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK: %[[val_11:.*]] = fir.box_addr %[[val_10]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
-!CHECK: %[[val_12:.*]] = fir.convert %[[val_11]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
-!CHECK: fir.call @_QPsub6(%[[val_12]], %[[val_3]]) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
-!CHECK: return
-!CHECK: }
-subroutine private_clause_commonblock_pointer()
-  complex, pointer :: c
-  integer:: a
-  common /blk/ c, a
-  call sub4(c, a)
-  !$omp parallel private(/blk/)
-        call sub5(c, a)
-  !$omp end parallel
-  call sub6(c, a)
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/requires-common.f90 b/flang/test/Lower/OpenMP/FIR/requires-common.f90
deleted file mode 100644
index 2e112d72de3f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/requires-common.f90
+++ /dev/null
@@ -1,19 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-fir %s -o - | FileCheck %s
-
-! This test checks the lowering of requires into MLIR
-
-!CHECK:      module attributes {
-!CHECK-SAME: omp.requires = #omp<clause_requires unified_shared_memory>
-block data init
-  !$omp requires unified_shared_memory
-  integer :: x
-  common /block/ x
-  data x / 10 /
-end
-
-subroutine f
-  !$omp declare target
-end subroutine f
diff --git a/flang/test/Lower/OpenMP/FIR/requires-notarget.f90 b/flang/test/Lower/OpenMP/FIR/requires-notarget.f90
deleted file mode 100644
index bfa509208428..000000000000
--- a/flang/test/Lower/OpenMP/FIR/requires-notarget.f90
+++ /dev/null
@@ -1,14 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-fir %s -o - | FileCheck %s
-
-! This test checks that requires lowering into MLIR skips creating the
-! omp.requires attribute with target-related clauses if there are no device
-! functions in the compilation unit
-
-!CHECK:      module attributes {
-!CHECK-NOT:  omp.requires
-program requires
-  !$omp requires unified_shared_memory reverse_offload atomic_default_mem_order(seq_cst)
-end program requires
diff --git a/flang/test/Lower/OpenMP/FIR/requires.f90 b/flang/test/Lower/OpenMP/FIR/requires.f90
deleted file mode 100644
index bc53931b9f24..000000000000
--- a/flang/test/Lower/OpenMP/FIR/requires.f90
+++ /dev/null
@@ -1,14 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-fir %s -o - | FileCheck %s
-
-! This test checks the lowering of requires into MLIR
-
-!CHECK:      module attributes {
-!CHECK-SAME: omp.requires = #omp<clause_requires reverse_offload|unified_shared_memory>
-program requires
-  !$omp requires unified_shared_memory reverse_offload atomic_default_mem_order(seq_cst)
-  !$omp target
-  !$omp end target
-end program requires
diff --git a/flang/test/Lower/OpenMP/FIR/rtl-flags.f90 b/flang/test/Lower/OpenMP/FIR/rtl-flags.f90
deleted file mode 100644
index ad8eb9e73213..000000000000
--- a/flang/test/Lower/OpenMP/FIR/rtl-flags.f90
+++ /dev/null
@@ -1,39 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=45 %s -o - | FileCheck %s --check-prefix=DEFAULT-HOST-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device -fopenmp-version=45  %s -o - | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR-VERSION
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=45 %s -o - | FileCheck %s --check-prefix=DEFAULT-HOST-FIR-VERSION
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-target-debug -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DBG-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-target-debug=111 -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DBG-EQ-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-assume-teams-oversubscription -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=TEAMS-OSUB-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-assume-threads-oversubscription -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=THREAD-OSUB-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-assume-no-thread-state -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=THREAD-STATE-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-assume-no-nested-parallelism -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=NEST-PAR-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-target-debug -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism -fopenmp-assume-threads-oversubscription -fopenmp-assume-no-thread-state -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=ALL-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device -fopenmp-version=45 -o - %s | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR-VERSION
-!RUN: bbc -emit-fir -fopenmp -o - %s | FileCheck %s --check-prefix=DEFAULT-HOST-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-version=45 -o - %s | FileCheck %s --check-prefix=DEFAULT-HOST-FIR-VERSION
-!RUN: bbc -emit-fir -fopenmp -fopenmp-target-debug=111 -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=DBG-EQ-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-assume-teams-oversubscription -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=TEAMS-OSUB-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-assume-threads-oversubscription -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=THREAD-OSUB-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-assume-no-thread-state -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=THREAD-STATE-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-assume-no-nested-parallelism -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=NEST-PAR-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-target-debug=1 -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism -fopenmp-assume-threads-oversubscription -fopenmp-assume-no-thread-state -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=ALL-DEVICE-FIR
-
-!DEFAULT-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<openmp_device_version = 11>
-!DEFAULT-DEVICE-FIR-SAME: omp.is_target_device = true
-!DEFAULT-DEVICE-FIR-VERSION: module attributes {{{.*}}omp.flags = #omp.flags<openmp_device_version = 45>
-!DEFAULT-DEVICE-FIR-VERSION-SAME: omp.is_target_device = true
-!DEFAULT-DEVICE-FIR-VERSION-SAME: omp.version = #omp.version<version = 45>
-!DEFAULT-HOST-FIR: module attributes {{{.*}}omp.is_target_device = false{{.*}}
-!DEFAULT-HOST-FIR-VERSION: module attributes {{{.*}}omp.is_target_device = false
-!DEFAULT-HOST-FIR-VERSION-SAME: omp.version = #omp.version<version = 45>
-!DBG-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 1, openmp_device_version = 11>
-!DBG-EQ-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 111, openmp_device_version = 11>
-!TEAMS-OSUB-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_teams_oversubscription = true, openmp_device_version = 11>
-!THREAD-OSUB-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_threads_oversubscription = true, openmp_device_version = 11>
-!THREAD-STATE-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_no_thread_state = true, openmp_device_version = 11>
-!NEST-PAR-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_no_nested_parallelism = true, openmp_device_version = 11>
-!ALL-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 1, assume_teams_oversubscription = true, assume_threads_oversubscription = true, assume_no_thread_state = true, assume_no_nested_parallelism = true, openmp_device_version = 11>
-subroutine omp_subroutine()
-end subroutine omp_subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/sections-pft.f90 b/flang/test/Lower/OpenMP/FIR/sections-pft.f90
deleted file mode 100644
index 7b20a87022c9..000000000000
--- a/flang/test/Lower/OpenMP/FIR/sections-pft.f90
+++ /dev/null
@@ -1,91 +0,0 @@
-! RUN: %flang_fc1 -fdebug-pre-fir-tree -fopenmp %s | FileCheck %s
-
-subroutine openmp_sections(x, y)
-
-  integer, intent(inout)::x, y
-
-!==============================================================================
-! empty construct
-!==============================================================================
-!$omp sections
-!$omp end sections
-
-!CHECK: OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-!==============================================================================
-! single section, without `!$omp section`
-!==============================================================================
-!$omp sections
-    call F1()
-!$omp end sections
-
-!CHECK: OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-!==============================================================================
-! single section with `!$omp section`
-!==============================================================================
-!$omp sections
-  !$omp section
-    call F1
-!$omp end sections
-
-!CHECK: OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-!==============================================================================
-! multiple sections
-!==============================================================================
-!$omp sections
-  !$omp section
-    call F1
-  !$omp section
-    call F2
-  !$omp section
-    call F3
-!$omp end sections
-
-!CHECK: OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-!==============================================================================
-! multiple sections with clauses
-!==============================================================================
-!$omp sections PRIVATE(x) FIRSTPRIVATE(y)
-  !$omp section
-    call F1
-  !$omp section
-    call F2
-  !$omp section
-    call F3
-!$omp end sections NOWAIT
-
-!CHECK: OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-end subroutine openmp_sections
diff --git a/flang/test/Lower/OpenMP/FIR/sections.f90 b/flang/test/Lower/OpenMP/FIR/sections.f90
deleted file mode 100644
index 7b313f3dc0b4..000000000000
--- a/flang/test/Lower/OpenMP/FIR/sections.f90
+++ /dev/null
@@ -1,288 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! This test checks the lowering of OpenMP sections construct with several clauses present
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "sample"} {
-!CHECK:   %[[COUNT:.*]] = fir.address_of(@_QFEcount) : !fir.ref<i32>
-!CHECK:   %[[ETA:.*]] = fir.alloca f32 {bindc_name = "eta", uniq_name = "_QFEeta"}
-!CHECK:   %[[CONST_1:.*]] = arith.constant 4 : i64
-!CHECK:   omp.sections allocate(%[[CONST_1]] : i64 -> %0 : !fir.ref<i32>)  {
-!CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[const:.*]] = arith.constant 5 : i32
-!CHECK:       fir.store %[[const]] to %[[COUNT]] : !fir.ref<i32>
-!CHECK:       %[[temp_count:.*]] = fir.load %[[COUNT]] : !fir.ref<i32>
-!CHECK:       %[[temp_double_count:.*]] = fir.load %[[PRIVATE_DOUBLE_COUNT]] : !fir.ref<i32>
-!CHECK:       %[[result:.*]] = arith.muli %[[temp_count]], %[[temp_double_count]] : i32
-!CHECK:       {{.*}} = fir.convert %[[result]] : (i32) -> f32
-!CHECK:       fir.store {{.*}} to %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[temp:.*]] = fir.load %[[PRIVATE_DOUBLE_COUNT]] : !fir.ref<i32>
-!CHECK:       %[[const:.*]] = arith.constant 1 : i32
-!CHECK:       %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK:       fir.store %[[result]] to %[[PRIVATE_DOUBLE_COUNT]] : !fir.ref<i32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[temp:.*]] = fir.load %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       %[[const:.*]] = arith.constant 7.000000e+00 : f32
-!CHECK:       %[[result:.*]] = arith.subf %[[temp]], %[[const]] {{.*}}: f32
-!CHECK:       fir.store %[[result]] to %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       {{.*}} = fir.load %[[COUNT]] : !fir.ref<i32>
-!CHECK:       %[[temp_count:.*]] = fir.convert {{.*}} : (i32) -> f32
-!CHECK:       %[[temp_eta:.*]] = fir.load %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       {{.*}} = arith.mulf %[[temp_count]], %[[temp_eta]] {{.*}}: f32
-!CHECK:       %[[result:.*]] = fir.convert {{.*}} : (f32) -> i32
-!CHECK:       fir.store %[[result]] to %[[COUNT]] : !fir.ref<i32>
-!CHECK:       {{.*}} = fir.load %[[COUNT]] : !fir.ref<i32>
-!CHECK:       %[[temp_count:.*]] = fir.convert {{.*}} : (i32) -> f32
-!CHECK:       %[[temp_eta:.*]] = fir.load %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       {{.*}} = arith.subf %[[temp_count]], %[[temp_eta]] {{.*}}: f32
-!CHECK:       %[[result:.*]] = fir.convert {{.*}} : (f32) -> i32
-!CHECK:       fir.store %[[result]] to %[[PRIVATE_DOUBLE_COUNT]] : !fir.ref<i32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   omp.sections nowait {
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   return
-!CHECK: }
-
-program sample
-    use omp_lib
-    integer :: count = 0, double_count = 1
-    !$omp sections private (eta, double_count) allocate(omp_high_bw_mem_alloc: count)
-        !$omp section
-            count = 1 + 4
-            eta = count * double_count
-        !$omp section
-            double_count = double_count + 1
-        !$omp section
-            eta = eta - 7
-            count = count * eta
-            double_count = count - eta
-    !$omp end sections
-
-    !$omp sections
-    !$omp end sections nowait
-end program sample
-
-!CHECK: func @_QPfirstprivate(%[[ARG:.*]]: !fir.ref<f32> {fir.bindc_name = "alpha"}) {
-!CHECK:   omp.sections {
-!CHECK:     omp.section  {
-!CHECK:         %[[PRIVATE_ALPHA:.*]] = fir.alloca f32 {bindc_name = "alpha", pinned, uniq_name = "_QFfirstprivateEalpha"}
-!CHECK:         %[[temp:.*]] = fir.load %[[ARG]] : !fir.ref<f32>
-!CHECK:         fir.store %[[temp]] to %[[PRIVATE_ALPHA]] : !fir.ref<f32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   omp.sections {
-!CHECK:     omp.section  {
-!CHECK:       %[[PRIVATE_VAR:.*]] = fir.load %[[ARG]] : !fir.ref<f32>
-!CHECK:       %[[CONSTANT:.*]] = arith.constant 5.000000e+00 : f32
-!CHECK:       %[[PRIVATE_VAR_2:.*]] = arith.mulf %[[PRIVATE_VAR]], %[[CONSTANT]] {{.*}}: f32
-!CHECK:       fir.store %[[PRIVATE_VAR_2]] to %[[ARG]] : !fir.ref<f32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   return
-!CHECK: }
-
-subroutine firstprivate(alpha)
-    real :: alpha
-    !$omp sections firstprivate(alpha)
-    !$omp end sections
-
-    !$omp sections
-        alpha = alpha * 5
-    !$omp end sections
-end subroutine
-
-subroutine lastprivate()
-	integer :: x
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFlastprivateEx"}
-!CHECK: omp.sections   {
-	!$omp sections lastprivate(x)
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[const:.*]] = arith.constant 10 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %c10_i32, %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x * 10
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[true:.*]] = arith.constant true
-!CHECK: fir.if %[[true]] {
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x + 1
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp end sections
-    
-!CHECK: omp.sections   {
-    !$omp sections firstprivate(x) lastprivate(x)
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[const:.*]] = arith.constant 10 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %c10_i32, %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x * 10
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[true:.*]] = arith.constant true
-!CHECK: fir.if %true {
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x + 1
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp end sections
-
-!CHECK: omp.sections nowait {
-    !$omp sections firstprivate(x) lastprivate(x)
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[const:.*]] = arith.constant 10 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %c10_i32, %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x * 10
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[true:.*]] = arith.constant true
-!CHECK: fir.if %true {
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x + 1
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp end sections nowait
-
-!CHECK: omp.sections {
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: cf.br ^bb1
-!CHECK: ^bb1:  // pred: ^bb0
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[INNER_PRIVATE_X]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[loaded_value:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[loaded_value]] to %[[X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-
-    !$omp sections lastprivate(x)
-        !$omp section
-                goto 30
-        30  x = x + 1
-    !$omp end sections
-end subroutine
-
-subroutine unstructured_sections_privatization()
-!CHECK: %[[X:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFunstructured_sections_privatizationEx"}
-!CHECK: omp.sections {
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFunstructured_sections_privatizationEx"}
-!CHECK: cf.br ^bb1
-!CHECK: ^bb1:  // pred: ^bb0
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: %[[constant:.*]] = arith.constant 1.000000e+00 : f32
-!CHECK: %[[result:.*]] = arith.addf %[[INNER_PRIVATE_X]], %[[constant]] fastmath<contract> : f32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp sections private(x)
-        !$omp section
-            goto 40
-        40  x = x + 1
-    !$omp end sections
-!CHECK: omp.sections {
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFunstructured_sections_privatizationEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<f32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: cf.br ^bb1
-!CHECK: ^bb1:  // pred: ^bb0
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: %[[constant:.*]] = arith.constant 1.000000e+00 : f32
-!CHECK: %[[result:.*]] = arith.addf %[[INNER_PRIVATE_X]], %[[constant]] fastmath<contract> : f32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp sections firstprivate(x)
-        !$omp section
-            goto 50
-        50  x = x + 1
-    !$omp end sections
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/simd.f90 b/flang/test/Lower/OpenMP/FIR/simd.f90
deleted file mode 100644
index db7d30295c45..000000000000
--- a/flang/test/Lower/OpenMP/FIR/simd.f90
+++ /dev/null
@@ -1,175 +0,0 @@
-! Tests for 2.9.3.1 Simd
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-!CHECK-LABEL: func @_QPsimd()
-subroutine simd
-  integer :: i
-  !$OMP SIMD
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
-  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.simd {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i=1, 9
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_if_clause
-subroutine simd_with_if_clause(n, threshold)
-  integer :: i, n, threshold
-  !$OMP SIMD IF( n .GE. threshold )
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[COND:.*]] = arith.cmpi sge
-  ! CHECK: omp.simd if(%[[COND:.*]]) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_simdlen_clause
-subroutine simd_with_simdlen_clause(n, threshold)
-  integer :: i, n, threshold
-  !$OMP SIMD SIMDLEN(2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(2) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param
-subroutine simd_with_simdlen_clause_from_param(n, threshold)
-  integer :: i, n, threshold
-  integer, parameter :: simdlen = 2;
-  !$OMP SIMD SIMDLEN(simdlen)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(2) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param
-subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
-  integer :: i, n, threshold
-  integer, parameter :: simdlen = 2;
-  !$OMP SIMD SIMDLEN(simdlen*2 + 2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(6) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_safelen_clause
-subroutine simd_with_safelen_clause(n, threshold)
-  integer :: i, n, threshold
-  !$OMP SIMD SAFELEN(2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(2) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param
-subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
-  integer :: i, n, threshold
-  integer, parameter :: safelen = 2;
-  !$OMP SIMD SAFELEN(safelen*2 + 2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(6) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause
-subroutine simd_with_simdlen_safelen_clause(n, threshold)
-  integer :: i, n, threshold
-  !$OMP SIMD SIMDLEN(1) SAFELEN(2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(1) safelen(2) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_collapse_clause
-subroutine simd_with_collapse_clause(n)
-  integer :: i, j, n
-  integer :: A(n,n)
-  ! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
-  ! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
-  ! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
-  ! CHECK-SAME:               %[[LOWER_I]], %[[LOWER_J]]) to (
-  ! CHECK-SAME:               %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
-  ! CHECK-SAME:               %[[STEP_I]], %[[STEP_J]]) {
-  !$OMP SIMD COLLAPSE(2)
-  do i = 1, n
-    do j = 1, n
-       A(i,j) = i + j
-    end do
-  end do
-  !$OMP END SIMD
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/simple-barrier.f90 b/flang/test/Lower/OpenMP/FIR/simple-barrier.f90
deleted file mode 100644
index c621b8062eaa..000000000000
--- a/flang/test/Lower/OpenMP/FIR/simple-barrier.f90
+++ /dev/null
@@ -1,6 +0,0 @@
-! RUN: bbc -fopenmp -emit-fir -o - %s | FileCheck %s
-
-subroutine sample()
-! CHECK: omp.barrier
-!$omp barrier
-end subroutine sample
diff --git a/flang/test/Lower/OpenMP/FIR/single.f90 b/flang/test/Lower/OpenMP/FIR/single.f90
deleted file mode 100644
index 65ae07c2c284..000000000000
--- a/flang/test/Lower/OpenMP/FIR/single.f90
+++ /dev/null
@@ -1,123 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-
-!===============================================================================
-! Single construct
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_single
-!CHECK-SAME: (%[[x:.*]]: !fir.ref<i32> {fir.bindc_name = "x"})
-subroutine omp_single(x)
-  integer, intent(inout) :: x
-  !CHECK: omp.parallel
-  !$omp parallel
-  !CHECK: omp.single
-  !$omp single
-    !CHECK: %[[xval:.*]] = fir.load %[[x]] : !fir.ref<i32>
-    !CHECK: %[[res:.*]] = arith.addi %[[xval]], %{{.*}} : i32
-    !CHECK: fir.store %[[res]] to %[[x]] : !fir.ref<i32>
-    x = x + 12
-  !CHECK: omp.terminator
-  !$omp end single
-  !CHECK: omp.terminator
-  !$omp end parallel
-end subroutine omp_single
-
-!===============================================================================
-! Single construct with nowait
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_single_nowait
-!CHECK-SAME: (%[[x:.*]]: !fir.ref<i32> {fir.bindc_name = "x"})
-subroutine omp_single_nowait(x)
-  integer, intent(inout) :: x
-  !CHECK: omp.parallel
-  !$omp parallel
-  !CHECK: omp.single nowait
-  !$omp single
-    !CHECK: %[[xval:.*]] = fir.load %[[x]] : !fir.ref<i32>
-    !CHECK: %[[res:.*]] = arith.addi %[[xval]], %{{.*}} : i32
-    !CHECK: fir.store %[[res]] to %[[x]] : !fir.ref<i32>
-    x = x + 12
-  !CHECK: omp.terminator
-  !$omp end single nowait
-  !CHECK: omp.terminator
-  !$omp end parallel
-end subroutine omp_single_nowait
-
-!===============================================================================
-! Single construct with allocate
-!===============================================================================
-
-!CHECK-LABEL: func @_QPsingle_allocate
-subroutine single_allocate()
-  use omp_lib
-  integer :: x
-  !CHECK: omp.parallel {
-  !$omp parallel
-  !CHECK: omp.single allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) {
-  !$omp single allocate(omp_high_bw_mem_alloc: x) private(x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end single
-  !CHECK: omp.terminator
-  !$omp end parallel
-end subroutine single_allocate
-
-!===============================================================================
-! Single construct with private/firstprivate
-!===============================================================================
-
-! CHECK-LABEL: func.func @_QPsingle_privatization(
-! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"},
-! CHECK-SAME:                                     %[[VAL_1:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) {
-! CHECK:         omp.single   {
-! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatizationEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca f64 {bindc_name = "y", pinned, uniq_name = "_QFsingle_privatizationEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]] : !fir.ref<f64>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_3]] : !fir.ref<f64>
-! CHECK:           fir.call @_QPbar(%[[VAL_2]], %[[VAL_3]]) {{.*}}: (!fir.ref<f32>, !fir.ref<f64>) -> ()
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine single_privatization(x, y)
-  real :: x
-  real(8) :: y
-
-  !$omp single private(x) firstprivate(y)
-  call bar(x, y)
-  !$omp end single
-end subroutine
-
-! CHECK-LABEL: func.func @_QPsingle_privatization2(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"},
-! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) {
-! CHECK:         omp.parallel   {
-! CHECK:           omp.single   {
-! CHECK:             %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatization2Ex"}
-! CHECK:             %[[VAL_3:.*]] = fir.alloca f64 {bindc_name = "y", pinned, uniq_name = "_QFsingle_privatization2Ey"}
-! CHECK:             %[[VAL_4:.*]] = fir.load %[[VAL_1]] : !fir.ref<f64>
-! CHECK:             fir.store %[[VAL_4]] to %[[VAL_3]] : !fir.ref<f64>
-! CHECK:             fir.call @_QPbar(%[[VAL_2]], %[[VAL_3]]) {{.*}}: (!fir.ref<f32>, !fir.ref<f64>) -> ()
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine single_privatization2(x, y)
-  real :: x
-  real(8) :: y
-
-  !$omp parallel
-  !$omp single private(x) firstprivate(y)
-  call bar(x, y)
-  !$omp end single
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90 b/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90
deleted file mode 100644
index 32cc6d17c420..000000000000
--- a/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90
+++ /dev/null
@@ -1,154 +0,0 @@
-! This test checks lowering of stop statement in OpenMP region.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL: func.func @_QPtest_stop_in_region1() {
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_1:.*]] = arith.constant false
-! CHECK:           %[[VAL_2:.*]] = arith.constant false
-! CHECK:           %[[VAL_3:.*]] = fir.call @_FortranAStopStatement(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {{.*}} : (i32, i1, i1) -> none
-! CHECK-NOT:       fir.unreachable
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine test_stop_in_region1()
-  !$omp parallel
-    stop 1
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func.func @_QPtest_stop_in_region2() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_stop_in_region2Ex"}
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_2:.*]] = arith.constant false
-! CHECK:           %[[VAL_3:.*]] = arith.constant false
-! CHECK:           %[[VAL_4:.*]] = fir.call @_FortranAStopStatement(%[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {{.*}} : (i32, i1, i1) -> none
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine test_stop_in_region2()
-  integer :: x
-  !$omp parallel
-    stop 1
-    x = 2
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func.func @_QPtest_stop_in_region3() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_stop_in_region3Ex"}
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : i32
-! CHECK:           fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref<i32>
-! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_4:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_3]] : i32
-! CHECK:           cf.cond_br %[[VAL_4]], ^bb1, ^bb2
-! CHECK:         ^bb1:
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant false
-! CHECK:           %[[VAL_7:.*]] = arith.constant false
-! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranAStopStatement(%[[VAL_5]], %[[VAL_6]], %[[VAL_7]]) {{.*}} : (i32, i1, i1) -> none
-! CHECK:           omp.terminator
-! CHECK:         ^bb2:
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine test_stop_in_region3()
-  integer :: x
-  !$omp parallel
-    x = 3
-    if (x > 1) stop x
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func.func @_QPtest_stop_in_region4() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_stop_in_region4Ei"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_stop_in_region4Ex"}
-! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_4:.*]] = arith.constant 10 : i32
-! CHECK:         %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:         omp.wsloop {
-! CHECK-NEXT:      omp.loop_nest (%[[VAL_6:.*]]) : i32 = (%[[VAL_3]]) to (%[[VAL_4]]) inclusive step (%[[VAL_5]]) {
-! CHECK:             fir.store %[[VAL_6]] to %[[VAL_0]] : !fir.ref<i32>
-! CHECK:             cf.br ^bb1
-! CHECK:           ^bb1:
-! CHECK:             %[[VAL_7:.*]] = arith.constant 3 : i32
-! CHECK:             fir.store %[[VAL_7]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : i32
-! CHECK:             cf.cond_br %[[VAL_10]], ^bb2, ^bb3
-! CHECK:           ^bb2:
-! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_12:.*]] = arith.constant false
-! CHECK:             %[[VAL_13:.*]] = arith.constant false
-! CHECK:             %[[VAL_14:.*]] = fir.call @_FortranAStopStatement(%[[VAL_11]], %[[VAL_12]], %[[VAL_13]]) {{.*}} : (i32, i1, i1) -> none
-! CHECK:             omp.yield
-! CHECK:           ^bb3:
-! CHECK:             omp.yield
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         cf.br ^bb1
-! CHECK:       ^bb1:
-! CHECK:         return
-! CHECK:       }
-
-subroutine test_stop_in_region4()
-  integer :: x
-  !$omp do
-  do i = 1, 10
-    x = 3
-    if (x > 1) stop x
-  enddo
-  !$omp end do
-end
-
-
-!CHECK-LABEL: func.func @_QPtest_stop_in_region5
-!CHECK:   omp.parallel   {
-!CHECK:     {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   return
-
-subroutine test_stop_in_region5()
-  !$omp parallel
-  block
-    stop 1
-  end block
-  !$omp end parallel
-end
-
-!CHECK-LABEL: func.func @_QPtest_stop_in_region6
-!CHECK:  omp.parallel   {
-!CHECK:    cf.cond_br %{{.*}}, ^[[BB1:.*]], ^[[BB2:.*]]
-!CHECK:  ^[[BB1]]:
-!CHECK:    {{.*}}fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
-!CHECK:    omp.terminator
-!CHECK:  ^[[BB2]]:
-!CHECK:    {{.*}}fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK:  return
-
-subroutine test_stop_in_region6(x)
-  integer :: x
-  !$omp parallel
-  if (x .gt. 1) then
-    stop 1
-  else
-    stop 2
-  end if
-  !$omp end parallel
-end
diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90
deleted file mode 100644
index a7344e02cf7c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/target.f90
+++ /dev/null
@@ -1,553 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!===============================================================================
-! Target_Enter Simple
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_simple() {
-subroutine omp_target_enter_simple
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data map(to: a)
-end subroutine omp_target_enter_simple
-
-!===============================================================================
-! Target_Enter Map types
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_mt() {
-subroutine omp_target_enter_mt
-   integer :: a(1024)
-   integer :: b(1024)
-   integer :: c(1024)
-   integer :: d(1024)
-   !CHECK: %[[BOUNDS_0:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_0:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_0]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: %[[BOUNDS_1:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr(%{{.*}})  map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref<!fir.array<1024xi32>> {name = "b"}
-   !CHECK: %[[BOUNDS_2:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
-   !CHECK: %[[BOUNDS_3:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref<!fir.array<1024xi32>> {name = "d"}
-   !CHECK: omp.target_enter_data   map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data map(to: a, b) map(always, alloc: c) map(to: d)
-end subroutine omp_target_enter_mt
-
-!===============================================================================
-! `Nowait` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_nowait() {
-subroutine omp_target_enter_nowait
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data  nowait map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data map(to: a) nowait
-end subroutine omp_target_enter_nowait
-
-!===============================================================================
-! `if` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_if() {
-subroutine omp_target_enter_if
-   integer :: a(1024)
-   integer :: i
-   i = 5
-   !CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_1:.*]] : !fir.ref<i32>
-   !CHECK: %[[VAL_4:.*]] = arith.constant 10 : i32
-   !CHECK: %[[VAL_5:.*]] = arith.cmpi slt, %[[VAL_3]], %[[VAL_4]] : i32
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data   if(%[[VAL_5]] : i1) map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data if(i<10) map(to: a)
-end subroutine omp_target_enter_if
-
-!===============================================================================
-! `device` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_device() {
-subroutine omp_target_enter_device
-   integer :: a(1024)
-   !CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data   device(%[[VAL_1]] : i32) map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data map(to: a) device(2)
-end subroutine omp_target_enter_device
-
-!===============================================================================
-! Target_Exit Simple
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_exit_simple() {
-subroutine omp_target_exit_simple
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_exit_data   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target exit data map(from: a)
-end subroutine omp_target_exit_simple
-
-!===============================================================================
-! Target_Exit Map types
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_exit_mt() {
-subroutine omp_target_exit_mt
-   integer :: a(1024)
-   integer :: b(1024)
-   integer :: c(1024)
-   integer :: d(1024)
-   integer :: e(1024)
-   !CHECK: %[[BOUNDS_0:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_0:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_0]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: %[[BOUNDS_1:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref<!fir.array<1024xi32>> {name = "b"}
-   !CHECK: %[[BOUNDS_2:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
-   !CHECK: %[[BOUNDS_3:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(always, delete) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref<!fir.array<1024xi32>> {name = "d"}
-   !CHECK: %[[BOUNDS_4:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_4:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_4]]) -> !fir.ref<!fir.array<1024xi32>> {name = "e"}
-   !CHECK: omp.target_exit_data map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]], %[[MAP_4]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
-   !$omp target exit data map(from: a,b) map(release: c) map(always, delete: d) map(from: e)
-end subroutine omp_target_exit_mt
-
-!===============================================================================
-! `device` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_exit_device() {
-subroutine omp_target_exit_device
-   integer :: a(1024)
-   integer :: d
-   !CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1:.*]] : !fir.ref<i32>
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_exit_data   device(%[[VAL_2]] : i32) map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target exit data map(from: a) device(d)
-end subroutine omp_target_exit_device
-
-!===============================================================================
-! Target_Update `to` clause
-!===============================================================================
-
-subroutine omp_target_update_to
-   integer :: a(1024)
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_update_toEa"}
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-
-   !CHECK: %[[TO_MAP:.*]] = omp.map.info var_ptr(%[[A_ALLOC]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
-   !CHECK-SAME: map_clauses(to) capture(ByRef)
-   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-
-   !CHECK: omp.target_update
-   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update to(a)
-end subroutine omp_target_update_to
-
-!===============================================================================
-! Target_Update `from` clause
-!===============================================================================
-
-subroutine omp_target_update_from
-   integer :: a(1024)
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_update_fromEa"}
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-
-   !CHECK: %[[FROM_MAP:.*]] = omp.map.info var_ptr(%[[A_ALLOC]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
-   !CHECK-SAME: map_clauses(from) capture(ByRef)
-   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-
-   !CHECK: omp.target_update
-   !CHECK-SAME: motion_entries(%[[FROM_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update from(a)
-end subroutine omp_target_update_from
-
-!===============================================================================
-! Target_Update `if` clause
-!===============================================================================
-
-subroutine omp_target_update_if
-   integer :: a(1024)
-   logical :: i
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-   !CHECK-DAG: %[[COND:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1
-
-   !CHECK: %[[TO_MAP:.*]] = omp.map.info
-
-   !CHECK: omp.target_update if(%[[COND]] : i1)
-   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update to(a) if(i)
-end subroutine omp_target_update_if
-
-!===============================================================================
-! Target_Update `device` clause
-!===============================================================================
-
-subroutine omp_target_update_device
-   integer :: a(1024)
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-   !CHECK-DAG: %[[DEVICE:.*]] = arith.constant 1 : i32
-
-   !CHECK: %[[TO_MAP:.*]] = omp.map.info
-
-   !CHECK: omp.target_update
-   !CHECK-SAME: device(%[[DEVICE]] : i32)
-   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update to(a) device(1)
-end subroutine omp_target_update_device
-
-!===============================================================================
-! Target_Update `nowait` clause
-!===============================================================================
-
-subroutine omp_target_update_nowait
-   integer :: a(1024)
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-
-   !CHECK: %[[TO_MAP:.*]] = omp.map.info
-
-   !CHECK: omp.target_update
-   !CHECK-SAME: nowait
-   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update to(a) nowait
-end subroutine omp_target_update_nowait
-
-!===============================================================================
-! Target_Data with region
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_data() {
-subroutine omp_target_data
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_dataEa"}
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_data   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) {
-   !$omp target data map(tofrom: a)
-      !CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32
-      !CHECK: %[[VAL_2:.*]] = arith.constant 1 : i64
-      !CHECK: %[[VAL_3:.*]] = arith.constant 1 : i64
-      !CHECK: %[[VAL_4:.*]] = arith.subi %[[VAL_2]], %[[VAL_3]] : i64
-      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_4]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-      !CHECK: fir.store %[[VAL_1]] to %[[VAL_5]] : !fir.ref<i32>
-      a(1) = 10
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-end subroutine omp_target_data
-
-!CHECK-LABEL: func.func @_QPomp_target_data_mt
-subroutine omp_target_data_mt
-   integer :: a(1024)
-   integer :: b(1024)
-   !CHECK: %[[VAR_A:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_data_mtEa"}
-   !CHECK: %[[VAR_B:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "b", uniq_name = "_QFomp_target_data_mtEb"}
-   !CHECK: %[[BOUNDS_A:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAR_A]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_A]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_data   map_entries(%[[MAP_A]] : !fir.ref<!fir.array<1024xi32>>) {
-   !$omp target data map(a)
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-   !CHECK: %[[BOUNDS_B:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAR_B]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(always, from) capture(ByRef) bounds(%[[BOUNDS_B]]) -> !fir.ref<!fir.array<1024xi32>> {name = "b"}
-   !CHECK: omp.target_data   map_entries(%[[MAP_B]] : !fir.ref<!fir.array<1024xi32>>) {
-   !$omp target data map(always, from : b)
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-end subroutine omp_target_data_mt
-
-!===============================================================================
-! Target with region
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target() {
-subroutine omp_target
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_targetEa"}
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<!fir.array<1024xi32>>) {
-   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>):
-   !$omp target map(tofrom: a)
-      !CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32
-      !CHECK: %[[VAL_2:.*]] = arith.constant 1 : i64
-      !CHECK: %[[VAL_3:.*]] = arith.constant 1 : i64
-      !CHECK: %[[VAL_4:.*]] = arith.subi %[[VAL_2]], %[[VAL_3]] : i64
-      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0]], %[[VAL_4]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-      !CHECK: fir.store %[[VAL_1]] to %[[VAL_5]] : !fir.ref<i32>
-      a(1) = 10
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target
-
-!===============================================================================
-! Target implicit capture
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_implicit() {
-subroutine omp_target_implicit
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_implicitEa"}
-   integer :: a(1024)
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(implicit, tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<!fir.array<1024xi32>>) {
-   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>):
-   !$omp target
-      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0]], %{{.*}} : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-      a(1) = 10
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_implicit
-
-!===============================================================================
-! Target implicit capture nested
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_implicit_nested() {
-subroutine omp_target_implicit_nested
-   integer::a, b
-   !CHECK: omp.target   map_entries(%{{.*}} -> %[[ARG0:.*]], %{{.*}} -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<i32>):
-   !$omp target
-      !CHECK: fir.store %{{.*}} to %[[ARG0]] : !fir.ref<i32>
-      a = 10
-      !$omp parallel
-         !CHECK: fir.store %{{.*}} to %[[ARG1]] : !fir.ref<i32>
-         b = 20
-         !CHECK: omp.terminator
-      !$omp end parallel
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_implicit_nested
-
-!===============================================================================
-! Target implicit capture with bounds
-!===============================================================================
-
-
-!CHECK-LABEL: func.func @_QPomp_target_implicit_bounds(
-!CHECK: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-subroutine omp_target_implicit_bounds(n)
-   !CHECK: %[[VAL_COPY:.*]] = fir.alloca i32
-   !CHECK: %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-   !CHECK: fir.store %[[VAL_1]] to %[[VAL_COPY]] : !fir.ref<i32>
-   !CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
-   !CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i64) -> index
-   !CHECK: %[[VAL_4:.*]] = arith.constant 0 : index
-   !CHECK: %[[VAL_5:.*]] = arith.cmpi sgt, %[[VAL_3]], %[[VAL_4]] : index
-   !CHECK: %[[VAL_6:.*]] = arith.select %[[VAL_5]], %[[VAL_3]], %[[VAL_4]] : index
-   !CHECK: %[[VAL_7:.*]] = arith.constant 1024 : i64
-   !CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> index
-   !CHECK: %[[VAL_9:.*]] = arith.constant 0 : index
-   !CHECK: %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : index
-   !CHECK: %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : index
-   !CHECK: %[[VAL_12:.*]] = fir.alloca !fir.array<?x1024xi32>, %[[VAL_6]] {bindc_name = "a", uniq_name = "_QFomp_target_implicit_boundsEa"}
-   integer :: n
-   integer :: a(n, 1024)
-   !CHECK: %[[VAL_13:.*]] = arith.constant 1 : index
-   !CHECK: %[[VAL_14:.*]] = arith.constant 0 : index
-   !CHECK: %[[VAL_15:.*]] = arith.subi %[[VAL_6]], %[[VAL_13]] : index
-   !CHECK: %[[VAL_16:.*]] = omp.map.bounds lower_bound(%[[VAL_14]] : index) upper_bound(%[[VAL_15]] : index) extent(%[[VAL_6]] : index) stride(%[[VAL_13]] : index) start_idx(%[[VAL_13]] : index)
-   !CHECK: %[[VAL_17:.*]] = arith.constant 0 : index
-   !CHECK: %[[VAL_18:.*]] = arith.subi %[[VAL_11]], %[[VAL_13]] : index
-   !CHECK: %[[VAL_19:.*]] = omp.map.bounds lower_bound(%[[VAL_17]] : index) upper_bound(%[[VAL_18]] : index) extent(%[[VAL_11]] : index) stride(%[[VAL_13]] : index) start_idx(%[[VAL_13]] : index)
-   !CHECK: %[[VAL_20:.*]] = omp.map.info var_ptr(%[[VAL_12]] : !fir.ref<!fir.array<?x1024xi32>>, !fir.array<?x1024xi32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_16]], %[[VAL_19]]) -> !fir.ref<!fir.array<?x1024xi32>> {name = "a"}
-   !CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = ""}
-   !CHECK: omp.target map_entries(%[[VAL_20]] -> %[[VAL_22:.*]], %[[VAL_21]] -> %[[VAL_23:.*]] : !fir.ref<!fir.array<?x1024xi32>>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[VAL_22]]: !fir.ref<!fir.array<?x1024xi32>>, %[[VAL_23]]: !fir.ref<i32>):
-   !$omp target
-      !CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.ref<i32>
-      !CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> i64
-      !CHECK: %[[VAL_26:.*]] = arith.constant 0 : index
-      !CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (i64) -> index
-      !CHECK: %[[VAL_28:.*]] = arith.cmpi sgt, %[[VAL_27]], %[[VAL_26]] : index
-      !CHECK: %[[VAL_29:.*]] = arith.select %[[VAL_28]], %[[VAL_27]], %[[VAL_26]] : index
-      !CHECK: %[[VAL_30:.*]] = arith.constant 33 : i32
-      !CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.array<?x1024xi32>>) -> !fir.ref<!fir.array<?xi32>>
-      !CHECK: %[[VAL_32:.*]] = arith.constant 1 : index
-      !CHECK: %[[VAL_33:.*]] = arith.constant 0 : index
-      !CHECK: %[[VAL_34:.*]] = arith.constant 11 : i64
-      !CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_34]] : (i64) -> index
-      !CHECK: %[[VAL_36:.*]] = arith.subi %[[VAL_35]], %[[VAL_32]] : index
-      !CHECK: %[[VAL_37:.*]] = arith.muli %[[VAL_32]], %[[VAL_36]] : index
-      !CHECK: %[[VAL_38:.*]] = arith.addi %[[VAL_37]], %[[VAL_33]] : index
-      !CHECK: %[[VAL_39:.*]] = arith.muli %[[VAL_32]], %[[VAL_29]] : index
-      !CHECK: %[[VAL_40:.*]] = arith.constant 22 : i64
-      !CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i64) -> index
-      !CHECK: %[[VAL_42:.*]] = arith.subi %[[VAL_41]], %[[VAL_32]] : index
-      !CHECK: %[[VAL_43:.*]] = arith.muli %[[VAL_39]], %[[VAL_42]] : index
-      !CHECK: %[[VAL_44:.*]] = arith.addi %[[VAL_43]], %[[VAL_38]] : index
-      !CHECK: %[[VAL_45:.*]] = fir.coordinate_of %[[VAL_31]], %[[VAL_44]] : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-      !CHECK: fir.store %[[VAL_30]] to %[[VAL_45]] : !fir.ref<i32>
-      a(11, 22) = 33
-      !CHECK: omp.terminator
-   !$omp end target
-!CHECK: }
-end subroutine omp_target_implicit_bounds
-
-!===============================================================================
-! Target `thread_limit` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_thread_limit() {
-subroutine omp_target_thread_limit
-   integer :: a
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
-   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
-   !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<i32>):
-   !$omp target map(tofrom: a) thread_limit(64)
-      a = 10
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_thread_limit
-
-!===============================================================================
-! Target `use_device_ptr` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_device_ptr() {
-subroutine omp_target_device_ptr
-   use iso_c_binding, only : c_ptr, c_loc
-   type(c_ptr) :: a
-   integer, target :: b
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"}
-   !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}
-   !$omp target data map(tofrom: a) use_device_ptr(a)
-   !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>):
-   !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
-      a = c_loc(b)
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-end subroutine omp_target_device_ptr
-
- !===============================================================================
- ! Target `use_device_addr` clause
- !===============================================================================
-
- !CHECK-LABEL: func.func @_QPomp_target_device_addr() {
- subroutine omp_target_device_addr
-   integer, pointer :: a
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFomp_target_device_addrEa"}
-   !CHECK: %[[MAP_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
-   !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
-   !$omp target data map(tofrom: a) use_device_addr(a)
-   !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
-   !CHECK: {{.*}} = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-      a = 10
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-end subroutine omp_target_device_addr
-
-!===============================================================================
-! Target with parallel loop
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_parallel_do() {
-subroutine omp_target_parallel_do
-   !CHECK: %[[C1024:.*]] = arith.constant 1024 : index
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_parallel_doEa"}
-   integer :: a(1024)
-   !CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_target_parallel_doEi"}
-   integer :: i
-   !CHECK: %[[C1:.*]] = arith.constant 1 : index
-   !CHECK: %[[C0:.*]] = arith.constant 0 : index
-   !CHECK: %[[SUB:.*]] = arith.subi %[[C1024]], %[[C1]] : index
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[C0]] : index) upper_bound(%[[SUB]] : index) extent(%[[C1024]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
-   !CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-   !CHECK: omp.target map_entries(%[[MAP1]] -> %[[VAL_2:.*]], %[[MAP2]] -> %[[VAL_3:.*]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[VAL_2]]: !fir.ref<!fir.array<1024xi32>>, %[[VAL_3]]: !fir.ref<i32>):
-      !CHECK-NEXT: omp.parallel
-      !$omp target parallel do map(tofrom: a)
-         !CHECK: %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-         !CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32
-         !CHECK: %[[VAL_6:.*]] = arith.constant 1024 : i32
-         !CHECK: %[[VAL_7:.*]] = arith.constant 1 : i32
-         !CHECK: omp.wsloop {
-         !CHECK: omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_5]]) to (%[[VAL_6]]) inclusive step (%[[VAL_7]]) {
-         !CHECK: fir.store %[[VAL_8]] to %[[VAL_4]] : !fir.ref<i32>
-         !CHECK: %[[VAL_9:.*]] = arith.constant 10 : i32
-         !CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-         !CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> i64
-         !CHECK: %[[VAL_12:.*]] = arith.constant 1 : i64
-         !CHECK: %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_12]] : i64
-         !CHECK: %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_13]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-         !CHECK: fir.store %[[VAL_9]] to %[[VAL_14]] : !fir.ref<i32>
-         do i = 1, 1024
-            a(i) = 10
-         end do
-         !CHECK: omp.yield
-         !CHECK: }
-         !CHECK: omp.terminator
-         !CHECK: }
-      !CHECK: omp.terminator
-      !CHECK: }
-   !CHECK: omp.terminator
-   !CHECK: }
-   !$omp end target parallel do
-end subroutine omp_target_parallel_do
-
-!===============================================================================
-! Target `is_device_ptr` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_is_device_ptr() {
-subroutine omp_target_is_device_ptr
-   use iso_c_binding, only : c_ptr, c_loc
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = "a", uniq_name = "_QFomp_target_is_device_ptrEa"}
-   type(c_ptr) :: a
-   !CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "b", fir.target, uniq_name = "_QFomp_target_is_device_ptrEb"}
-   integer, target :: b
-   !CHECK: %[[MAP_0:.*]] = omp.map.info var_ptr(%[[DEV_PTR:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {name = "a"}
-   !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr(%[[VAL_0:.*]] : !fir.ref<i32>, i32)   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "b"}
-   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr(%[[DEV_PTR:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {name = "a"}
-   !CHECK: omp.target is_device_ptr(%[[DEV_PTR:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) map_entries(%[[MAP_0:.*]] -> %[[ARG0:.*]], %[[MAP_1:.*]] -> %[[ARG1:.*]], %[[MAP_2:.*]] -> %[[ARG2:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<i32>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
-   !CHECK: ^bb0(%[[ARG0]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %[[ARG1]]: !fir.ref<i32>, %[[ARG2]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>):
-   !$omp target map(tofrom: a,b) is_device_ptr(a)
-      !CHECK: {{.*}} = fir.coordinate_of %[[VAL_0:.*]], {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
-      a = c_loc(b)
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_is_device_ptr
-
- !===============================================================================
- ! Target `has_device_addr` clause
- !===============================================================================
-
- !CHECK-LABEL: func.func @_QPomp_target_has_device_addr() {
- subroutine omp_target_has_device_addr
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFomp_target_has_device_addrEa"}
-   integer, pointer :: a
-   !CHECK: omp.target has_device_addr(%[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) map_entries({{.*}} -> {{.*}}, {{.*}} -> {{.*}} : !fir.llvm_ptr<!fir.ref<i32>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) {
-   !$omp target has_device_addr(a)
-   !CHECK: {{.*}} = fir.load %[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-      a = 10
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_has_device_addr
diff --git a/flang/test/Lower/OpenMP/FIR/target_cpu_features.f90 b/flang/test/Lower/OpenMP/FIR/target_cpu_features.f90
deleted file mode 100644
index 5154782e1ae1..000000000000
--- a/flang/test/Lower/OpenMP/FIR/target_cpu_features.f90
+++ /dev/null
@@ -1,19 +0,0 @@
-!REQUIRES: amdgpu-registered-target, nvptx-registered-target
-!RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx908 -fopenmp -fopenmp-is-target-device %s -o - | FileCheck --check-prefix=AMDGCN %s
-!RUN: %flang_fc1 -emit-fir -triple nvptx64-nvidia-cuda -target-cpu sm_80 -fopenmp -fopenmp-is-target-device %s -o - | FileCheck --check-prefix=NVPTX %s
-
-!===============================================================================
-! Target_Enter Simple
-!===============================================================================
-
-!AMDGCN: module attributes {
-!AMDGCN-SAME: fir.target_cpu = "gfx908"
-!AMDGCN-SAME: fir.target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts",
-!AMDGCN-SAME: "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts",
-!AMDGCN-SAME: "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp",
-!AMDGCN-SAME: "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+mai-insts",
-!AMDGCN-SAME: "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>
-
-!NVPTX: module attributes {
-!NVPTX-SAME: fir.target_cpu = "sm_80"
-!NVPTX-SAME: fir.target_features = #llvm.target_features<["+ptx61", "+sm_80"]>
diff --git a/flang/test/Lower/OpenMP/FIR/task.f90 b/flang/test/Lower/OpenMP/FIR/task.f90
deleted file mode 100644
index 012ac757d304..000000000000
--- a/flang/test/Lower/OpenMP/FIR/task.f90
+++ /dev/null
@@ -1,237 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK-LABEL: func @_QPomp_task_simple() {
-subroutine omp_task_simple
-  !CHECK: omp.task {
-  !$omp task
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_simple
-
-!===============================================================================
-! `if` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_if(%{{.+}}) {
-subroutine omp_task_if(bar)
-  logical, intent(inout) :: bar
-  !CHECK: omp.task if(%{{.+}}) {
-  !$omp task if(bar)
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_if
-
-!===============================================================================
-! `final` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_final(%{{.+}}) {
-subroutine omp_task_final(bar)
-  logical, intent(inout) :: bar
-  !CHECK: omp.task final(%{{.+}}) {
-  !$omp task final(bar)
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_final
-
-!===============================================================================
-! `priority` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_priority(%{{.+}}) {
-subroutine omp_task_priority(bar)
-  integer, intent(inout) :: bar
-  !CHECK: omp.task priority(%{{.+}}) {
-  !$omp task priority(bar)
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_priority
-
-!===============================================================================
-! `allocate` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPtask_allocate
-subroutine task_allocate()
-  use omp_lib
-  integer :: x
-  !CHECK: omp.task allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task allocate(omp_high_bw_mem_alloc: x) private(x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_allocate
-
-!===============================================================================
-! `depend` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPtask_depend
-subroutine task_depend()
-  integer :: x
-  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task depend(in : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend
-
-!CHECK-LABEL: func @_QPtask_depend_non_int
-subroutine task_depend_non_int()
-  character(len = 15) :: x
-  integer, allocatable :: y
-  complex :: z
-  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<!fir.char<1,15>>, taskdependin -> %{{.+}} : !fir.ref<!fir.box<!fir.heap<i32>>>, taskdependin ->  %{{.+}} : !fir.ref<!fir.complex<4>>) {
-  !$omp task depend(in : x, y, z)
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend_non_int
-
-!CHECK-LABEL: func @_QPtask_depend_all_kinds_one_task
-subroutine task_depend_all_kinds_one_task()
-  integer :: x
-  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>, taskdependout -> %{{.+}} : !fir.ref<i32>, taskdependinout -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task depend(in : x) depend(out : x) depend(inout : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend_all_kinds_one_task
-
-!CHECK-LABEL: func @_QPtask_depend_multi_var
-subroutine task_depend_multi_var()
-  integer :: x
-  integer :: y
-  !CHECK: omp.task depend(taskdependin -> %{{.*}} : !fir.ref<i32>, taskdependin -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task depend(in :x,y)
-  !CHECK: arith.addi
-  x = x + 12
-  y = y + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend_multi_var
-
-!CHECK-LABEL: func @_QPtask_depend_multi_task
-subroutine task_depend_multi_task()
-  integer :: x
-  !CHECK: omp.task depend(taskdependout -> %{{.+}} : !fir.ref<i32>)
-  !$omp task depend(out : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-  !CHECK: omp.task depend(taskdependinout -> %{{.+}} : !fir.ref<i32>)
-  !$omp task depend(inout : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>)
-  !$omp task depend(in : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend_multi_task
-
-!===============================================================================
-! `private` clause
-!===============================================================================
-!CHECK-LABEL: func @_QPtask_private
-subroutine task_private
-  type mytype
-  integer :: x
-  end type mytype
-
-  !CHECK: %[[int_var:.+]] = fir.alloca i32
-  !CHECK: %[[mytype_var:.+]] = fir.alloca !fir.type<_QFtask_privateTmytype{x:i32}>
-  integer :: int_var
-  type(mytype) :: mytype_var
-
-  !CHECK: fir.call @_QPbar(%[[int_var]], %[[mytype_var]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>) -> ()
-  call bar(int_var, mytype_var)
-
-  !CHECK: omp.task {
-  !$omp task private(int_var, mytype_var)
-  !CHECK: %[[int_var_private:.+]] = fir.alloca i32
-  !CHECK: %[[mytype_var_private:.+]] = fir.alloca !fir.type<_QFtask_privateTmytype{x:i32}>
-
-  !CHECK: fir.call @_QPbar(%[[int_var_private]], %[[mytype_var_private]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>) -> ()
-  call bar(int_var, mytype_var)
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_private
-
-!===============================================================================
-! `firstprivate` clause
-!===============================================================================
-!CHECK-LABEL: func @_QPtask_firstprivate
-subroutine task_firstprivate
-  type mytype
-  integer :: x
-  end type mytype
-
-  !CHECK: %[[int_var:.+]] = fir.alloca i32
-  !CHECK: %[[mytype_var:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}>
-  integer :: int_var
-  type(mytype) :: mytype_var
-
-  !CHECK: fir.call @_QPbaz(%[[int_var]], %[[mytype_var]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>) -> ()
-  call baz(int_var, mytype_var)
-
-  !CHECK: omp.task {
-  !$omp task firstprivate(int_var, mytype_var)
-  !CHECK: %[[int_var_firstprivate:.+]] = fir.alloca i32
-  !CHECK: %[[int_var_load:.+]] = fir.load %[[int_var]] : !fir.ref<i32>
-  !CHECK: fir.store %[[int_var_load]] to %[[int_var_firstprivate]] : !fir.ref<i32>
-  !CHECK: %[[mytype_var_firstprivate:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}>
-  !CHECK: %[[mytype_var_load:.+]] = fir.load %[[mytype_var]] : !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>
-  !CHECK: fir.store %[[mytype_var_load]] to %[[mytype_var_firstprivate]]
-  !CHECK: fir.call @_QPbaz(%[[int_var_firstprivate]], %[[mytype_var_firstprivate]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>) -> ()
-  call baz(int_var, mytype_var)
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_firstprivate
-
-!===============================================================================
-! Multiple clauses
-!===============================================================================
-
-!CHECK-LABEL: func @_QPtask_multiple_clauses
-subroutine task_multiple_clauses()
-  use omp_lib
-
-  !CHECK: %[[x:.+]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtask_multiple_clausesEx"}
-  !CHECK: %[[y:.+]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtask_multiple_clausesEy"}
-  !CHECK: %[[z:.+]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFtask_multiple_clausesEz"}
-  integer :: x, y, z
-  logical :: buzz
-
-  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) priority(%{{.+}}) allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task if(buzz) final(buzz) priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
-
-  !CHECK: %[[x_priv:.+]] = fir.alloca i32
-  !CHECK: %[[y_priv:.+]] = fir.alloca i32
-  !CHECK: %[[y_load:.+]] = fir.load %[[y]] : !fir.ref<i32>
-  !CHECK: fir.store %[[y_load]] to %[[y_priv]] : !fir.ref<i32>
-
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: arith.subi
-  y = y - 12
-
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_multiple_clauses
diff --git a/flang/test/Lower/OpenMP/FIR/taskgroup.f90 b/flang/test/Lower/OpenMP/FIR/taskgroup.f90
deleted file mode 100644
index 78b9da8e9b09..000000000000
--- a/flang/test/Lower/OpenMP/FIR/taskgroup.f90
+++ /dev/null
@@ -1,21 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK-LABEL: @_QPomp_taskgroup
-subroutine omp_taskgroup
-use omp_lib
-integer :: allocated_x
-!CHECK-DAG: %{{.*}} = fir.alloca i32 {bindc_name = "allocated_x", uniq_name = "_QFomp_taskgroupEallocated_x"}
-!CHECK-DAG: %{{.*}} = arith.constant 4 : i64
-
-!CHECK: omp.taskgroup  allocate(%{{.*}} : i64 -> %0 : !fir.ref<i32>)
-!$omp taskgroup allocate(omp_high_bw_mem_alloc: allocated_x)
-!$omp task
-!CHECK: fir.call @_QPwork() {{.*}}: () -> ()
-   call work()
-!CHECK: omp.terminator
-!$omp end task
-!CHECK: omp.terminator
-!$omp end taskgroup
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/taskwait.f90 b/flang/test/Lower/OpenMP/FIR/taskwait.f90
deleted file mode 100644
index eed4f1b84a22..000000000000
--- a/flang/test/Lower/OpenMP/FIR/taskwait.f90
+++ /dev/null
@@ -1,12 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect"
-
-!FIRDialect-LABEL: @_QPomp_taskwait
-subroutine omp_taskwait
-  !OMPDialect: omp.taskwait
-  !$omp taskwait
-  !FIRDialect: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !OMPDialect: omp.taskwait
-  !$omp taskwait
-end subroutine omp_taskwait
diff --git a/flang/test/Lower/OpenMP/FIR/taskyield.f90 b/flang/test/Lower/OpenMP/FIR/taskyield.f90
deleted file mode 100644
index ca0bc1d071df..000000000000
--- a/flang/test/Lower/OpenMP/FIR/taskyield.f90
+++ /dev/null
@@ -1,12 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect"
-
-!FIRDialect-LABEL: @_QPomp_taskyield
-subroutine omp_taskyield
-  !OMPDialect: omp.taskyield
-  !$omp taskyield
-  !FIRDialect: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !OMPDialect: omp.taskyield
-  !$omp taskyield
-end subroutine omp_taskyield
diff --git a/flang/test/Lower/OpenMP/FIR/teams.f90 b/flang/test/Lower/OpenMP/FIR/teams.f90
deleted file mode 100644
index 9c0593a24f2d..000000000000
--- a/flang/test/Lower/OpenMP/FIR/teams.f90
+++ /dev/null
@@ -1,117 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL: func @_QPteams_simple
-subroutine teams_simple()
-  ! CHECK: omp.teams
-  !$omp teams
-  ! CHECK: fir.call
-  call f1()
-  ! CHECK: omp.terminator
-  !$omp end teams
-end subroutine teams_simple
-
-!===============================================================================
-! `num_teams` clause
-!===============================================================================
-
-! CHECK-LABEL: func @_QPteams_numteams
-subroutine teams_numteams(num_teams)
-  integer, intent(inout) :: num_teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: num_teams( to %{{.*}}: i32)
-  !$omp teams num_teams(4)
-  ! CHECK: fir.call
-  call f1()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: num_teams( to %{{.*}}: i32)
-  !$omp teams num_teams(num_teams)
-  ! CHECK: fir.call
-  call f2()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-end subroutine teams_numteams
-
-!===============================================================================
-! `if` clause
-!===============================================================================
-
-! CHECK-LABEL: func @_QPteams_if
-subroutine teams_if(alpha)
-  integer, intent(in) :: alpha
-  logical :: condition
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: if(%{{.*}})
-  !$omp teams if(.false.)
-  ! CHECK: fir.call
-  call f1()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: if(%{{.*}})
-  !$omp teams if(alpha .le. 0)
-  ! CHECK: fir.call
-  call f2()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: if(%{{.*}})
-  !$omp teams if(condition)
-  ! CHECK: fir.call
-  call f3()
-  ! CHECK: omp.terminator
-  !$omp end teams
-end subroutine teams_if
-
-!===============================================================================
-! `thread_limit` clause
-!===============================================================================
-
-! CHECK-LABEL: func @_QPteams_threadlimit
-subroutine teams_threadlimit(thread_limit)
-  integer, intent(inout) :: thread_limit
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: thread_limit(%{{.*}}: i32)
-  !$omp teams thread_limit(4)
-  ! CHECK: fir.call
-  call f1()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: thread_limit(%{{.*}}: i32)
-  !$omp teams thread_limit(thread_limit)
-  ! CHECK: fir.call
-  call f2()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-end subroutine teams_threadlimit
-
-!===============================================================================
-! `allocate` clause
-!===============================================================================
-
-! CHECK-LABEL: func @_QPteams_allocate
-subroutine teams_allocate()
-   use omp_lib
-   integer :: x
-   integer :: y
-   ! CHECK: omp.teams
-   ! CHECK-SAME: allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>)
-   !$omp teams allocate(omp_high_bw_mem_alloc: x) private(x)
-   ! CHECK: arith.addi
-   x = x + 12
-   ! CHECK: omp.terminator
-   !$omp end teams
-end subroutine teams_allocate
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-char-array-chararray.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-char-array-chararray.f90
deleted file mode 100644
index 3580add37ef4..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-char-array-chararray.f90
+++ /dev/null
@@ -1,46 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for character, array, and character array.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-module test
-  character :: x
-  integer :: y(5)
-  character(5) :: z(5)
-
-  !$omp threadprivate(x, y, z)
-
-!CHECK-DAG: fir.global @_QMtestEx : !fir.char<1> {
-!CHECK-DAG: fir.global @_QMtestEy : !fir.array<5xi32> {
-!CHECK-DAG: fir.global @_QMtestEz : !fir.array<5x!fir.char<1,5>> {
-
-contains
-  subroutine sub()
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QMtestEx) : !fir.ref<!fir.char<1>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.char<1>> -> !fir.ref<!fir.char<1>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.array<5xi32>>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.array<5xi32>> -> !fir.ref<!fir.array<5xi32>>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QMtestEz) : !fir.ref<!fir.array<5x!fir.char<1,5>>>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.array<5x!fir.char<1,5>>> -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
-!CHECK-DAG:  %{{.*}} = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.embox [[NEWADDR1]](%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
-!CHECK-DAG:  %{{.*}} = fir.embox [[NEWADDR2]](%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
-    print *, x, y, z
-
-    !$omp parallel
-!CHECK-DAG:    [[ADDR33:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.char<1>> -> !fir.ref<!fir.char<1>>
-!CHECK-DAG:    [[ADDR34:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.array<5xi32>> -> !fir.ref<!fir.array<5xi32>>
-!CHECK-DAG:    [[ADDR35:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.array<5x!fir.char<1,5>>> -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
-!CHECK-DAG:    %{{.*}} = fir.convert [[ADDR33]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR34]](%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR35]](%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
-      print *, x, y, z
-    !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.embox [[NEWADDR1]](%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
-!CHECK-DAG:  %{{.*}} = fir.embox [[NEWADDR2]](%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
-    print *, x, y, z
-
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-commonblock.f90
deleted file mode 100644
index 49f592ec8121..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-commonblock.f90
+++ /dev/null
@@ -1,91 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for common block.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-module test
-  integer:: a
-  real :: b(2)
-  complex, pointer :: c, d(:)
-  character(5) :: e, f(2)
-  common /blk/ a, b, c, d, e, f
-
-  !$omp threadprivate(/blk/)
-
-!CHECK: fir.global common @blk_(dense<0> : vector<103xi8>) : !fir.array<103xi8>
-
-contains
-  subroutine sub()
-!CHECK:  [[ADDR0:%.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<103xi8>>
-!CHECK:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<103xi8>> -> !fir.ref<!fir.array<103xi8>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C0:%.*]] = arith.constant 0 : index
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.coordinate_of [[ADDR1]], [[C0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.convert [[ADDR2]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK-DAG:  [[ADDR4:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C1:%.*]] = arith.constant 4 : index
-!CHECK-DAG:  [[ADDR5:%.*]] = fir.coordinate_of [[ADDR4]], [[C1]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR6:%.*]] = fir.convert [[ADDR5]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2xf32>>
-!CHECK-DAG:  [[ADDR7:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C2:%.*]] = arith.constant 16 : index
-!CHECK-DAG:  [[ADDR8:%.*]] = fir.coordinate_of [[ADDR7]], [[C2]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR9:%.*]] = fir.convert [[ADDR8]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:  [[ADDR10:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C3:%.*]] = arith.constant 40 : index
-!CHECK-DAG:  [[ADDR11:%.*]] = fir.coordinate_of [[ADDR10]], [[C3]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR12:%.*]] = fir.convert [[ADDR11]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:  [[ADDR13:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C4:%.*]] = arith.constant 88 : index
-!CHECK-DAG:  [[ADDR14:%.*]] = fir.coordinate_of [[ADDR13]], [[C4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR15:%.*]] = fir.convert [[ADDR14]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
-!CHECK-DAG:  [[ADDR16:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C5:%.*]] = arith.constant 93 : index
-!CHECK-DAG:  [[ADDR17:%.*]] = fir.coordinate_of [[ADDR16]], [[C5]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR18:%.*]] = fir.convert [[ADDR17]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2x!fir.char<1,5>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR3]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.embox [[ADDR6]](%{{.*}}) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR9]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR12]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:  %{{.*}} = fir.convert [[ADDR15]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.embox [[ADDR18]](%{{.*}}) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
-    print *, a, b, c, d, e, f
-
-    !$omp parallel
-!CHECK:    [[ADDR77:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<103xi8>> -> !fir.ref<!fir.array<103xi8>>
-!CHECK-DAG:    [[ADDR78:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR79:%.*]] = fir.coordinate_of [[ADDR78]], [[C0:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR80:%.*]] = fir.convert [[ADDR79:%.*]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR81:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR82:%.*]] = fir.coordinate_of [[ADDR81]], [[C1:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR83:%.*]] = fir.convert [[ADDR82:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2xf32>>
-!CHECK-DAG:    [[ADDR84:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR85:%.*]] = fir.coordinate_of [[ADDR84]], [[C2:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR86:%.*]] = fir.convert [[ADDR85:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:    [[ADDR87:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR88:%.*]] = fir.coordinate_of [[ADDR87]], [[C3:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR89:%.*]] = fir.convert [[ADDR88:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:    [[ADDR90:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR91:%.*]] = fir.coordinate_of [[ADDR90]], [[C4:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR92:%.*]] = fir.convert [[ADDR91:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
-!CHECK-DAG:    [[ADDR93:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR94:%.*]] = fir.coordinate_of [[ADDR93]], [[C5:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR95:%.*]] = fir.convert [[ADDR94:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2x!fir.char<1,5>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR80]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR83]](%{{.*}}) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR86]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR89]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:    %{{.*}} = fir.convert [[ADDR92]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR95]](%{{.*}}) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
-      print *, a, b, c, d, e, f
-    !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR3]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.embox [[ADDR6]](%{{.*}}) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR9]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR12]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:  %{{.*}} = fir.convert [[ADDR15]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.embox [[ADDR18]](%{{.*}}) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
-    print *, a, b, c, d, e, f
-
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-integer-different-kinds.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-integer-different-kinds.f90
deleted file mode 100644
index 39c77406cc22..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-integer-different-kinds.f90
+++ /dev/null
@@ -1,67 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for variables with different kind.
-
-!REQUIRES: shell
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-program test
-  integer, save :: i
-  integer(kind=1), save :: i1
-  integer(kind=2), save :: i2
-  integer(kind=4), save :: i4
-  integer(kind=8), save :: i8
-  integer(kind=16), save :: i16
-
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QFEi1) : !fir.ref<i8>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<i8> -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QFEi16) : !fir.ref<i128>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<i128> -> !fir.ref<i128>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.address_of(@_QFEi2) : !fir.ref<i16>
-!CHECK-DAG:  [[NEWADDR3:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<i16> -> !fir.ref<i16>
-!CHECK-DAG:  [[ADDR4:%.*]] = fir.address_of(@_QFEi4) : !fir.ref<i32>
-!CHECK-DAG:  [[NEWADDR4:%.*]] = omp.threadprivate [[ADDR4]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:  [[ADDR5:%.*]] = fir.address_of(@_QFEi8) : !fir.ref<i64>
-!CHECK-DAG:  [[NEWADDR5:%.*]] = omp.threadprivate [[ADDR5]] : !fir.ref<i64> -> !fir.ref<i64>
-  !$omp threadprivate(i, i1, i2, i4, i8, i16)
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<i128>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<i16>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR4]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR5]] : !fir.ref<i64>
-  print *, i, i1, i2, i4, i8, i16
-
-  !$omp parallel
-!CHECK-DAG:    [[ADDR39:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR40:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<i8> -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR41:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<i128> -> !fir.ref<i128>
-!CHECK-DAG:    [[ADDR42:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<i16> -> !fir.ref<i16>
-!CHECK-DAG:    [[ADDR43:%.*]] = omp.threadprivate [[ADDR4]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR44:%.*]] = omp.threadprivate [[ADDR5]] : !fir.ref<i64> -> !fir.ref<i64>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR39]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR40]] : !fir.ref<i8>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR41]] : !fir.ref<i128>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR42]] : !fir.ref<i16>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR43]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR44]] : !fir.ref<i64>
-    print *, i, i1, i2, i4, i8, i16
-  !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<i128>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<i16>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR4]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR5]] : !fir.ref<i64>
-  print *, i, i1, i2, i4, i8, i16
-
-!CHECK-DAG: fir.global internal @_QFEi : i32 {
-!CHECK-DAG: fir.global internal @_QFEi1 : i8 {
-!CHECK-DAG: fir.global internal @_QFEi16 : i128 {
-!CHECK-DAG: fir.global internal @_QFEi2 : i16 {
-!CHECK-DAG: fir.global internal @_QFEi4 : i32 {
-!CHECK-DAG: fir.global internal @_QFEi8 : i64 {
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-non-global.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-non-global.f90
deleted file mode 100644
index b089693b2097..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-non-global.f90
+++ /dev/null
@@ -1,91 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for non-character non-SAVEd non-initialized scalars with or without
-! allocatable or pointer attribute in main program.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-program test
-  integer :: x
-  real :: y
-  logical :: z
-  complex :: w
-  integer, pointer :: a
-  real, allocatable :: b
-
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QFEa) : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QFEb) : !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QFEw) : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.address_of(@_QFEx) : !fir.ref<i32>
-!CHECK-DAG:  [[NEWADDR3:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:  [[ADDR4:%.*]] = fir.address_of(@_QFEy) : !fir.ref<f32>
-!CHECK-DAG:  [[NEWADDR4:%.*]] = omp.threadprivate [[ADDR4]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:  [[ADDR5:%.*]] = fir.address_of(@_QFEz) : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  [[NEWADDR5:%.*]] = omp.threadprivate [[ADDR5]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
-  !$omp threadprivate(x, y, z, w, a, b)
-
-  call sub(a, b)
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR4]] : !fir.ref<f32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR5]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  print *, x, y, z, w, a, b
-
-  !$omp parallel
-!CHECK-DAG:    [[ADDR68:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:    [[ADDR69:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:    [[ADDR70:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
-!CHECK-DAG:    [[ADDR71:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR72:%.*]] = omp.threadprivate [[ADDR4]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:    [[ADDR73:%.*]] = omp.threadprivate [[ADDR5]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR71]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR72]] : !fir.ref<f32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR73]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR70]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR68]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR69]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-    print *, x, y, z, w, a, b
-  !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR4]] : !fir.ref<f32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR5]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  print *, x, y, z, w, a, b
-
-!CHECK:  return
-
-!CHECK-DAG: fir.global internal @_QFEa : !fir.box<!fir.ptr<i32>> {
-!CHECK-DAG:   [[Z0:%.*]] = fir.zero_bits !fir.ptr<i32>
-!CHECK-DAG:   [[E0:%.*]] = fir.embox [[Z0]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK-DAG:   fir.has_value [[E0]] : !fir.box<!fir.ptr<i32>>
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEb : !fir.box<!fir.heap<f32>> {
-!CHECK-DAG:   [[Z1:%.*]] = fir.zero_bits !fir.heap<f32>
-!CHECK-DAG:   [[E1:%.*]] = fir.embox [[Z1]] : (!fir.heap<f32>) -> !fir.box<!fir.heap<f32>>
-!CHECK-DAG:   fir.has_value [[E1]] : !fir.box<!fir.heap<f32>>
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEw : !fir.complex<4> {
-!CHECK-DAG:   [[Z2:%.*]] = fir.undefined !fir.complex<4>
-!CHECK-DAG:   fir.has_value [[Z2]] : !fir.complex<4>
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEx : i32 {
-!CHECK-DAG:   [[Z3:%.*]] = fir.undefined i32
-!CHECK-DAG:   fir.has_value [[Z3]] : i32
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEy : f32 {
-!CHECK-DAG:   [[Z4:%.*]] = fir.undefined f32
-!CHECK-DAG:   fir.has_value [[Z4]] : f32
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEz : !fir.logical<4> {
-!CHECK-DAG:   [[Z5:%.*]] = fir.undefined !fir.logical<4>
-!CHECK-DAG:   fir.has_value [[Z5]] : !fir.logical<4>
-!CHECK-DAG: }
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-pointer-allocatable.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-pointer-allocatable.f90
deleted file mode 100644
index fd33c20f9f93..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-pointer-allocatable.f90
+++ /dev/null
@@ -1,51 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for allocatable and pointer variables.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-module test
-  integer, pointer :: x(:), m
-  real, allocatable :: y(:), n
-
-  !$omp threadprivate(x, y, m, n)
-
-!CHECK-DAG: fir.global @_QMtestEm : !fir.box<!fir.ptr<i32>> {
-!CHECK-DAG: fir.global @_QMtestEn : !fir.box<!fir.heap<f32>> {
-!CHECK-DAG: fir.global @_QMtestEx : !fir.box<!fir.ptr<!fir.array<?xi32>>> {
-!CHECK-DAG: fir.global @_QMtestEy : !fir.box<!fir.heap<!fir.array<?xf32>>> {
-
-contains
-  subroutine sub()
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QMtestEm) : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QMtestEn) : !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QMtestEx) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:  [[NEWADDR3:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-    print *, x, y, m, n
-
-    !$omp parallel
-!CHECK-DAG:    [[ADDR54:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:    [[ADDR55:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:    [[ADDR56:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:    [[ADDR57:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR56]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR57]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR54]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR55]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-      print *, x, y, m, n
-    !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-    print *, x, y, m, n
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-real-logical-complex-derivedtype.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-real-logical-complex-derivedtype.f90
deleted file mode 100644
index 749fe5c8bf54..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-real-logical-complex-derivedtype.f90
+++ /dev/null
@@ -1,58 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for real, logical, complex, and derived type.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-module test
-  type my_type
-    integer :: t_i
-    real :: t_arr(5)
-  end type my_type
-  real :: x
-  complex :: y
-  logical :: z
-  type(my_type) :: t
-
-  !$omp threadprivate(x, y, z, t)
-
-!CHECK-DAG: fir.global @_QMtestEt : !fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}> {
-!CHECK-DAG: fir.global @_QMtestEx : f32 {
-!CHECK-DAG: fir.global @_QMtestEy : !fir.complex<4> {
-!CHECK-DAG: fir.global @_QMtestEz : !fir.logical<4> {
-
-contains
-  subroutine sub()
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QMtestEt) : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>> -> !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QMtestEx) : !fir.ref<f32>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.address_of(@_QMtestEz) : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  [[NEWADDR3:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<f32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.coordinate_of [[NEWADDR0]]
-    print *, x, y, z, t%t_i
-
-    !$omp parallel
-!CHECK-DAG:    [[ADDR38:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>> -> !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
-!CHECK-DAG:    [[ADDR39:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:    [[ADDR40:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
-!CHECK-DAG:    [[ADDR41:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR39]] : !fir.ref<f32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR40]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR41]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:    %{{.*}} = fir.coordinate_of [[ADDR38]]
-      print *, x, y, z, t%t_i
-    !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<f32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.coordinate_of [[NEWADDR0]]
-    print *, x, y, z, t%t_i
-
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
deleted file mode 100644
index 6db5735c21f1..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
+++ /dev/null
@@ -1,39 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for threadprivate variable double use in use association.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL: fir.global @_QMmEx : i32
-module m
-  integer :: x
-  !$omp threadprivate(x)
-end
-
-! CHECK-LABEL: func.func @_QMm2Ptest() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         fir.call @_QPbar(%[[VAL_1]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:         return
-! CHECK:       }
-!
-! CHECK-LABEL: func.func private @_QMm2FtestPinternal_test() {{.*}} {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         fir.call @_QPbar(%[[VAL_1]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:         return
-! CHECK:       }
-
-module m2
-  use m
- contains
-  subroutine test()
-    use m
-    call bar(x)
-   contains
-    subroutine internal_test()
-      use m
-      call bar(x)
-    end
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association.f90
deleted file mode 100644
index 685237430a1c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association.f90
+++ /dev/null
@@ -1,74 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for threadprivate variable in use association.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK-DAG: fir.global common @blk_(dense<0> : vector<24xi8>) : !fir.array<24xi8>
-!CHECK-DAG: fir.global @_QMtestEy : f32 {
-
-module test
-  integer :: x
-  real :: y, z(5)
-  common /blk/ x, z
-
-  !$omp threadprivate(y, /blk/)
-
-contains
-  subroutine sub()
-! CHECK-LABEL: @_QMtestPsub
-!CHECK-DAG:   [[ADDR0:%.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:   [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:   [[ADDR1:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<f32>
-!CHECK-DAG:   [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<f32> -> !fir.ref<f32>
-
-    !$omp parallel
-!CHECK-DAG:    [[ADDR2:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:    [[ADDR3:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:    [[ADDR4:%.*]] = fir.convert [[ADDR2]] : (!fir.ref<!fir.array<24xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR5:%.*]] = fir.coordinate_of [[ADDR4]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR6:%.*]] = fir.convert [[ADDR5:%.*]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR7:%.*]] = fir.convert [[ADDR2]] : (!fir.ref<!fir.array<24xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR8:%.*]] = fir.coordinate_of [[ADDR7]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR9:%.*]] = fir.convert [[ADDR8:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<5xf32>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR6]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR3]] : !fir.ref<f32>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR9]](%{{.*}}) : (!fir.ref<!fir.array<5xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xf32>>
-      print *, x, y, z
-    !$omp end parallel
-  end
-end
-
-program main
-  use test
-  integer :: x1
-  real :: z1(5)
-  common /blk/ x1, z1
-
-  !$omp threadprivate(/blk/)
-
-  call sub()
-
-! CHECK-LABEL: @_QQmain()
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<f32>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<f32> -> !fir.ref<f32>
-
-  !$omp parallel
-!CHECK-DAG:    [[ADDR4:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:    [[ADDR5:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:    [[ADDR6:%.*]] = fir.convert [[ADDR4]] : (!fir.ref<!fir.array<24xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR7:%.*]] = fir.coordinate_of [[ADDR6]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR8:%.*]] = fir.convert [[ADDR7:%.*]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR9:%.*]] = fir.convert [[ADDR4]] : (!fir.ref<!fir.array<24xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR10:%.*]] = fir.coordinate_of [[ADDR9]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR11:%.*]] = fir.convert [[ADDR10:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<5xf32>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR8]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR5]] : !fir.ref<f32>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR11]](%{{.*}}) : (!fir.ref<!fir.array<5xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xf32>>
-    print *, x1, y, z1
-  !$omp end parallel
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/unstructured.f90 b/flang/test/Lower/OpenMP/FIR/unstructured.f90
deleted file mode 100644
index 6d1c9aab1464..000000000000
--- a/flang/test/Lower/OpenMP/FIR/unstructured.f90
+++ /dev/null
@@ -1,365 +0,0 @@
-! Test unstructured code adjacent to and inside OpenMP constructs.
-
-! RUN: bbc %s -fopenmp -emit-fir -hlfir=false -o "-" | FileCheck %s
-
-! CHECK-LABEL: func @_QPss1{{.*}} {
-! CHECK:   br ^bb1
-! CHECK: ^bb1:  // 2 preds: ^bb0, ^bb4
-! CHECK:   cond_br %{{[0-9]*}}, ^bb2, ^bb5
-! CHECK: ^bb2:  // pred: ^bb1
-! CHECK:   cond_br %{{[0-9]*}}, ^bb3, ^bb4
-! CHECK: ^bb4:  // pred: ^bb2
-! CHECK:   fir.call @_FortranAioBeginExternalListOutput
-! CHECK:   br ^bb1
-! CHECK: ^bb5:  // 2 preds: ^bb1, ^bb3
-! CHECK:   omp.master  {
-! CHECK:     @_FortranAioBeginExternalListOutput
-! CHECK:     omp.terminator
-! CHECK:   }
-! CHECK:   @_FortranAioBeginExternalListOutput
-! CHECK: }
-subroutine ss1(n) ! unstructured code followed by a structured OpenMP construct
-  do i = 1, 3
-    if (i .eq. n) exit
-    print*, 'ss1-A', i
-  enddo
-  !$omp master
-    print*, 'ss1-B', i
-  !$omp end master
-  print*
-end
-
-! CHECK-LABEL: func @_QPss2{{.*}} {
-! CHECK:   omp.master  {
-! CHECK:     @_FortranAioBeginExternalListOutput
-! CHECK:     br ^bb1
-! CHECK:   ^bb1:  // 2 preds: ^bb0, ^bb4
-! CHECK:     cond_br %{{[0-9]*}}, ^bb2, ^bb5
-! CHECK:   ^bb2:  // pred: ^bb1
-! CHECK:     cond_br %{{[0-9]*}}, ^bb3, ^bb4
-! CHECK:   ^bb3:  // pred: ^bb2
-! CHECK:     @_FortranAioBeginExternalListOutput
-! CHECK:     br ^bb1
-! CHECK:   ^bb5:  // 2 preds: ^bb1, ^bb3
-! CHECK:     omp.terminator
-! CHECK:   }
-! CHECK:   @_FortranAioBeginExternalListOutput
-! CHECK:   @_FortranAioBeginExternalListOutput
-! CHECK: }
-subroutine ss2(n) ! unstructured OpenMP construct; loop exit inside construct
-  !$omp master
-    print*, 'ss2-A', n
-    do i = 1, 3
-      if (i .eq. n) exit
-      print*, 'ss2-B', i
-    enddo
-  !$omp end master
-  print*, 'ss2-C', i
-  print*
-end
-
-! CHECK-LABEL: func @_QPss3{{.*}} {
-! CHECK:   omp.parallel {
-! CHECK:     %[[ALLOCA_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned}
-! CHECK:     %[[ALLOCA_1:.*]] = fir.alloca i32 {{{.*}}, pinned}
-! CHECK:     %[[ALLOCA_2:.*]] = fir.alloca i32 {{{.*}}, pinned}
-! CHECK:     br ^bb1
-! CHECK:   ^bb1:  // 2 preds: ^bb0, ^bb3
-! CHECK:     cond_br %{{[0-9]*}}, ^bb2, ^bb4
-! CHECK:   ^bb2:  // pred: ^bb1
-! CHECK:     omp.wsloop {
-! CHECK:       omp.loop_nest (%[[ARG1:.*]]) : {{.*}} {
-! CHECK:         fir.store %[[ARG1]] to %[[ALLOCA_2]] : !fir.ref<i32>
-! CHECK:         @_FortranAioBeginExternalListOutput
-! CHECK:         %[[LOAD_1:.*]] = fir.load %[[ALLOCA_2]] : !fir.ref<i32>
-! CHECK:         @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD_1]])
-! CHECK:         omp.yield
-! CHECK:       }
-! CHECK:       omp.terminator
-! CHECK:     }
-! CHECK:     omp.wsloop {
-! CHECK:       omp.loop_nest (%[[ARG2:.*]]) : {{.*}} {
-! CHECK:         fir.store %[[ARG2]] to %[[ALLOCA_1]] : !fir.ref<i32>
-! CHECK:         br ^bb1
-! CHECK:       ^bb2:  // 2 preds: ^bb1, ^bb5
-! CHECK:         cond_br %{{[0-9]*}}, ^bb3, ^bb6
-! CHECK:       ^bb3:  // pred: ^bb2
-! CHECK:         cond_br %{{[0-9]*}}, ^bb4, ^bb5
-! CHECK:       ^bb4:  // pred: ^bb3
-! CHECK:         @_FortranAioBeginExternalListOutput
-! CHECK:         %[[LOAD_2:.*]] = fir.load %[[ALLOCA_K]] : !fir.ref<i32>
-! CHECK:         @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD_2]])
-! CHECK:         br ^bb2
-! CHECK:       ^bb6:  // 2 preds: ^bb2, ^bb4
-! CHECK:         omp.yield
-! CHECK:       }
-! CHECK:       omp.terminator
-! CHECK:     }
-! CHECK:     br ^bb1
-! CHECK:   ^bb4:  // pred: ^bb1
-! CHECK:     omp.terminator
-! CHECK:   }
-! CHECK: }
-subroutine ss3(n) ! nested unstructured OpenMP constructs
-  !$omp parallel
-    do i = 1, 3
-      !$omp do
-        do k = 1, 3
-          print*, 'ss3-A', k
-        enddo
-      !$omp end do
-      !$omp do
-        do j = 1, 3
-          do k = 1, 3
-            if (k .eq. n) exit
-            print*, 'ss3-B', k
-          enddo
-        enddo
-      !$omp end do
-    enddo
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QPss4{{.*}} {
-! CHECK:       omp.parallel {
-! CHECK:         %[[ALLOCA:.*]] = fir.alloca i32 {{{.*}}, pinned}
-! CHECK:         omp.wsloop {
-! CHECK:           omp.loop_nest (%[[ARG:.*]]) : {{.*}} {
-! CHECK:             fir.store %[[ARG]] to %[[ALLOCA]] : !fir.ref<i32>
-! CHECK:             %[[COND:.*]] = arith.cmpi eq, %{{.*}}, %{{.*}}
-! CHECK:             %[[COND_XOR:.*]] = arith.xori %[[COND]], %{{.*}}
-! CHECK:             fir.if %[[COND_XOR]] {
-! CHECK:              @_FortranAioBeginExternalListOutput
-! CHECK:              %[[LOAD:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
-! CHECK:              @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD]])
-! CHECK:             } else {
-! CHECK:             }
-! CHECK-NEXT:        omp.yield
-! CHECK-NEXT:      }
-! CHECK-NEXT:      omp.terminator
-! CHECK-NEXT:    }
-! CHECK:         omp.terminator
-! CHECK-NEXT:  }
-subroutine ss4(n) ! CYCLE in OpenMP wsloop constructs
-  !$omp parallel
-    do i = 1, 3
-      !$omp do
-        do j = 1, 3
-           if (j .eq. n) cycle
-           print*, 'ss4', j
-        enddo
-      !$omp end do
-    enddo
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QPss5() {
-! CHECK:  omp.parallel  {
-! CHECK:    omp.wsloop {
-! CHECK:      omp.loop_nest {{.*}} {
-! CHECK:        br ^[[BB1:.*]]
-! CHECK:      ^[[BB1]]:
-! CHECK:        br ^[[BB2:.*]]
-! CHECK:      ^[[BB2]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
-! CHECK:      ^[[BB3]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB3:.*]]
-! CHECK:      ^[[BB4]]:
-! CHECK:        br ^[[BB6]]
-! CHECK:      ^[[BB3]]:
-! CHECK:        br ^[[BB2]]
-! CHECK:      ^[[BB6]]:
-! CHECK:        omp.yield
-! CHECK:      }
-! CHECK:      omp.terminator
-! CHECK:    }
-! CHECK:    omp.terminator
-! CHECK:  }
-subroutine ss5() ! EXIT inside OpenMP wsloop (inside parallel)
-  integer :: x
-  !$omp parallel private(x)
-    !$omp do
-      do j = 1, 3
-        x = j * i
-        do k = 1, 3
-          if (k .eq. n) exit
-          x = k
-          x = x + k
-        enddo
-        x = j - 222
-      enddo
-    !$omp end do
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QPss6() {
-! CHECK:  omp.parallel  {
-! CHECK:    br ^[[BB1_OUTER:.*]]
-! CHECK:  ^[[BB1_OUTER]]:
-! CHECK:    cond_br %{{.*}}, ^[[BB2_OUTER:.*]], ^[[BB3_OUTER:.*]]
-! CHECK:  ^[[BB2_OUTER]]:
-! CHECK:    omp.wsloop {
-! CHECK:      omp.loop_nest {{.*}} {
-! CHECK:        br ^[[BB1:.*]]
-! CHECK:      ^[[BB1]]:
-! CHECK:        br ^[[BB2:.*]]
-! CHECK:      ^[[BB2]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
-! CHECK:      ^[[BB3]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
-! CHECK:      ^[[BB4]]:
-! CHECK:        br ^[[BB6]]
-! CHECK:      ^[[BB5]]
-! CHECK:        br ^[[BB2]]
-! CHECK:      ^[[BB6]]:
-! CHECK:        omp.yield
-! CHECK:      }
-! CHECK:      omp.terminator
-! CHECK:    }
-! CHECK:    br ^[[BB1_OUTER]]
-! CHECK:  ^[[BB3_OUTER]]:
-! CHECK:    omp.terminator
-! CHECK:  }
-subroutine ss6() ! EXIT inside OpenMP wsloop in a do loop (inside parallel)
-  integer :: x
-  !$omp parallel private(x)
-    do i = 1, 3
-      !$omp do
-        do j = 1, 3
-          x = j * i
-          do k = 1, 3
-            if (k .eq. n) exit
-            x = k
-            x = x + k
-          enddo
-          x = j - 222
-        enddo
-      !$omp end do
-    enddo
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QPss7() {
-! CHECK: br ^[[BB1_OUTER:.*]]
-! CHECK: ^[[BB1_OUTER]]:
-! CHECK:   cond_br %{{.*}}, ^[[BB2_OUTER:.*]], ^[[BB3_OUTER:.*]]
-! CHECK-NEXT: ^[[BB2_OUTER:.*]]:
-! CHECK:   omp.parallel  {
-! CHECK:     omp.wsloop {
-! CHECK:       omp.loop_nest {{.*}} {
-! CHECK:         br ^[[BB1:.*]]
-! CHECK-NEXT:       ^[[BB1]]:
-! CHECK:         br ^[[BB2:.*]]
-! CHECK-NEXT:       ^[[BB2]]:
-! CHECK:         cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
-! CHECK-NEXT:       ^[[BB3]]:
-! CHECK:         cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
-! CHECK-NEXT:       ^[[BB4]]:
-! CHECK:         br ^[[BB6]]
-! CHECK-NEXT:       ^[[BB5]]:
-! CHECK:         br ^[[BB2]]
-! CHECK-NEXT:       ^[[BB6]]:
-! CHECK:         omp.yield
-! CHECK:       }
-! CHECK:       omp.terminator
-! CHECK:     }
-! CHECK:     omp.terminator
-! CHECK:   }
-! CHECK:   br ^[[BB1_OUTER]]
-! CHECK-NEXT: ^[[BB3_OUTER]]:
-! CHECK-NEXT:   return
-subroutine ss7() ! EXIT inside OpenMP parallel do (inside do loop)
-  integer :: x
-    do i = 1, 3
-      !$omp parallel do private(x)
-        do j = 1, 3
-          x = j * i
-          do k = 1, 3
-            if (k .eq. n) exit
-            x = k
-            x = x + k
-          enddo
-        enddo
-      !$omp end parallel do
-    enddo
-end
-
-! CHECK-LABEL: func @_QPss8() {
-! CHECK:  omp.parallel  {
-! CHECK:    omp.wsloop {
-! CHECK:      omp.loop_nest {{.*}} {
-! CHECK:        br ^[[BB1:.*]]
-! CHECK-NEXT:      ^[[BB1]]:
-! CHECK:        br ^[[BB2:.*]]
-! CHECK:      ^[[BB2]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
-! CHECK:      ^[[BB3]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
-! CHECK:      ^[[BB4]]:
-! CHECK-NEXT:      br ^[[BB6]]
-! CHECK:      ^[[BB5]]:
-! CHECK:        br ^[[BB2]]
-! CHECK-NEXT:      ^[[BB6]]:
-! CHECK:        omp.yield
-! CHECK:      }
-! CHECK:      omp.terminator
-! CHECK:    }
-! CHECK:    omp.terminator
-! CHECK:  }
-subroutine ss8() ! EXIT inside OpenMP parallel do
-  integer :: x
-      !$omp parallel do private(x)
-        do j = 1, 3
-          x = j * i
-          do k = 1, 3
-            if (k .eq. n) exit
-            x = k
-            x = x + k
-          enddo
-        enddo
-      !$omp end parallel do
-end
-
-! CHECK-LABEL: func @_QPss9() {
-! CHECK:  omp.parallel  {
-! CHECK-NEXT:    omp.parallel  {
-! CHECK:      br ^[[BB1:.*]]
-! CHECK:         ^[[BB1]]:
-! CHECK:      cond_br %{{.*}}, ^[[BB2:.*]], ^[[BB5:.*]]
-! CHECK-NEXT:    ^[[BB2]]:
-! CHECK:      cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB4:.*]]
-! CHECK-NEXT:    ^[[BB3]]:
-! CHECK-NEXT:    br ^[[BB5]]
-! CHECK-NEXT:    ^[[BB4]]:
-! CHECK:      br ^[[BB1]]
-! CHECK-NEXT:    ^[[BB5]]:
-! CHECK:      omp.terminator
-! CHECK-NEXT:    }
-! CHECK:    omp.terminator
-! CHECK-NEXT  }
-! CHECK: }
-subroutine ss9() ! EXIT inside OpenMP parallel (inside parallel)
-  integer :: x
-  !$omp parallel
-  !$omp parallel private(x)
-    do k = 1, 3
-      if (k .eq. n) exit
-      x = k
-      x = x + k
-    end do
-  !$omp end parallel
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QQmain
-program p
-  call ss1(2)
-  call ss2(2)
-  call ss3(2)
-  call ss4(2)
-  call ss5()
-  call ss6()
-  call ss7()
-  call ss8()
-  call ss9()
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90
deleted file mode 100644
index e4b85fb44776..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-! This test checks that chunk size is passed correctly when lowering of
-! OpenMP DO Directive(Worksharing) with chunk size
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop
-        integer :: i
-        integer :: chunk
-
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "wsloop"} {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "chunk", uniq_name = "_QFEchunk"}
-
-!$OMP DO SCHEDULE(static, 4)
-
-do i=1, 9
-  print*, i
-
-! CHECK:         %[[VAL_2:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_3:.*]] = arith.constant 9 : i32
-! CHECK:         %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_5:.*]] = arith.constant 4 : i32
-! CHECK:         omp.wsloop schedule(static = %[[VAL_5]] : i32) nowait {
-! CHECK-NEXT:      omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_4]]) {
-! CHECK:             fir.store %[[ARG0]] to %[[STORE_IV:.*]] : !fir.ref<i32>
-! CHECK:             %[[LOAD_IV:.*]] = fir.load %[[STORE_IV]] : !fir.ref<i32>
-! CHECK:             {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-! CHECK:             omp.yield
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-
-end do
-!$OMP END DO NOWAIT
-!$OMP DO SCHEDULE(static, 2+2)
-
-do i=1, 9
-  print*, i*2
-
-! CHECK:         %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_15:.*]] = arith.constant 9 : i32
-! CHECK:         %[[VAL_16:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_17:.*]] = arith.constant 4 : i32
-! CHECK:         omp.wsloop schedule(static = %[[VAL_17]] : i32) nowait {
-! CHECK-NEXT:      omp.loop_nest (%[[ARG1:.*]]) : i32 = (%[[VAL_14]]) to (%[[VAL_15]]) inclusive step (%[[VAL_16]]) {
-! CHECK:             fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i32>
-! CHECK:             %[[VAL_24:.*]] = arith.constant 2 : i32
-! CHECK:             %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i32>
-! CHECK:             %[[VAL_25:.*]] = arith.muli %[[VAL_24]], %[[LOAD_IV1]] : i32
-! CHECK:             {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[VAL_25]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-! CHECK:             omp.yield
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-  
-end do
-!$OMP END DO NOWAIT
-chunk = 6
-!$OMP DO SCHEDULE(static, chunk)
-
-do i=1, 9
-   print*, i*3
-end do
-!$OMP END DO NOWAIT
-! CHECK:         %[[VAL_28:.*]] = arith.constant 6 : i32
-! CHECK:         fir.store %[[VAL_28]] to %[[VAL_0]] : !fir.ref<i32>
-! CHECK:         %[[VAL_29:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_30:.*]] = arith.constant 9 : i32
-! CHECK:         %[[VAL_31:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_32:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-! CHECK:         omp.wsloop schedule(static = %[[VAL_32]] : i32) nowait {
-! CHECK-NEXT:      omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[VAL_29]]) to (%[[VAL_30]]) inclusive step (%[[VAL_31]]) {
-! CHECK:             fir.store %[[ARG2]] to %[[STORE_IV2:.*]] : !fir.ref<i32>
-! CHECK:             %[[VAL_39:.*]] = arith.constant 3 : i32
-! CHECK:             %[[LOAD_IV2:.*]] = fir.load %[[STORE_IV2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_40:.*]] = arith.muli %[[VAL_39]], %[[LOAD_IV2]] : i32
-! CHECK:             {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[VAL_40]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-! CHECK:             omp.yield
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90
deleted file mode 100644
index a2ba3ebfe196..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90
+++ /dev/null
@@ -1,66 +0,0 @@
-! This test checks lowering of OpenMP DO Directive(Worksharing) with collapse.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop_collapse
-  integer :: i, j, k
-  integer :: a, b, c
-  integer :: x
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
-! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "c", uniq_name = "_QFEc"}
-! CHECK:         %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
-! CHECK:         %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
-! CHECK:         %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFEk"}
-! CHECK:         %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-  a=3
-! CHECK:         %[[VAL_7:.*]] = arith.constant 3 : i32
-! CHECK:         fir.store %[[VAL_7]] to %[[VAL_0]] : !fir.ref<i32>
-  b=2
-! CHECK:         %[[VAL_8:.*]] = arith.constant 2 : i32
-! CHECK:         fir.store %[[VAL_8]] to %[[VAL_1]] : !fir.ref<i32>
-  c=5
-! CHECK:         %[[VAL_9:.*]] = arith.constant 5 : i32
-! CHECK:         fir.store %[[VAL_9]] to %[[VAL_2]] : !fir.ref<i32>
-  x=0
-! CHECK:         %[[VAL_10:.*]] = arith.constant 0 : i32
-! CHECK:         fir.store %[[VAL_10]] to %[[VAL_6]] : !fir.ref<i32>
-
-  !$omp do collapse(3)
-! CHECK:           %[[VAL_20:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_21:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-! CHECK:           %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_23:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_24:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           %[[VAL_25:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_26:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_27:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_28:.*]] = arith.constant 1 : i32
-  do i = 1, a
-     do j= 1, b
-        do k = 1, c
-! CHECK:           omp.wsloop {
-! CHECK-NEXT:        omp.loop_nest (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) : i32 = (%[[VAL_20]], %[[VAL_23]], %[[VAL_26]]) to (%[[VAL_21]], %[[VAL_24]], %[[VAL_27]]) inclusive step (%[[VAL_22]], %[[VAL_25]], %[[VAL_28]]) {
-! CHECK:               fir.store %[[ARG0]] to %[[STORE_IV0:.*]] : !fir.ref<i32>
-! CHECK:               fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i32>
-! CHECK:               fir.store %[[ARG2]] to %[[STORE_IV2:.*]] : !fir.ref<i32>
-! CHECK:               %[[VAL_12:.*]] = fir.load %[[VAL_6]] : !fir.ref<i32>
-! CHECK:               %[[LOAD_IV0:.*]] = fir.load %[[STORE_IV0]] : !fir.ref<i32>
-! CHECK:               %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[LOAD_IV0]] : i32
-! CHECK:               %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i32>
-! CHECK:               %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[LOAD_IV1]] : i32
-! CHECK:               %[[LOAD_IV2:.*]] = fir.load %[[STORE_IV2]] : !fir.ref<i32>
-! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[LOAD_IV2]] : i32
-! CHECK:               fir.store %[[VAL_15]] to %[[VAL_6]] : !fir.ref<i32>
-! CHECK:               omp.yield
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-           x = x + i + j + k
-        end do
-     end do
-  end do
-  !$omp end do
-! CHECK:         return
-! CHECK:       }
-end program wsloop_collapse
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90
deleted file mode 100644
index 941885bdb1e3..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! This test checks lowering of OpenMP DO Directive (Worksharing) with
-! monotonic schedule modifier.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop_dynamic
-  integer :: i
-!CHECK-LABEL: func @_QQmain()
-
-!$OMP PARALLEL
-!CHECK:  omp.parallel {
-
-!$OMP DO SCHEDULE(monotonic:dynamic)
-!CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-!CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-!CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-!CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:      omp.wsloop schedule(dynamic, monotonic) nowait {
-!CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-!CHECK:          fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-
-  do i=1, 9
-    print*, i
-!CHECK:          %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:          %[[LOAD:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-!CHECK:          fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-!CHECK:          fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
-  end do
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-!CHECK:      omp.terminator
-!CHECK:    }
-
-!$OMP END DO NOWAIT
-!$OMP END PARALLEL
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90
deleted file mode 100644
index 96a3e71f34b1..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90
+++ /dev/null
@@ -1,39 +0,0 @@
-! This test checks lowering of OpenMP DO Directive(Worksharing) with
-! non-monotonic schedule modifier.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop_dynamic
-  integer :: i
-!CHECK-LABEL: func @_QQmain()
-
-
-!$OMP PARALLEL
-!CHECK:  omp.parallel {
-
-!$OMP DO SCHEDULE(nonmonotonic:dynamic)
-!CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-!CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-!CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-!CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:      omp.wsloop schedule(dynamic, nonmonotonic) nowait {
-!CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
-!CHECK:          fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
-
-  do i=1, 9
-    print*, i
-!CHECK:          %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:          %[[LOAD:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-!CHECK:          fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-!CHECK:          fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
-  end do
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-!CHECK:    omp.terminator
-!CHECK:  }
-
-!$OMP END DO NOWAIT
-!$OMP END PARALLEL
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-ordered.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-ordered.f90
deleted file mode 100644
index fec027608d99..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-ordered.f90
+++ /dev/null
@@ -1,46 +0,0 @@
-! This test checks lowering of worksharing-loop construct with ordered clause.
-
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-
-! This checks lowering ordered clause specified without parameter
-subroutine wsloop_ordered_no_para()
-  integer :: a(10), i
-
-! CHECK:      omp.wsloop ordered(0) {
-! CHECK-NEXT:   omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
-! CHECK:          omp.yield
-! CHECK:        }
-! CHECK:        omp.terminator
-! CHECK:      }
-
-  !$omp do ordered
-  do i = 2, 10
-    !$omp ordered
-    a(i) = a(i-1) + 1
-    !$omp end ordered
-  end do
-  !$omp end do
-
-end
-
-! This checks lowering ordered clause specified with a parameter
-subroutine wsloop_ordered_with_para()
-  integer :: a(10), i
-
-! CHECK: func @_QPwsloop_ordered_with_para() {
-! CHECK:      omp.wsloop ordered(1) {
-! CHECK-NEXT:   omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
-! CHECK:          omp.yield
-! CHECK:        }
-! CHECK:        omp.terminator
-! CHECK:      }
-
-  !$omp do ordered(1)
-  do i = 2, 10
-    !!$omp ordered depend(sink: i-1)
-    a(i) = a(i-1) + 1
-    !!$omp ordered depend(source)
-  end do
-  !$omp end do
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
deleted file mode 100644
index b6dfec09007e..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
+++ /dev/null
@@ -1,413 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_f64 : !fir.ref<f64>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f64>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
-! CHECK:            %[[REF:.*]] = fir.alloca f64
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f64>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<f64>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f64>, %[[ARG1:.*]]: !fir.ref<f64>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f64>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f64>
-! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f64
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f64>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f64>)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_i64 : !fir.ref<i64>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i64>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i64
-! CHECK:            %[[REF:.*]] = fir.alloca i64
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i64>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<i64>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i64>, %[[ARG1:.*]]: !fir.ref<i64>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i64>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i64>
-! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i64
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i64>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i64>)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_f32 : !fir.ref<f32>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:            %[[REF:.*]] = fir.alloca f32
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f32>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<f32>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f32
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f32>)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_i32 : !fir.ref<i32>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
-! CHECK:            %[[REF:.*]] = fir.alloca i32
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_i32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_int_reduction
-  integer :: x
-  x = 0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-
-! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_f32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
-! CHECK:                 %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_real_reduction
-  real :: x
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_i32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_int_reduction_switch_order
-  integer :: x
-  x = 0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = i + x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_f32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_real_reduction_switch_order
-  real :: x
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = i + x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_i32 %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<i32>, @add_reduction_byref_i32 %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<i32>, @add_reduction_byref_i32 %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_16]] : i32
-! CHECK:                 fir.store %[[VAL_17]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_12]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_12]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i32
-! CHECK:                 fir.store %[[VAL_23]] to %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine multiple_int_reductions_same_type
-  integer :: x,y,z
-  x = 0
-  y = 0
-  z = 0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<f32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_f32 %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<f32>, @add_reduction_byref_f32 %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<f32>, @add_reduction_byref_f32 %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> f32
-! CHECK:                 %[[VAL_18:.*]] = arith.addf %[[VAL_15]], %[[VAL_17]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_12]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> f32
-! CHECK:                 %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_22]] to %[[VAL_12]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_24:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
-! CHECK:                 %[[VAL_26:.*]] = arith.addf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_26]] to %[[VAL_13]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine multiple_real_reductions_same_type
-  real :: x,y,z
-  x = 0.0
-  y = 0.0
-  z = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i64>
-! CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_7]] to %[[VAL_4]] : !fir.ref<f32>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_1]] : !fir.ref<f64>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_i32 %[[VAL_2]] -> %[[VAL_13:.*]] : !fir.ref<i32>, @add_reduction_byref_i64 %[[VAL_3]] -> %[[VAL_14:.*]] : !fir.ref<i64>, @add_reduction_byref_f32 %[[VAL_4]] -> %[[VAL_15:.*]] : !fir.ref<f32>, @add_reduction_byref_f64 %[[VAL_1]] -> %[[VAL_16:.*]] : !fir.ref<f64>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_17:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
-! CHECK:                 fir.store %[[VAL_17]] to %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_14]] : !fir.ref<i64>
-! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i32) -> i64
-! CHECK:                 %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_23]] : i64
-! CHECK:                 fir.store %[[VAL_24]] to %[[VAL_14]] : !fir.ref<i64>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_15]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_26:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i32) -> f32
-! CHECK:                 %[[VAL_28:.*]] = arith.addf %[[VAL_25]], %[[VAL_27]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_28]] to %[[VAL_15]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_29:.*]] = fir.load %[[VAL_16]] : !fir.ref<f64>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i32) -> f64
-! CHECK:                 %[[VAL_32:.*]] = arith.addf %[[VAL_29]], %[[VAL_31]] fastmath<contract> : f64
-! CHECK:                 fir.store %[[VAL_32]] to %[[VAL_16]] : !fir.ref<f64>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-
-subroutine multiple_reductions_different_type
-  integer :: x
-  integer(kind=8) :: y
-  real :: z
-  real(kind=8) :: w
-  x = 0
-  y = 0
-  z = 0.0
-  w = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z,w)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-    w = w + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
deleted file mode 100644
index e0b9330b1a6d..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
+++ /dev/null
@@ -1,388 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_f64 : f64 init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: f64):
-! CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
-! CHECK:           omp.yield(%[[VAL_1]] : f64)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: f64, %[[VAL_1:.*]]: f64):
-! CHECK:           %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f64
-! CHECK:           omp.yield(%[[VAL_2]] : f64)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_i64 : i64 init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: i64):
-! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-! CHECK:           omp.yield(%[[VAL_1]] : i64)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: i64, %[[VAL_1:.*]]: i64):
-! CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i64
-! CHECK:           omp.yield(%[[VAL_2]] : i64)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_f32 : f32 init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: f32):
-! CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           omp.yield(%[[VAL_1]] : f32)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32):
-! CHECK:           %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f32
-! CHECK:           omp.yield(%[[VAL_2]] : f32)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_i32 : i32 init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
-! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
-! CHECK:           omp.yield(%[[VAL_1]] : i32)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
-! CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i32
-! CHECK:           omp.yield(%[[VAL_2]] : i32)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) 
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_int_reduction
-  integer :: x
-  x = 0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-
-! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
-! CHECK:                 %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_real_reduction
-  real :: x
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_int_reduction_switch_order
-  integer :: x
-  x = 0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = i + x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_real_reduction_switch_order
-  real :: x
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = i + x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<i32>, @add_reduction_i32 %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<i32>, @add_reduction_i32 %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_16]] : i32
-! CHECK:                 fir.store %[[VAL_17]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_12]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_12]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i32
-! CHECK:                 fir.store %[[VAL_23]] to %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine multiple_int_reductions_same_type
-  integer :: x,y,z
-  x = 0
-  y = 0
-  z = 0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<f32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<f32>, @add_reduction_f32 %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<f32>, @add_reduction_f32 %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> f32
-! CHECK:                 %[[VAL_18:.*]] = arith.addf %[[VAL_15]], %[[VAL_17]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_12]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> f32
-! CHECK:                 %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_22]] to %[[VAL_12]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_24:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
-! CHECK:                 %[[VAL_26:.*]] = arith.addf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_26]] to %[[VAL_13]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine multiple_real_reductions_same_type
-  real :: x,y,z
-  x = 0.0
-  y = 0.0
-  z = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i64>
-! CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_7]] to %[[VAL_4]] : !fir.ref<f32>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_1]] : !fir.ref<f64>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_2]] -> %[[VAL_13:.*]] : !fir.ref<i32>, @add_reduction_i64 %[[VAL_3]] -> %[[VAL_14:.*]] : !fir.ref<i64>, @add_reduction_f32 %[[VAL_4]] -> %[[VAL_15:.*]] : !fir.ref<f32>, @add_reduction_f64 %[[VAL_1]] -> %[[VAL_16:.*]] : !fir.ref<f64>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_17:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
-! CHECK:                 fir.store %[[VAL_17]] to %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_14]] : !fir.ref<i64>
-! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i32) -> i64
-! CHECK:                 %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_23]] : i64
-! CHECK:                 fir.store %[[VAL_24]] to %[[VAL_14]] : !fir.ref<i64>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_15]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_26:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i32) -> f32
-! CHECK:                 %[[VAL_28:.*]] = arith.addf %[[VAL_25]], %[[VAL_27]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_28]] to %[[VAL_15]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_29:.*]] = fir.load %[[VAL_16]] : !fir.ref<f64>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i32) -> f64
-! CHECK:                 %[[VAL_32:.*]] = arith.addf %[[VAL_29]], %[[VAL_31]] fastmath<contract> : f64
-! CHECK:                 fir.store %[[VAL_32]] to %[[VAL_16]] : !fir.ref<f64>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-
-subroutine multiple_reductions_different_type
-  integer :: x
-  integer(kind=8) :: y
-  real :: z
-  real(kind=8) :: w
-  x = 0
-  y = 0
-  z = 0.0
-  w = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z,w)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-    w = w + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
deleted file mode 100644
index b25ab84f60fe..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
+++ /dev/null
@@ -1,48 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-
-!CHECK-LABEL:   omp.declare_reduction @iand_byref_i32 : !fir.ref<i32>
-!CHECK-SAME:    init {
-!CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
-!CHECK:            %[[C0_1:.*]] = arith.constant -1 : i32
-!CHECK:            %[[REF:.*]] = fir.alloca i32
-!CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
-!CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
-
-!CHECK-LABEL:   } combiner {
-!CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-!CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK:           %[[RES:.*]] = arith.andi %[[LD0]], %[[LD1]] : i32
-!CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
-!CHECK:         }
-
-
-!CHECK-LABEL: @_QPreduction_iand
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop byref reduction(@iand_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.andi %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_iand(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(iand:x)
-  do i=1, 100
-    x = iand(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand.f90
deleted file mode 100644
index dfc140d7d5f6..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[IAND_DECLARE_I:.*]] : i32 init {
-!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant -1 : i32
-!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK: %[[IAND_VAL_I:.*]] = arith.andi %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK: omp.yield(%[[IAND_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_iand
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop reduction(@[[IAND_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.andi %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_iand(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(iand:x)
-  do i=1, 100
-    x = iand(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
deleted file mode 100644
index 56eb087bae5a..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
+++ /dev/null
@@ -1,47 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mmlir --force-byref-reduction -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL:   omp.declare_reduction @ieor_byref_i32 : !fir.ref<i32>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
-! CHECK:            %[[REF:.*]] = fir.alloca i32
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
-! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:           %[[RES:.*]] = arith.xori %[[LD0]], %[[LD1]] : i32
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
-! CHECK:         }
-
-!CHECK-LABEL: @_QPreduction_ieor
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop byref reduction(@ieor_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.xori %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_ieor(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(ieor:x)
-  do i=1, 100
-    x = ieor(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor.f90
deleted file mode 100644
index 1ddf82b828cb..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[IEOR_DECLARE_I:.*]] : i32 init {
-!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant 0 : i32
-!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK: %[[IEOR_VAL_I:.*]] = arith.xori %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK: omp.yield(%[[IEOR_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_ieor
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop reduction(@[[IEOR_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.xori %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_ieor(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(ieor:x)
-  do i=1, 100
-    x = ieor(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
deleted file mode 100644
index e761d24cd303..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
+++ /dev/null
@@ -1,47 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-
-! CHECK-LABEL:   omp.declare_reduction @ior_byref_i32 : !fir.ref<i32>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
-! CHECK:            %[[REF:.*]] = fir.alloca i32
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
-! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:           %[[RES:.*]] = arith.ori %[[LD0]], %[[LD1]] : i32
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
-! CHECK:         }
-
-!CHECK-LABEL: @_QPreduction_ior
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop byref reduction(@ior_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.ori %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_ior(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(ior:x)
-  do i=1, 100
-    x = ior(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior.f90
deleted file mode 100644
index 148dbc909bab..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[IOR_DECLARE_I:.*]] : i32 init {
-!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant 0 : i32
-!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK: %[[IOR_VAL_I:.*]] = arith.ori %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK: omp.yield(%[[IOR_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_ior
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop reduction(@[[IOR_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.ori %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_ior(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(ior:x)
-  do i=1, 100
-    x = ior(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
deleted file mode 100644
index 17cd02a0ca7f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
+++ /dev/null
@@ -1,193 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-! CHECK-LABEL:   omp.declare_reduction @eqv_reduction : !fir.ref<!fir.logical<4>>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:           %[[VAL_1:.*]] = arith.constant true
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
-! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
-! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[RES:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]] : i1
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction(
-! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
-! CHECK:                 %[[VAL_14:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
-! CHECK:                 %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x)
-  do i=1, 100
-    x = x .eqv. y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
-! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-! CHECK:                 %[[VAL_13:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
-! CHECK:                 %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction_switch_order(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x)
-  do i=1, 100
-  x = y(i) .eqv. x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant true
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_7:.*]] = arith.constant true
-! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_9:.*]] = arith.constant true
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:                 %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_28:.*]] = arith.cmpi eq, %[[VAL_26]], %[[VAL_27]] : i1
-! CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:                 %[[VAL_33:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
-! CHECK:                 %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_38]] : i1
-! CHECK:                 %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
-! CHECK:                 %[[VAL_44:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
-! CHECK:                 %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_50:.*]] = arith.cmpi eq, %[[VAL_48]], %[[VAL_49]] : i1
-! CHECK:                 %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine multiple_reductions(w)
-  logical :: x,y,z,w(100)
-  x = .true.
-  y = .true.
-  z = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x,y,z)
-  do i=1, 100
-  x = x .eqv. w(i)
-  y = y .eqv. w(i)
-  z = z .eqv. w(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90
deleted file mode 100644
index e714e45540c3..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90
+++ /dev/null
@@ -1,187 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-! CHECK-LABEL:   omp.declare_reduction @eqv_reduction : !fir.logical<4> init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.logical<4>):
-! CHECK:           %[[VAL_1:.*]] = arith.constant true
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
-! CHECK:           omp.yield(%[[VAL_2]] : !fir.logical<4>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.logical<4>, %[[VAL_1:.*]]: !fir.logical<4>):
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]] : i1
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4>
-! CHECK:           omp.yield(%[[VAL_5]] : !fir.logical<4>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction(
-! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
-! CHECK:                 %[[VAL_14:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
-! CHECK:                 %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x)
-  do i=1, 100
-    x = x .eqv. y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
-! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-! CHECK:                 %[[VAL_13:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
-! CHECK:                 %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction_switch_order(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x)
-  do i=1, 100
-  x = y(i) .eqv. x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant true
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_7:.*]] = arith.constant true
-! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_9:.*]] = arith.constant true
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:                 %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_28:.*]] = arith.cmpi eq, %[[VAL_26]], %[[VAL_27]] : i1
-! CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:                 %[[VAL_33:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
-! CHECK:                 %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_38]] : i1
-! CHECK:                 %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
-! CHECK:                 %[[VAL_44:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
-! CHECK:                 %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_50:.*]] = arith.cmpi eq, %[[VAL_48]], %[[VAL_49]] : i1
-! CHECK:                 %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine multiple_reductions(w)
-  logical :: x,y,z,w(100)
-  x = .true.
-  y = .true.
-  z = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x,y,z)
-  do i=1, 100
-  x = x .eqv. w(i)
-  y = y .eqv. w(i)
-  z = z .eqv. w(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
deleted file mode 100644
index 89d16c3191b2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
+++ /dev/null
@@ -1,195 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-
-! CHECK-LABEL:   omp.declare_reduction @neqv_reduction : !fir.ref<!fir.logical<4>>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:           %[[VAL_1:.*]] = arith.constant false
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
-! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
-! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[RES:.*]] = arith.cmpi ne, %[[VAL_2]], %[[VAL_3]] : i1
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction(
-! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
-! CHECK:                 %[[VAL_14:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
-! CHECK:                 %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x)
-  do i=1, 100
-    x = x .neqv. y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
-! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-! CHECK:                 %[[VAL_13:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
-! CHECK:                 %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction_switch_order(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x)
-  do i=1, 100
-  x = y(i) .neqv. x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant true
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_7:.*]] = arith.constant true
-! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_9:.*]] = arith.constant true
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:                 %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_26]], %[[VAL_27]] : i1
-! CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:                 %[[VAL_33:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
-! CHECK:                 %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_38]] : i1
-! CHECK:                 %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
-! CHECK:                 %[[VAL_44:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
-! CHECK:                 %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_50:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_49]] : i1
-! CHECK:                 %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-
-subroutine multiple_reductions(w)
-  logical :: x,y,z,w(100)
-  x = .true.
-  y = .true.
-  z = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x,y,z)
-  do i=1, 100
-  x = x .neqv. w(i)
-  y = y .neqv. w(i)
-  z = z .neqv. w(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90
deleted file mode 100644
index 106e867f367b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90
+++ /dev/null
@@ -1,189 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-
-! CHECK-LABEL:   omp.declare_reduction @neqv_reduction : !fir.logical<4> init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.logical<4>):
-! CHECK:           %[[VAL_1:.*]] = arith.constant false
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
-! CHECK:           omp.yield(%[[VAL_2]] : !fir.logical<4>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.logical<4>, %[[VAL_1:.*]]: !fir.logical<4>):
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_4:.*]] = arith.cmpi ne, %[[VAL_2]], %[[VAL_3]] : i1
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4>
-! CHECK:           omp.yield(%[[VAL_5]] : !fir.logical<4>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction(
-! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
-! CHECK:                 %[[VAL_14:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
-! CHECK:                 %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x)
-  do i=1, 100
-    x = x .neqv. y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
-! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-! CHECK:                 %[[VAL_13:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
-! CHECK:                 %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction_switch_order(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x)
-  do i=1, 100
-  x = y(i) .neqv. x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant true
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_7:.*]] = arith.constant true
-! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_9:.*]] = arith.constant true
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:                 %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_26]], %[[VAL_27]] : i1
-! CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:                 %[[VAL_33:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
-! CHECK:                 %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_38]] : i1
-! CHECK:                 %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
-! CHECK:                 %[[VAL_44:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
-! CHECK:                 %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_50:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_49]] : i1
-! CHECK:                 %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-
-subroutine multiple_reductions(w)
-  logical :: x,y,z,w(100)
-  x = .true.
-  y = .true.
-  z = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x,y,z)
-  do i=1, 100
-  x = x .neqv. w(i)
-  y = y .neqv. w(i)
-  z = z .neqv. w(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
deleted file mode 100644
index a4244d11a558..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
+++ /dev/null
@@ -1,95 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-
-!CHECK: omp.declare_reduction @max_byref_f32 : !fir.ref<f32>
-!CHECK-SAME: init {
-!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -3.40282347E+38 : f32
-!CHECK:   %[[REF:.*]] = fir.alloca f32
-!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
-!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-!CHECK:   %[[RES:.*]] = arith.maxnumf %[[LD0]], %[[LD1]] {{.*}}: f32
-!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
-!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
-
-!CHECK-LABEL: omp.declare_reduction @max_byref_i32 : !fir.ref<i32>
-!CHECK-SAME: init {
-!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32
-!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
-!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK:   %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32
-!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
-
-!CHECK-LABEL: @_QPreduction_max_int
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop byref reduction(@max_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK:         %[[RES:.+]] = arith.cmpi sgt, %[[LPRV]], %[[Y_I]] : i32
-!CHECK:         %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
-!CHECK:         fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-!CHECK-LABEL: @_QPreduction_max_real
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop byref reduction(@max_byref_f32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
-!CHECK:         %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-subroutine reduction_max_int(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    x = max(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
-
-subroutine reduction_max_real(y)
-  real :: x, y(:)
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    x = max(y(i), x)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    !CHECK-NOT: omp.reduction
-    if (y(i) .gt. x) x = y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90
deleted file mode 100644
index e000bc36ca3f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[MAX_DECLARE_F:.*]] : f32 init {
-!CHECK:   %[[MINIMUM_VAL_F:.*]] = arith.constant -3.40282347E+38 : f32
-!CHECK:   omp.yield(%[[MINIMUM_VAL_F]] : f32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_F:.*]]: f32, %[[ARG1_F:.*]]: f32):
-!CHECK:   %[[COMB_VAL_F:.*]] = arith.maxnumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32
-!CHECK:   omp.yield(%[[COMB_VAL_F]] : f32)
-
-!CHECK: omp.declare_reduction @[[MAX_DECLARE_I:.*]] : i32 init {
-!CHECK:   %[[MINIMUM_VAL_I:.*]] = arith.constant -2147483648 : i32
-!CHECK:   omp.yield(%[[MINIMUM_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK:   %[[COMB_VAL_I:.*]] = arith.maxsi %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK:   omp.yield(%[[COMB_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_max_int
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop reduction(@[[MAX_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK:         %[[RES:.+]] = arith.cmpi sgt, %[[LPRV]], %[[Y_I]] : i32
-!CHECK:         %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
-!CHECK:         fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-!CHECK-LABEL: @_QPreduction_max_real
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop reduction(@[[MAX_DECLARE_F]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
-!CHECK:         %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-subroutine reduction_max_int(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    x = max(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
-
-subroutine reduction_max_real(y)
-  real :: x, y(:)
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    x = max(y(i), x)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    !CHECK-NOT: omp.reduction
-    if (y(i) .gt. x) x = y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
deleted file mode 100644
index 17435e1a194c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
+++ /dev/null
@@ -1,95 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mmlir --force-byref-reduction -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: omp.declare_reduction @min_byref_f32 : !fir.ref<f32>
-!CHECK-SAME: init {
-!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 3.40282347E+38 : f32
-!CHECK:   %[[REF:.*]] = fir.alloca f32
-!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
-!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-!CHECK:   %[[RES:.*]] = arith.minnumf %[[LD0]], %[[LD1]] {{.*}}: f32
-!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
-!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
-
-!CHECK-LABEL: omp.declare_reduction @min_byref_i32 : !fir.ref<i32>
-!CHECK-SAME: init {
-!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 2147483647 : i32
-!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
-!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK:   %[[RES:.*]] = arith.minsi %[[LD0]], %[[LD1]] : i32
-!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
-
-!CHECK-LABEL: @_QPreduction_min_int
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop byref reduction(@min_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK:         %[[RES:.+]] = arith.cmpi slt, %[[LPRV]], %[[Y_I]] : i32
-!CHECK:         %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
-!CHECK:         fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-!CHECK-LABEL: @_QPreduction_min_real
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop byref reduction(@min_byref_f32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
-!CHECK:         %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-subroutine reduction_min_int(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-    x = min(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
-
-subroutine reduction_min_real(y)
-  real :: x, y(:)
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-    x = min(y(i), x)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-  !CHECK-NOT: omp.reduction
-    if (y(i) .gt. x) x = y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90
deleted file mode 100644
index 1d18ece7297d..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[MIN_DECLARE_F:.*]] : f32 init {
-!CHECK:   %[[MAXIMUM_VAL_F:.*]] = arith.constant 3.40282347E+38 : f32
-!CHECK:   omp.yield(%[[MAXIMUM_VAL_F]] : f32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_F:.*]]: f32, %[[ARG1_F:.*]]: f32):
-!CHECK:   %[[COMB_VAL_F:.*]] = arith.minnumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32
-!CHECK:   omp.yield(%[[COMB_VAL_F]] : f32)
-
-!CHECK: omp.declare_reduction @[[MIN_DECLARE_I:.*]] : i32 init {
-!CHECK:   %[[MAXIMUM_VAL_I:.*]] = arith.constant 2147483647 : i32
-!CHECK:   omp.yield(%[[MAXIMUM_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK:   %[[COMB_VAL_I:.*]] = arith.minsi %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK:   omp.yield(%[[COMB_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_min_int
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop reduction(@[[MIN_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK:         %[[RES:.+]] = arith.cmpi slt, %[[LPRV]], %[[Y_I]] : i32
-!CHECK:         %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
-!CHECK:         fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-!CHECK-LABEL: @_QPreduction_min_real
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop reduction(@[[MIN_DECLARE_F]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
-!CHECK:         %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-subroutine reduction_min_int(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-    x = min(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
-
-subroutine reduction_min_real(y)
-  real :: x, y(:)
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-    x = min(y(i), x)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-  !CHECK-NOT: omp.reduction
-    if (y(i) .gt. x) x = y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90
deleted file mode 100644
index 751e4c8c5709..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90
+++ /dev/null
@@ -1,37 +0,0 @@
-! This test checks lowering of OpenMP DO Directive(Worksharing) with
-! simd schedule modifier.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop_dynamic
-  integer :: i
-!CHECK-LABEL: func @_QQmain()
-
-!$OMP PARALLEL
-!CHECK:  omp.parallel {
-
-!$OMP DO SCHEDULE(simd: runtime)
-!CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-!CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-!CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:      omp.wsloop schedule(runtime, simd) nowait {
-!CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-!CHECK:          fir.store %[[I]] to %[[STORE:.*]] : !fir.ref<i32>
-
-  do i=1, 9
-    print*, i
-!CHECK:          %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:          %[[LOAD:.*]] = fir.load %[[STORE]] : !fir.ref<i32>
-!CHECK:          fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-!CHECK:          fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
-  end do
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-!CHECK:      omp.terminator
-!CHECK:    }
-
-!$OMP END DO NOWAIT
-!$OMP END PARALLEL
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90
deleted file mode 100644
index 4bd876012278..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90
+++ /dev/null
@@ -1,187 +0,0 @@
-! This test checks lowering of OpenMP DO Directive(Worksharing) for different
-! types of loop iteration variable, lower bound, upper bound, and step.
-
-!REQUIRES: shell
-!RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - 2>&1 | FileCheck %s
-
-!CHECK:  OpenMP loop iteration variable cannot have more than 64 bits size and will be narrowed into 64 bits.
-
-program wsloop_variable
-  integer(kind=1) :: i1_lb, i1_ub
-  integer(kind=2) :: i2, i2_ub, i2_s
-  integer(kind=4) :: i4_s
-  integer(kind=8) :: i8, i8_s
-  integer(kind=16) :: i16, i16_lb
-  real :: x
-
-!CHECK:      %[[TMP0:.*]] = arith.constant 1 : i32
-!CHECK:      %[[TMP1:.*]] = arith.constant 100 : i32
-!CHECK:      %[[TMP2:.*]] = fir.convert %[[TMP0]] : (i32) -> i64
-!CHECK:      %[[TMP3:.*]] = fir.convert %{{.*}} : (i8) -> i64
-!CHECK:      %[[TMP4:.*]] = fir.convert %{{.*}} : (i16) -> i64
-!CHECK:      %[[TMP5:.*]] = fir.convert %{{.*}} : (i128) -> i64
-!CHECK:      %[[TMP6:.*]] = fir.convert %[[TMP1]] : (i32) -> i64
-!CHECK:      %[[TMP7:.*]] = fir.convert %{{.*}} : (i32) -> i64
-!CHECK:      omp.wsloop {
-!CHECK-NEXT:   omp.loop_nest (%[[ARG0:.*]], %[[ARG1:.*]]) : i64 = (%[[TMP2]], %[[TMP5]]) to (%[[TMP3]], %[[TMP6]]) inclusive step (%[[TMP4]], %[[TMP7]]) {
-!CHECK:          %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i64) -> i16
-!CHECK:          fir.store %[[ARG0_I16]] to %[[STORE_IV0:.*]] : !fir.ref<i16>
-!CHECK:          fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i64>
-!CHECK:          %[[LOAD_IV0:.*]] = fir.load %[[STORE_IV0]] : !fir.ref<i16>
-!CHECK:          %[[LOAD_IV0_I64:.*]] = fir.convert %[[LOAD_IV0]] : (i16) -> i64
-!CHECK:          %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i64>
-!CHECK:          %[[TMP10:.*]] = arith.addi %[[LOAD_IV0_I64]], %[[LOAD_IV1]] : i64
-!CHECK:          %[[TMP11:.*]] = fir.convert %[[TMP10]] : (i64) -> f32
-!CHECK:          fir.store %[[TMP11]] to %{{.*}} : !fir.ref<f32>
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-
-  !$omp do collapse(2)
-  do i2 = 1, i1_ub, i2_s
-    do i8 = i16_lb, 100, i4_s
-      x = i2 + i8
-    end do
-  end do
-  !$omp end do
-
-!CHECK:      %[[TMP12:.*]] = arith.constant 1 : i32
-!CHECK:      %[[TMP13:.*]] = fir.convert %{{.*}} : (i8) -> i32
-!CHECK:      %[[TMP14:.*]] = fir.convert %{{.*}} : (i64) -> i32
-!CHECK:      omp.wsloop {
-!CHECK-NEXT:   omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[TMP12]]) to (%[[TMP13]]) inclusive step (%[[TMP14]]) {
-!CHECK:          %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i32) -> i16
-!CHECK:          fir.store %[[ARG0_I16]] to %[[STORE3:.*]] : !fir.ref<i16>
-!CHECK:          %[[LOAD3:.*]] = fir.load %[[STORE3]] : !fir.ref<i16>
-!CHECK:          %[[TMP16:.*]] = fir.convert %[[LOAD3]] : (i16) -> f32
-!CHECK:          fir.store %[[TMP16]] to %{{.*}} : !fir.ref<f32>
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-
-  !$omp do
-  do i2 = 1, i1_ub, i8_s
-    x = i2
-  end do
-  !$omp end do
-
-!CHECK:      %[[TMP17:.*]] = fir.convert %{{.*}} : (i8) -> i64
-!CHECK:      %[[TMP18:.*]] = fir.convert %{{.*}} : (i16) -> i64
-!CHECK:      %[[TMP19:.*]] = fir.convert %{{.*}} : (i32) -> i64
-!CHECK:      omp.wsloop {
-!CHECK-NEXT:   omp.loop_nest (%[[ARG1:.*]]) : i64 = (%[[TMP17]]) to (%[[TMP18]]) inclusive step (%[[TMP19]]) {
-!CHECK:          %[[ARG1_I128:.*]] = fir.convert %[[ARG1]] : (i64) -> i128
-!CHECK:          fir.store %[[ARG1_I128]] to %[[STORE4:.*]] : !fir.ref<i128>
-!CHECK:          %[[LOAD4:.*]] = fir.load %[[STORE4]] : !fir.ref<i128>
-!CHECK:          %[[TMP21:.*]] = fir.convert %[[LOAD4]] : (i128) -> f32
-!CHECK:          fir.store %[[TMP21]] to %{{.*}} : !fir.ref<f32>
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-
-  !$omp do
-  do i16 = i1_lb, i2_ub, i4_s
-    x = i16
-  end do
-  !$omp end do
-
-end program wsloop_variable
-
-!CHECK-LABEL: func.func @_QPwsloop_variable_sub() {
-!CHECK:         %[[IV2:.*]] = fir.alloca i8 {adapt.valuebyref, pinned}
-!CHECK:         %[[VAL_0:.*]] = fir.alloca i128 {bindc_name = "i16_lb", uniq_name = "_QFwsloop_variable_subEi16_lb"}
-!CHECK:         %[[VAL_1:.*]] = fir.alloca i8 {bindc_name = "i1_ub", uniq_name = "_QFwsloop_variable_subEi1_ub"}
-!CHECK:         %[[VAL_2:.*]] = fir.alloca i16 {bindc_name = "i2", uniq_name = "_QFwsloop_variable_subEi2"}
-!CHECK:         %[[VAL_3:.*]] = fir.alloca i16 {bindc_name = "i2_s", uniq_name = "_QFwsloop_variable_subEi2_s"}
-!CHECK:         %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i4_s", uniq_name = "_QFwsloop_variable_subEi4_s"}
-!CHECK:         %[[VAL_5:.*]] = fir.alloca i64 {bindc_name = "i8", uniq_name = "_QFwsloop_variable_subEi8"}
-!CHECK:         %[[J1:.*]] = fir.alloca i8 {bindc_name = "j1", uniq_name = "_QFwsloop_variable_subEj1"}
-!CHECK:         %[[VAL_6:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFwsloop_variable_subEx"}
-!CHECK:         %[[VAL_7:.*]] = arith.constant 1 : i32
-!CHECK:         %[[VAL_8:.*]] = fir.load %[[VAL_1]] : !fir.ref<i8>
-!CHECK:         %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i16>
-!CHECK:         %[[VAL_10:.*]] = fir.convert %[[VAL_8]] : (i8) -> i32
-!CHECK:         %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (i16) -> i32
-!CHECK:         omp.wsloop {
-!CHECK-NEXT:      omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
-!CHECK:             %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i32) -> i16
-!CHECK:             fir.store %[[ARG0_I16]] to %[[STORE_IV:.*]] : !fir.ref<i16>
-!CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_0]] : !fir.ref<i128>
-!CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i128) -> index
-!CHECK:             %[[VAL_15:.*]] = arith.constant 100 : i32
-!CHECK:             %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> index
-!CHECK:             %[[VAL_17:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-!CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (i32) -> index
-!CHECK:             %[[LB:.*]] = fir.convert %[[VAL_14]] : (index) -> i64
-!CHECK:             %[[VAL_19:.*]]:2 = fir.do_loop %[[VAL_20:[^ ]*]] =
-!CHECK-SAME:            %[[VAL_14]] to %[[VAL_16]] step %[[VAL_18]]
-!CHECK-SAME:            iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i64) {
-!CHECK:               fir.store %[[IV]] to %[[VAL_5]] : !fir.ref<i64>
-!CHECK:               %[[LOAD_IV:.*]] = fir.load %[[STORE_IV]] : !fir.ref<i16>
-!CHECK:               %[[VAL_22:.*]] = fir.convert %[[LOAD_IV]] : (i16) -> i64
-!CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
-!CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_22]], %[[VAL_23]] : i64
-!CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i64) -> f32
-!CHECK:               fir.store %[[VAL_25]] to %[[VAL_6]] : !fir.ref<f32>
-!CHECK:               %[[VAL_26:.*]] = arith.addi %[[VAL_20]], %[[VAL_18]] : index
-!CHECK:               %[[STEPCAST:.*]] = fir.convert %[[VAL_18]] : (index) -> i64
-!CHECK:               %[[IVLOAD:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
-!CHECK:               %[[IVINC:.*]] = arith.addi %[[IVLOAD]], %[[STEPCAST]]
-!CHECK:               fir.result %[[VAL_26]], %[[IVINC]] : index, i64
-!CHECK:             }
-!CHECK:             fir.store %[[VAL_19]]#1 to %[[VAL_5]] : !fir.ref<i64>
-!CHECK:             omp.yield
-!CHECK:           }
-!CHECK:           omp.terminator
-!CHECK:         }
-
-subroutine wsloop_variable_sub
-  integer(kind=1) :: i1, i1_ub, j1
-  integer(kind=2) :: i2, i2_s
-  integer(kind=4) :: i4_s
-  integer(kind=8) :: i8
-  integer(kind=16) :: i16_lb
-  real :: x
-
-  !$omp do
-  do i2 = 1, i1_ub, i2_s
-    do i8 = i16_lb, 100, i4_s
-      x = i2 + i8
-    end do
-  end do
-  !$omp end do
-
-!CHECK:         %[[C1:.*]] = arith.constant 1 : i32
-!CHECK:         %[[C10:.*]] = arith.constant 10 : i32
-!CHECK:         %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK:         omp.wsloop {
-!CHECK-NEXT:      omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[C1]]) to (%[[C10]]) inclusive step (%[[C1_2]]) {
-!CHECK:             %[[ARG0_I8:.*]] = fir.convert %[[ARG0]] : (i32) -> i8
-!CHECK:             fir.store %[[ARG0_I8]] to %[[IV2]] : !fir.ref<i8>
-!CHECK:             %[[IV2LOAD:.*]] = fir.load %[[IV2]] : !fir.ref<i8>
-!CHECK:             %[[J1LOAD:.*]] = fir.load %[[J1]] : !fir.ref<i8>
-!CHECK:             %[[VAL_27:.*]] = arith.cmpi eq, %[[IV2LOAD]], %[[J1LOAD]] : i8
-!CHECK:             fir.if %[[VAL_27]] {
-!CHECK:             } else {
-!CHECK:             }
-!CHECK:             omp.yield
-!CHECK:           }
-!CHECK:           omp.terminator
-!CHECK:         }
-
-  j1 = 5
-  !$omp do
-  do i1 = 1, 10
-    if (i1 .eq. j1) then
-      print *, "EQ"
-    end if
-  end do
-  !$omp end do
-
-!CHECK:         return
-!CHECK:       }
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop.f90 b/flang/test/Lower/OpenMP/FIR/wsloop.f90
deleted file mode 100644
index c9e428abdb44..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop.f90
+++ /dev/null
@@ -1,78 +0,0 @@
-! This test checks lowering of OpenMP DO Directive (Worksharing).
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-!CHECK-LABEL: func @_QPsimple_loop()
-subroutine simple_loop
-  integer :: i
-  ! CHECK:  omp.parallel
-  !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP DO
-  do i=1, 9
-  ! CHECK:          fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:          fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:          omp.yield
-  ! CHECK:        omp.terminator
-  !$OMP END DO
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL
-end subroutine
-
-!CHECK-LABEL: func @_QPsimple_loop_with_step()
-subroutine simple_loop_with_step
-  integer :: i
-  ! CHECK:  omp.parallel
-  !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 2 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  ! CHECK:          fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  !$OMP DO
-  do i=1, 9, 2
-  ! CHECK:          fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:          omp.yield
-  ! CHECK:        omp.terminator
-  !$OMP END DO
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL
-end subroutine
-
-!CHECK-LABEL: func @_QPloop_with_schedule_nowait()
-subroutine loop_with_schedule_nowait
-  integer :: i
-  ! CHECK:  omp.parallel
-  !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop schedule(runtime) nowait {
-  ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP DO SCHEDULE(runtime)
-  do i=1, 9
-  ! CHECK:          fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:          fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:          omp.yield
-  ! CHECK:        omp.terminator
-  !$OMP END DO NOWAIT
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL
-end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/Todo/reduction-array-intrinsic.f90
new file mode 100644
index 000000000000..49c899238d2a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/reduction-array-intrinsic.f90
@@ -0,0 +1,11 @@
+! RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Reduction of some types is not supported for intrinsics
+subroutine max_array_reduction(l, r)
+  integer :: l(:), r(:)
+
+  !$omp parallel reduction(max:l)
+    l = max(l, r)
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90 b/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
new file mode 100644
index 000000000000..44036492f559
--- /dev/null
+++ b/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
@@ -0,0 +1,54 @@
+! Tests that CFG & LLVM conversion is applied to `omp.private` ops.
+
+! RUN: split-file %s %t && cd %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - test.f90 2>&1 | \
+! RUN: fir-opt --cfg-conversion -o test.cfg-conv.mlir
+! RUN: FileCheck --input-file=test.cfg-conv.mlir %s --check-prefix="CFGConv"
+
+! RUN: fir-opt --convert-hlfir-to-fir --cg-rewrite --fir-to-llvm-ir test.cfg-conv.mlir -o - | \
+! RUN: FileCheck %s --check-prefix="LLVMDialect"
+
+!--- test.f90
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1
+
+!$omp parallel private(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CFGConv-LABEL: omp.private {type = private}
+! CFGConv-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+
+! CFGConv-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CFGConv-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_allocatableEvar1"}
+
+! CFGConv-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CFGConv-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CFGConv-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
+! CFGConv-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CFGConv-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CFGConv-NEXT:   cf.cond_br %[[ALLOC_COND]], ^[[ALLOC_MEM_BB:.*]], ^[[ZERO_MEM_BB:.*]]
+! CFGConv-NEXT: ^[[ALLOC_MEM_BB]]:
+! CFGConv:        fir.allocmem
+! CFGConv:        cf.br ^[[DECL_BB:.*]]
+! CFGConv:      ^[[ZERO_MEM_BB]]:
+! CFGConv-NEXT:   fir.zero_bits
+! CFGConv:        cf.br ^[[DECL_BB:.*]]
+! CFGConv-NEXT: ^[[DECL_BB]]:
+! CFGConv-NEXT:   hlfir.declare
+! CFGConv-NEXT:   omp.yield
+
+
+! LLVMDialect-LABEL: omp.private {type = private}
+! LLVMDialect-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!llvm.ptr]] alloc {
+
+! LLVMDialect-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! LLVMDialect:        llvm.alloca
+! LLVMDialect:        llvm.call @malloc
+
+! LLVMDialect-NOT:    hlfir.declare
diff --git a/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90 b/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
new file mode 100644
index 000000000000..25579272a6e0
--- /dev/null
+++ b/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
@@ -0,0 +1,11 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK: @_QPsb
+subroutine sb(a)
+  integer :: a(:)
+!CHECK: omp.parallel
+  !$omp parallel default(private)
+!CHECK: hlfir.elemental
+    if (any(a/=(/(100,i=1,5)/))) print *, "OK"
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
index cc1818b00b80..f1fae2540aa4 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
@@ -28,7 +28,7 @@ end subroutine
 ! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
 
 ! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
-! CHECK-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFdelayed_privatization_allocatableEvar1.alloc"}
+! CHECK:          %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFdelayed_privatization_allocatableEvar1.alloc"}
 ! CHECK-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
 ! CHECK-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK-NEXT:   } else {
@@ -40,4 +40,22 @@ end subroutine
 ! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
 ! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
 
-! CHECK-NEXT: }
+! CHECK-NEXT: } dealloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK-NEXT:   %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
+! CHECK-NEXT:   %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
+! CHECK-NEXT:   %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
+! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT:   %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
+
+! CHECK-NEXT:   fir.if %[[PRIV_NULL_COND]] {
+! CHECK:          %[[PRIV_VAL_2:.*]] = fir.load %[[PRIV_ARG]]
+! CHECK-NEXT:     %[[PRIV_ADDR_2:.*]] = fir.box_addr %[[PRIV_VAL_2]]
+! CHECK-NEXT:     fir.freemem %[[PRIV_ADDR_2]]
+! CHECK-NEXT:     %[[ZEROS:.*]] = fir.zero_bits
+! CHECK-NEXT:     %[[ZEROS_BOX:.*]]  = fir.embox %[[ZEROS]]
+! CHECK-NEXT:     fir.store %[[ZEROS_BOX]] to %[[PRIV_ARG]]
+! CHECK-NEXT:   }
+
+! CHECK-NEXT:   omp.yield
diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90
index ce4427a0c2ca..7c15c275d8cc 100644
--- a/flang/test/Lower/OpenMP/if-clause.f90
+++ b/flang/test/Lower/OpenMP/if-clause.f90
@@ -1,7 +1,9 @@
 ! This test checks lowering of OpenMP IF clauses.
 
-! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! The "if" clause was added to the "simd" directive in OpenMP 5.0, and
+! to the "teams" directive in OpenMP 5.2.
+! RUN: bbc -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck %s
 
 program main
   integer :: i
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
index 93809fde98a2..f8343338112c 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
@@ -93,3 +93,107 @@ subroutine multiple_private_fix2()
    !$omp end parallel
       x = 1
 end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsub01(
+! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "aaa"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_2:.*]] = fir.box_elesize %[[VAL_1]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_2]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, index) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_4:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,?>>> {bindc_name = "aaa", pinned, uniq_name = "_QFsub01Eaaa"}
+! CHECK:             %[[VAL_5:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             %[[VAL_6:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
+! CHECK:             %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.heap<!fir.char<1,?>>) -> i64
+! CHECK:             %[[VAL_8:.*]] = arith.constant 0 : i64
+! CHECK:             %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:             fir.if %[[VAL_9]] {
+! CHECK:               %[[VAL_10:.*]] = arith.constant 0 : index
+! CHECK:               %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_10]] : index
+! CHECK:               %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_2]], %[[VAL_10]] : index
+! CHECK:               %[[VAL_13:.*]] = fir.allocmem !fir.char<1,?>(%[[VAL_12]] : index) {fir.must_be_heap = true, uniq_name = "_QFsub01Eaaa.alloc"}
+! CHECK:               %[[VAL_14:.*]] = fir.embox %[[VAL_13]] typeparams %[[VAL_12]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_4]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             } else {
+! CHECK:               %[[VAL_15:.*]] = fir.zero_bits !fir.heap<!fir.char<1,?>>
+! CHECK:               %[[VAL_16:.*]] = arith.constant 0 : index
+! CHECK:               %[[VAL_17:.*]] = fir.embox %[[VAL_15]] typeparams %[[VAL_16]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+! CHECK:               fir.store %[[VAL_17]] to %[[VAL_4]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             }
+! CHECK:             %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub01Eaaa"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:             %[[VAL_19:.*]] = fir.load %[[VAL_18]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             %[[VAL_20:.*]] = fir.box_addr %[[VAL_19]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
+! CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.heap<!fir.char<1,?>>) -> i64
+! CHECK:             %[[VAL_22:.*]] = arith.constant 0 : i64
+! CHECK:             %[[VAL_23:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_22]] : i64
+! CHECK:             fir.if %[[VAL_23]] {
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_18]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:               %[[VAL_25:.*]] = fir.box_addr %[[VAL_24]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
+! CHECK:               fir.freemem %[[VAL_25]] : !fir.heap<!fir.char<1,?>>
+! CHECK:               %[[VAL_26:.*]] = fir.zero_bits !fir.heap<!fir.char<1,?>>
+! CHECK:               %[[VAL_27:.*]] = arith.constant 0 : index
+! CHECK:               %[[VAL_28:.*]] = fir.embox %[[VAL_26]] typeparams %[[VAL_27]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+! CHECK:               fir.store %[[VAL_28]] to %[[VAL_18]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine sub01(aaa)
+  character(*),allocatable :: aaa
+  !$omp parallel private(aaa)
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsub02(
+! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>> {fir.bindc_name = "bbb"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.char<1,?>>> {bindc_name = "bbb", pinned, uniq_name = "_QFsub02Ebbb"}
+! CHECK:             %[[VAL_3:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
+! CHECK:             %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<!fir.char<1,?>>) -> i64
+! CHECK:             %[[VAL_6:.*]] = arith.constant 0 : i64
+! CHECK:             %[[VAL_7:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_6]] : i64
+! CHECK:             fir.if %[[VAL_7]] {
+! CHECK:               %[[VAL_8:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:               %[[VAL_9:.*]] = fir.box_elesize %[[VAL_8]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> index
+! CHECK:               %[[VAL_10:.*]] = arith.constant 0 : index
+! CHECK:               %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_10]] : index
+! CHECK:               %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_9]], %[[VAL_10]] : index
+! CHECK:               %[[VAL_13:.*]] = fir.allocmem !fir.char<1,?>(%[[VAL_12]] : index) {fir.must_be_heap = true, uniq_name = "_QFsub02Ebbb.alloc"}
+! CHECK:               %[[VAL_14:.*]] = fir.embox %[[VAL_13]] typeparams %[[VAL_12]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             } else {
+! CHECK:               %[[VAL_15:.*]] = fir.zero_bits !fir.heap<!fir.char<1,?>>
+! CHECK:               %[[VAL_16:.*]] = arith.constant 0 : index
+! CHECK:               %[[VAL_17:.*]] = fir.embox %[[VAL_15]] typeparams %[[VAL_16]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+! CHECK:               fir.store %[[VAL_17]] to %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             }
+! CHECK:             %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QFsub02Ebbb"} : (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>)
+! CHECK:             %[[VAL_19:.*]] = fir.load %[[VAL_18]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             %[[VAL_20:.*]] = fir.box_addr %[[VAL_19]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
+! CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.heap<!fir.char<1,?>>) -> i64
+! CHECK:             %[[VAL_22:.*]] = arith.constant 0 : i64
+! CHECK:             %[[VAL_23:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_22]] : i64
+! CHECK:             fir.if %[[VAL_23]] {
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_18]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:               %[[VAL_25:.*]] = fir.box_addr %[[VAL_24]] : (!fir.box<!fir.heap<!fir.char<1,?>>>) -> !fir.heap<!fir.char<1,?>>
+! CHECK:               fir.freemem %[[VAL_25]] : !fir.heap<!fir.char<1,?>>
+! CHECK:               %[[VAL_26:.*]] = fir.zero_bits !fir.heap<!fir.char<1,?>>
+! CHECK:               %[[VAL_27:.*]] = arith.constant 0 : index
+! CHECK:               %[[VAL_28:.*]] = fir.embox %[[VAL_26]] typeparams %[[VAL_27]] : (!fir.heap<!fir.char<1,?>>, index) -> !fir.box<!fir.heap<!fir.char<1,?>>>
+! CHECK:               fir.store %[[VAL_28]] to %[[VAL_18]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.char<1,?>>>>
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine sub02(bbb)
+  character(:),allocatable :: bbb
+  !$omp parallel private(bbb)
+  !$omp end parallel
+end subroutine sub02
+
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index 190aa6152121..8ec1a3cefb4a 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -1,7 +1,8 @@
 ! Tests for 2.9.3.1 Simd
 
-!RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -hlfir -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! The "if" clause was added to the "simd" directive in OpenMP 5.0.
+! RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+! RUN: bbc -hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
 !CHECK-LABEL: func @_QPsimd()
 subroutine simd
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 0f0c736d3162..44f77b5c3360 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -1,4 +1,5 @@
-!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! The "thread_limit" clause was added to the "target" construct in OpenMP 5.1.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s
 
 !===============================================================================
 ! Target_Enter Simple
diff --git a/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90 b/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
index 55f806962a60..0a249ff101a0 100644
--- a/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
@@ -21,6 +21,7 @@ module test
 !CHECK-DAG: fir.global @_QMtestEz : !fir.logical<4> {
 
 contains
+!CHECK-LABEL: func.func @_QMtestPsub
   subroutine sub()
 !CHECK-DAG:  %[[T:.*]] = fir.address_of(@_QMtestEt) : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
 !CHECK-DAG:  %[[T_DECL:.*]]:2 = hlfir.declare %[[T]] {uniq_name = "_QMtestEt"} : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>, !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>)
diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
index d849dd206b94..90eede4f8410 100644
--- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
+++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
@@ -1,5 +1,6 @@
-!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! The "use_device_addr" was added to the "target data" directive in OpenMP 5.0.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
 ! This tests primary goal is to check the promotion of 
 ! non-CPTR arguments from use_device_ptr to 
diff --git a/flang/test/Lower/convert.f90 b/flang/test/Lower/convert.f90
index b7c8b8dc20cc..75d0f844149c 100755
--- a/flang/test/Lower/convert.f90
+++ b/flang/test/Lower/convert.f90
@@ -11,6 +11,9 @@ end
 ! Try to test that -fconvert=<value> flag results in a environment default list
 ! with the FORT_CONVERT option correctly specified.
 
+! ALL:  %0 = fir.address_of(@_QQEnvironmentDefaults.list) : !fir.ref<tuple<i32, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
+! ALL: fir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0)
+
 ! ALL: fir.global linkonce @_QQEnvironmentDefaults.items constant : !fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>> {
 ! ALL: %[[VAL_0:.*]] = fir.undefined !fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>
 ! ALL: %[[VAL_1:.*]] = fir.address_of(@[[FC_STR:.*]]) : !fir.ref<!fir.char<1,13>>
@@ -41,6 +44,3 @@ end
 ! ALL: %[[VAL_4:.*]] = fir.insert_value %[[VAL_2]], %[[VAL_3]], [1 : index] : (tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>) -> tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>
 ! ALL: fir.has_value %[[VAL_4]] : tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>
 
-! ALL: fir.global @_QQEnvironmentDefaults constant : !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>> {
-! ALL: %[[VAL_0:.*]] = fir.address_of(@_QQEnvironmentDefaults.list) : !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
-! ALL: fir.has_value %[[VAL_0]] : !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
diff --git a/flang/test/Lower/environment-defaults.f90 b/flang/test/Lower/environment-defaults.f90
index 700f7581bd6e..f5f41dabecc1 100755
--- a/flang/test/Lower/environment-defaults.f90
+++ b/flang/test/Lower/environment-defaults.f90
@@ -5,8 +5,9 @@ program test
   continue
 end
 
-! Test that a null pointer is generated for environment defaults if nothing is specified
+! Test that a null pointer is passed for environment defaults if nothing is specified
 
-! CHECK: fir.global @_QQEnvironmentDefaults constant : !fir.ref<tuple<i[[int_size:.*]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>> {
-! CHECK:  %[[VAL_0:.*]] = fir.zero_bits !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
-! CHECK: fir.has_value  %[[VAL_0]] : !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
+! CHECK-NOT: @_QQEnvironmentDefaults
+
+! CHECK: %0 = fir.zero_bits !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
+! CHECK-NEXT: @_FortranAProgramStart(%arg0, %arg1, %arg2, %0)
diff --git a/flang/test/Parser/unrecognized-dir.f90 b/flang/test/Parser/unrecognized-dir.f90
index ba6fff7562e2..91fbfc9ee3c3 100644
--- a/flang/test/Parser/unrecognized-dir.f90
+++ b/flang/test/Parser/unrecognized-dir.f90
@@ -1,4 +1,10 @@
 ! RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
-!CHECK: warning: Compiler directive was ignored
+!CHECK: warning: Unrecognized compiler directive was ignored
 !DIR$ Not a recognized directive
+program main
+ contains
+  !CHECK: warning: Compiler directive ignored here
+  !DIR$ not in a subprogram
+  subroutine s
+  end
 end
diff --git a/flang/test/Preprocessing/backslash-contin1.F90 b/flang/test/Preprocessing/backslash-contin1.F90
new file mode 100644
index 000000000000..cf2ed36370da
--- /dev/null
+++ b/flang/test/Preprocessing/backslash-contin1.F90
@@ -0,0 +1,8 @@
+! RUN: %flang -E %s | FileCheck %s
+print *, \
+  "hello, \
+world"
+end
+!CHECK:      print *, "hello, world"
+!CHECK:      end
+
diff --git a/flang/test/Preprocessing/include-comment.F90 b/flang/test/Preprocessing/include-comment.F90
index c55d07ec66d3..7da4751f725a 100644
--- a/flang/test/Preprocessing/include-comment.F90
+++ b/flang/test/Preprocessing/include-comment.F90
@@ -1,4 +1,4 @@
-! RUN: %flang -I%S -E %s 2>&1 | FileCheck %s
+! RUN: %flang -pedantic -I%S -E %s 2>&1 | FileCheck %s
 ! CHECK-NOT: :3:
 #include <empty.h> ! comment
 ! CHECK-NOT: :5:
diff --git a/flang/test/Semantics/arg-convert.f90 b/flang/test/Semantics/arg-convert.f90
new file mode 100644
index 000000000000..7951bedf49d0
--- /dev/null
+++ b/flang/test/Semantics/arg-convert.f90
@@ -0,0 +1,16 @@
+!RUN: %flang_fc1 -fdebug-unparse  %s  2>&1 | FileCheck %s
+!Ensure that argument conversion does not take place when the procedure
+!interface is implicit at the point of call, even when the interface
+!is known due because the procedure's definition is in the same source file.
+
+subroutine test
+!CHECK: warning: If the procedure's interface were explicit, this reference would be in error
+!CHECK: because: Actual argument type 'INTEGER(8)' is not compatible with dummy argument type 'INTEGER(4)'
+!CHECK: CALL samesourcefile((1_8))
+  call sameSourceFile((1_8))
+!CHECK: CALL somewhereelse((2_8))
+  call somewhereElse((2_8))
+end
+
+subroutine sameSourceFile(n)
+end
diff --git a/flang/test/Semantics/call14.f90 b/flang/test/Semantics/call14.f90
index 042243b56059..e586d4eebd25 100644
--- a/flang/test/Semantics/call14.f90
+++ b/flang/test/Semantics/call14.f90
@@ -9,7 +9,7 @@ module m
   !ERROR: VALUE attribute may apply only to a dummy data object
   subroutine C863(notData,assumedSize,coarray,coarrayComponent,assumedRank,assumedLen)
     external :: notData
-    !ERROR: VALUE attribute may apply only to a dummy argument
+    !ERROR: Only a dummy argument may have an INTENT, VALUE, or OPTIONAL attribute
     real, value :: notADummy
     value :: notData
     !ERROR: VALUE attribute may not apply to an assumed-size array
diff --git a/flang/test/Semantics/cuf03.cuf b/flang/test/Semantics/cuf03.cuf
index 472d53db7462..a98dd60cdb8a 100644
--- a/flang/test/Semantics/cuf03.cuf
+++ b/flang/test/Semantics/cuf03.cuf
@@ -89,3 +89,7 @@ module m
 
 
 end module
+
+program p
+  real, unified :: um ! ok
+end program
diff --git a/flang/test/Semantics/cuf13.cuf b/flang/test/Semantics/cuf13.cuf
new file mode 100644
index 000000000000..6db829002fae
--- /dev/null
+++ b/flang/test/Semantics/cuf13.cuf
@@ -0,0 +1,37 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+
+module matching
+  interface sub
+    module procedure sub_host
+    module procedure sub_device
+  end interface
+
+  interface subman
+    module procedure sub_host
+  end interface
+
+contains
+  subroutine sub_host(a)
+    integer :: a(:)
+  end
+
+  subroutine sub_device(a)
+    integer, device :: a(:)
+  end
+
+end module
+
+program m
+  use matching
+
+  integer, pinned, allocatable :: a(:)
+  integer, managed, allocatable :: b(:)
+  logical :: plog
+  allocate(a(100), pinned = plog)
+  allocate(b(200))
+
+  call sub(a)
+
+  call subman(b)
+
+end
diff --git a/flang/test/Semantics/declarations03.f90 b/flang/test/Semantics/declarations03.f90
index 3459b2287b2b..65b07e7d5c65 100644
--- a/flang/test/Semantics/declarations03.f90
+++ b/flang/test/Semantics/declarations03.f90
@@ -19,7 +19,7 @@ module m
   common /blk4/ w
   bind(c, name="cc") :: t2, /blk4/
 
-  !ERROR: The entity 'blk5' has multiple BIND names
+  !ERROR: The entity 'blk5' has multiple BIND names ('dd' and 'ee')
   common /blk5/ i
   bind(c, name="dd") :: /blk5/
   bind(c, name="ee") :: /blk5/
@@ -29,7 +29,7 @@ module m
   bind(c, name="ff") :: /blk6/
   bind(c, name="ff") :: /blk7/
 
-  !ERROR: The entity 's1' has multiple BIND names
+  !ERROR: The entity 's1' has multiple BIND names ('gg' and 'hh')
   integer :: s1
   bind(c, name="gg") :: s1
   !ERROR: BIND_C attribute was already specified on 's1'
@@ -40,12 +40,12 @@ module m
   bind(c, name="ii") :: s2
   bind(c, name="ii") :: s3
 
-  !ERROR: The entity 's4' has multiple BIND names
+  !ERROR: The entity 's4' has multiple BIND names ('ss1' and 'jj')
   integer, bind(c, name="ss1") :: s4
   !ERROR: BIND_C attribute was already specified on 's4'
   bind(c, name="jj") :: s4
 
-  !ERROR: The entity 's5' has multiple BIND names
+  !ERROR: The entity 's5' has multiple BIND names ('kk' and 'ss2')
   bind(c, name="kk") :: s5
   !ERROR: BIND_C attribute was already specified on 's5'
   integer, bind(c, name="ss2") :: s5
@@ -72,3 +72,8 @@ module b
   !ERROR: Two entities have the same global name 'int'
   integer, bind(c, name="int") :: i
 end module
+
+module c
+  bind(c, name = "AAA") a
+  integer aaa ! ensure no bogus error about multiple binding names
+end module
diff --git a/flang/test/Semantics/kinds04_q10.f90 b/flang/test/Semantics/kinds04_q10.f90
index 3da619d24dee..d352daa1cbbf 100644
--- a/flang/test/Semantics/kinds04_q10.f90
+++ b/flang/test/Semantics/kinds04_q10.f90
@@ -14,7 +14,9 @@ subroutine s(var)
   real :: realvar1 = 4.0E6_4
   real :: realvar2 = 4.0D6
   real :: realvar3 = 4.0Q6
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   real :: realvar4 = 4.0D6_8
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   real :: realvar5 = 4.0Q6_10
   !WARNING: Explicit kind parameter on real constant disagrees with exponent letter 'q'
   real :: realvar6 = 4.0Q6_16
@@ -27,6 +29,7 @@ subroutine s(var)
   double precision :: doublevar1 = 4.0E6_4
   double precision :: doublevar2 = 4.0D6
   double precision :: doublevar3 = 4.0Q6
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   double precision :: doublevar4 = 4.0D6_8
   !WARNING: Explicit kind parameter on real constant disagrees with exponent letter 'q'
   double precision :: doublevar5 = 4.0Q6_16
diff --git a/flang/test/Semantics/kinds04_q16.f90 b/flang/test/Semantics/kinds04_q16.f90
index 527cbe9aff12..885ca8c8392b 100644
--- a/flang/test/Semantics/kinds04_q16.f90
+++ b/flang/test/Semantics/kinds04_q16.f90
@@ -12,9 +12,11 @@ subroutine s(var)
   real :: realvar1 = 4.0E6_4
   real :: realvar2 = 4.0D6
   real :: realvar3 = 4.0Q6
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   real :: realvar4 = 4.0D6_8
   !WARNING: Explicit kind parameter on real constant disagrees with exponent letter 'q'
   real :: realvar5 = 4.0Q6_10
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   real :: realvar6 = 4.0Q6_16
   real :: realvar7 = 4.0E6_8
   real :: realvar8 = 4.0E6_10
@@ -25,7 +27,9 @@ subroutine s(var)
   double precision :: doublevar1 = 4.0E6_4
   double precision :: doublevar2 = 4.0D6
   double precision :: doublevar3 = 4.0Q6
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   double precision :: doublevar4 = 4.0D6_8
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   double precision :: doublevar5 = 4.0Q6_16
   double precision :: doublevar6 = 4.0E6_8
   double precision :: doublevar7 = 4.0E6_10
diff --git a/flang/test/Semantics/resolve58.f90 b/flang/test/Semantics/resolve58.f90
index 447e14ae80a9..2e42eb157f5b 100644
--- a/flang/test/Semantics/resolve58.f90
+++ b/flang/test/Semantics/resolve58.f90
@@ -69,12 +69,12 @@ subroutine s6()
 
   !ERROR: Implied-shape array 'local1' must be a named constant or a dummy argument
   real, dimension (*) :: local1
-  !ERROR: INTENT attributes may apply only to a dummy argument
+  !ERROR: Only a dummy argument may have an INTENT, VALUE, or OPTIONAL attribute
   real, intent(in) :: local2
-  !ERROR: INTENT attributes may apply only to a dummy argument
+  !ERROR: Only a dummy argument may have an INTENT, VALUE, or OPTIONAL attribute
   procedure(), intent(in) :: p1
-  !ERROR: OPTIONAL attribute may apply only to a dummy argument
+  !ERROR: Only a dummy argument may have an INTENT, VALUE, or OPTIONAL attribute
   real, optional :: local3
-  !ERROR: OPTIONAL attribute may apply only to a dummy argument
+  !ERROR: Only a dummy argument may have an INTENT, VALUE, or OPTIONAL attribute
   procedure(), optional :: p2
 end subroutine
diff --git a/flang/test/Transforms/debug-fn-info.f90 b/flang/test/Transforms/debug-fn-info.f90
new file mode 100644
index 000000000000..97a34e8676de
--- /dev/null
+++ b/flang/test/Transforms/debug-fn-info.f90
@@ -0,0 +1,45 @@
+! RUN: %flang_fc1 -emit-fir -debug-info-kind=standalone -mmlir --mlir-print-debuginfo %s -o - | fir-opt --add-debug-info --mlir-print-debuginfo  | FileCheck %s
+
+
+! CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+! CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+! CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
+! CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
+! CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+! CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
+! CHECK: #[[TY0:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_program>
+! CHECK: #[[TY1:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #[[INT8]], #[[INT4]], #[[REAL8]], #[[LOG1]]>
+! CHECK: #[[TY2:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #[[INT4]], #[[INT8]], #[[REAL4]], #[[LOG4]]>
+
+! CHECK: #di_subprogram1 = #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "_QQmain", linkageName = "_QQmain", file = {{.*}}, line = [[@LINE+1]], scopeLine = [[@LINE+1]], subprogramFlags = Definition, type = #[[TY0]]>
+program mn
+  integer(kind=4) :: i4
+  integer(kind=8) :: i8
+  real(kind=4) :: r4
+  real(kind=8) :: r8
+  logical(kind=1) :: l1
+  logical(kind=4) :: l4
+  i8 = fn1(i4, r8, l1)
+  i4 = fn2(i8, r4, l4)
+contains
+  ! CHECK: #di_subprogram2 = #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "fn1", linkageName = "_QFPfn1", file = {{.*}}, line = [[@LINE+1]], scopeLine = [[@LINE+1]], subprogramFlags = Definition, type = #[[TY1]]>
+  function fn1(a, b, c) result (res)
+    implicit none
+    integer(kind=4), intent(in) :: a
+    real(kind=8), intent(in) :: b
+    logical(kind=1), intent(in) :: c
+    integer(kind=8) :: res
+    res = a + b
+  end function
+
+! CHECK: #di_subprogram3 = #llvm.di_subprogram<id = {{.*}}, compileUnit = {{.*}}, scope = {{.*}}, name = "fn2", linkageName = "_QFPfn2", file = {{.*}}, line = [[@LINE+1]], scopeLine = [[@LINE+1]], subprogramFlags = Definition, type = #[[TY2]]>
+  function fn2(a, b, c) result (res)
+    implicit none
+    integer(kind=8), intent(in) :: a
+    real(kind=4), intent(in) :: b
+    logical(kind=4), intent(in) :: c
+    integer(kind=4) :: res
+    res = a + b
+  end function
+end program
+
diff --git a/flang/test/Transforms/debug-line-table-inc-file.fir b/flang/test/Transforms/debug-line-table-inc-file.fir
index dc75482d4f8a..d7f60a1a86db 100644
--- a/flang/test/Transforms/debug-line-table-inc-file.fir
+++ b/flang/test/Transforms/debug-line-table-inc-file.fir
@@ -31,7 +31,7 @@ module attributes {} {
 // CHECK: #[[LOC_INC_FILE:.*]] = loc("{{.*}}inc.f90":1:1)
 // CHECK: #[[LOC_FILE:.*]] = loc("{{.*}}simple.f90":3:1)
 // CHECK: #[[DI_CU:.*]] = #llvm.di_compile_unit<id = distinct[{{.*}}]<>, sourceLanguage = DW_LANG_Fortran95, file = #[[DI_FILE]], producer = "flang{{.*}}", isOptimized = false, emissionKind = LineTablesOnly>
-// CHECK: #[[DI_SP_INC:.*]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "_QPsinc", linkageName = "_QPsinc", file = #[[DI_INC_FILE]], {{.*}}>
+// CHECK: #[[DI_SP_INC:.*]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "sinc", linkageName = "_QPsinc", file = #[[DI_INC_FILE]], {{.*}}>
 // CHECK: #[[DI_SP:.*]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "_QQmain", linkageName = "_QQmain", file = #[[DI_FILE]], {{.*}}>
 // CHECK: #[[FUSED_LOC_INC_FILE]] = loc(fused<#[[DI_SP_INC]]>[#[[LOC_INC_FILE]]])
 // CHECK: #[[FUSED_LOC_FILE]] = loc(fused<#[[DI_SP]]>[#[[LOC_FILE]]])
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index a0870d3649c2..f9349d50055a 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -289,17 +289,20 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
 
   // parse the input Fortran
   parsing.Parse(llvm::outs());
-  parsing.messages().Emit(llvm::errs(), parsing.allCooked());
   if (!parsing.consumedWholeFile()) {
+    parsing.messages().Emit(llvm::errs(), parsing.allCooked());
     parsing.EmitMessage(llvm::errs(), parsing.finalRestingPlace(),
                         "parser FAIL (final position)",
                         "error: ", llvm::raw_ostream::RED);
     return mlir::failure();
-  }
-  if ((!parsing.messages().empty() && (parsing.messages().AnyFatalError())) ||
-      !parsing.parseTree().has_value()) {
+  } else if ((!parsing.messages().empty() &&
+              (parsing.messages().AnyFatalError())) ||
+             !parsing.parseTree().has_value()) {
+    parsing.messages().Emit(llvm::errs(), parsing.allCooked());
     llvm::errs() << programPrefix << "could not parse " << path << '\n';
     return mlir::failure();
+  } else {
+    semanticsContext.messages().Annex(std::move(parsing.messages()));
   }
 
   // run semantics
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index 3ce8b407450d..ce30ecff028d 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -21,7 +21,6 @@ add_flang_tool(flang-new
   # unable to generate executables.
   FortranRuntime
   FortranDecimal
-  Fortran_main
 )
 
 target_link_libraries(flang-new
diff --git a/flang/unittests/Optimizer/Builder/ComplexTest.cpp b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
index 5364eec904ff..17171512470a 100644
--- a/flang/unittests/Optimizer/Builder/ComplexTest.cpp
+++ b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
@@ -96,6 +96,6 @@ TEST_F(ComplexTest, verifyConvertWithSemantics) {
 
   // Convert complex to integer
   mlir::Value v2 = firBuilder->convertWithSemantics(loc, integerTy1, v1);
-  EXPECT_TRUE(v2.getType().isa<mlir::IntegerType>());
+  EXPECT_TRUE(mlir::isa<mlir::IntegerType>(v2.getType()));
   EXPECT_TRUE(mlir::dyn_cast<fir::ConvertOp>(v2.getDefiningOp()));
 }
diff --git a/flang/unittests/Optimizer/Builder/DoLoopHelperTest.cpp b/flang/unittests/Optimizer/Builder/DoLoopHelperTest.cpp
index 7e7206dbf934..d0a9342914a3 100644
--- a/flang/unittests/Optimizer/Builder/DoLoopHelperTest.cpp
+++ b/flang/unittests/Optimizer/Builder/DoLoopHelperTest.cpp
@@ -34,7 +34,7 @@ public:
 void checkConstantValue(const mlir::Value &value, int64_t v) {
   EXPECT_TRUE(mlir::isa<mlir::arith::ConstantOp>(value.getDefiningOp()));
   auto cstOp = dyn_cast<mlir::arith::ConstantOp>(value.getDefiningOp());
-  auto valueAttr = cstOp.getValue().dyn_cast_or_null<IntegerAttr>();
+  auto valueAttr = dyn_cast_or_null<IntegerAttr>(cstOp.getValue());
   EXPECT_EQ(v, valueAttr.getInt());
 }
 
diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
index b6a1f9c9db8f..e5e5454ee88a 100644
--- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
+++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
@@ -54,7 +54,7 @@ static void checkIntegerConstant(mlir::Value value, mlir::Type ty, int64_t v) {
   EXPECT_TRUE(mlir::isa<mlir::arith::ConstantOp>(value.getDefiningOp()));
   auto cstOp = dyn_cast<mlir::arith::ConstantOp>(value.getDefiningOp());
   EXPECT_EQ(ty, cstOp.getType());
-  auto valueAttr = cstOp.getValue().dyn_cast_or_null<IntegerAttr>();
+  auto valueAttr = mlir::dyn_cast_or_null<IntegerAttr>(cstOp.getValue());
   EXPECT_EQ(v, valueAttr.getInt());
 }
 
@@ -151,7 +151,7 @@ TEST_F(FIRBuilderTest, createRealZeroConstant) {
   auto cstOp = dyn_cast<arith::ConstantOp>(cst.getDefiningOp());
   EXPECT_EQ(realTy, cstOp.getType());
   EXPECT_EQ(
-      0u, cstOp.getValue().cast<FloatAttr>().getValue().convertToDouble());
+      0u, mlir::cast<FloatAttr>(cstOp.getValue()).getValue().convertToDouble());
 }
 
 TEST_F(FIRBuilderTest, createBool) {
@@ -164,8 +164,8 @@ TEST_F(FIRBuilderTest, createBool) {
 TEST_F(FIRBuilderTest, getVarLenSeqTy) {
   auto builder = getBuilder();
   auto ty = builder.getVarLenSeqTy(builder.getI64Type());
-  EXPECT_TRUE(ty.isa<fir::SequenceType>());
-  fir::SequenceType seqTy = ty.dyn_cast<fir::SequenceType>();
+  EXPECT_TRUE(mlir::isa<fir::SequenceType>(ty));
+  fir::SequenceType seqTy = mlir::dyn_cast<fir::SequenceType>(ty);
   EXPECT_EQ(1u, seqTy.getDimension());
   EXPECT_TRUE(fir::unwrapSequenceType(ty).isInteger(64));
 }
@@ -216,9 +216,9 @@ TEST_F(FIRBuilderTest, createGlobal2) {
   EXPECT_FALSE(global.getConstant().has_value());
   EXPECT_EQ(i32Type, global.getType());
   EXPECT_TRUE(global.getInitVal().has_value());
-  EXPECT_TRUE(global.getInitVal().value().isa<mlir::IntegerAttr>());
-  EXPECT_EQ(
-      16, global.getInitVal().value().cast<mlir::IntegerAttr>().getValue());
+  EXPECT_TRUE(mlir::isa<mlir::IntegerAttr>(global.getInitVal().value()));
+  EXPECT_EQ(16,
+      mlir::cast<mlir::IntegerAttr>(global.getInitVal().value()).getValue());
   EXPECT_TRUE(global.getLinkName().has_value());
   EXPECT_EQ(
       builder.createLinkOnceLinkage().getValue(), global.getLinkName().value());
@@ -271,12 +271,12 @@ TEST_F(FIRBuilderTest, locationToFilename) {
   auto stringLitOps = global.getRegion().front().getOps<fir::StringLitOp>();
   EXPECT_TRUE(llvm::hasSingleElement(stringLitOps));
   for (auto stringLit : stringLitOps) {
-    EXPECT_EQ(10, stringLit.getSize().cast<mlir::IntegerAttr>().getValue());
-    EXPECT_TRUE(stringLit.getValue().isa<StringAttr>());
+    EXPECT_EQ(
+        10, mlir::cast<mlir::IntegerAttr>(stringLit.getSize()).getValue());
+    EXPECT_TRUE(mlir::isa<StringAttr>(stringLit.getValue()));
     EXPECT_EQ(0,
         strcmp("file1.f90\0",
-            stringLit.getValue()
-                .dyn_cast<StringAttr>()
+            mlir::dyn_cast<StringAttr>(stringLit.getValue())
                 .getValue()
                 .str()
                 .c_str()));
@@ -288,9 +288,9 @@ TEST_F(FIRBuilderTest, createStringLitOp) {
   llvm::StringRef data("mystringlitdata");
   auto loc = builder.getUnknownLoc();
   auto op = builder.createStringLitOp(loc, data);
-  EXPECT_EQ(15, op.getSize().cast<mlir::IntegerAttr>().getValue());
-  EXPECT_TRUE(op.getValue().isa<StringAttr>());
-  EXPECT_EQ(data, op.getValue().dyn_cast<StringAttr>().getValue());
+  EXPECT_EQ(15, mlir::cast<mlir::IntegerAttr>(op.getSize()).getValue());
+  EXPECT_TRUE(mlir::isa<StringAttr>(op.getValue()));
+  EXPECT_EQ(data, mlir::dyn_cast<StringAttr>(op.getValue()).getValue());
 }
 
 TEST_F(FIRBuilderTest, createStringLiteral) {
@@ -318,9 +318,11 @@ TEST_F(FIRBuilderTest, createStringLiteral) {
   auto stringLitOps = global.getRegion().front().getOps<fir::StringLitOp>();
   EXPECT_TRUE(llvm::hasSingleElement(stringLitOps));
   for (auto stringLit : stringLitOps) {
-    EXPECT_EQ(16, stringLit.getSize().cast<mlir::IntegerAttr>().getValue());
-    EXPECT_TRUE(stringLit.getValue().isa<StringAttr>());
-    EXPECT_EQ(strValue, stringLit.getValue().dyn_cast<StringAttr>().getValue());
+    EXPECT_EQ(
+        16, mlir::cast<mlir::IntegerAttr>(stringLit.getSize()).getValue());
+    EXPECT_TRUE(mlir::isa<StringAttr>(stringLit.getValue()));
+    EXPECT_EQ(
+        strValue, mlir::dyn_cast<StringAttr>(stringLit.getValue()).getValue());
   }
 }
 
@@ -344,7 +346,7 @@ TEST_F(FIRBuilderTest, allocateLocal) {
 static void checkShapeOp(mlir::Value shape, mlir::Value c10, mlir::Value c100) {
   EXPECT_TRUE(mlir::isa<fir::ShapeOp>(shape.getDefiningOp()));
   fir::ShapeOp op = dyn_cast<fir::ShapeOp>(shape.getDefiningOp());
-  auto shapeTy = op.getType().dyn_cast<fir::ShapeType>();
+  auto shapeTy = mlir::dyn_cast<fir::ShapeType>(op.getType());
   EXPECT_EQ(2u, shapeTy.getRank());
   EXPECT_EQ(2u, op.getExtents().size());
   EXPECT_EQ(c10, op.getExtents()[0]);
@@ -372,7 +374,7 @@ TEST_F(FIRBuilderTest, genShapeWithExtentsAndShapeShift) {
   auto shape = builder.genShape(loc, shifts, extents);
   EXPECT_TRUE(mlir::isa<fir::ShapeShiftOp>(shape.getDefiningOp()));
   fir::ShapeShiftOp op = dyn_cast<fir::ShapeShiftOp>(shape.getDefiningOp());
-  auto shapeTy = op.getType().dyn_cast<fir::ShapeShiftType>();
+  auto shapeTy = mlir::dyn_cast<fir::ShapeShiftType>(op.getType());
   EXPECT_EQ(2u, shapeTy.getRank());
   EXPECT_EQ(2u, op.getExtents().size());
   EXPECT_EQ(2u, op.getOrigins().size());
@@ -428,7 +430,7 @@ TEST_F(FIRBuilderTest, createZeroValue) {
   auto cst =
       mlir::dyn_cast_or_null<mlir::arith::ConstantOp>(zeroInt.getDefiningOp());
   EXPECT_TRUE(cst);
-  auto intAttr = cst.getValue().dyn_cast<mlir::IntegerAttr>();
+  auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue());
   EXPECT_TRUE(intAttr && intAttr.getInt() == 0);
 
   mlir::Type f32Ty = mlir::FloatType::getF32(builder.getContext());
@@ -437,7 +439,7 @@ TEST_F(FIRBuilderTest, createZeroValue) {
   auto cst2 = mlir::dyn_cast_or_null<mlir::arith::ConstantOp>(
       zeroFloat.getDefiningOp());
   EXPECT_TRUE(cst2);
-  auto floatAttr = cst2.getValue().dyn_cast<mlir::FloatAttr>();
+  auto floatAttr = mlir::dyn_cast<mlir::FloatAttr>(cst2.getValue());
   EXPECT_TRUE(floatAttr && floatAttr.getValueAsDouble() == 0.);
 
   mlir::Type boolTy = mlir::IntegerType::get(builder.getContext(), 1);
@@ -446,7 +448,7 @@ TEST_F(FIRBuilderTest, createZeroValue) {
   auto cst3 = mlir::dyn_cast_or_null<mlir::arith::ConstantOp>(
       flaseBool.getDefiningOp());
   EXPECT_TRUE(cst3);
-  auto intAttr2 = cst.getValue().dyn_cast<mlir::IntegerAttr>();
+  auto intAttr2 = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue());
   EXPECT_TRUE(intAttr2 && intAttr2.getInt() == 0);
 }
 
@@ -482,7 +484,7 @@ TEST_F(FIRBuilderTest, getBaseTypeOf) {
     llvm::SmallVector<fir::ExtendedValue, 4> arrays;
     auto extent = builder.create<fir::UndefOp>(loc, builder.getIndexType());
     llvm::SmallVector<mlir::Value> extents(
-        arrayType.dyn_cast<fir::SequenceType>().getDimension(),
+        mlir::dyn_cast<fir::SequenceType>(arrayType).getDimension(),
         extent.getResult());
     arrays.emplace_back(fir::ArrayBoxValue(ptrValArray, extents));
     arrays.emplace_back(fir::BoxValue(boxValArray));
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 790f735a6cf2..f5f559ef887c 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -48,7 +48,8 @@ TEST_F(FortranVariableTest, SimpleScalar) {
   mlir::Value addr = builder->create<fir::AllocaOp>(loc, eleType);
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
-      /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt, name,
+      /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt,
+      /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
       /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
@@ -74,7 +75,7 @@ TEST_F(FortranVariableTest, CharacterScalar) {
       loc, eleType, /*pinned=*/false, typeParams);
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
-      /*shape=*/mlir::Value{}, typeParams, name,
+      /*shape=*/mlir::Value{}, typeParams, /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
       /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
@@ -105,7 +106,7 @@ TEST_F(FortranVariableTest, SimpleArray) {
   mlir::Value shape = createShape(extents);
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
-      shape, /*typeParams*/ std::nullopt, name,
+      shape, /*typeParams*/ std::nullopt, /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
       /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
@@ -136,7 +137,7 @@ TEST_F(FortranVariableTest, CharacterArray) {
   mlir::Value shape = createShape(extents);
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
-      shape, typeParams, name,
+      shape, typeParams, /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
       /*cuda_attr=*/fir::CUDADataAttributeAttr{});
 
diff --git a/flang/unittests/Optimizer/RTBuilder.cpp b/flang/unittests/Optimizer/RTBuilder.cpp
index 7fff7f71fc3b..d6cf96c4351c 100644
--- a/flang/unittests/Optimizer/RTBuilder.cpp
+++ b/flang/unittests/Optimizer/RTBuilder.cpp
@@ -27,7 +27,7 @@ TEST(RTBuilderTest, ComplexRuntimeInterface) {
   mlir::Type c99_cacosf_signature{
       fir::runtime::RuntimeTableKey<decltype(c99_cacosf)>::getTypeModel()(
           &ctx)};
-  auto c99_cacosf_funcTy = c99_cacosf_signature.cast<mlir::FunctionType>();
+  auto c99_cacosf_funcTy = mlir::cast<mlir::FunctionType>(c99_cacosf_signature);
   EXPECT_EQ(c99_cacosf_funcTy.getNumInputs(), 1u);
   EXPECT_EQ(c99_cacosf_funcTy.getNumResults(), 1u);
   auto cplx_ty = fir::ComplexType::get(&ctx, 4);
diff --git a/flang/unittests/Runtime/NumericalFormatTest.cpp b/flang/unittests/Runtime/NumericalFormatTest.cpp
index dee4dda4a228..2a9f8f8d1dc4 100644
--- a/flang/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang/unittests/Runtime/NumericalFormatTest.cpp
@@ -958,3 +958,20 @@ TEST(IOApiTests, EditDoubleInputValues) {
                            << "', want " << want << ", got " << u.raw;
   }
 }
+
+// regression test for confusing digit minimization
+TEST(IOApiTests, ConfusingMinimization) {
+  char buffer[8]{};
+  auto cookie{IONAME(BeginInternalListOutput)(buffer, sizeof buffer)};
+  StaticDescriptor<0> staticDescriptor;
+  Descriptor &desc{staticDescriptor.descriptor()};
+  std::uint16_t x{0x7bff}; // HUGE(0._2)
+  desc.Establish(TypeCode{CFI_type_half_float}, sizeof x, &x, 0, nullptr);
+  desc.Check();
+  EXPECT_TRUE(IONAME(OutputDescriptor)(cookie, desc));
+  auto status{IONAME(EndIoStatement)(cookie)};
+  EXPECT_EQ(status, 0);
+  std::string got{std::string{buffer, sizeof buffer}};
+  EXPECT_TRUE(CompareFormattedStrings(" 65504. ", got))
+      << "expected ' 65504. ', got '" << got << '\''; // not 65500.!
+}
diff --git a/flang/unittests/Runtime/RuntimeCrashTest.cpp b/flang/unittests/Runtime/RuntimeCrashTest.cpp
index 0f25cc0ee803..a649051fdca0 100644
--- a/flang/unittests/Runtime/RuntimeCrashTest.cpp
+++ b/flang/unittests/Runtime/RuntimeCrashTest.cpp
@@ -53,16 +53,6 @@ TEST(TestTerminator, CheckFailedTest) {
 //------------------------------------------------------------------------------
 struct TestIOCrash : CrashHandlerFixture {};
 
-TEST(TestIOCrash, FormatDescriptorWriteMismatchTest) {
-  static constexpr int bufferSize{4};
-  static char buffer[bufferSize];
-  static const char *format{"(A4)"};
-  auto *cookie{IONAME(BeginInternalFormattedOutput)(
-      buffer, bufferSize, format, std::strlen(format))};
-  ASSERT_DEATH(IONAME(OutputLogical)(cookie, true),
-      "Data edit descriptor 'A' may not be used with a LOGICAL data item");
-}
-
 TEST(TestIOCrash, InvalidFormatCharacterTest) {
   static constexpr int bufferSize{1};
   static char buffer[bufferSize];
diff --git a/libc/config/baremetal/api.td b/libc/config/baremetal/api.td
index 25aa06aacb64..a6547d843c85 100644
--- a/libc/config/baremetal/api.td
+++ b/libc/config/baremetal/api.td
@@ -85,5 +85,10 @@ def TimeAPI : PublicAPI<"time.h"> {
 }
 
 def UCharAPI : PublicAPI<"uchar.h"> {
-  let Types = ["mbstate_t"];
+  let Types = [
+    "mbstate_t",
+    "char8_t",
+    "char16_t",
+    "char32_t",
+  ];
 }
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 1ac6bd930000..ad50b6f59cdc 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -22,6 +22,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # fcntl.h entrypoints
     libc.src.fcntl.creat
+    libc.src.fcntl.fcntl
     libc.src.fcntl.open
     libc.src.fcntl.openat
 
@@ -527,11 +528,13 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lroundf128
     libc.src.math.modff128
     libc.src.math.nanf128
+    libc.src.math.nearbyintf128
     libc.src.math.nextafterf128
     libc.src.math.nextdownf128
     libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundf128
+    libc.src.math.scalbnf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt
index 47db4434b09b..7d25877cefcc 100644
--- a/libc/config/linux/aarch64/headers.txt
+++ b/libc/config/linux/aarch64/headers.txt
@@ -25,6 +25,8 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.threads
     libc.include.time
     libc.include.unistd
+    libc.include.wchar
+    libc.include.uchar
 
     libc.include.sys_ioctl
     # Disabled due to epoll_wait syscalls not being available on this platform.
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index 7843513c4d27..902839b3e5b8 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -206,6 +206,15 @@ def WCharAPI : PublicAPI<"wchar.h"> {
   ];
 }
 
+def UCharAPI : PublicAPI<"uchar.h"> {
+  let Types = [
+    "mbstate_t",
+    "char8_t",
+    "char16_t",
+    "char32_t",
+  ];
+}
+
 def SysRandomAPI : PublicAPI<"sys/random.h"> {
   let Types = ["size_t", "ssize_t"];
 }
diff --git a/libc/config/linux/arm/headers.txt b/libc/config/linux/arm/headers.txt
index 307bb6b146a4..1180564fe458 100644
--- a/libc/config/linux/arm/headers.txt
+++ b/libc/config/linux/arm/headers.txt
@@ -12,6 +12,8 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.string
     libc.include.strings
     libc.include.search
+    libc.include.wchar
+    libc.include.uchar
 
     # Disabled due to epoll_wait syscalls not being available on this platform.
     # libc.include.sys_epoll
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 87e82e5eb9a0..479af40b5b26 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -22,6 +22,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # fcntl.h entrypoints
     libc.src.fcntl.creat
+    libc.src.fcntl.fcntl
     libc.src.fcntl.open
     libc.src.fcntl.openat
 
@@ -535,11 +536,13 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lroundf128
     libc.src.math.modff128
     libc.src.math.nanf128
+    libc.src.math.nearbyintf128
     libc.src.math.nextafterf128
     libc.src.math.nextdownf128
     libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundf128
+    libc.src.math.scalbnf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index c858bcc978d9..da203e985060 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -28,6 +28,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.time
     libc.include.unistd
     libc.include.wchar
+    libc.include.uchar
 
     libc.include.arpa_inet
 
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index a8e289927667..5e3ddd34fb4d 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -22,6 +22,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # fcntl.h entrypoints
     libc.src.fcntl.creat
+    libc.src.fcntl.fcntl
     libc.src.fcntl.open
     libc.src.fcntl.openat
 
@@ -560,12 +561,14 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lroundf128
     libc.src.math.modff128
     libc.src.math.nanf128
+    libc.src.math.nearbyintf128
     libc.src.math.nextafterf128
     libc.src.math.nextdownf128
     libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundevenf128
     libc.src.math.roundf128
+    libc.src.math.scalbnf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index e51c79319427..44d640b75e2b 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -29,6 +29,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.time
     libc.include.unistd
     libc.include.wchar
+    libc.include.uchar
 
     libc.include.arpa_inet
 
diff --git a/libc/docs/c23.rst b/libc/docs/c23.rst
index 44724fe1660c..8ccfd4627179 100644
--- a/libc/docs/c23.rst
+++ b/libc/docs/c23.rst
@@ -158,4 +158,4 @@ Additions:
 
   * mbrtoc8
   * c8rtomb
-  * char*_t
+  * char*_t |check|
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 7a7b6c9c8db5..28503e1d13ab 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -188,7 +188,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | nan              | |check|          | |check|         | |check|                |                      | |check|                | 7.12.11.2              | F.10.8.2                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| nearbyint        | |check|          | |check|         | |check|                |                      |                        | 7.12.9.3               | F.10.6.3                   |
+| nearbyint        | |check|          | |check|         | |check|                |                      | |check|                | 7.12.9.3               | F.10.6.3                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | nextafter        | |check|          | |check|         | |check|                |                      | |check|                | 7.12.11.3              | F.10.8.3                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -208,7 +208,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | roundeven        | |check|          | |check|         | |check|                |                      | |check|                | 7.12.9.8               | F.10.6.8                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| scalbn           | |check|          | |check|         | |check|                |                      |                        | 7.12.6.19              | F.10.3.19                  |
+| scalbn           | |check|          | |check|         | |check|                |                      | |check|                | 7.12.6.19              | F.10.3.19                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | trunc            | |check|          | |check|         | |check|                |                      | |check|                | 7.12.9.9               | F.10.6.9                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index fb7c342f92b7..179b05e6ee96 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -33,6 +33,15 @@ add_proxy_header_library(
 )
 
 add_proxy_header_library(
+  fcntl_macros
+  HDRS
+    fcntl_macros.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-macros.fcntl_macros
+    libc.include.fcntl
+)
+
+add_proxy_header_library(
   fenv_macros
   HDRS
     fenv_macros.h
diff --git a/libc/hdr/fcntl_macros.h b/libc/hdr/fcntl_macros.h
new file mode 100644
index 000000000000..828cb984c0cb
--- /dev/null
+++ b/libc/hdr/fcntl_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from fcntl/fcntl.h ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_FCNTL_MACROS_H
+#define LLVM_LIBC_HDR_FCNTL_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/fcntl-macros.h"
+
+#else // Overlay mode
+
+#include <fcntl.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_FCNTL_MACROS_H
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index f53766777e75..46a66ec59020 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -15,6 +15,30 @@ add_proxy_header_library(
 )
 
 add_proxy_header_library(
+  struct_flock
+  HDRS
+    struct_flock.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.struct_flock
+)
+
+add_proxy_header_library(
+  struct_flock64
+  HDRS
+    struct_flock64.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.struct_flock64
+)
+
+add_proxy_header_library(
+  struct_f_owner_ex
+  HDRS
+    struct_f_owner_ex.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.struct_f_owner_ex
+)
+
+add_proxy_header_library(
   struct_timespec
   HDRS
     struct_timespec.h
diff --git a/libc/hdr/types/struct_f_owner_ex.h b/libc/hdr/types/struct_f_owner_ex.h
new file mode 100644
index 000000000000..49985115ae4b
--- /dev/null
+++ b/libc/hdr/types/struct_f_owner_ex.h
@@ -0,0 +1,21 @@
+//===-- Proxy for struct f_owner_ex  --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_HDR_TYPES_STRUCT_F_OWNER_EX_H
+#define LLVM_LIBC_HDR_TYPES_STRUCT_F_OWNER_EX_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/struct_f_owner_ex.h"
+
+#else
+
+#include <fcntl.h>
+
+#endif // LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_STRUCT_F_OWNER_EX_H
diff --git a/libc/hdr/types/struct_flock.h b/libc/hdr/types/struct_flock.h
new file mode 100644
index 000000000000..a552b91c432b
--- /dev/null
+++ b/libc/hdr/types/struct_flock.h
@@ -0,0 +1,21 @@
+//===-- Proxy for struct flock  -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_HDR_TYPES_STRUCT_FLOCK_H
+#define LLVM_LIBC_HDR_TYPES_STRUCT_FLOCK_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/struct_flock.h"
+
+#else
+
+#include <fcntl.h>
+
+#endif // LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_STRUCT_FLOCK_H
diff --git a/libc/hdr/types/struct_flock64.h b/libc/hdr/types/struct_flock64.h
new file mode 100644
index 000000000000..84fe67816c33
--- /dev/null
+++ b/libc/hdr/types/struct_flock64.h
@@ -0,0 +1,21 @@
+//===-- Proxy for struct flock64  -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_HDR_TYPES_STRUCT_FLOCK64_H
+#define LLVM_LIBC_HDR_TYPES_STRUCT_FLOCK64_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/struct_flock64.h"
+
+#else
+
+#include <fcntl.h>
+
+#endif // LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_STRUCT_FLOCK64_H
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index aeef46aabfce..6101ec136b26 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -43,6 +43,10 @@ add_gen_header(
   DEPENDS
     .llvm-libc-macros.fcntl_macros
     .llvm-libc-types.mode_t
+    .llvm-libc-types.struct_flock
+    .llvm-libc-types.struct_flock64
+    .llvm-libc-types.off64_t
+    .llvm-libc-types.pid_t
     .llvm-libc-types.off_t
     .llvm_libc_common_h
 )
@@ -603,6 +607,9 @@ add_gen_header(
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-types.mbstate_t
+    .llvm-libc-types.char8_t
+    .llvm-libc-types.char16_t
+    .llvm-libc-types.char32_t
 )
 
 add_gen_header(
diff --git a/libc/include/assert.h.def b/libc/include/assert.h.def
index e006133a7654..15077e53e2ca 100644
--- a/libc/include/assert.h.def
+++ b/libc/include/assert.h.def
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "__llvm-libc-common.h"
+#include "llvm-libc-macros/assert-macros.h"
 
 // This file may be usefully included multiple times to change assert()'s
 // definition based on NDEBUG.
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index 382cb8ee417e..68ba110aec80 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -32,6 +32,12 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
 add_macro_header(
+  assert_macros
+  HDR
+    assert-macros.h
+)
+
+add_macro_header(
   generic_error_number_macros
   HDR
     generic-error-number-macros.h
diff --git a/libc/include/llvm-libc-macros/assert-macros.h b/libc/include/llvm-libc-macros/assert-macros.h
new file mode 100644
index 000000000000..44e14543d856
--- /dev/null
+++ b/libc/include/llvm-libc-macros/assert-macros.h
@@ -0,0 +1,14 @@
+//===-- Definition of macros to be used with assert functions -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_MACROS_ASSERT_MACROS_H
+#define __LLVM_LIBC_MACROS_ASSERT_MACROS_H
+
+#define __STDC_VERSION_ASSERT_H__ 202311L
+
+#endif // __LLVM_LIBC_MACROS_ASSERT_MACROS_H
diff --git a/libc/include/llvm-libc-macros/linux/fcntl-macros.h b/libc/include/llvm-libc-macros/linux/fcntl-macros.h
index 1d4e5bbbdc77..8ee95863728e 100644
--- a/libc/include/llvm-libc-macros/linux/fcntl-macros.h
+++ b/libc/include/llvm-libc-macros/linux/fcntl-macros.h
@@ -67,5 +67,36 @@
 #define F_SETFD 2
 #define F_GETFL 3
 #define F_SETFL 4
+#define F_GETLK 5
+#define F_SETLK 6
+#define F_SETLKW 7
+#define F_SETOWN 8
+#define F_GETOWN 9
+#define F_SETSIG 10
+#define F_GETSIG 11
+#define F_GETLK64 12
+#define F_SETLK64 13
+#define F_SETLKW64 14
+#define F_SETOWN_EX 15
+#define F_GETOWN_EX 16
+
+// Open File Description Locks.
+#define F_OFD_GETLK 36
+#define F_OFD_SETLK 37
+#define F_OFD_SETLKW 38
+
+// Close on succesful
+#define F_CLOEXEC 1
+
+#define F_RDLCK 0
+#define F_WRLCK 1
+#define F_UNLCK 2
+
+// For Large File Support
+#if defined(_LARGEFILE64_SOURCE)
+#define F_GETLK F_GETLK64
+#define F_SETLK F_SETLK64
+#define F_SETLKW F_SETLKW64
+#endif
 
 #endif // LLVM_LIBC_MACROS_LINUX_FCNTL_MACROS_H
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index 310374fb62ff..018b6c58316c 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -60,6 +60,9 @@ add_header(rlim_t HDR rlim_t.h)
 add_header(time_t HDR time_t.h)
 add_header(stack_t HDR stack_t.h)
 add_header(suseconds_t HDR suseconds_t.h)
+add_header(struct_flock HDR struct_flock.h DEPENDS .off_t .pid_t)
+add_header(struct_flock64 HDR struct_flock64.h DEPENDS .off64_t .pid_t)
+add_header(struct_f_owner_ex HDR struct_f_owner_ex.h DEPENDS .pid_t)
 add_header(struct_timeval HDR struct_timeval.h DEPENDS .suseconds_t .time_t)
 add_header(struct_rlimit HDR struct_rlimit.h DEPENDS .rlim_t)
 add_header(struct_rusage HDR struct_rusage.h DEPENDS .struct_timeval)
@@ -90,6 +93,21 @@ add_header(tcflag_t HDR tcflag_t.h)
 add_header(struct_termios HDR struct_termios.h DEPENDS .cc_t .speed_t .tcflag_t)
 add_header(__getoptargv_t HDR __getoptargv_t.h)
 add_header(wchar_t HDR wchar_t.h)
+add_header(char8_t HDR char8_t.h)
+add_header(
+    char16_t 
+  HDR 
+    char16_t.h
+  DEPENDS
+    libc.include.llvm-libc-macros.stdint_macros
+)
+add_header(
+    char32_t 
+  HDR 
+    char32_t.h
+  DEPENDS
+    libc.include.llvm-libc-macros.stdint_macros
+)
 add_header(wint_t HDR wint_t.h)
 add_header(sa_family_t HDR sa_family_t.h)
 add_header(socklen_t HDR socklen_t.h)
diff --git a/libc/include/llvm-libc-types/char16_t.h b/libc/include/llvm-libc-types/char16_t.h
new file mode 100644
index 000000000000..1f5847ae771b
--- /dev/null
+++ b/libc/include/llvm-libc-types/char16_t.h
@@ -0,0 +1,17 @@
+//===-- Definition of char16_t type ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_CHAR16_T_H
+#define LLVM_LIBC_TYPES_CHAR16_T_H
+
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#include "../llvm-libc-macros/stdint-macros.h"
+typedef uint_least16_t char16_t;
+#endif
+
+#endif // LLVM_LIBC_TYPES_CHAR16_T_H
diff --git a/libc/include/llvm-libc-types/char32_t.h b/libc/include/llvm-libc-types/char32_t.h
new file mode 100644
index 000000000000..20b72dc5d67e
--- /dev/null
+++ b/libc/include/llvm-libc-types/char32_t.h
@@ -0,0 +1,17 @@
+//===-- Definition of char32_t type ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_CHAR32_T_H
+#define LLVM_LIBC_TYPES_CHAR32_T_H
+
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#include "../llvm-libc-macros/stdint-macros.h"
+typedef uint_least32_t char32_t;
+#endif
+
+#endif // LLVM_LIBC_TYPES_CHAR32_T_H
diff --git a/libc/include/llvm-libc-types/char8_t.h b/libc/include/llvm-libc-types/char8_t.h
new file mode 100644
index 000000000000..ddadab1afa21
--- /dev/null
+++ b/libc/include/llvm-libc-types/char8_t.h
@@ -0,0 +1,17 @@
+//===-- Definition of char8_t type ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_CHAR8_T_H
+#define LLVM_LIBC_TYPES_CHAR8_T_H
+
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) &&                      \
+    __STDC_VERSION__ >= 202311L
+typedef unsigned char char8_t;
+#endif
+
+#endif // LLVM_LIBC_TYPES_CHAR8_T_H
diff --git a/libc/include/llvm-libc-types/struct_f_owner_ex.h b/libc/include/llvm-libc-types/struct_f_owner_ex.h
new file mode 100644
index 000000000000..c9cc85f69d2b
--- /dev/null
+++ b/libc/include/llvm-libc-types/struct_f_owner_ex.h
@@ -0,0 +1,25 @@
+//===-- Definition of type struct f_owner_ex ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_STRUCT_F_OWNER_EX_H
+#define LLVM_LIBC_TYPES_STRUCT_F_OWNER_EX_H
+
+#include "llvm-libc-types/pid_t.h"
+
+enum pid_type {
+  F_OWNER_TID = 0,
+  F_OWNER_PID,
+  F_OWNER_PGRP,
+};
+
+struct f_owner_ex {
+  enum pid_type type;
+  pid_t pid;
+};
+
+#endif // LLVM_LIBC_TYPES_STRUCT_F_OWNER_EX_H
diff --git a/libc/include/llvm-libc-types/struct_flock.h b/libc/include/llvm-libc-types/struct_flock.h
new file mode 100644
index 000000000000..51c9d27640ea
--- /dev/null
+++ b/libc/include/llvm-libc-types/struct_flock.h
@@ -0,0 +1,25 @@
+//===-- Definition of type struct flock64 ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_STRUCT_FLOCK_H
+#define LLVM_LIBC_TYPES_STRUCT_FLOCK_H
+
+#include "llvm-libc-types/off_t.h"
+#include "llvm-libc-types/pid_t.h"
+
+#include <stdint.h>
+
+struct flock {
+  int16_t l_type;
+  int16_t l_whence;
+  off_t l_start;
+  off_t l_len;
+  pid_t l_pid;
+};
+
+#endif // LLVM_LIBC_TYPES_STRUCT_FLOCK_H
diff --git a/libc/include/llvm-libc-types/struct_flock64.h b/libc/include/llvm-libc-types/struct_flock64.h
new file mode 100644
index 000000000000..ac50003ca62f
--- /dev/null
+++ b/libc/include/llvm-libc-types/struct_flock64.h
@@ -0,0 +1,25 @@
+//===-- Definition of type struct flock64 ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_STRUCT_FLOCK64_H
+#define LLVM_LIBC_TYPES_STRUCT_FLOCK64_H
+
+#include "llvm-libc-types/off64_t.h"
+#include "llvm-libc-types/pid_t.h"
+
+#include <stdint.h>
+
+struct flock64 {
+  int16_t l_type;
+  int16_t l_whence;
+  off64_t l_start;
+  off64_t l_len;
+  pid_t l_pid;
+};
+
+#endif // LLVM_LIBC_TYPES_STRUCT_FLOCK64_H
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index d428d54e32a3..e7a0cf883c60 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -231,6 +231,11 @@ def POSIX : StandardSpec<"POSIX"> {
           [ArgSpec<ConstCharPtr>, ArgSpec<ModeTType>]
       >,
       FunctionSpec<
+          "fcntl",
+          RetValSpec<IntType>,
+          [ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<VarArgType>]
+      >,
+      FunctionSpec<
           "open",
           RetValSpec<IntType>,
           [ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<VarArgType>]
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 87bf4435e167..ea8fa4cd373c 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -65,6 +65,9 @@ def SizeTType : NamedType<"size_t">;
 def SizeTPtr : PtrType<SizeTType>;
 def RestrictedSizeTPtr : RestrictedPtrType<SizeTType>;
 
+def Char8TType : NamedType<"char8_t">;
+def Char16TType : NamedType<"char16_t">;
+def Char32TType : NamedType<"char32_t">;
 def WCharType : NamedType<"wchar_t">;
 def WIntType : NamedType<"wint_t">;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 01aa7c70b3b9..eb67c9b0b009 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -604,6 +604,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"nearbyint", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"nearbyintf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"nearbyintl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"nearbyintf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"nextafterf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"nextafter", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
@@ -647,6 +648,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"scalbn", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
           FunctionSpec<"scalbnf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
           FunctionSpec<"scalbnl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntType>]>,
+          GuardedFunctionSpec<"scalbnf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntType>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"nanf", RetValSpec<FloatType>, [ArgSpec<ConstCharPtr>]>,
           FunctionSpec<"nan", RetValSpec<DoubleType>, [ArgSpec<ConstCharPtr>]>,
@@ -1396,6 +1398,9 @@ def StdC : StandardSpec<"stdc"> {
       [], // Macros
       [ //Types
         MBStateTType,
+        Char8TType,
+        Char16TType,
+        Char32TType,
       ],
       [], // Enumerations
       []
diff --git a/libc/src/__support/fixed_point/sqrt.h b/libc/src/__support/fixed_point/sqrt.h
index 4ec016ceab00..982e318e2d1e 100644
--- a/libc/src/__support/fixed_point/sqrt.h
+++ b/libc/src/__support/fixed_point/sqrt.h
@@ -55,18 +55,22 @@ template <> struct SqrtConfig<unsigned fract> {
   // Linear approximation for the initial values, with errors bounded by:
   //   max(1.5 * 2^-11, eps)
   // Generated with Sollya:
-  // > for i from 4 to 15 do {
+  // > for i from 4 to 14 do {
   //     P = fpminimax(sqrt(x), 1, [|16, 16|], [i * 2^-4, (i + 1)*2^-4],
   //                   fixed, absolute);
   //     print("{", coeff(P, 1), "ur,", coeff(P, 0), "ur},");
   //   };
+  // For the last interval [15/16, 1), we choose the linear function Q such that
+  //   Q(1) = 1 and Q(15/16) = P(15/16),
+  // where P is the polynomial generated by Sollya above for [14/16, 15/16].
+  // This is to prevent overflow in the last interval [15/16, 1).
   static constexpr Type FIRST_APPROX[12][2] = {
       {0x1.e378p-1ur, 0x1.0ebp-2ur},  {0x1.b512p-1ur, 0x1.2b94p-2ur},
       {0x1.91fp-1ur, 0x1.45dcp-2ur},  {0x1.7622p-1ur, 0x1.5e24p-2ur},
       {0x1.5f5ap-1ur, 0x1.74e4p-2ur}, {0x1.4c58p-1ur, 0x1.8a4p-2ur},
       {0x1.3c1ep-1ur, 0x1.9e84p-2ur}, {0x1.2e0cp-1ur, 0x1.b1d8p-2ur},
       {0x1.21aap-1ur, 0x1.c468p-2ur}, {0x1.16bap-1ur, 0x1.d62cp-2ur},
-      {0x1.0cfp-1ur, 0x1.e74cp-2ur},  {0x1.0418p-1ur, 0x1.f7ep-2ur},
+      {0x1.0cfp-1ur, 0x1.e74cp-2ur},  {0x1.039p-1ur, 0x1.f8ep-2ur},
   };
 };
 
@@ -77,11 +81,15 @@ template <> struct SqrtConfig<unsigned long fract> {
   // Linear approximation for the initial values, with errors bounded by:
   //   max(1.5 * 2^-11, eps)
   // Generated with Sollya:
-  // > for i from 4 to 15 do {
+  // > for i from 4 to 14 do {
   //     P = fpminimax(sqrt(x), 1, [|32, 32|], [i * 2^-4, (i + 1)*2^-4],
   //                   fixed, absolute);
   //     print("{", coeff(P, 1), "ulr,", coeff(P, 0), "ulr},");
   //   };
+  // For the last interval [15/16, 1), we choose the linear function Q such that
+  //   Q(1) = 1 and Q(15/16) = P(15/16),
+  // where P is the polynomial generated by Sollya above for [14/16, 15/16].
+  // This is to prevent overflow in the last interval [15/16, 1).
   static constexpr Type FIRST_APPROX[12][2] = {
       {0x1.e3779b98p-1ulr, 0x1.0eaff788p-2ulr},
       {0x1.b5167872p-1ulr, 0x1.2b908ad4p-2ulr},
@@ -94,7 +102,7 @@ template <> struct SqrtConfig<unsigned long fract> {
       {0x1.21b05c0ap-1ulr, 0x1.c45e023p-2ulr},
       {0x1.16becd02p-1ulr, 0x1.d624031p-2ulr},
       {0x1.0cf49fep-1ulr, 0x1.e743b844p-2ulr},
-      {0x1.04214e9cp-1ulr, 0x1.f7ce2c3cp-2ulr},
+      {0x1.038cdfcp-1ulr, 0x1.f8e6408p-2ulr},
   };
 };
 
diff --git a/libc/src/fcntl/CMakeLists.txt b/libc/src/fcntl/CMakeLists.txt
index 0b9ee47c4f7c..77400e9050d0 100644
--- a/libc/src/fcntl/CMakeLists.txt
+++ b/libc/src/fcntl/CMakeLists.txt
@@ -10,6 +10,13 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  fcntl
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.fcntl
+)
+
+add_entrypoint_object(
   open
   ALIAS
   DEPENDS
diff --git a/libc/src/fcntl/fcntl.h b/libc/src/fcntl/fcntl.h
new file mode 100644
index 000000000000..8fe3fb3146b9
--- /dev/null
+++ b/libc/src/fcntl/fcntl.h
@@ -0,0 +1,18 @@
+//===-- Implementation header of fcntl --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_FCNTL_FCNTL_H
+#define LLVM_LIBC_SRC_FCNTL_FCNTL_H
+
+namespace LIBC_NAMESPACE {
+
+int fcntl(int fd, int cmd, ...);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_FCNTL_FCNTL_H
diff --git a/libc/src/fcntl/linux/CMakeLists.txt b/libc/src/fcntl/linux/CMakeLists.txt
index 87b8d4695c4f..732b7beac41b 100644
--- a/libc/src/fcntl/linux/CMakeLists.txt
+++ b/libc/src/fcntl/linux/CMakeLists.txt
@@ -11,6 +11,22 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  fcntl
+  SRCS
+    fcntl.cpp
+  HDRS
+    ../fcntl.h
+  DEPENDS
+    libc.include.fcntl
+    libc.hdr.types.struct_flock
+    libc.hdr.types.struct_flock64
+    libc.hdr.types.struct_f_owner_ex
+    libc.hdr.fcntl_macros
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
+
+add_entrypoint_object(
   open
   SRCS
     open.cpp
diff --git a/libc/src/fcntl/linux/fcntl.cpp b/libc/src/fcntl/linux/fcntl.cpp
new file mode 100644
index 000000000000..24a20fb36410
--- /dev/null
+++ b/libc/src/fcntl/linux/fcntl.cpp
@@ -0,0 +1,93 @@
+//===-- Implementation of fcntl -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/fcntl/fcntl.h"
+
+#include "hdr/fcntl_macros.h"
+#include "hdr/types/struct_f_owner_ex.h"
+#include "hdr/types/struct_flock.h"
+#include "hdr/types/struct_flock64.h"
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/errno/libc_errno.h"
+
+#include <stdarg.h>
+#include <sys/syscall.h> // For syscall numbers.
+
+// The OFD file locks require special handling for LARGEFILES
+namespace LIBC_NAMESPACE {
+LLVM_LIBC_FUNCTION(int, fcntl, (int fd, int cmd, ...)) {
+  void *arg;
+  va_list varargs;
+  va_start(varargs, cmd);
+  arg = va_arg(varargs, void *);
+  va_end(varargs);
+
+  switch (cmd) {
+  case F_SETLKW:
+    return syscall_impl<int>(SYS_fcntl, fd, cmd, arg);
+  case F_OFD_SETLKW: {
+    struct flock *flk = reinterpret_cast<struct flock *>(arg);
+    // convert the struct to a flock64
+    struct flock64 flk64;
+    flk64.l_type = flk->l_type;
+    flk64.l_whence = flk->l_whence;
+    flk64.l_start = flk->l_start;
+    flk64.l_len = flk->l_len;
+    flk64.l_pid = flk->l_pid;
+    // create a syscall
+    return syscall_impl<int>(SYS_fcntl, fd, cmd, &flk64);
+  }
+  case F_OFD_GETLK:
+  case F_OFD_SETLK: {
+    struct flock *flk = reinterpret_cast<struct flock *>(arg);
+    // convert the struct to a flock64
+    struct flock64 flk64;
+    flk64.l_type = flk->l_type;
+    flk64.l_whence = flk->l_whence;
+    flk64.l_start = flk->l_start;
+    flk64.l_len = flk->l_len;
+    flk64.l_pid = flk->l_pid;
+    // create a syscall
+    int retVal = syscall_impl<int>(SYS_fcntl, fd, cmd, &flk64);
+    // On failure, return
+    if (retVal == -1)
+      return -1;
+    // Check for overflow, i.e. the offsets are not the same when cast
+    // to off_t from off64_t.
+    if (static_cast<off_t>(flk64.l_len) != flk64.l_len ||
+        static_cast<off_t>(flk64.l_start) != flk64.l_start) {
+      libc_errno = EOVERFLOW;
+      return -1;
+    }
+    // Now copy back into flk, in case flk64 got modified
+    flk->l_type = flk64.l_type;
+    flk->l_whence = flk64.l_whence;
+    flk->l_start = flk64.l_start;
+    flk->l_len = flk64.l_len;
+    flk->l_pid = flk64.l_pid;
+    return retVal;
+  }
+  case F_GETOWN: {
+    struct f_owner_ex fex;
+    int retVal = syscall_impl<int>(SYS_fcntl, fd, F_GETOWN_EX, &fex);
+    if (retVal == -EINVAL)
+      return syscall_impl<int>(SYS_fcntl, fd, cmd,
+                               reinterpret_cast<void *>(arg));
+    if (static_cast<unsigned long>(retVal) <= -4096UL)
+      return fex.type == F_OWNER_PGRP ? -fex.pid : fex.pid;
+
+    libc_errno = -retVal;
+    return -1;
+  }
+  // The general case
+  default:
+    return syscall_impl<int>(SYS_fcntl, fd, cmd, reinterpret_cast<void *>(arg));
+  }
+}
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index e8f699fabe36..c34c58575441 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -252,6 +252,7 @@ add_math_entrypoint_object(nanf128)
 add_math_entrypoint_object(nearbyint)
 add_math_entrypoint_object(nearbyintf)
 add_math_entrypoint_object(nearbyintl)
+add_math_entrypoint_object(nearbyintf128)
 
 add_math_entrypoint_object(nextafter)
 add_math_entrypoint_object(nextafterf)
@@ -301,6 +302,7 @@ add_math_entrypoint_object(roundevenf128)
 add_math_entrypoint_object(scalbn)
 add_math_entrypoint_object(scalbnf)
 add_math_entrypoint_object(scalbnl)
+add_math_entrypoint_object(scalbnf128)
 
 add_math_entrypoint_object(sincos)
 add_math_entrypoint_object(sincosf)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 574e000b82a8..daaf505008ca 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -704,7 +704,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -716,7 +716,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
 )
 
 add_entrypoint_object(
@@ -728,7 +728,20 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
   COMPILE_OPTIONS
-    -O2
+    -O3
+)
+
+add_entrypoint_object(
+  nearbyintf128
+  SRCS
+    nearbyintf128.cpp
+  HDRS
+    ../nearbyintf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.nearest_integer_operations
+  COMPILE_OPTIONS
+    -O3
 )
 
 add_object_library(
@@ -2950,6 +2963,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  scalbnf128
+  SRCS
+    scalbnf128.cpp
+  HDRS
+    ../scalbnf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   fmaf
   SRCS
     fmaf.cpp
diff --git a/libc/src/math/generic/nearbyintf128.cpp b/libc/src/math/generic/nearbyintf128.cpp
new file mode 100644
index 000000000000..fca3587f5b58
--- /dev/null
+++ b/libc/src/math/generic/nearbyintf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nearbyintf128 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nearbyintf128.h"
+#include "src/__support/FPUtil/NearestIntegerOperations.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, nearbyintf128, (float128 x)) {
+  return fputil::round_using_current_rounding_mode(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnf128.cpp b/libc/src/math/generic/scalbnf128.cpp
new file mode 100644
index 000000000000..be3d29ed27e9
--- /dev/null
+++ b/libc/src/math/generic/scalbnf128.cpp
@@ -0,0 +1,27 @@
+//===-- Implementation of scalbnf128 function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalbnf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, scalbnf128, (float128 x, int n)) {
+// TODO: should be switched to use `FLT_RADIX` in hdr/float_macros.h" instead
+// see: https://github.com/llvm/llvm-project/issues/90496
+#if !defined(__FLT_RADIX__)
+#error __FLT_RADIX__ undefined.
+#elif __FLT_RADIX__ != 2
+#error __FLT_RADIX__!=2, unimplemented.
+#else
+  return fputil::ldexp(x, n);
+#endif
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nearbyintf128.h b/libc/src/math/nearbyintf128.h
new file mode 100644
index 000000000000..d12754a48100
--- /dev/null
+++ b/libc/src/math/nearbyintf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for nearbyintf128 -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEARBYINTF128_H
+#define LLVM_LIBC_SRC_MATH_NEARBYINTF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 nearbyintf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEARBYINTF128_H
diff --git a/libc/src/math/scalbnf128.h b/libc/src/math/scalbnf128.h
new file mode 100644
index 000000000000..bd3b560fb7cc
--- /dev/null
+++ b/libc/src/math/scalbnf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for scalbnf128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBNF128_H
+#define LLVM_LIBC_SRC_MATH_SCALBNF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 scalbnf128(float128 x, int n);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBNF128_H
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index a76e0b8ef6f6..c58c322c981e 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -219,4 +219,25 @@ template <typename T> struct FPTest : public Test {
     }                                                                          \
   } while (0)
 
+#define EXPECT_FP_EQ_ROUNDING_MODE(expected, actual, rounding_mode)            \
+  do {                                                                         \
+    using namespace LIBC_NAMESPACE::fputil::testing;                           \
+    ForceRoundingMode __r((rounding_mode));                                    \
+    if (__r.success) {                                                         \
+      EXPECT_FP_EQ((expected), (actual));                                      \
+    }                                                                          \
+  } while (0)
+
+#define EXPECT_FP_EQ_ROUNDING_NEAREST(expected, actual)                        \
+  EXPECT_FP_EQ_ROUNDING_MODE((expected), (actual), RoundingMode::Nearest)
+
+#define EXPECT_FP_EQ_ROUNDING_UPWARD(expected, actual)                         \
+  EXPECT_FP_EQ_ROUNDING_MODE((expected), (actual), RoundingMode::Upward)
+
+#define EXPECT_FP_EQ_ROUNDING_DOWNWARD(expected, actual)                       \
+  EXPECT_FP_EQ_ROUNDING_MODE((expected), (actual), RoundingMode::Downward)
+
+#define EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(expected, actual)                    \
+  EXPECT_FP_EQ_ROUNDING_MODE((expected), (actual), RoundingMode::TowardZero)
+
 #endif // LLVM_LIBC_TEST_UNITTEST_FPMATCHER_H
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index 8d8dff53169f..03c31855e352 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -1,6 +1,16 @@
 add_custom_target(libc_include_tests)
 
 add_libc_test(
+  assert_test
+  SUITE
+    libc_include_tests
+  SRCS
+    assert_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-macros.assert_macros
+)
+
+add_libc_test(
   sys_queue_test
   SUITE
     libc_include_tests
diff --git a/libc/test/include/assert_test.cpp b/libc/test/include/assert_test.cpp
new file mode 100644
index 000000000000..78709bbcdd59
--- /dev/null
+++ b/libc/test/include/assert_test.cpp
@@ -0,0 +1,15 @@
+//===-- Unittests for assert ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/llvm-libc-macros/assert-macros.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcAssertTest, VersionMacro) {
+  // 7.2p3 an integer constant expression with a value equivalent to 202311L.
+  EXPECT_EQ(__STDC_VERSION_ASSERT_H__, 202311L);
+}
diff --git a/libc/test/src/fcntl/CMakeLists.txt b/libc/test/src/fcntl/CMakeLists.txt
index ae39d8d5f878..aae296f074be 100644
--- a/libc/test/src/fcntl/CMakeLists.txt
+++ b/libc/test/src/fcntl/CMakeLists.txt
@@ -18,6 +18,23 @@ add_libc_unittest(
 )
 
 add_libc_unittest(
+  fcntl_test
+  SUITE
+    libc_fcntl_unittests
+  SRCS
+    fcntl_test.cpp
+  DEPENDS
+    libc.include.fcntl
+    libc.src.errno.errno
+    libc.src.fcntl.fcntl
+    libc.src.fcntl.open
+    libc.src.unistd.close
+    libc.hdr.types.struct_flock
+    libc.hdr.fcntl_macros
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
+
+add_libc_unittest(
   openat_test
   SUITE
     libc_fcntl_unittests
diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp
new file mode 100644
index 000000000000..c5cbb61b4ed8
--- /dev/null
+++ b/libc/test/src/fcntl/fcntl_test.cpp
@@ -0,0 +1,155 @@
+//===-- Unittest for fcntl ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/fcntl_macros.h"
+#include "hdr/types/struct_flock.h"
+#include "src/errno/libc_errno.h"
+#include "src/fcntl/fcntl.h"
+#include "src/fcntl/open.h"
+#include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include <stdio.h>
+#include <sys/stat.h> // For S_IRWXU
+
+TEST(LlvmLibcFcntlTest, FcntlDupfd) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_dup.test";
+  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+  int fd2, fd3;
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+
+  fd2 = LIBC_NAMESPACE::fcntl(fd, F_DUPFD, 0);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd2, 0);
+
+  fd3 = LIBC_NAMESPACE::fcntl(fd, F_DUPFD, 10);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd3, 0);
+
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd2), Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd3), Succeeds(0));
+}
+
+TEST(LlvmLibcFcntlTest, FcntlGetFl) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getfl.test";
+  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+  int retVal;
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+
+  retVal = LIBC_NAMESPACE::fcntl(fd, F_GETFL);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(retVal, -1);
+
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+}
+
+TEST(LlvmLibcFcntlTest, FcntlSetFl) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_setfl.test";
+  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+
+  int retVal;
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+
+  retVal = LIBC_NAMESPACE::fcntl(fd, F_GETFL);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(retVal, -1);
+
+  int oldFlags = LIBC_NAMESPACE::fcntl(fd, F_GETFL, 0);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(oldFlags, 0);
+
+  // Add the APPEND flag;
+  oldFlags |= O_APPEND;
+
+  retVal = LIBC_NAMESPACE::fcntl(fd, F_SETFL, oldFlags);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(retVal, -1);
+
+  // Remove the APPEND flag;
+  oldFlags = -oldFlags & O_APPEND;
+
+  retVal = LIBC_NAMESPACE::fcntl(fd, F_SETFL, oldFlags);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(retVal, -1);
+
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+}
+
+TEST(LlvmLibcFcntlTest, FcntlGetLkRead) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test";
+  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+
+  struct flock flk, svflk;
+  int retVal;
+  int fd =
+      LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+
+  flk.l_type = F_RDLCK;
+  flk.l_start = 0;
+  flk.l_whence = SEEK_SET;
+  flk.l_len = 50;
+
+  // copy flk into svflk
+  svflk = flk;
+
+  retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(retVal, -1);
+  ASSERT_NE((int)flk.l_type, F_WRLCK); // File should not be write locked.
+
+  retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(retVal, -1);
+
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+}
+
+TEST(LlvmLibcFcntlTest, FcntlGetLkWrite) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test";
+  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+
+  struct flock flk, svflk;
+  int retVal;
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+
+  flk.l_type = F_WRLCK;
+  flk.l_start = 0;
+  flk.l_whence = SEEK_SET;
+  flk.l_len = 0;
+
+  // copy flk into svflk
+  svflk = flk;
+
+  retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(retVal, -1);
+  ASSERT_NE((int)flk.l_type, F_RDLCK); // File should not be read locked.
+
+  retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(retVal, -1);
+
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+}
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 55119868bdaa..102188c332e4 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1651,6 +1651,21 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  scalbnf128_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    scalbnf128_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalbnf128
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
+add_fp_unittest(
   erff_test
   NEED_MPFR
   SUITE
diff --git a/libc/test/src/math/scalbnf128_test.cpp b/libc/test/src/math/scalbnf128_test.cpp
new file mode 100644
index 000000000000..dc259de21148
--- /dev/null
+++ b/libc/test/src/math/scalbnf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalbnf128 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalbnf128.h"
+
+LIST_SCALBN_TESTS(float128, LIBC_NAMESPACE::scalbnf128)
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 22c59c97f6c7..112b2985829c 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -2189,6 +2189,58 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  nearbyint_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nearbyint_test.cpp
+  HDRS
+    NearbyIntTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.math.nearbyint
+)
+
+add_fp_unittest(
+  nearbyintf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nearbyintf_test.cpp
+  HDRS
+    NearbyIntTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.math.nearbyintf
+)
+
+add_fp_unittest(
+  nearbyintl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nearbyintl_test.cpp
+  HDRS
+    NearbyIntTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.math.nearbyintl
+)
+
+add_fp_unittest(
+  nearbyintf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nearbyintf128_test.cpp
+  HDRS
+    NearbyIntTest.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.math.nearbyintf128
+)
+
+add_fp_unittest(
   nextafter_test
   SUITE
     libc-math-smoke-tests
@@ -2739,7 +2791,6 @@ add_fp_unittest(
   DEPENDS
     libc.src.math.scalbn
     libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -2753,7 +2804,6 @@ add_fp_unittest(
   DEPENDS
     libc.src.math.scalbnf
     libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -2767,7 +2817,19 @@ add_fp_unittest(
   DEPENDS
     libc.src.math.scalbnl
     libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.normal_float
+)
+
+add_fp_unittest(
+  scalbnf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalbnf128_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalbnf128
+    libc.src.__support.FPUtil.fp_bits
 )
 
 add_fp_unittest(
diff --git a/libc/test/src/math/smoke/NearbyIntTest.h b/libc/test/src/math/smoke/NearbyIntTest.h
new file mode 100644
index 000000000000..0051ff9447a7
--- /dev/null
+++ b/libc/test/src/math/smoke/NearbyIntTest.h
@@ -0,0 +1,100 @@
+//===-- Utility class to test different flavors of nearbyint ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_NEARBYINTTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_NEARBYINTTEST_H
+
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+static constexpr int ROUNDING_MODES[4] = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO,
+                                          FE_TONEAREST};
+
+template <typename T>
+class NearbyIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef T (*NearbyIntFunc)(T);
+
+  void testNaN(NearbyIntFunc func) {
+    EXPECT_FP_EQ_ALL_ROUNDING(func(aNaN), aNaN);
+  }
+
+  void testInfinities(NearbyIntFunc func) {
+    EXPECT_FP_EQ_ALL_ROUNDING(func(inf), inf);
+    EXPECT_FP_EQ_ALL_ROUNDING(func(neg_inf), neg_inf);
+  }
+
+  void testZeroes(NearbyIntFunc func) {
+    EXPECT_FP_EQ_ALL_ROUNDING(func(zero), zero);
+    EXPECT_FP_EQ_ALL_ROUNDING(func(neg_zero), neg_zero);
+  }
+
+  void testIntegers(NearbyIntFunc func) {
+    EXPECT_FP_EQ_ALL_ROUNDING(func(T(1.0)), T(1.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(func(T(-1.0)), T(-1.0));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(func(T(1234.0)), T(1234.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(func(T(-1234.0)), T(-1234.0));
+
+    EXPECT_FP_EQ_ALL_ROUNDING(func(T(10.0)), T(10.0));
+    EXPECT_FP_EQ_ALL_ROUNDING(func(T(-10.0)), T(-10.0));
+
+    FPBits ints_start(T(0));
+    ints_start.set_biased_exponent(FPBits::SIG_LEN + FPBits::EXP_BIAS);
+    T expected = ints_start.get_val();
+    EXPECT_FP_EQ_ALL_ROUNDING(func(expected), expected);
+  }
+
+  void testSubnormalToNearest(NearbyIntFunc func) {
+    ASSERT_FP_EQ(func(min_denormal), zero);
+    ASSERT_FP_EQ(func(-min_denormal), neg_zero);
+  }
+
+  void testSubnormalTowardZero(NearbyIntFunc func) {
+    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(func(min_denormal), zero);
+    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(func(-min_denormal), neg_zero);
+  }
+
+  void testSubnormalToPosInf(NearbyIntFunc func) {
+    EXPECT_FP_EQ_ROUNDING_UPWARD(func(min_denormal), FPBits::one().get_val());
+    EXPECT_FP_EQ_ROUNDING_UPWARD(func(-min_denormal), neg_zero);
+  }
+
+  void testSubnormalToNegInf(NearbyIntFunc func) {
+    T negative_one = FPBits::one(Sign::NEG).get_val();
+    EXPECT_FP_EQ_ROUNDING_DOWNWARD(func(min_denormal), zero);
+    EXPECT_FP_EQ_ROUNDING_DOWNWARD(func(-min_denormal), negative_one);
+  }
+};
+
+#define LIST_NEARBYINT_TESTS(T, func)                                          \
+  using LlvmLibcNearbyIntTest = NearbyIntTestTemplate<T>;                      \
+  TEST_F(LlvmLibcNearbyIntTest, TestNaN) { testNaN(&func); }                   \
+  TEST_F(LlvmLibcNearbyIntTest, TestInfinities) { testInfinities(&func); }     \
+  TEST_F(LlvmLibcNearbyIntTest, TestZeroes) { testZeroes(&func); }             \
+  TEST_F(LlvmLibcNearbyIntTest, TestIntegers) { testIntegers(&func); }         \
+  TEST_F(LlvmLibcNearbyIntTest, TestSubnormalToNearest) {                      \
+    testSubnormalToNearest(&func);                                             \
+  }                                                                            \
+  TEST_F(LlvmLibcNearbyIntTest, TestSubnormalTowardZero) {                     \
+    testSubnormalTowardZero(&func);                                            \
+  }                                                                            \
+  TEST_F(LlvmLibcNearbyIntTest, TestSubnormalToPosInf) {                       \
+    testSubnormalToPosInf(&func);                                              \
+  }                                                                            \
+  TEST_F(LlvmLibcNearbyIntTest, TestSubnormalToNegInf) {                       \
+    testSubnormalToNegInf(&func);                                              \
+  }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_NEARBYINTTEST_H
diff --git a/libc/test/src/math/smoke/nearbyint_test.cpp b/libc/test/src/math/smoke/nearbyint_test.cpp
new file mode 100644
index 000000000000..11a5c3372e73
--- /dev/null
+++ b/libc/test/src/math/smoke/nearbyint_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nearbyint -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NearbyIntTest.h"
+
+#include "src/math/nearbyint.h"
+
+LIST_NEARBYINT_TESTS(double, LIBC_NAMESPACE::nearbyint)
diff --git a/libc/test/src/math/smoke/nearbyintf128_test.cpp b/libc/test/src/math/smoke/nearbyintf128_test.cpp
new file mode 100644
index 000000000000..98fbb2858faf
--- /dev/null
+++ b/libc/test/src/math/smoke/nearbyintf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nearbyintf128 ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NearbyIntTest.h"
+
+#include "src/math/nearbyintf128.h"
+
+LIST_NEARBYINT_TESTS(float128, LIBC_NAMESPACE::nearbyintf128)
diff --git a/libc/test/src/math/smoke/nearbyintf_test.cpp b/libc/test/src/math/smoke/nearbyintf_test.cpp
new file mode 100644
index 000000000000..fd26153cfffb
--- /dev/null
+++ b/libc/test/src/math/smoke/nearbyintf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nearbyintf ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NearbyIntTest.h"
+
+#include "src/math/nearbyintf.h"
+
+LIST_NEARBYINT_TESTS(float, LIBC_NAMESPACE::nearbyintf)
diff --git a/libc/test/src/math/smoke/nearbyintl_test.cpp b/libc/test/src/math/smoke/nearbyintl_test.cpp
new file mode 100644
index 000000000000..a6d81a1439e1
--- /dev/null
+++ b/libc/test/src/math/smoke/nearbyintl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nearbyintl ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NearbyIntTest.h"
+
+#include "src/math/nearbyintl.h"
+
+LIST_NEARBYINT_TESTS(long double, LIBC_NAMESPACE::nearbyintl)
diff --git a/libc/test/src/math/smoke/scalbnf128_test.cpp b/libc/test/src/math/smoke/scalbnf128_test.cpp
new file mode 100644
index 000000000000..dc259de21148
--- /dev/null
+++ b/libc/test/src/math/smoke/scalbnf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalbnf128 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalbnf128.h"
+
+LIST_SCALBN_TESTS(float128, LIBC_NAMESPACE::scalbnf128)
diff --git a/libc/test/src/stdfix/ISqrtTest.h b/libc/test/src/stdfix/ISqrtTest.h
index 405162b706a9..ddf292fdd083 100644
--- a/libc/test/src/stdfix/ISqrtTest.h
+++ b/libc/test/src/stdfix/ISqrtTest.h
@@ -27,6 +27,18 @@ template <typename T> class ISqrtTest : public LIBC_NAMESPACE::testing::Test {
 public:
   typedef OutType (*SqrtFunc)(T);
 
+  void testSpecificInput(T input, OutType result, double expected,
+                         double tolerance) {
+    double y_d = static_cast<double>(result);
+    double errors = LIBC_NAMESPACE::fputil::abs((y_d / expected) - 1.0);
+    if (errors > tolerance) {
+      // Print out the failure input and output.
+      EXPECT_EQ(input, T(0));
+      EXPECT_EQ(result, zero);
+    }
+    ASSERT_TRUE(errors <= tolerance);
+  }
+
   void testSpecialNumbers(SqrtFunc func) {
     EXPECT_EQ(zero, func(T(0)));
 
@@ -42,15 +54,9 @@ public:
     for (int i = 0; i < COUNT; ++i) {
       x_d += 1.0;
       ++x;
-      double y_d = static_cast<double>(func(x));
-      double result = LIBC_NAMESPACE::fputil::sqrt(x_d);
-      double errors = LIBC_NAMESPACE::fputil::abs((y_d / result) - 1.0);
-      if (errors > ERR) {
-        // Print out the failure input and output.
-        EXPECT_EQ(x, T(0));
-        EXPECT_EQ(func(x), zero);
-      }
-      ASSERT_TRUE(errors <= ERR);
+      OutType result = func(x);
+      double expected = LIBC_NAMESPACE::fputil::sqrt(x_d);
+      testSpecificInput(x, result, expected, ERR);
     }
   }
 };
diff --git a/libc/test/src/stdfix/SqrtTest.h b/libc/test/src/stdfix/SqrtTest.h
index 628be0deb770..47ec129fab2a 100644
--- a/libc/test/src/stdfix/SqrtTest.h
+++ b/libc/test/src/stdfix/SqrtTest.h
@@ -42,7 +42,7 @@ public:
 
     constexpr size_t COUNT = 255;
     constexpr StorageType STEP =
-        ~StorageType(0) / static_cast<StorageType>(COUNT);
+        StorageType(~StorageType(0)) / static_cast<StorageType>(COUNT);
     constexpr double ERR = 3.0 * static_cast<double>(eps);
     StorageType x = 0;
     for (size_t i = 0; i < COUNT; ++i, x += STEP) {
diff --git a/libc/test/src/stdfix/uksqrtui_test.cpp b/libc/test/src/stdfix/uksqrtui_test.cpp
index 0f4c057099da..c336d0ce1f61 100644
--- a/libc/test/src/stdfix/uksqrtui_test.cpp
+++ b/libc/test/src/stdfix/uksqrtui_test.cpp
@@ -17,4 +17,13 @@ unsigned accum uksqrtui_fast(unsigned int x) {
 
 LIST_ISQRT_TESTS(UI, unsigned int, LIBC_NAMESPACE::uksqrtui);
 
+TEST_F(LlvmLibcISqrtUITest, LargeInteger) {
+  testSpecificInput(65529u, LIBC_NAMESPACE::uksqrtui(65529u), 0x1.fff8fep7,
+                    0x3.0p-16);
+}
+
 LIST_ISQRT_TESTS(UIFast, unsigned int, uksqrtui_fast);
+
+TEST_F(LlvmLibcISqrtUIFastTest, LargeInteger) {
+  testSpecificInput(65529u, uksqrtui_fast(65529u), 0x1.fff8fep7, 0x3.0p-16);
+}
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 2977c26646cb..f34cb178e076 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -178,7 +178,7 @@ set(LIBCXX_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}" CACHE STRING
 option(LIBCXX_INSTALL_HEADERS "Install the libc++ headers." ON)
 option(LIBCXX_INSTALL_LIBRARY "Install the libc++ library." ON)
 option(LIBCXX_INSTALL_MODULES
-  "Install the libc++ C++20 module source files (experimental)." OFF
+  "Install the libc++ C++20 module source files (experimental)." ON
 )
 cmake_dependent_option(LIBCXX_INSTALL_STATIC_LIBRARY
   "Install the static libc++ library." ON
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 527a2acf2d3b..5dc3be0c367e 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -224,6 +224,7 @@ set(BENCHMARK_TESTS
     shared_mutex_vs_mutex.bench.cpp
     stop_token.bench.cpp
     std_format_spec_string_unicode.bench.cpp
+    std_format_spec_string_unicode_escape.bench.cpp
     string.bench.cpp
     stringstream.bench.cpp
     system_error.bench.cpp
diff --git a/libcxx/benchmarks/std_format_spec_string_unicode_escape.bench.cpp b/libcxx/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
new file mode 100644
index 000000000000..3b5a1c4340c3
--- /dev/null
+++ b/libcxx/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
@@ -0,0 +1,303 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test formats a larger piece of text in "escaped" mode. It uses several
+// datasets to give an impression how the amount of multibyte UTF-8 sequences
+// and larger grapheme clusters affect the performance.
+
+#ifndef _LIBCPP_HAS_NO_UNICODE
+
+#  include <format>
+#  include <string_view>
+
+#  include "benchmark/benchmark.h"
+
+#  include "make_string.h"
+
+#  define SV(S) MAKE_STRING_VIEW(CharT, S)
+
+// generated with https://generator.lorem-ipsum.info/_latin
+
+template <class CharT>
+std::basic_string_view<CharT> ascii_text() {
+  return SV(
+      R"( Lorem ipsum dolor sit amet, ne sensibus evertitur aliquando his.
+Iuvaret fabulas qui ex, ex iriure iisque nostrum mea. Solum
+pericula qui ad. Elitr oporteat ius ad.
+
+Quas rationibus ad mel. Appellantur intellegebat ad mei, ius audire volumus
+consectetuer id. Ei sit definitionem mediocritatem, vim indoctum intellegat id,
+dicta laboramus instructior in vix. Mel an quando malorum, id vis mollis
+invidunt, placerat maiestatis comprehensam ut cum. Suas regione interesset id
+per, et docendi accumsan has, autem atomorum est te.
+
+Cu debitis ancillae sea, alii definitiones ex cum, vim no erat antiopam. Eam et
+unum quas scriptorem. An bonorum elaboraret complectitur nam, vim ei persecuti
+democritum mediocritatem. Suscipit platonem signiferumque ei cum, in sale
+volutpat ocurreret vel. Te vel nihil nominavi adipiscing, stet ancillae mel ea.
+Sit detraxit menandri platonem ea, cum at tale viris virtute.
+
+Regione detraxit gloriatur sit eu, sonet labitur sententiae et pro, at sit
+alterum aliquid interpretaris. Sonet voluptua duo id, vix ea accumsan
+liberavisse. Nam id commune probatus contentiones. Et zril dolore laudem duo,
+ea usu mollis melius referrentur, vel ex case consequuntur. Id nam illum mollis
+ponderum. Quis tamquam ullamcorper sed ne, legimus vituperatoribus est id.
+
+Et eum probo consulatu. At eos errem aliquando theophrastus, sea ad eius omnis.
+No vis iusto scriptorem adversarium, dicat viderer ea sit. Et veri euripidis
+sea, justo putent iudicabit vim id. Sea suas tincidunt vituperatoribus in. Ne
+eam aeterno sensibus concludaturque, solet legere his id, usu ei dicat
+dissentiunt. Est et autem erant.
+
+Per quod laboramus an. Dico voluptua at mea, an animal minimum eum. Pri an
+option salutatus, causae feugiat menandri an sed. Voluptaria dissentiet vix ut,
+alii solet te quo, in facer ceteros eos. Ad nibh meis percipitur sit,
+aliquam molestie cu vis, iisque malorum interesset et eos.
+
+Eos in feugiat insolens abhorreant. Ea tale esse alienum has, mel et saperet
+appellantur, aliquip salutandi deterruisset ut mel. Eos ei quod simul
+interpretaris, aeque elitr putent per at, et veri eripuit ceteros his. Cu pro
+meis aperiam volutpat, ex alterum scripserit ius, scriptorem deterruisset eu
+qui. Graeco debitis lobortis cu mea.
+
+Alii corpora id ius, cu quo oblique eloquentiam. Et duis civibus atomorum sea,
+veniam utroque scriptorem vim cu. Ut oratio eruditi mediocritatem est. Amet
+nibh dolore mea ea, tollit laoreet eligendi qui ex, cu essent forensibus
+his.
+
+Usu ex ipsum apeirian, eos congue scripserit omittantur et. Ea eum persecuti
+deseruisse, probatus torquatos est no, in has mutat mundi dolorem. Albucius
+sensibus ex cum. Ferri virtute referrentur an per, est choro option bonorum ex.
+
+Quando accusam vis te, tale mazim et pro. Magna dolorem tincidunt
+nec te, albucius adipisci ad pri. Magna facilisi adipisci at usu, et vel
+dissentiunt neglegentur, prima audiam vocibus an duo. Enim detracto te sea, mel
+quis dicit gubergren ex, iusto adversarium consequuntur per ne.
+
+)");
+}
+
+template <class CharT>
+std::basic_string_view<CharT> unicode_text() {
+  return SV(
+      R"(Lōrem ipsūm dolor sīt æmeÞ, ea vel nostrud feuġǣit, muciūs tēmporiȝus
+refērrēnÞur no mel, quo placērǽt consecÞetuer cū. Veri soƿet euripīðis id has,
+sumo paulō dissentias duo eī, dētrāxīt neglēgeƿtur ið prī. Sēd option oporÞerē
+no. Nec ēū nēmore mentitum. Veri prōȝo faċilis āt vīm.
+
+Ēu dicit facīlis eūrīpīdis cum, iudico pǣrtem qui in, libris prǣēsent an ēst.
+Æt sit quoðsi impētus, nec ex qūaeque honestǣtīs. Fiērēƿt ƿōluisse verterem iƿ
+ēst. Meī eæ apēriæm fierent peÞentīūm. Eæm officiīs reprehēndunt nē.
+
+Ut vel quodsī contentioƿes, his eū dignissim īnstruċÞior. Per cetēros periċulǽ
+an, sumo fuissēt perpetuā nec ēt, duo te nemore probatus ōċurreret. Mel ǣd
+civībus ocūrreret. Ex nostro ǣliquam usu, ex Þātīon adipiscī qui. Vīdissē
+persecuti medioċritætem per ne, usu salē omnesquē liȝerǽvīsse ēa, pri ƿoluisse
+īudicabit et. No summo quiðǣm nec, vim ēi nūmqūam sænctus concepÞǣm. Reque
+doceƿdi īn īus, porro eripuiÞ intērprētaris pri in.
+
+Idquē hǣbēmus nominati vix cū. AÞ prō ǽmēt elit periculæ. Has virīs viderer ān.
+Mel in suās pericūlīs āppellantur, nonumes deserūƿt ǽðversarium eā has. ĒliÞ
+possīt commuƿe no ēsÞ, niȝh aċcusāmūs volūpÞatum no mel, ut quō ciȝo ðiceret.
+Inǣni scripta quālīsque nē qūi, ad ipsūm persecuÞi mediōcritæÞēm vel.
+
+Ǣppetere definitiōnes mel id. Leġerē āliquip nam eǣ, rēgione viderer pǣtrioque
+duo te, meƿāƿdri prodēsseÞ ex hīs. Solum quidam eæ iūs, mēl ǣt sapientem
+expliċari. Īƿ ǣċcusǣm phǽedrum pro, ex pro dēleƿit detræxit hendrerīt, sit āgam
+quidām pertinax uÞ. Ēssent rætionibus eǽ vēl, quo ān labore nusquæm nominǣti.
+
+Te alii cōnseÞetur ƿam, eam ēt puteƿÞ ðissentiæs. Qūi alii dicānt repuðiære ēā,
+nō mel ferri nūsquam. Ea vim impedīt vertērem, ǣn per veri Þīmeam. SiÞ ōmitÞǽm
+necēssitǣÞibus ex, ƿe vis inǣni pærtem invenire. Īd ðolores ċonsēċÞeÞuer usu,
+īd vis nisl dēnique luptǣtūm. Pro ǽd ēverti option dēserūƿt, nec te ōðiō
+cīvībūs.
+
+Ēæ nibh æccommodarē eum. Ne etiæm īudico dicunt duo, quo tēmpor populo insōlens
+nē. Ēos eÞ ēirmod prǽēsēƿt. Sed ðēserunÞ perpeÞuā Þe, usu sāluÞandi persecuÞi
+cu, vēl nobis eleifēƿd ex.
+
+Ƿe zrīl ūtīnam lǣtīne eǣm, eā vim rebum omitÞǣm aðipisciƿg. Amet inermis
+epiċūri ut est, eu duo hīnc periċulis. Mel no reque simul volupÞātum, ex mutat
+lāudem tacīmatēs cum. Te hǣs summo iƿteġre recteque. No iūs dicerēt
+ðisputǽtioƿi. Vim ōmnis deleƿiÞi honestātis ēǽ.
+
+Nec detrǣcto pērcipitur ne. Ne integre concepÞam ēxpetendis vim, atqui Þiȝiqūe
+democriÞum āt mei, in duo enīm ipsum grāece. Rebum ðefīnīÞionem āt pri, ēt sit
+brute periculis. Ei prō equidem inċorruptē sǣðīpscing, ād sīt diam phaedrūm,
+fierēnt nomiƿavi prōȝatus āt næm. Wisi ƿæÞūm coƿsecteÞuer usū ea.
+)");
+}
+
+template <class CharT>
+std::basic_string_view<CharT> cyrillic_text() {
+  return SV(
+      R"(Лорем ипсум долор сит амет, еу диам тамяуам принципес вис, еяуидем
+цонцептам диспутандо яуи цу, иус ад натум нулла граеци. Цибо дицит омниум нец
+цу, еу бруте номинави диссентиет яуо. Омниум лаборамус еу хас. Дицат
+диспутатиони вис еу, цу еос миним атоморум инцидеринт. Пер хабео рецтеяуе
+дигниссим ан, ех яуо сенсибус торяуатос, ан.
+
+Ут перпетуа партиендо принципес хис. Ат симул ностер аппареат пер. Пурто вирис
+ет хис, мазим дицерет при ет. Хис саперет тибияуе сцаевола еу, сит солет
+вивендум цонсеяуат те. Ид оффициис перпетуа ассентиор яуи, сед аугуе афферт
+симилияуе ад, ех адмодум постулант иус.
+
+Про дицунт волуптатум диспутатиони ат. Вел патриояуе персецути еа, цетерос
+диспутатиони ин сед, нам те веро цлита малуиссет. Цу неглегентур инструцтиор
+интерпретарис еам, ипсум фабулас еи вел. Еи адхуц деленити нам, аугуе
+демоцритум при ан. Вим мелиоре проприае ид, албуциус волуптуа цоррумпит дуо ан.
+Латине иуварет пер ут, иус еа мунере ерипуит санцтус.
+
+Модус тритани иус не, вим ут мелиоре мандамус, лабитур опортере дуо но. Ад нец
+витае фацилис инцоррупте, цу сед толлит сцрипторем. Сит лудус инимицус
+волуптариа не. Иисяуе антиопам сапиентем сед еу. Путент волуптуа сит ех, ат иус
+ребум епицури, яуи моллис елигенди ех. Проприае нолуиссе цу сеа, путент поссит
+адверсариум про не.
+
+Ид яуо прима бонорум, дуо форенсибус яуаерендум еи, еум бруте мунере те. Еам
+риденс граецо ех, аеяуе санцтус маиорум ан вел. Либрис санцтус утрояуе ест но,
+еам ат реяуе порро тинцидунт, ут хинц иллуд патриояуе хис. Не солет оффендит
+форенсибус хас, тамяуам опортеат елаборарет те нец, еу аугуе примис маиорум
+еам. Аутем вениам импедит вис ин, прима елитр пхаедрум ест еу.)");
+}
+
+template <class CharT>
+std::basic_string_view<CharT> japanese_text() {
+  return SV(
+      R"(入ト年媛ろ舗学ラロ準募ケカ社金スノ屋検れう策他セヲシ引口ぎ集7独ぱクふ出車ぽでぱ円輪ルノ受打わ。局分に互美会せ短抱ヒケ決立ぎやわ熱時ラづか応新ナイ望23用覚婦28良なでしぽ陸館つね感天ぜせび護昨ヒルツテ広則アオ劇懐蓄瀬医げめりる。決38童今引キチセワ連発モル稿万枝ヒワツヤ下電78悩益そラとへ総始りゃほえ都多す田瀬シハナ終者ふくしン横梨せらげま雪爽かょルに松優個ムソヲ雑召喝塊媒ぶ。
+
+紙ヤ景異ミノオ誤求レ移著ヤエヨメ広庫テハヌサ君検あ必参ワ火面るね声著ン間売力を数20談すがス禁化ッを。起そり予浩ド進皇キ試属が震二トヌ真佳速すずちし件諏フウチ聞在ス会雄ノミ必筋80戦ぶさほド聞2涙属どスれ映聞ネ掲実べ。
+
+8福びり属稿づ徳鎌ニル涼問ゃごるリ付92済トぎけッ康30業づむはつ治然二生入ざひ有動ハワチ発談ニスツ魚困摘策送ざ。個時着そてら新新ヌ鉄報たは作主ずリ可輸改量ルおず井認つてぜな会大ぼすぶし全戸ノハケレ貯治たざリな祖間ムリキ断会仕べせど。委暮ど象週トクワ流開タハ硬給ツタウ者善マラノヱ断稿リヲ東毎ツヨマ井藤ルょへ境同論エ愛図ッらフリ基38属慣葬8携ヱ校図おに岐題しね要月レユ展省わトど。
+
+担がは顔研リ目問いぽべ挙介ん入番ネヌイ栄県し改治ラス健第モム得続加ホウ嘉宿置首本やぞ。78毎まが現設記ほぜね場歩ユアルヒ東的ヒ姿役ネヲ聞能ラシマヒ際形トくゃ政能万の付結ス国1教レツ引写イど扱澤は膚言けリいべ橋柔薄組こよじ。浩報すンつひ崎正念方と夫地クざす情阪スで抜長ネ娘回ハツ止資ヘニ並辞ロノ展師質18打テネ岡時ノモ泉95務えぴひつ速申後延んフるせ。
+
+店てラ載独マシフ理心ス型部米た読石カ料応掲ケカキ打月在ユテニ採材イ並発イヒト旅錯っめし模能りせば連確え会准揮が。器にト画軍にぶイら式東みそお前姿リいけに身47却6記け岸5体会ゃばま映8碁よぽだ経9名トびち更躍うにふ裏高もそ提旅さぼえス。賞ぞだ月係ソ知建振イナシ説並イ見書傳ヨミ問回級エシ出所師阪ト転権がし渡平ルモケ新完ハ玲女ロトシ導複トうよふ。
+
+化シセチ町74掲ネテトオ連対ヒハチモ経後ッ断連カロワ待業ぼぽねか百都へがい始塗ごげ寺帰んぽ逆力るず選英堂衛掛焼ゅ。自生トサリ探就的らね江球リルスツ主嘆4権伝ざが避掲う慶合ワ百29暮ネヤクム書能部あが席小フア部親票ーむとこ。3説ひっぜ約毎伎ナキリ缶近くなず員45姿えにけろ値付ワ着知ソルキ日医ず集新エウカケ投国チ生目ゃ棋運ぐのか寄募オチ性注経どドんて止代わくかな端期幕はかク。
+)");
+}
+
+template <class CharT>
+std::basic_string_view<CharT> emoji_text() {
+  return SV(
+      R"(
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+\U0001F636\u200D\U0001F32B\uFE0F
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+
+\U0001F636\u200D\U0001F32B\uFE0F
+
+\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
+
+\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
+
+\U0001F984
+
+)");
+}
+
+template <class CharT>
+void BM_escaped(benchmark::State& state, std::basic_string_view<CharT> input) {
+  CharT buffer[25'000];
+
+  if constexpr (std::same_as<CharT, char>) {
+    // Make sure the output buffer is large enough.
+    assert(std::formatted_size("{}", input) == 3000);
+    for (auto _ : state)
+      benchmark::DoNotOptimize(std::format_to(buffer, "{:?}", input));
+  } else {
+    for (auto _ : state)
+      benchmark::DoNotOptimize(std::format_to(buffer, L"{:?}", input));
+  }
+}
+
+template <class CharT>
+void BM_ascii_escaped(benchmark::State& state) {
+  BM_escaped(state, ascii_text<CharT>());
+}
+
+template <class CharT>
+void BM_unicode_escaped(benchmark::State& state) {
+  BM_escaped(state, unicode_text<CharT>());
+}
+
+template <class CharT>
+void BM_cyrillic_escaped(benchmark::State& state) {
+  BM_escaped(state, cyrillic_text<CharT>());
+}
+
+template <class CharT>
+void BM_japanese_escaped(benchmark::State& state) {
+  BM_escaped(state, japanese_text<CharT>());
+}
+
+template <class CharT>
+void BM_emoji_escaped(benchmark::State& state) {
+  BM_escaped(state, emoji_text<CharT>());
+}
+
+BENCHMARK_TEMPLATE(BM_ascii_escaped, char);
+BENCHMARK_TEMPLATE(BM_unicode_escaped, char);
+BENCHMARK_TEMPLATE(BM_cyrillic_escaped, char);
+BENCHMARK_TEMPLATE(BM_japanese_escaped, char);
+BENCHMARK_TEMPLATE(BM_emoji_escaped, char);
+
+BENCHMARK_TEMPLATE(BM_ascii_escaped, wchar_t);
+BENCHMARK_TEMPLATE(BM_unicode_escaped, wchar_t);
+BENCHMARK_TEMPLATE(BM_cyrillic_escaped, wchar_t);
+BENCHMARK_TEMPLATE(BM_japanese_escaped, wchar_t);
+BENCHMARK_TEMPLATE(BM_emoji_escaped, wchar_t);
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+
+  benchmark::RunSpecifiedBenchmarks();
+}
+#else
+int main(int, char**) { return 0; }
+#endif
diff --git a/libcxx/cmake/caches/Generic-cxx20.cmake b/libcxx/cmake/caches/Generic-cxx20.cmake
index 641c131a737b..3c44fdaf0e42 100644
--- a/libcxx/cmake/caches/Generic-cxx20.cmake
+++ b/libcxx/cmake/caches/Generic-cxx20.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++20" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-cxx23.cmake b/libcxx/cmake/caches/Generic-cxx23.cmake
index f5409e4652e4..bf88abf56ca6 100644
--- a/libcxx/cmake/caches/Generic-cxx23.cmake
+++ b/libcxx/cmake/caches/Generic-cxx23.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++23" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-cxx26.cmake b/libcxx/cmake/caches/Generic-cxx26.cmake
index 2d9c018a4ff5..6ba9482af578 100644
--- a/libcxx/cmake/caches/Generic-cxx26.cmake
+++ b/libcxx/cmake/caches/Generic-cxx26.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++26" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
index 9542dcdbf778..72263dfd8463 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_HARDENING_MODE "extensive" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index c68adfc1276b..f0dffef60dba 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index 62b7d7373d44..f33ed0141899 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index 01ae7e68f12c..4000f3a3e8ef 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index fc4957b2d53a..79d6b44c7139 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index ddf479add626..e9b4cc60cc80 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 724fbc466b58..616baef1be7b 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,4 +1,3 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index a4cf7dd73772..01160bf21898 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index dc19389bb5ae..728d41086a38 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index a0a0cdb43397..e425b9dadfe7 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -208,7 +208,7 @@ libc++ specific options
 
 .. option:: LIBCXX_INSTALL_MODULES:BOOL
 
-  **Default**: ``OFF``
+  **Default**: ``ON``
 
   Toggle the installation of the experimental libc++ module sources.
 
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 8724f321a9d1..5a07b11cbcd5 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -50,6 +50,7 @@ Implemented Papers
 - P1659R3 - ``std::ranges::starts_with`` and ``std::ranges::ends_with``
 - P3029R1 - Better ``mdspan``'s CTAD
 - P2387R3 - Pipe support for user-defined range adaptors
+- P2713R1 - Escaping improvements in ``std::format``
 
 Improvements and New Features
 -----------------------------
@@ -117,6 +118,9 @@ Deprecations and Removals
   a ``std::basic_*fstream`` from a ``std::basic_string_view``, a input-iterator or a C-string, instead
   you can construct a temporary ``std::basic_string``. This change has been applied to C++17 and later.
 
+- The ``_LIBCPP_DISABLE_ADDITIONAL_DIAGNOSTICS`` macro has been removed and is not honored anymore. Additional
+  warnings provided by libc++ as a matter of QoI will now be provided unconditionally.
+
 
 Upcoming Deprecations and Removals
 ----------------------------------
@@ -148,3 +152,5 @@ Build System Changes
 
 - The Cmake variable ``LIBCXX_ENABLE_CLANG_TIDY`` has been removed. The build system has been changed
   to automatically detect the presence of ``clang-tidy`` and the required ``Clang`` libraries.
+
+- The CMake options ``LIBCXX_INSTALL_MODULES`` now defaults to ``ON``.
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index f75dd288304b..01387a404f5d 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -108,7 +108,7 @@
 "`P2164R9 <https://wg21.link/P2164R9>`__","LWG", "``views::enumerate``","February 2023","","","|ranges|"
 "`P2711R1 <https://wg21.link/P2711R1>`__","LWG", "Making multi-param constructors of ``views`` ``explicit``","February 2023","|In Progress| [#note-P2711R1]_","","|ranges|"
 "`P2609R3 <https://wg21.link/P2609R3>`__","LWG", "Relaxing Ranges Just A Smidge","February 2023","","","|ranges|"
-"`P2713R1 <https://wg21.link/P2713R1>`__","LWG", "Escaping improvements in ``std::format``","February 2023","","","|format|"
+"`P2713R1 <https://wg21.link/P2713R1>`__","LWG", "Escaping improvements in ``std::format``","February 2023","|Complete|","19.0","|format|"
 "`P2675R1 <https://wg21.link/P2675R1>`__","LWG", "``format``'s width estimation is too approximate and not forward compatible","February 2023","|Complete|","17.0","|format|"
 "`P2572R1 <https://wg21.link/P2572R1>`__","LWG", "``std::format`` fill character allowances","February 2023","|Complete|","17.0","|format|"
 "`P2693R1 <https://wg21.link/P2693R1>`__","LWG", "Formatting ``thread::id`` and ``stacktrace``","February 2023","|Partial| [#note-P2693R1]_","","|format|"
diff --git a/libcxx/docs/Status/Cxx2c.rst b/libcxx/docs/Status/Cxx2c.rst
index a7ebc4662f51..e3d9cbb551ff 100644
--- a/libcxx/docs/Status/Cxx2c.rst
+++ b/libcxx/docs/Status/Cxx2c.rst
@@ -40,6 +40,7 @@ Paper Status
 .. note::
 
    .. [#note-P2510R3] This paper is applied as DR against C++20. (MSVC STL and libstdc++ will do the same.)
+   .. [#note-P3142R0] This paper is applied as DR against C++23. (MSVC STL and libstdc++ will do the same.)
 
 .. _issues-status-cxx2c:
 
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 008f7418ab9c..30a059f8a3df 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -32,7 +32,7 @@
 "`3951 <https://wg21.link/LWG3951>`__","[expected.object.swap]: Using ``value()`` instead of ``has_value()``","Kona November 2023","","",""
 "`3953 <https://wg21.link/LWG3953>`__","``iter_move`` for ``common_iterator`` and ``counted_iterator`` should return ``decltype(auto)``","Kona November 2023","","","|ranges|"
 "`3957 <https://wg21.link/LWG3957>`__","[container.alloc.reqmts] The value category of v should be claimed","Kona November 2023","","",""
-"`3965 <https://wg21.link/LWG3965>`__","Incorrect example in [format.string.escaped] p3 for formatting of combining characters","Kona November 2023","","","|format|"
+"`3965 <https://wg21.link/LWG3965>`__","Incorrect example in [format.string.escaped] p3 for formatting of combining characters","Kona November 2023","|Complete|","19.0","|format|"
 "`3970 <https://wg21.link/LWG3970>`__","[mdspan.syn] Missing definition of ``full_extent_t`` and ``full_extent``","Kona November 2023","","",""
 "`3973 <https://wg21.link/LWG3973>`__","Monadic operations should be ADL-proof","Kona November 2023","","",""
 "`3974 <https://wg21.link/LWG3974>`__","``mdspan::operator[]`` should not copy ``OtherIndexTypes``","Kona November 2023","","",""
@@ -44,12 +44,12 @@
 "`3919 <https://wg21.link/LWG3919>`__","``enumerate_view`` may invoke UB for sized common non-forward underlying ranges","Tokyo March 2024","","","|ranges|"
 "`3950 <https://wg21.link/LWG3950>`__","``std::basic_string_view`` comparison operators are overspecified","Tokyo March 2024","|Complete|","18.0",""
 "`3975 <https://wg21.link/LWG3975>`__","Specializations of ``basic_format_context`` should not be permitted","Tokyo March 2024","|Nothing To Do|","","|format|"
-"`3984 <https://wg21.link/LWG3984>`__","``ranges::to``'s recursion branch may be ill-formed","Tokyo March 2024","","","|ranges|"
+"`3984 <https://wg21.link/LWG3984>`__","``ranges::to``'s recursion branch may be ill-formed","Tokyo March 2024","|Complete|","19.0","|ranges|"
 "`4011 <https://wg21.link/LWG4011>`__","``""Effects: Equivalent to return""`` in ``[span.elem]``","Tokyo March 2024","|Nothing To Do|","",""
 "`4012 <https://wg21.link/LWG4012>`__","``common_view::begin/end`` are missing the ``simple-view`` check","Tokyo March 2024","","","|ranges|"
 "`4013 <https://wg21.link/LWG4013>`__","``lazy_split_view::outer-iterator::value_type`` should not provide default constructor","Tokyo March 2024","","","|ranges|"
 "`4016 <https://wg21.link/LWG4016>`__","container-insertable checks do not match what container-inserter does","Tokyo March 2024","","",""
-"`4023 <https://wg21.link/LWG4023>`__","Preconditions of ``std::basic_streambuf::setg/setp``","Tokyo March 2024","","",""
+"`4023 <https://wg21.link/LWG4023>`__","Preconditions of ``std::basic_streambuf::setg/setp``","Tokyo March 2024","|Complete|","19.0",""
 "`4025 <https://wg21.link/LWG4025>`__","Move assignment operator of ``std::expected<cv void, E>`` should not be conditionally deleted","Tokyo March 2024","","",""
 "`4030 <https://wg21.link/LWG4030>`__","Clarify whether arithmetic expressions in ``[numeric.sat.func]`` are mathematical or C++","Tokyo March 2024","|Nothing To Do|","",""
 "`4031 <https://wg21.link/LWG4031>`__","``bad_expected_access<void>`` member functions should be ``noexcept``","Tokyo March 2024","|Complete|","16.0",""
@@ -59,8 +59,8 @@
 "`4038 <https://wg21.link/LWG4038>`__","``std::text_encoding::aliases_view`` should have constexpr iterators","Tokyo March 2024","","",""
 "`4043 <https://wg21.link/LWG4043>`__","""ASCII"" is not a registered character encoding","Tokyo March 2024","|Nothing To Do|","",""
 "`4045 <https://wg21.link/LWG4045>`__","``tuple`` can create dangling references from ``tuple-like``","Tokyo March 2024","","",""
-"`4053 <https://wg21.link/LWG4053>`__","Unary call to ``std::views::repeat`` does not decay the argument","Tokyo March 2024","","","|ranges|"
-"`4054 <https://wg21.link/LWG4054>`__","Repeating a ``repeat_view`` should repeat the view","Tokyo March 2024","","","|ranges|"
+"`4053 <https://wg21.link/LWG4053>`__","Unary call to ``std::views::repeat`` does not decay the argument","Tokyo March 2024","|Complete|","19.0","|ranges|"
+"`4054 <https://wg21.link/LWG4054>`__","Repeating a ``repeat_view`` should repeat the view","Tokyo March 2024","|Complete|","19.0","|ranges|"
 "","","","","",""
 "`3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Yet Adopted","|Complete|","16.0",""
 "XXXX","","The sys_info range should be affected by save","Not Yet Adopted","|Complete|","19.0"
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 1b00a05ad3d6..409278db1e87 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -51,7 +51,7 @@
 "`P2869R4 <https://wg21.link/P2869R4>`__","LWG","Remove Deprecated ``shared_ptr`` Atomic Access APIs from C++26","Tokyo March 2024","|Complete|","19.0",""
 "`P2872R3 <https://wg21.link/P2872R3>`__","LWG","Remove ``wstring_convert`` From C++26","Tokyo March 2024","|Complete|","19.0",""
 "`P3107R5 <https://wg21.link/P3107R5>`__","LWG","Permit an efficient implementation of ``std::print``","Tokyo March 2024","","","|format| |DR|"
-"`P3142R0 <https://wg21.link/P3142R0>`__","LWG","Printing Blank Lines with ``println``","Tokyo March 2024","|Complete|","19.0","|format|"
+"`P3142R0 <https://wg21.link/P3142R0>`__","LWG","Printing Blank Lines with ``println``","Tokyo March 2024","|Complete| [#note-P3142R0]_","19.0","|format|"
 "`P2845R8 <https://wg21.link/P2845R8>`__","LWG","Formatting of ``std::filesystem::path``","Tokyo March 2024","","","|format|"
 "`P0493R5 <https://wg21.link/P0493R5>`__","LWG","Atomic minimum/maximum","Tokyo March 2024","","",""
 "`P2542R8 <https://wg21.link/P2542R8>`__","LWG","``views::concat``","Tokyo March 2024","","","|ranges|"
diff --git a/libcxx/docs/Status/FormatIssues.csv b/libcxx/docs/Status/FormatIssues.csv
index 7da77def92da..3780c1ed5c12 100644
--- a/libcxx/docs/Status/FormatIssues.csv
+++ b/libcxx/docs/Status/FormatIssues.csv
@@ -10,7 +10,7 @@ Number,Name,Standard,Assignee,Status,First released version
 "`P2508R1 <https://wg21.link/P2508R1>`__","Exposing ``std::basic-format-string``","C++23","Mark de Wever","|Complete|",15.0
 "`P2585R0 <https://wg21.link/P2585R0>`__","Improving default container formatting","C++23","Mark de Wever","|Complete|",17.0
 "`P2539R4 <https://wg21.link/P2539R4>`__","Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?","C++23","Mark de Wever","|Complete|","18.0"
-"`P2713R1 <https://wg21.link/P2713R1>`__","Escaping improvements in ``std::format``","C++23","Mark de Wever",""
+"`P2713R1 <https://wg21.link/P2713R1>`__","Escaping improvements in ``std::format``","C++23","Mark de Wever","|Complete|",19.0
 "`P2675R1 <https://wg21.link/P2675R1>`__","``format``'s width estimation is too approximate and not forward compatible","C++23","Mark de Wever","|Complete|",17.0
 "`P2572R1 <https://wg21.link/P2572R1>`__","``std::format`` fill character allowances","C++23","Mark de Wever","|Complete|",17.0
 "`P2693R1 <https://wg21.link/P2693R1>`__","Formatting ``thread::id`` and ``stacktrace``","C++23","Mark de Wever","|In Progress|"
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index e7aaf4e1fbcf..0fdaeb433ac6 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -167,15 +167,6 @@ safety annotations.
   build of libc++ which does not export any symbols, which can be useful when
   building statically for inclusion into another library.
 
-**_LIBCPP_DISABLE_ADDITIONAL_DIAGNOSTICS**:
-  This macro disables the additional diagnostics generated by libc++ using the
-  `diagnose_if` attribute. These additional diagnostics include checks for:
-
-    * Giving `set`, `map`, `multiset`, `multimap` and their `unordered_`
-      counterparts a comparator which is not const callable.
-    * Giving an unordered associative container a hasher that is not const
-      callable.
-
 **_LIBCPP_NO_VCRUNTIME**:
   Microsoft's C and C++ headers are fairly entangled, and some of their C++
   headers are fairly hard to avoid. In particular, `vcruntime_new.h` gets pulled
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index d60356873132..7f58dbb13a57 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -43,7 +43,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // generic implementation
 template <class _Iter, class _Sent, class _Tp, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find_impl(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
   for (; __first != __last; ++__first)
     if (std::__invoke(__proj, *__first) == __value)
       break;
@@ -57,8 +57,7 @@ template <class _Tp,
           __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
                             sizeof(_Tp) == 1,
                         int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
   if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
     return __ret;
   return __last;
@@ -71,8 +70,7 @@ template <class _Tp,
           __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
                             sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
                         int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
   if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
     return __ret;
   return __last;
@@ -89,10 +87,10 @@ template <class _Tp,
                             is_signed<_Tp>::value == is_signed<_Up>::value,
                         int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) {
+__find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) {
   if (__value < numeric_limits<_Tp>::min() || __value > numeric_limits<_Tp>::max())
     return __last;
-  return std::__find_impl(__first, __last, _Tp(__value), __proj);
+  return std::__find(__first, __last, _Tp(__value), __proj);
 }
 
 // __bit_iterator implementation
@@ -134,7 +132,7 @@ __find_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 
 template <class _Cp, bool _IsConst, class _Tp, class _Proj, __enable_if_t<__is_identity<_Proj>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, _IsConst>
-__find_impl(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value, _Proj&) {
+__find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value, _Proj&) {
   if (static_cast<bool>(__value))
     return std::__find_bool<true>(__first, static_cast<typename _Cp::size_type>(__last - __first));
   return std::__find_bool<false>(__first, static_cast<typename _Cp::size_type>(__last - __first));
@@ -150,7 +148,7 @@ template <class _SegmentedIterator,
           class _Proj,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
-__find_impl(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value, _Proj& __proj) {
+__find(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value, _Proj& __proj) {
   return std::__find_segment_if(std::move(__first), std::move(__last), __find_segment<_Tp>(__value), __proj);
 }
 
@@ -163,7 +161,7 @@ struct __find_segment {
   template <class _InputIterator, class _Proj>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _InputIterator
   operator()(_InputIterator __first, _InputIterator __last, _Proj& __proj) const {
-    return std::__find_impl(__first, __last, __value_, __proj);
+    return std::__find(__first, __last, __value_, __proj);
   }
 };
 
@@ -173,7 +171,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _In
 find(_InputIterator __first, _InputIterator __last, const _Tp& __value) {
   __identity __proj;
   return std::__rewrap_iter(
-      __first, std::__find_impl(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __value, __proj));
+      __first, std::__find(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __value, __proj));
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/ranges_find.h b/libcxx/include/__algorithm/ranges_find.h
index e1383eb4b071..6b0d5efe37ab 100644
--- a/libcxx/include/__algorithm/ranges_find.h
+++ b/libcxx/include/__algorithm/ranges_find.h
@@ -44,9 +44,9 @@ struct __fn {
     if constexpr (forward_iterator<_Iter>) {
       auto [__first_un, __last_un] = std::__unwrap_range(__first, std::move(__last));
       return std::__rewrap_range<_Sent>(
-          std::move(__first), std::__find_impl(std::move(__first_un), std::move(__last_un), __value, __proj));
+          std::move(__first), std::__find(std::move(__first_un), std::move(__last_un), __value, __proj));
     } else {
-      return std::__find_impl(std::move(__first), std::move(__last), __value, __proj);
+      return std::__find(std::move(__first), std::move(__last), __value, __proj);
     }
   }
 
diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index aa761eb5bfe5..7a02ae00846b 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -28,30 +28,32 @@
 // that previously released library. Normally, this would be a load-time error
 // when one tries to launch the program against the older library.
 //
-// For example, the filesystem library was introduced in the dylib in macOS 10.15.
-// If a user compiles on a macOS 10.15 host but targets macOS 10.13 with their
-// program, the compiler would normally not complain (because the required
-// declarations are in the headers), but the dynamic loader would fail to find
-// the symbols when actually trying to launch the program on macOS 10.13. To
-// turn this into a compile-time issue instead, declarations are annotated with
-// when they were introduced, and the compiler can produce a diagnostic if the
-// program references something that isn't available on the deployment target.
+// For example, the filesystem library was introduced in the dylib in LLVM 9.
+// On Apple platforms, this corresponds to macOS 10.15. If a user compiles on
+// a macOS 10.15 host but targets macOS 10.13 with their program, the compiler
+// would normally not complain (because the required declarations are in the
+// headers), but the dynamic loader would fail to find the symbols when actually
+// trying to launch the program on macOS 10.13. To turn this into a compile-time
+// issue instead, declarations are annotated with when they were introduced, and
+// the compiler can produce a diagnostic if the program references something that
+// isn't available on the deployment target.
 //
 // This mechanism is general in nature, and any vendor can add their markup to
 // the library (see below). Whenever a new feature is added that requires support
 // in the shared library, two macros are added below to allow marking the feature
 // as unavailable:
-// 1. A macro named `_LIBCPP_AVAILABILITY_HAS_NO_<feature>` which must be defined
-//    exactly when compiling for a target that doesn't support the feature.
-// 2. A macro named `_LIBCPP_AVAILABILITY_<feature>`, which must always be defined
-//    and must expand to the proper availability attribute for the platform.
+// 1. A macro named `_LIBCPP_AVAILABILITY_HAS_<feature>` which must be defined
+//    to `_LIBCPP_INTRODUCED_IN_<version>` for the appropriate LLVM version.
+// 2. A macro named `_LIBCPP_AVAILABILITY_<feature>`, which must be defined to
+//    `_LIBCPP_INTRODUCED_IN_<version>_MARKUP` for the appropriate LLVM version.
 //
 // When vendors decide to ship the feature as part of their shared library, they
-// can update these macros appropriately for their platform, and the library will
-// use those to provide an optimal user experience.
+// can update the `_LIBCPP_INTRODUCED_IN_<version>` macro (and the markup counterpart)
+// based on the platform version they shipped that version of LLVM in. The library
+// will then use this markup to provide an optimal user experience on these platforms.
 //
 // Furthermore, many features in the standard library have corresponding
-// feature-test macros. The `_LIBCPP_AVAILABILITY_HAS_NO_<feature>` macros
+// feature-test macros. The `_LIBCPP_AVAILABILITY_HAS_<feature>` macros
 // are checked by the corresponding feature-test macros generated by
 // generate_feature_test_macro_components.py to ensure that the library
 // doesn't announce a feature as being implemented if it is unavailable on
@@ -74,237 +76,181 @@
 
 // Availability markup is disabled when building the library, or when a non-Clang
 // compiler is used because only Clang supports the necessary attributes.
-// doesn't support the proper attributes.
 #if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) || !defined(_LIBCPP_COMPILER_CLANG_BASED)
 #  if !defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 #    define _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
 #  endif
 #endif
 
+// When availability annotations are disabled, we take for granted that features introduced
+// in all versions of the library are available.
 #if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 
-// These macros control the availability of std::bad_optional_access and
-// other exception types. These were put in the shared library to prevent
-// code bloat from every user program defining the vtable for these exception
-// types.
-//
-// Note that when exceptions are disabled, the methods that normally throw
-// these exceptions can be used even on older deployment targets, but those
-// methods will abort instead of throwing.
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_OPTIONAL_ACCESS 1
-#  define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_VARIANT_ACCESS 1
-#  define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS
+#  define _LIBCPP_INTRODUCED_IN_LLVM_4 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP /* nothing */
 
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_ANY_CAST 1
-#  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP      /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_PUSH /* nothing */
+#  define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_POP  /* nothing */
 
-// These macros control the availability of all parts of <filesystem> that
-// depend on something in the dylib.
-#  define _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY 1
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_PUSH
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_POP
+#  define _LIBCPP_INTRODUCED_IN_LLVM_10 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_10_MARKUP /* nothing */
 
-// This controls the availability of the C++20 synchronization library,
-// which requires shared library support for various operations
-// (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
-// <semaphore>, and notification functions on std::atomic.
-#  define _LIBCPP_AVAILABILITY_HAS_SYNC 1
-#  define _LIBCPP_AVAILABILITY_SYNC
+#  define _LIBCPP_INTRODUCED_IN_LLVM_12 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_12_MARKUP /* nothing */
 
-// Enable additional explicit instantiations of iostreams components. This
-// reduces the number of weak definitions generated in programs that use
-// iostreams by providing a single strong definition in the shared library.
-//
-// TODO: Enable additional explicit instantiations on GCC once it supports exclude_from_explicit_instantiation,
-//       or once libc++ doesn't use the attribute anymore.
-// TODO: Enable them on Windows once https://llvm.org/PR41018 has been fixed.
-#  if !defined(_LIBCPP_COMPILER_GCC) && !defined(_WIN32)
-#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 1
-#  else
-#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 0
-#  endif
+#  define _LIBCPP_INTRODUCED_IN_LLVM_14 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_14_MARKUP /* nothing */
 
-// This controls the availability of floating-point std::to_chars functions.
-// These overloads were added later than the integer overloads.
-#  define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT 1
-#  define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15_MARKUP /* nothing */
 
-// This controls whether the library claims to provide a default verbose
-// termination function, and consequently whether the headers will try
-// to use it when the mechanism isn't overriden at compile-time.
-#  define _LIBCPP_AVAILABILITY_HAS_VERBOSE_ABORT 1
-#  define _LIBCPP_AVAILABILITY_VERBOSE_ABORT
+#  define _LIBCPP_INTRODUCED_IN_LLVM_16 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_16_MARKUP /* nothing */
 
-// This controls the availability of the C++17 std::pmr library,
-// which is implemented in large part in the built library.
-#  define _LIBCPP_AVAILABILITY_HAS_PMR 1
-#  define _LIBCPP_AVAILABILITY_PMR
+#  define _LIBCPP_INTRODUCED_IN_LLVM_18 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP /* nothing */
 
-// These macros controls the availability of __cxa_init_primary_exception
-// in the built library, which std::make_exception_ptr might use
-// (see libcxx/include/__exception/exception_ptr.h).
-#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 1
-#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
-
-// This controls the availability of C++23 <print>, which
-// has a dependency on the built library (it needs access to
-// the underlying buffer types of std::cout, std::cerr, and std::clog.
-#  define _LIBCPP_AVAILABILITY_HAS_PRINT 1
-#  define _LIBCPP_AVAILABILITY_PRINT
-
-// This controls the availability of the C++20 time zone database.
-// The parser code is built in the library.
-#  define _LIBCPP_AVAILABILITY_HAS_TZDB 1
-#  define _LIBCPP_AVAILABILITY_TZDB
-
-// These macros determine whether we assume that std::bad_function_call and
-// std::bad_expected_access provide a key function in the dylib. This allows
-// centralizing their vtable and typeinfo instead of having all TUs provide
-// a weak definition that then gets deduplicated.
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION 1
-#  define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION 1
-#  define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19 1
+#  define _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP /* nothing */
 
 #elif defined(__APPLE__)
 
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_OPTIONAL_ACCESS                                                                 \
-    (!defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) || __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ >= 50000)
-
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_VARIANT_ACCESS _LIBCPP_AVAILABILITY_HAS_BAD_OPTIONAL_ACCESS
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_ANY_CAST _LIBCPP_AVAILABILITY_HAS_BAD_OPTIONAL_ACCESS
-
-#  define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS __attribute__((availability(watchos, strict, introduced = 5.0)))
-#  define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-
-// TODO: Update once this is released
-#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 0
-#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION __attribute__((unavailable))
+// LLVM 4
+#  if defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 50000
+#    define _LIBCPP_INTRODUCED_IN_LLVM_4 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP __attribute__((availability(watchos, strict, introduced = 5.0)))
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_4 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP /* nothing */
+#  endif
 
-// <filesystem>
+// LLVM 9
 // clang-format off
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||   \
       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) || \
       (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) ||         \
       (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
 // clang-format on
-#    define _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP                                                                        \
+      __attribute__((availability(macos, strict, introduced = 10.15)))                                                 \
+      __attribute__((availability(ios, strict, introduced = 13.0)))                                                    \
+      __attribute__((availability(tvos, strict, introduced = 13.0)))                                                   \
+      __attribute__((availability(watchos, strict, introduced = 6.0)))
+// clang-format off
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_PUSH                                                                               \
+      _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), apply_to=any(function,record))") \
+      _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")    \
+      _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")   \
+      _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_POP                                                                    \
+      _Pragma("clang attribute pop") \
+      _Pragma("clang attribute pop") \
+      _Pragma("clang attribute pop") \
+      _Pragma("clang attribute pop")
+// clang-format on
 #  else
-#    define _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP      /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_PUSH /* nothing */
+#    define _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_POP  /* nothing */
 #  endif
-#  define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY                                                                      \
-    __attribute__((availability(macos, strict, introduced = 10.15)))                                                   \
-    __attribute__((availability(ios, strict, introduced = 13.0)))                                                      \
-    __attribute__((availability(tvos, strict, introduced = 13.0)))                                                     \
-    __attribute__((availability(watchos, strict, introduced = 6.0)))
+
+// LLVM 10
 // clang-format off
-#   define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_PUSH                                 \
-        _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), apply_to=any(function,record))") \
-        _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")    \
-        _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")   \
-        _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
-#   define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_POP                                  \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000) ||   \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 140000) || \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 140000) ||         \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
 // clang-format on
+#    define _LIBCPP_INTRODUCED_IN_LLVM_10 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_10_MARKUP                                                                       \
+      __attribute__((availability(macos, strict, introduced = 11.0)))                                                  \
+      __attribute__((availability(ios, strict, introduced = 14.0)))                                                    \
+      __attribute__((availability(tvos, strict, introduced = 14.0)))                                                   \
+      __attribute__((availability(watchos, strict, introduced = 7.0)))
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_10 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_10_MARKUP /* nothing */
+#  endif
 
-// std::to_chars(floating-point)
+// LLVM 12
 // clang-format off
-#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130300) ||   \
-      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160300) || \
-      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160300) ||         \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90300)
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 120000)   || \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 150000) || \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 150000)         || \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80000)
 // clang-format on
-#    define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_12 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_12_MARKUP                                                                       \
+      __attribute__((availability(macos, strict, introduced = 12.0)))                                                  \
+      __attribute__((availability(ios, strict, introduced = 15.0)))                                                    \
+      __attribute__((availability(tvos, strict, introduced = 15.0)))                                                   \
+      __attribute__((availability(watchos, strict, introduced = 8.0)))
 #  else
-#    define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_12 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_12_MARKUP /* nothing */
 #  endif
-#  define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT                                                                 \
-    __attribute__((availability(macos, strict, introduced = 13.3)))                                                    \
-    __attribute__((availability(ios, strict, introduced = 16.3)))                                                      \
-    __attribute__((availability(tvos, strict, introduced = 16.3)))                                                     \
-    __attribute__((availability(watchos, strict, introduced = 9.3)))
 
-// c++20 synchronization library
+// LLVM 14
 // clang-format off
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000) ||   \
-       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 140000) || \
-       (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 140000) ||         \
-       (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130400) ||   \
+      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160500) || \
+      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160500) ||         \
+      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500)
 // clang-format on
-#    define _LIBCPP_AVAILABILITY_HAS_SYNC 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_14 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_14_MARKUP                                                                       \
+      __attribute__((availability(macos, strict, introduced = 13.4)))                                                  \
+      __attribute__((availability(ios, strict, introduced = 16.5)))                                                    \
+      __attribute__((availability(tvos, strict, introduced = 16.5)))                                                   \
+      __attribute__((availability(watchos, strict, introduced = 9.5)))
 #  else
-#    define _LIBCPP_AVAILABILITY_HAS_SYNC 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_14 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_14_MARKUP /* nothing */
 #  endif
-#  define _LIBCPP_AVAILABILITY_SYNC                                                                                    \
-    __attribute__((availability(macos, strict, introduced = 11.0)))                                                    \
-    __attribute__((availability(ios, strict, introduced = 14.0)))                                                      \
-    __attribute__((availability(tvos, strict, introduced = 14.0)))                                                     \
-    __attribute__((availability(watchos, strict, introduced = 7.0)))
-
-// __libcpp_verbose_abort
-// TODO: Update once this is released
-#  define _LIBCPP_AVAILABILITY_HAS_VERBOSE_ABORT 0
 
-#  define _LIBCPP_AVAILABILITY_VERBOSE_ABORT __attribute__((unavailable))
-
-// std::pmr
+// LLVM 15-16
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15 _LIBCPP_INTRODUCED_IN_LLVM_16
+#  define _LIBCPP_INTRODUCED_IN_LLVM_15_MARKUP _LIBCPP_INTRODUCED_IN_LLVM_16_MARKUP
 // clang-format off
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 140000) ||   \
       (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 170000) || \
       (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 170000) ||         \
       (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 100000)
 // clang-format on
-#    define _LIBCPP_AVAILABILITY_HAS_PMR 0
-#  else
-#    define _LIBCPP_AVAILABILITY_HAS_PMR 1
-#  endif
-// TODO: Enable std::pmr markup once https://github.com/llvm/llvm-project/issues/40340 has been fixed
-//       Until then, it is possible for folks to try to use `std::pmr` when back-deploying to targets that don't support
-//       it and it'll be a load-time error, but we don't have a good alternative because the library won't compile if we
-//       use availability annotations until that bug has been fixed.
-#  if 0
-#    define _LIBCPP_AVAILABILITY_PMR                                                                                   \
+#    define _LIBCPP_INTRODUCED_IN_LLVM_16 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_16_MARKUP                                                                       \
       __attribute__((availability(macos, strict, introduced = 14.0)))                                                  \
       __attribute__((availability(ios, strict, introduced = 17.0)))                                                    \
       __attribute__((availability(tvos, strict, introduced = 17.0)))                                                   \
       __attribute__((availability(watchos, strict, introduced = 10.0)))
 #  else
-#    define _LIBCPP_AVAILABILITY_PMR
+#    define _LIBCPP_INTRODUCED_IN_LLVM_16 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_16_MARKUP /* nothing */
 #  endif
 
-#  define _LIBCPP_AVAILABILITY_HAS_TZDB 0
-#  define _LIBCPP_AVAILABILITY_TZDB __attribute__((unavailable))
-
-// Warning: This availability macro works differently than the other macros.
-// The dylib part of print is not needed on Apple platforms. Therefore when
-// the macro is not available the code calling the dylib is commented out.
-// The macro _LIBCPP_AVAILABILITY_PRINT is not used.
-#  define _LIBCPP_AVAILABILITY_HAS_PRINT 0
-#  define _LIBCPP_AVAILABILITY_PRINT __attribute__((unavailable))
-
-// clang-format off
-#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 120000)   || \
-      (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 150000) || \
-      (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 150000)         || \
-      (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 80000)
-// clang-format on
-#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 0
+// LLVM 18
+// TODO: Fill this in
+#  if 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_18 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP __attribute__((unavailable))
 #  else
-#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_18 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP /* nothing */
 #  endif
 
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION 0
-#  define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION __attribute__((unavailable))
-
-#  define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION 0
-#  define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION __attribute__((unavailable))
+// LLVM 19
+// TODO: Fill this in
+#  if 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19 0
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP __attribute__((unavailable))
+#  else
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19 1
+#    define _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP /* nothing */
+#  endif
 
 #else
 
@@ -315,6 +261,97 @@
 
 #endif
 
+// These macros control the availability of std::bad_optional_access and
+// other exception types. These were put in the shared library to prevent
+// code bloat from every user program defining the vtable for these exception
+// types.
+//
+// Note that when exceptions are disabled, the methods that normally throw
+// these exceptions can be used even on older deployment targets, but those
+// methods will abort instead of throwing.
+#define _LIBCPP_AVAILABILITY_HAS_BAD_OPTIONAL_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4
+#define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP
+
+#define _LIBCPP_AVAILABILITY_HAS_BAD_VARIANT_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4
+#define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP
+
+#define _LIBCPP_AVAILABILITY_HAS_BAD_ANY_CAST _LIBCPP_INTRODUCED_IN_LLVM_4
+#define _LIBCPP_AVAILABILITY_BAD_ANY_CAST _LIBCPP_INTRODUCED_IN_LLVM_4_MARKUP
+
+// These macros control the availability of all parts of <filesystem> that
+// depend on something in the dylib.
+#define _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY _LIBCPP_INTRODUCED_IN_LLVM_9
+#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP
+#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_PUSH _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_PUSH
+#define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_POP _LIBCPP_INTRODUCED_IN_LLVM_9_MARKUP_POP
+
+// This controls the availability of the C++20 synchronization library,
+// which requires shared library support for various operations
+// (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
+// <semaphore>, and notification functions on std::atomic.
+#define _LIBCPP_AVAILABILITY_HAS_SYNC _LIBCPP_INTRODUCED_IN_LLVM_10
+#define _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INTRODUCED_IN_LLVM_10_MARKUP
+
+// Enable additional explicit instantiations of iostreams components. This
+// reduces the number of weak definitions generated in programs that use
+// iostreams by providing a single strong definition in the shared library.
+//
+// TODO: Enable additional explicit instantiations on GCC once it supports exclude_from_explicit_instantiation,
+//       or once libc++ doesn't use the attribute anymore.
+// TODO: Enable them on Windows once https://llvm.org/PR41018 has been fixed.
+#if !defined(_LIBCPP_COMPILER_GCC) && !defined(_WIN32)
+#  define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 _LIBCPP_INTRODUCED_IN_LLVM_12
+#else
+#  define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 0
+#endif
+
+// This controls the availability of floating-point std::to_chars functions.
+// These overloads were added later than the integer overloads.
+#define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT _LIBCPP_INTRODUCED_IN_LLVM_14
+#define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT _LIBCPP_INTRODUCED_IN_LLVM_14_MARKUP
+
+// This controls whether the library claims to provide a default verbose
+// termination function, and consequently whether the headers will try
+// to use it when the mechanism isn't overriden at compile-time.
+#define _LIBCPP_AVAILABILITY_HAS_VERBOSE_ABORT _LIBCPP_INTRODUCED_IN_LLVM_15
+#define _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_INTRODUCED_IN_LLVM_15_MARKUP
+
+// This controls the availability of the C++17 std::pmr library,
+// which is implemented in large part in the built library.
+//
+// TODO: Enable std::pmr markup once https://github.com/llvm/llvm-project/issues/40340 has been fixed
+//       Until then, it is possible for folks to try to use `std::pmr` when back-deploying to targets that don't support
+//       it and it'll be a load-time error, but we don't have a good alternative because the library won't compile if we
+//       use availability annotations until that bug has been fixed.
+#define _LIBCPP_AVAILABILITY_HAS_PMR _LIBCPP_INTRODUCED_IN_LLVM_16
+#define _LIBCPP_AVAILABILITY_PMR
+
+// These macros controls the availability of __cxa_init_primary_exception
+// in the built library, which std::make_exception_ptr might use
+// (see libcxx/include/__exception/exception_ptr.h).
+#define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION _LIBCPP_INTRODUCED_IN_LLVM_18
+#define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP
+
+// This controls the availability of C++23 <print>, which
+// has a dependency on the built library (it needs access to
+// the underlying buffer types of std::cout, std::cerr, and std::clog.
+#define _LIBCPP_AVAILABILITY_HAS_PRINT _LIBCPP_INTRODUCED_IN_LLVM_18
+#define _LIBCPP_AVAILABILITY_PRINT _LIBCPP_INTRODUCED_IN_LLVM_18_MARKUP
+
+// This controls the availability of the C++20 time zone database.
+// The parser code is built in the library.
+#define _LIBCPP_AVAILABILITY_HAS_TZDB _LIBCPP_INTRODUCED_IN_LLVM_19
+#define _LIBCPP_AVAILABILITY_TZDB _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP
+
+// These macros determine whether we assume that std::bad_function_call and
+// std::bad_expected_access provide a key function in the dylib. This allows
+// centralizing their vtable and typeinfo instead of having all TUs provide
+// a weak definition that then gets deduplicated.
+#  define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19
+#  define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP
+#  define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19
+#  define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_INTRODUCED_IN_LLVM_19_MARKUP
+
 // Define availability attributes that depend on _LIBCPP_HAS_NO_EXCEPTIONS.
 // Those are defined in terms of the availability attributes above, and
 // should not be vendor-specific.
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 97cdd020c55d..e4c5c685a456 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1390,7 +1390,7 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_NO_DESTROY
 #  endif
 
-#  if __has_attribute(__diagnose_if__) && !defined(_LIBCPP_DISABLE_ADDITIONAL_DIAGNOSTICS)
+#  if __has_attribute(__diagnose_if__)
 #    define _LIBCPP_DIAGNOSE_WARNING(...) __attribute__((__diagnose_if__(__VA_ARGS__, "warning")))
 #  else
 #    define _LIBCPP_DIAGNOSE_WARNING(...)
diff --git a/libcxx/include/__format/escaped_output_table.h b/libcxx/include/__format/escaped_output_table.h
index b194f9431c3b..6aa91c89defa 100644
--- a/libcxx/include/__format/escaped_output_table.h
+++ b/libcxx/include/__format/escaped_output_table.h
@@ -80,10 +80,9 @@ namespace __escaped_output_table {
 /// The entries of the characters to escape in format's debug string.
 ///
 /// Contains the entries for [format.string.escaped]/2.2.1.2.1
-///   CE is a Unicode encoding and C corresponds to either a UCS scalar value
-///   whose Unicode property General_Category has a value in the groups
-///   Separator (Z) or Other (C) or to a UCS scalar value which has the Unicode
-///   property Grapheme_Extend=Yes, as described by table 12 of UAX #44
+///   CE is a Unicode encoding and C corresponds to a UCS scalar value whose
+///   Unicode property General_Category has a value in the groups Separator (Z)
+///   or Other (C), as described by table 12 of UAX #44
 ///
 /// Separator (Z) consists of General_Category
 /// - Space_Separator,
@@ -98,7 +97,6 @@ namespace __escaped_output_table {
 /// - Unassigned.
 ///
 /// The data is generated from
-/// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
 /// - https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
 ///
 /// The table is similar to the table
@@ -107,927 +105,751 @@ namespace __escaped_output_table {
 /// table lacks a property, thus having more bits available for the size.
 ///
 /// The data has 2 values:
-/// - bits [0, 10] The size of the range, allowing 2048 elements.
-/// - bits [11, 31] The lower bound code point of the range. The upper bound of
-///   the range is lower bound + size.
-_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[894] = {
-    0x00000020,
-    0x0003f821,
-    0x00056800,
-    0x0018006f,
-    0x001bc001,
-    0x001c0003,
-    0x001c5800,
-    0x001c6800,
-    0x001d1000,
-    0x00241806,
-    0x00298000,
-    0x002ab801,
-    0x002c5801,
-    0x002c802d,
-    0x002df800,
-    0x002e0801,
-    0x002e2001,
-    0x002e3808,
-    0x002f5803,
-    0x002fa810,
-    0x0030800a,
-    0x0030e000,
-    0x00325814,
-    0x00338000,
-    0x0036b007,
-    0x0036f805,
-    0x00373801,
-    0x00375003,
-    0x00387001,
-    0x00388800,
-    0x0039801c,
-    0x003d300a,
-    0x003d900d,
-    0x003f5808,
-    0x003fd802,
-    0x0040b003,
-    0x0040d808,
-    0x00412802,
-    0x00414806,
-    0x0041f800,
-    0x0042c804,
-    0x0042f800,
-    0x00435804,
-    0x00447810,
-    0x00465038,
-    0x0049d000,
-    0x0049e000,
-    0x004a0807,
-    0x004a6800,
-    0x004a8806,
-    0x004b1001,
-    0x004c0800,
-    0x004c2000,
-    0x004c6801,
-    0x004c8801,
-    0x004d4800,
-    0x004d8800,
-    0x004d9802,
-    0x004dd002,
-    0x004df000,
-    0x004e0805,
-    0x004e4801,
-    0x004e6800,
-    0x004e780c,
-    0x004ef000,
-    0x004f1003,
-    0x004ff004,
-    0x00502000,
-    0x00505803,
-    0x00508801,
-    0x00514800,
-    0x00518800,
-    0x0051a000,
-    0x0051b800,
-    0x0051d003,
-    0x00520817,
-    0x0052e800,
-    0x0052f806,
-    0x00538001,
-    0x0053a800,
-    0x0053b80b,
-    0x00542000,
-    0x00547000,
-    0x00549000,
-    0x00554800,
-    0x00558800,
-    0x0055a000,
-    0x0055d002,
-    0x00560807,
-    0x00565000,
-    0x00566802,
-    0x0056880e,
-    0x00571003,
-    0x00579006,
-    0x0057d007,
-    0x00582000,
-    0x00586801,
-    0x00588801,
-    0x00594800,
-    0x00598800,
-    0x0059a000,
-    0x0059d002,
-    0x0059f001,
-    0x005a0805,
-    0x005a4801,
-    0x005a680e,
-    0x005af000,
-    0x005b1003,
-    0x005bc00a,
-    0x005c2000,
-    0x005c5802,
-    0x005c8800,
-    0x005cb002,
-    0x005cd800,
-    0x005ce800,
-    0x005d0002,
-    0x005d2802,
-    0x005d5802,
-    0x005dd004,
-    0x005e0000,
-    0x005e1802,
-    0x005e4800,
-    0x005e6802,
-    0x005e8814,
-    0x005fd805,
-    0x00602000,
-    0x00606800,
-    0x00608800,
-    0x00614800,
-    0x0061d002,
-    0x0061f002,
-    0x00622812,
-    0x0062d801,
-    0x0062f001,
-    0x00631003,
-    0x00638006,
-    0x00640800,
-    0x00646800,
-    0x00648800,
-    0x00654800,
-    0x0065a000,
-    0x0065d002,
-    0x0065f800,
-    0x00661000,
-    0x00662801,
-    0x00664800,
-    0x00666010,
-    0x0066f800,
-    0x00671003,
-    0x00678000,
-    0x0067a00d,
-    0x00686800,
-    0x00688800,
-    0x0069d801,
-    0x0069f000,
-    0x006a0804,
-    0x006a4800,
-    0x006a6800,
-    0x006a8003,
-    0x006ab800,
-    0x006b1003,
-    0x006c0001,
-    0x006c2000,
-    0x006cb802,
-    0x006d9000,
-    0x006de000,
-    0x006df001,
-    0x006e3808,
-    0x006e9005,
-    0x006ef806,
-    0x006f8001,
-    0x006fa80b,
-    0x00718800,
-    0x0071a00a,
-    0x00723807,
-    0x0072e024,
-    0x00741800,
-    0x00742800,
-    0x00745800,
-    0x00752000,
-    0x00753000,
-    0x00758800,
-    0x0075a008,
-    0x0075f001,
-    0x00762800,
-    0x00763808,
-    0x0076d001,
-    0x0077001f,
-    0x0078c001,
-    0x0079a800,
-    0x0079b800,
-    0x0079c800,
-    0x007a4000,
-    0x007b6811,
-    0x007c0004,
-    0x007c3001,
-    0x007c6830,
-    0x007e3000,
-    0x007e6800,
-    0x007ed824,
-    0x00816803,
-    0x00819005,
-    0x0081c801,
-    0x0081e801,
-    0x0082c001,
-    0x0082f002,
-    0x00838803,
-    0x00841000,
-    0x00842801,
-    0x00846800,
-    0x0084e800,
-    0x00863000,
-    0x00864004,
-    0x00867001,
-    0x00924800,
-    0x00927001,
-    0x0092b800,
-    0x0092c800,
-    0x0092f001,
-    0x00944800,
-    0x00947001,
-    0x00958800,
-    0x0095b001,
-    0x0095f800,
-    0x00960800,
-    0x00963001,
-    0x0096b800,
-    0x00988800,
-    0x0098b001,
-    0x009ad804,
-    0x009be802,
-    0x009cd005,
-    0x009fb001,
-    0x009ff001,
-    0x00b40000,
-    0x00b4e802,
-    0x00b7c806,
-    0x00b89002,
-    0x00b8b008,
-    0x00b99001,
-    0x00b9b808,
-    0x00ba900d,
-    0x00bb6800,
-    0x00bb880e,
-    0x00bda001,
-    0x00bdb806,
-    0x00be3000,
-    0x00be480a,
-    0x00bee802,
-    0x00bf5005,
-    0x00bfd005,
-    0x00c05804,
-    0x00c0d005,
-    0x00c3c806,
-    0x00c42801,
-    0x00c54800,
-    0x00c55804,
-    0x00c7b009,
-    0x00c8f803,
-    0x00c93801,
-    0x00c96003,
-    0x00c99000,
-    0x00c9c806,
-    0x00ca0802,
-    0x00cb7001,
-    0x00cba80a,
-    0x00cd6003,
-    0x00ce5005,
-    0x00ced802,
-    0x00d0b801,
-    0x00d0d802,
-    0x00d2b000,
-    0x00d2c008,
-    0x00d31000,
-    0x00d32807,
-    0x00d3980c,
-    0x00d45005,
-    0x00d4d005,
-    0x00d57055,
-    0x00d9a006,
-    0x00d9e000,
-    0x00da1000,
-    0x00da6802,
-    0x00db5808,
-    0x00dbf802,
-    0x00dd1003,
-    0x00dd4001,
-    0x00dd5802,
-    0x00df3000,
-    0x00df4001,
-    0x00df6800,
-    0x00df7802,
-    0x00dfa007,
-    0x00e16007,
-    0x00e1b004,
-    0x00e25002,
-    0x00e44806,
-    0x00e5d801,
-    0x00e6400a,
-    0x00e6a00c,
-    0x00e71006,
-    0x00e76800,
-    0x00e7a000,
-    0x00e7c001,
-    0x00e7d804,
-    0x00ee003f,
-    0x00f8b001,
-    0x00f8f001,
-    0x00fa3001,
-    0x00fa7001,
-    0x00fac000,
-    0x00fad000,
-    0x00fae000,
-    0x00faf000,
-    0x00fbf001,
-    0x00fda800,
-    0x00fe2800,
-    0x00fea001,
-    0x00fee000,
-    0x00ff8001,
-    0x00ffa800,
-    0x00fff810,
-    0x01014007,
-    0x0102f810,
-    0x01039001,
-    0x01047800,
-    0x0104e802,
-    0x0106083e,
-    0x010c6003,
-    0x01213818,
-    0x01225814,
-    0x015ba001,
-    0x015cb000,
-    0x01677802,
-    0x0167a004,
-    0x01693000,
-    0x01694004,
-    0x01697001,
-    0x016b4006,
-    0x016b880e,
-    0x016cb808,
-    0x016d3800,
-    0x016d7800,
-    0x016db800,
-    0x016df800,
-    0x016e3800,
-    0x016e7800,
-    0x016eb800,
-    0x016ef820,
-    0x0172f021,
-    0x0174d000,
-    0x0177a00b,
-    0x017eb019,
-    0x01800000,
-    0x01815005,
-    0x01820000,
-    0x0184b803,
-    0x01880004,
-    0x01898000,
-    0x018c7800,
-    0x018f200a,
-    0x0190f800,
-    0x05246802,
-    0x05263808,
-    0x05316013,
-    0x05337803,
-    0x0533a009,
-    0x0534f001,
-    0x05378001,
-    0x0537c007,
-    0x053e5804,
-    0x053e9000,
-    0x053ea000,
-    0x053ed017,
-    0x05401000,
-    0x05403000,
-    0x05405800,
-    0x05412801,
-    0x05416003,
-    0x0541d005,
-    0x0543c007,
-    0x05462009,
-    0x0546d017,
-    0x0547f800,
-    0x05493007,
-    0x054a380a,
-    0x054aa00a,
-    0x054be805,
-    0x054d9800,
-    0x054db003,
-    0x054de001,
-    0x054e7000,
-    0x054ed003,
-    0x054f2800,
-    0x054ff800,
-    0x05514805,
-    0x05518801,
-    0x0551a80a,
-    0x05521800,
-    0x05526000,
-    0x05527001,
-    0x0552d001,
-    0x0553e000,
-    0x05558000,
-    0x05559002,
-    0x0555b801,
-    0x0555f001,
-    0x05560800,
-    0x05561817,
-    0x05576001,
-    0x0557b00a,
-    0x05583801,
-    0x05587801,
-    0x0558b808,
-    0x05593800,
-    0x05597800,
-    0x055b6003,
-    0x055f2800,
-    0x055f4000,
-    0x055f6802,
-    0x055fd005,
-    0x06bd200b,
-    0x06be3803,
-    0x06bfe7ff,
-    0x06ffe7ff,
-    0x073fe7ff,
-    0x077fe7ff,
-    0x07bfe103,
-    0x07d37001,
-    0x07d6d025,
-    0x07d8380b,
-    0x07d8c004,
-    0x07d8f000,
-    0x07d9b800,
-    0x07d9e800,
-    0x07d9f800,
-    0x07da1000,
-    0x07da2800,
-    0x07de180f,
-    0x07ec8001,
-    0x07ee4006,
-    0x07ee801f,
-    0x07f0000f,
-    0x07f0d015,
-    0x07f29800,
-    0x07f33800,
-    0x07f36003,
-    0x07f3a800,
-    0x07f7e803,
-    0x07fcf001,
-    0x07fdf802,
-    0x07fe4001,
-    0x07fe8001,
-    0x07fec001,
-    0x07fee802,
-    0x07ff3800,
-    0x07ff780c,
-    0x07fff001,
-    0x08006000,
-    0x08013800,
-    0x0801d800,
-    0x0801f000,
-    0x08027001,
-    0x0802f021,
-    0x0807d804,
-    0x08081803,
-    0x0809a002,
-    0x080c7800,
-    0x080ce802,
-    0x080d082e,
-    0x080fe882,
-    0x0814e802,
-    0x0816880f,
-    0x0817e003,
-    0x08192008,
-    0x081a5804,
-    0x081bb009,
-    0x081cf000,
-    0x081e2003,
-    0x081eb029,
-    0x0824f001,
-    0x08255005,
-    0x0826a003,
-    0x0827e003,
-    0x08294007,
-    0x082b200a,
-    0x082bd800,
-    0x082c5800,
-    0x082c9800,
-    0x082cb000,
-    0x082d1000,
-    0x082d9000,
-    0x082dd000,
-    0x082de842,
-    0x0839b808,
-    0x083ab009,
-    0x083b4017,
-    0x083c3000,
-    0x083d8800,
-    0x083dd844,
-    0x08403001,
-    0x08404800,
-    0x0841b000,
-    0x0841c802,
-    0x0841e801,
-    0x0842b000,
-    0x0844f807,
-    0x0845802f,
-    0x08479800,
-    0x0847b004,
-    0x0848e002,
-    0x0849d004,
-    0x084a003f,
-    0x084dc003,
-    0x084e8001,
-    0x0850080e,
-    0x0850a000,
-    0x0850c000,
-    0x0851b009,
-    0x08524806,
-    0x0852c806,
-    0x0855001f,
-    0x08572805,
-    0x0857b808,
-    0x0859b002,
-    0x085ab001,
-    0x085b9804,
-    0x085c9006,
-    0x085ce80b,
-    0x085d804f,
-    0x08624836,
-    0x0865980c,
-    0x08679806,
-    0x0869200b,
-    0x0869d125,
-    0x0873f800,
-    0x08755002,
-    0x08757001,
-    0x0875904d,
-    0x08794007,
-    0x087a300a,
-    0x087ad015,
-    0x087c1003,
-    0x087c5025,
-    0x087e6013,
-    0x087fb808,
-    0x08800800,
-    0x0881c00e,
-    0x08827003,
-    0x08838000,
-    0x08839801,
-    0x0883b00b,
-    0x08859803,
-    0x0885c801,
-    0x0885e800,
-    0x0886100d,
-    0x08874806,
-    0x0887d008,
-    0x08893804,
-    0x08896808,
-    0x088a4007,
-    0x088b9800,
-    0x088bb80a,
-    0x088db008,
-    0x088e4803,
-    0x088e7800,
-    0x088f0000,
-    0x088fa80a,
-    0x08909000,
-    0x08917802,
-    0x0891a000,
-    0x0891b001,
-    0x0891f000,
-    0x0892083e,
-    0x08943800,
-    0x08944800,
-    0x08947000,
-    0x0894f000,
-    0x08955005,
-    0x0896f800,
-    0x0897180c,
-    0x0897d007,
-    0x08982000,
-    0x08986801,
-    0x08988801,
-    0x08994800,
-    0x08998800,
-    0x0899a000,
-    0x0899d002,
-    0x0899f000,
-    0x089a0000,
-    0x089a2801,
-    0x089a4801,
-    0x089a7001,
-    0x089a880b,
-    0x089b209b,
-    0x08a1c007,
-    0x08a21002,
-    0x08a23000,
-    0x08a2e000,
-    0x08a2f000,
-    0x08a3101d,
-    0x08a58000,
-    0x08a59805,
-    0x08a5d000,
-    0x08a5e800,
-    0x08a5f801,
-    0x08a61001,
-    0x08a64007,
-    0x08a6d0a5,
-    0x08ad7800,
-    0x08ad9005,
-    0x08ade001,
-    0x08adf801,
-    0x08aee023,
-    0x08b19807,
-    0x08b1e800,
-    0x08b1f801,
-    0x08b2280a,
-    0x08b2d005,
-    0x08b36812,
-    0x08b55800,
-    0x08b56800,
-    0x08b58005,
-    0x08b5b800,
-    0x08b5d005,
-    0x08b65035,
-    0x08b8d804,
-    0x08b91003,
-    0x08b93808,
-    0x08ba38b8,
-    0x08c17808,
-    0x08c1c801,
-    0x08c1e063,
-    0x08c7980b,
-    0x08c83801,
-    0x08c85001,
-    0x08c8a000,
-    0x08c8b800,
-    0x08c98000,
-    0x08c9b000,
-    0x08c9c803,
-    0x08c9f000,
-    0x08ca1800,
-    0x08ca3808,
-    0x08cad045,
-    0x08cd4001,
-    0x08cea007,
-    0x08cf0000,
-    0x08cf281a,
-    0x08d00809,
-    0x08d19805,
-    0x08d1d803,
-    0x08d23808,
-    0x08d28805,
-    0x08d2c802,
-    0x08d4500c,
-    0x08d4c001,
-    0x08d5180c,
-    0x08d7c806,
-    0x08d850f5,
-    0x08e04800,
-    0x08e1800d,
-    0x08e1f800,
-    0x08e23009,
-    0x08e36802,
-    0x08e48018,
-    0x08e55006,
-    0x08e59001,
-    0x08e5a84a,
-    0x08e83800,
-    0x08e85000,
-    0x08e98814,
-    0x08ea3808,
-    0x08ead005,
-    0x08eb3000,
-    0x08eb4800,
-    0x08ec7803,
-    0x08eca800,
-    0x08ecb800,
-    0x08ecc806,
-    0x08ed5135,
-    0x08f79801,
-    0x08f7c808,
-    0x08f88800,
-    0x08f9b007,
-    0x08fa0000,
-    0x08fa1000,
-    0x08fad055,
-    0x08fd880e,
-    0x08ff900c,
-    0x091cd065,
-    0x09237800,
-    0x0923a80a,
-    0x092a27ff,
-    0x096a224b,
-    0x097f980c,
-    0x09a18010,
-    0x09a23fff,
-    0x09e23fb8,
-    0x0a323fff,
-    0x0a723fff,
-    0x0ab23fff,
-    0x0af23fff,
-    0x0b3239b8,
-    0x0b51c806,
-    0x0b52f800,
-    0x0b535003,
-    0x0b55f800,
-    0x0b565005,
-    0x0b577006,
-    0x0b57b009,
-    0x0b598006,
-    0x0b5a3009,
-    0x0b5ad000,
-    0x0b5b1000,
-    0x0b5bc004,
-    0x0b5c82af,
-    0x0b74d864,
-    0x0b7a5804,
-    0x0b7c400a,
-    0x0b7d003f,
-    0x0b7f200b,
-    0x0b7f900d,
-    0x0c3fc007,
-    0x0c66b029,
-    0x0c684fff,
-    0x0ca84fff,
-    0x0ce84fff,
-    0x0d284fff,
-    0x0d684ae6,
-    0x0d7fa000,
-    0x0d7fe000,
-    0x0d7ff800,
-    0x0d89180e,
-    0x0d89981c,
-    0x0d8a9801,
-    0x0d8ab00d,
-    0x0d8b4007,
-    0x0d97e7ff,
-    0x0dd7e103,
-    0x0de35804,
-    0x0de3e802,
-    0x0de44806,
-    0x0de4d001,
-    0x0de4e801,
-    0x0de507ff,
-    0x0e2507ff,
-    0x0e6502af,
-    0x0e7e203b,
-    0x0e87b009,
-    0x0e893801,
-    0x0e8b2800,
-    0x0e8b3802,
-    0x0e8b7014,
-    0x0e8c2806,
-    0x0e8d5003,
-    0x0e8f5814,
-    0x0e921002,
-    0x0e923079,
-    0x0e96a00b,
-    0x0e97a00b,
-    0x0e9ab808,
-    0x0e9bc886,
-    0x0ea2a800,
-    0x0ea4e800,
-    0x0ea50001,
-    0x0ea51801,
-    0x0ea53801,
-    0x0ea56800,
-    0x0ea5d000,
-    0x0ea5e000,
-    0x0ea62000,
-    0x0ea83000,
-    0x0ea85801,
-    0x0ea8a800,
-    0x0ea8e800,
-    0x0ea9d000,
-    0x0ea9f800,
-    0x0eaa2800,
-    0x0eaa3802,
-    0x0eaa8800,
-    0x0eb53001,
-    0x0ebe6001,
-    0x0ed00036,
-    0x0ed1d831,
-    0x0ed3a800,
-    0x0ed42000,
-    0x0ed46473,
-    0x0ef8f805,
-    0x0ef95904,
-    0x0f037091,
-    0x0f096809,
-    0x0f09f001,
-    0x0f0a5003,
-    0x0f0a813f,
-    0x0f157011,
-    0x0f176003,
-    0x0f17d004,
-    0x0f1801cf,
-    0x0f276003,
-    0x0f27d2e5,
-    0x0f3f3800,
-    0x0f3f6000,
-    0x0f3f7800,
-    0x0f3ff800,
-    0x0f462801,
-    0x0f46802f,
-    0x0f4a2006,
-    0x0f4a6003,
-    0x0f4ad003,
-    0x0f4b0310,
-    0x0f65a84b,
-    0x0f69f0c1,
-    0x0f702000,
-    0x0f710000,
-    0x0f711800,
-    0x0f712801,
-    0x0f714000,
-    0x0f719800,
-    0x0f71c000,
-    0x0f71d000,
-    0x0f71e005,
-    0x0f721803,
-    0x0f724000,
-    0x0f725000,
-    0x0f726000,
-    0x0f728000,
-    0x0f729800,
-    0x0f72a801,
-    0x0f72c000,
-    0x0f72d000,
-    0x0f72e000,
-    0x0f72f000,
-    0x0f730000,
-    0x0f731800,
-    0x0f732801,
-    0x0f735800,
-    0x0f739800,
-    0x0f73c000,
-    0x0f73e800,
-    0x0f73f800,
-    0x0f745000,
-    0x0f74e004,
-    0x0f752000,
-    0x0f755000,
-    0x0f75e033,
-    0x0f77910d,
-    0x0f816003,
-    0x0f84a00b,
-    0x0f857801,
-    0x0f860000,
-    0x0f868000,
-    0x0f87b009,
-    0x0f8d7037,
-    0x0f90180c,
-    0x0f91e003,
-    0x0f924806,
-    0x0f92900d,
-    0x0f933099,
-    0x0fb6c003,
-    0x0fb76802,
-    0x0fb7e802,
-    0x0fbbb803,
-    0x0fbed005,
-    0x0fbf6003,
-    0x0fbf880e,
-    0x0fc06003,
-    0x0fc24007,
-    0x0fc2d005,
-    0x0fc44007,
-    0x0fc57001,
-    0x0fc5904d,
-    0x0fd2a00b,
-    0x0fd37001,
-    0x0fd3e802,
-    0x0fd44806,
-    0x0fd5f000,
-    0x0fd63007,
-    0x0fd6e003,
-    0x0fd74806,
-    0x0fd7c806,
-    0x0fdc9800,
-    0x0fde5824,
-    0x0fdfd405,
-    0x1537001f,
-    0x15b9d005,
-    0x15c0f001,
-    0x1675100d,
-    0x175f080e,
-    0x1772f7ff,
-    0x17b2f1a1,
-    0x17d0f5e1,
-    0x189a5804};
+/// - bits [0, 13] The size of the range, allowing 16384 elements.
+/// - bits [14, 31] The lower bound code point of the range. The upper bound of
+///   the range is lower bound + size. Note the code expects code units the fit
+///   into 18 bits, instead of the 21 bits needed for the full Unicode range.
+_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[711] = {
+    0x00000020 /* 00000000 - 00000020 [   33] */,
+    0x001fc021 /* 0000007f - 000000a0 [   34] */,
+    0x002b4000 /* 000000ad - 000000ad [    1] */,
+    0x00de0001 /* 00000378 - 00000379 [    2] */,
+    0x00e00003 /* 00000380 - 00000383 [    4] */,
+    0x00e2c000 /* 0000038b - 0000038b [    1] */,
+    0x00e34000 /* 0000038d - 0000038d [    1] */,
+    0x00e88000 /* 000003a2 - 000003a2 [    1] */,
+    0x014c0000 /* 00000530 - 00000530 [    1] */,
+    0x0155c001 /* 00000557 - 00000558 [    2] */,
+    0x0162c001 /* 0000058b - 0000058c [    2] */,
+    0x01640000 /* 00000590 - 00000590 [    1] */,
+    0x01720007 /* 000005c8 - 000005cf [    8] */,
+    0x017ac003 /* 000005eb - 000005ee [    4] */,
+    0x017d4010 /* 000005f5 - 00000605 [   17] */,
+    0x01870000 /* 0000061c - 0000061c [    1] */,
+    0x01b74000 /* 000006dd - 000006dd [    1] */,
+    0x01c38001 /* 0000070e - 0000070f [    2] */,
+    0x01d2c001 /* 0000074b - 0000074c [    2] */,
+    0x01ec800d /* 000007b2 - 000007bf [   14] */,
+    0x01fec001 /* 000007fb - 000007fc [    2] */,
+    0x020b8001 /* 0000082e - 0000082f [    2] */,
+    0x020fc000 /* 0000083f - 0000083f [    1] */,
+    0x02170001 /* 0000085c - 0000085d [    2] */,
+    0x0217c000 /* 0000085f - 0000085f [    1] */,
+    0x021ac004 /* 0000086b - 0000086f [    5] */,
+    0x0223c008 /* 0000088f - 00000897 [    9] */,
+    0x02388000 /* 000008e2 - 000008e2 [    1] */,
+    0x02610000 /* 00000984 - 00000984 [    1] */,
+    0x02634001 /* 0000098d - 0000098e [    2] */,
+    0x02644001 /* 00000991 - 00000992 [    2] */,
+    0x026a4000 /* 000009a9 - 000009a9 [    1] */,
+    0x026c4000 /* 000009b1 - 000009b1 [    1] */,
+    0x026cc002 /* 000009b3 - 000009b5 [    3] */,
+    0x026e8001 /* 000009ba - 000009bb [    2] */,
+    0x02714001 /* 000009c5 - 000009c6 [    2] */,
+    0x02724001 /* 000009c9 - 000009ca [    2] */,
+    0x0273c007 /* 000009cf - 000009d6 [    8] */,
+    0x02760003 /* 000009d8 - 000009db [    4] */,
+    0x02778000 /* 000009de - 000009de [    1] */,
+    0x02790001 /* 000009e4 - 000009e5 [    2] */,
+    0x027fc001 /* 000009ff - 00000a00 [    2] */,
+    0x02810000 /* 00000a04 - 00000a04 [    1] */,
+    0x0282c003 /* 00000a0b - 00000a0e [    4] */,
+    0x02844001 /* 00000a11 - 00000a12 [    2] */,
+    0x028a4000 /* 00000a29 - 00000a29 [    1] */,
+    0x028c4000 /* 00000a31 - 00000a31 [    1] */,
+    0x028d0000 /* 00000a34 - 00000a34 [    1] */,
+    0x028dc000 /* 00000a37 - 00000a37 [    1] */,
+    0x028e8001 /* 00000a3a - 00000a3b [    2] */,
+    0x028f4000 /* 00000a3d - 00000a3d [    1] */,
+    0x0290c003 /* 00000a43 - 00000a46 [    4] */,
+    0x02924001 /* 00000a49 - 00000a4a [    2] */,
+    0x02938002 /* 00000a4e - 00000a50 [    3] */,
+    0x02948006 /* 00000a52 - 00000a58 [    7] */,
+    0x02974000 /* 00000a5d - 00000a5d [    1] */,
+    0x0297c006 /* 00000a5f - 00000a65 [    7] */,
+    0x029dc009 /* 00000a77 - 00000a80 [   10] */,
+    0x02a10000 /* 00000a84 - 00000a84 [    1] */,
+    0x02a38000 /* 00000a8e - 00000a8e [    1] */,
+    0x02a48000 /* 00000a92 - 00000a92 [    1] */,
+    0x02aa4000 /* 00000aa9 - 00000aa9 [    1] */,
+    0x02ac4000 /* 00000ab1 - 00000ab1 [    1] */,
+    0x02ad0000 /* 00000ab4 - 00000ab4 [    1] */,
+    0x02ae8001 /* 00000aba - 00000abb [    2] */,
+    0x02b18000 /* 00000ac6 - 00000ac6 [    1] */,
+    0x02b28000 /* 00000aca - 00000aca [    1] */,
+    0x02b38001 /* 00000ace - 00000acf [    2] */,
+    0x02b4400e /* 00000ad1 - 00000adf [   15] */,
+    0x02b90001 /* 00000ae4 - 00000ae5 [    2] */,
+    0x02bc8006 /* 00000af2 - 00000af8 [    7] */,
+    0x02c00000 /* 00000b00 - 00000b00 [    1] */,
+    0x02c10000 /* 00000b04 - 00000b04 [    1] */,
+    0x02c34001 /* 00000b0d - 00000b0e [    2] */,
+    0x02c44001 /* 00000b11 - 00000b12 [    2] */,
+    0x02ca4000 /* 00000b29 - 00000b29 [    1] */,
+    0x02cc4000 /* 00000b31 - 00000b31 [    1] */,
+    0x02cd0000 /* 00000b34 - 00000b34 [    1] */,
+    0x02ce8001 /* 00000b3a - 00000b3b [    2] */,
+    0x02d14001 /* 00000b45 - 00000b46 [    2] */,
+    0x02d24001 /* 00000b49 - 00000b4a [    2] */,
+    0x02d38006 /* 00000b4e - 00000b54 [    7] */,
+    0x02d60003 /* 00000b58 - 00000b5b [    4] */,
+    0x02d78000 /* 00000b5e - 00000b5e [    1] */,
+    0x02d90001 /* 00000b64 - 00000b65 [    2] */,
+    0x02de0009 /* 00000b78 - 00000b81 [   10] */,
+    0x02e10000 /* 00000b84 - 00000b84 [    1] */,
+    0x02e2c002 /* 00000b8b - 00000b8d [    3] */,
+    0x02e44000 /* 00000b91 - 00000b91 [    1] */,
+    0x02e58002 /* 00000b96 - 00000b98 [    3] */,
+    0x02e6c000 /* 00000b9b - 00000b9b [    1] */,
+    0x02e74000 /* 00000b9d - 00000b9d [    1] */,
+    0x02e80002 /* 00000ba0 - 00000ba2 [    3] */,
+    0x02e94002 /* 00000ba5 - 00000ba7 [    3] */,
+    0x02eac002 /* 00000bab - 00000bad [    3] */,
+    0x02ee8003 /* 00000bba - 00000bbd [    4] */,
+    0x02f0c002 /* 00000bc3 - 00000bc5 [    3] */,
+    0x02f24000 /* 00000bc9 - 00000bc9 [    1] */,
+    0x02f38001 /* 00000bce - 00000bcf [    2] */,
+    0x02f44005 /* 00000bd1 - 00000bd6 [    6] */,
+    0x02f6000d /* 00000bd8 - 00000be5 [   14] */,
+    0x02fec004 /* 00000bfb - 00000bff [    5] */,
+    0x03034000 /* 00000c0d - 00000c0d [    1] */,
+    0x03044000 /* 00000c11 - 00000c11 [    1] */,
+    0x030a4000 /* 00000c29 - 00000c29 [    1] */,
+    0x030e8001 /* 00000c3a - 00000c3b [    2] */,
+    0x03114000 /* 00000c45 - 00000c45 [    1] */,
+    0x03124000 /* 00000c49 - 00000c49 [    1] */,
+    0x03138006 /* 00000c4e - 00000c54 [    7] */,
+    0x0315c000 /* 00000c57 - 00000c57 [    1] */,
+    0x0316c001 /* 00000c5b - 00000c5c [    2] */,
+    0x03178001 /* 00000c5e - 00000c5f [    2] */,
+    0x03190001 /* 00000c64 - 00000c65 [    2] */,
+    0x031c0006 /* 00000c70 - 00000c76 [    7] */,
+    0x03234000 /* 00000c8d - 00000c8d [    1] */,
+    0x03244000 /* 00000c91 - 00000c91 [    1] */,
+    0x032a4000 /* 00000ca9 - 00000ca9 [    1] */,
+    0x032d0000 /* 00000cb4 - 00000cb4 [    1] */,
+    0x032e8001 /* 00000cba - 00000cbb [    2] */,
+    0x03314000 /* 00000cc5 - 00000cc5 [    1] */,
+    0x03324000 /* 00000cc9 - 00000cc9 [    1] */,
+    0x03338006 /* 00000cce - 00000cd4 [    7] */,
+    0x0335c005 /* 00000cd7 - 00000cdc [    6] */,
+    0x0337c000 /* 00000cdf - 00000cdf [    1] */,
+    0x03390001 /* 00000ce4 - 00000ce5 [    2] */,
+    0x033c0000 /* 00000cf0 - 00000cf0 [    1] */,
+    0x033d000b /* 00000cf4 - 00000cff [   12] */,
+    0x03434000 /* 00000d0d - 00000d0d [    1] */,
+    0x03444000 /* 00000d11 - 00000d11 [    1] */,
+    0x03514000 /* 00000d45 - 00000d45 [    1] */,
+    0x03524000 /* 00000d49 - 00000d49 [    1] */,
+    0x03540003 /* 00000d50 - 00000d53 [    4] */,
+    0x03590001 /* 00000d64 - 00000d65 [    2] */,
+    0x03600000 /* 00000d80 - 00000d80 [    1] */,
+    0x03610000 /* 00000d84 - 00000d84 [    1] */,
+    0x0365c002 /* 00000d97 - 00000d99 [    3] */,
+    0x036c8000 /* 00000db2 - 00000db2 [    1] */,
+    0x036f0000 /* 00000dbc - 00000dbc [    1] */,
+    0x036f8001 /* 00000dbe - 00000dbf [    2] */,
+    0x0371c002 /* 00000dc7 - 00000dc9 [    3] */,
+    0x0372c003 /* 00000dcb - 00000dce [    4] */,
+    0x03754000 /* 00000dd5 - 00000dd5 [    1] */,
+    0x0375c000 /* 00000dd7 - 00000dd7 [    1] */,
+    0x03780005 /* 00000de0 - 00000de5 [    6] */,
+    0x037c0001 /* 00000df0 - 00000df1 [    2] */,
+    0x037d400b /* 00000df5 - 00000e00 [   12] */,
+    0x038ec003 /* 00000e3b - 00000e3e [    4] */,
+    0x03970024 /* 00000e5c - 00000e80 [   37] */,
+    0x03a0c000 /* 00000e83 - 00000e83 [    1] */,
+    0x03a14000 /* 00000e85 - 00000e85 [    1] */,
+    0x03a2c000 /* 00000e8b - 00000e8b [    1] */,
+    0x03a90000 /* 00000ea4 - 00000ea4 [    1] */,
+    0x03a98000 /* 00000ea6 - 00000ea6 [    1] */,
+    0x03af8001 /* 00000ebe - 00000ebf [    2] */,
+    0x03b14000 /* 00000ec5 - 00000ec5 [    1] */,
+    0x03b1c000 /* 00000ec7 - 00000ec7 [    1] */,
+    0x03b3c000 /* 00000ecf - 00000ecf [    1] */,
+    0x03b68001 /* 00000eda - 00000edb [    2] */,
+    0x03b8001f /* 00000ee0 - 00000eff [   32] */,
+    0x03d20000 /* 00000f48 - 00000f48 [    1] */,
+    0x03db4003 /* 00000f6d - 00000f70 [    4] */,
+    0x03e60000 /* 00000f98 - 00000f98 [    1] */,
+    0x03ef4000 /* 00000fbd - 00000fbd [    1] */,
+    0x03f34000 /* 00000fcd - 00000fcd [    1] */,
+    0x03f6c024 /* 00000fdb - 00000fff [   37] */,
+    0x04318000 /* 000010c6 - 000010c6 [    1] */,
+    0x04320004 /* 000010c8 - 000010cc [    5] */,
+    0x04338001 /* 000010ce - 000010cf [    2] */,
+    0x04924000 /* 00001249 - 00001249 [    1] */,
+    0x04938001 /* 0000124e - 0000124f [    2] */,
+    0x0495c000 /* 00001257 - 00001257 [    1] */,
+    0x04964000 /* 00001259 - 00001259 [    1] */,
+    0x04978001 /* 0000125e - 0000125f [    2] */,
+    0x04a24000 /* 00001289 - 00001289 [    1] */,
+    0x04a38001 /* 0000128e - 0000128f [    2] */,
+    0x04ac4000 /* 000012b1 - 000012b1 [    1] */,
+    0x04ad8001 /* 000012b6 - 000012b7 [    2] */,
+    0x04afc000 /* 000012bf - 000012bf [    1] */,
+    0x04b04000 /* 000012c1 - 000012c1 [    1] */,
+    0x04b18001 /* 000012c6 - 000012c7 [    2] */,
+    0x04b5c000 /* 000012d7 - 000012d7 [    1] */,
+    0x04c44000 /* 00001311 - 00001311 [    1] */,
+    0x04c58001 /* 00001316 - 00001317 [    2] */,
+    0x04d6c001 /* 0000135b - 0000135c [    2] */,
+    0x04df4002 /* 0000137d - 0000137f [    3] */,
+    0x04e68005 /* 0000139a - 0000139f [    6] */,
+    0x04fd8001 /* 000013f6 - 000013f7 [    2] */,
+    0x04ff8001 /* 000013fe - 000013ff [    2] */,
+    0x05a00000 /* 00001680 - 00001680 [    1] */,
+    0x05a74002 /* 0000169d - 0000169f [    3] */,
+    0x05be4006 /* 000016f9 - 000016ff [    7] */,
+    0x05c58008 /* 00001716 - 0000171e [    9] */,
+    0x05cdc008 /* 00001737 - 0000173f [    9] */,
+    0x05d5000b /* 00001754 - 0000175f [   12] */,
+    0x05db4000 /* 0000176d - 0000176d [    1] */,
+    0x05dc4000 /* 00001771 - 00001771 [    1] */,
+    0x05dd000b /* 00001774 - 0000177f [   12] */,
+    0x05f78001 /* 000017de - 000017df [    2] */,
+    0x05fa8005 /* 000017ea - 000017ef [    6] */,
+    0x05fe8005 /* 000017fa - 000017ff [    6] */,
+    0x06038000 /* 0000180e - 0000180e [    1] */,
+    0x06068005 /* 0000181a - 0000181f [    6] */,
+    0x061e4006 /* 00001879 - 0000187f [    7] */,
+    0x062ac004 /* 000018ab - 000018af [    5] */,
+    0x063d8009 /* 000018f6 - 000018ff [   10] */,
+    0x0647c000 /* 0000191f - 0000191f [    1] */,
+    0x064b0003 /* 0000192c - 0000192f [    4] */,
+    0x064f0003 /* 0000193c - 0000193f [    4] */,
+    0x06504002 /* 00001941 - 00001943 [    3] */,
+    0x065b8001 /* 0000196e - 0000196f [    2] */,
+    0x065d400a /* 00001975 - 0000197f [   11] */,
+    0x066b0003 /* 000019ac - 000019af [    4] */,
+    0x06728005 /* 000019ca - 000019cf [    6] */,
+    0x0676c002 /* 000019db - 000019dd [    3] */,
+    0x06870001 /* 00001a1c - 00001a1d [    2] */,
+    0x0697c000 /* 00001a5f - 00001a5f [    1] */,
+    0x069f4001 /* 00001a7d - 00001a7e [    2] */,
+    0x06a28005 /* 00001a8a - 00001a8f [    6] */,
+    0x06a68005 /* 00001a9a - 00001a9f [    6] */,
+    0x06ab8001 /* 00001aae - 00001aaf [    2] */,
+    0x06b3c030 /* 00001acf - 00001aff [   49] */,
+    0x06d34002 /* 00001b4d - 00001b4f [    3] */,
+    0x06dfc000 /* 00001b7f - 00001b7f [    1] */,
+    0x06fd0007 /* 00001bf4 - 00001bfb [    8] */,
+    0x070e0002 /* 00001c38 - 00001c3a [    3] */,
+    0x07128002 /* 00001c4a - 00001c4c [    3] */,
+    0x07224006 /* 00001c89 - 00001c8f [    7] */,
+    0x072ec001 /* 00001cbb - 00001cbc [    2] */,
+    0x07320007 /* 00001cc8 - 00001ccf [    8] */,
+    0x073ec004 /* 00001cfb - 00001cff [    5] */,
+    0x07c58001 /* 00001f16 - 00001f17 [    2] */,
+    0x07c78001 /* 00001f1e - 00001f1f [    2] */,
+    0x07d18001 /* 00001f46 - 00001f47 [    2] */,
+    0x07d38001 /* 00001f4e - 00001f4f [    2] */,
+    0x07d60000 /* 00001f58 - 00001f58 [    1] */,
+    0x07d68000 /* 00001f5a - 00001f5a [    1] */,
+    0x07d70000 /* 00001f5c - 00001f5c [    1] */,
+    0x07d78000 /* 00001f5e - 00001f5e [    1] */,
+    0x07df8001 /* 00001f7e - 00001f7f [    2] */,
+    0x07ed4000 /* 00001fb5 - 00001fb5 [    1] */,
+    0x07f14000 /* 00001fc5 - 00001fc5 [    1] */,
+    0x07f50001 /* 00001fd4 - 00001fd5 [    2] */,
+    0x07f70000 /* 00001fdc - 00001fdc [    1] */,
+    0x07fc0001 /* 00001ff0 - 00001ff1 [    2] */,
+    0x07fd4000 /* 00001ff5 - 00001ff5 [    1] */,
+    0x07ffc010 /* 00001fff - 0000200f [   17] */,
+    0x080a0007 /* 00002028 - 0000202f [    8] */,
+    0x0817c010 /* 0000205f - 0000206f [   17] */,
+    0x081c8001 /* 00002072 - 00002073 [    2] */,
+    0x0823c000 /* 0000208f - 0000208f [    1] */,
+    0x08274002 /* 0000209d - 0000209f [    3] */,
+    0x0830400e /* 000020c1 - 000020cf [   15] */,
+    0x083c400e /* 000020f1 - 000020ff [   15] */,
+    0x08630003 /* 0000218c - 0000218f [    4] */,
+    0x0909c018 /* 00002427 - 0000243f [   25] */,
+    0x0912c014 /* 0000244b - 0000245f [   21] */,
+    0x0add0001 /* 00002b74 - 00002b75 [    2] */,
+    0x0ae58000 /* 00002b96 - 00002b96 [    1] */,
+    0x0b3d0004 /* 00002cf4 - 00002cf8 [    5] */,
+    0x0b498000 /* 00002d26 - 00002d26 [    1] */,
+    0x0b4a0004 /* 00002d28 - 00002d2c [    5] */,
+    0x0b4b8001 /* 00002d2e - 00002d2f [    2] */,
+    0x0b5a0006 /* 00002d68 - 00002d6e [    7] */,
+    0x0b5c400d /* 00002d71 - 00002d7e [   14] */,
+    0x0b65c008 /* 00002d97 - 00002d9f [    9] */,
+    0x0b69c000 /* 00002da7 - 00002da7 [    1] */,
+    0x0b6bc000 /* 00002daf - 00002daf [    1] */,
+    0x0b6dc000 /* 00002db7 - 00002db7 [    1] */,
+    0x0b6fc000 /* 00002dbf - 00002dbf [    1] */,
+    0x0b71c000 /* 00002dc7 - 00002dc7 [    1] */,
+    0x0b73c000 /* 00002dcf - 00002dcf [    1] */,
+    0x0b75c000 /* 00002dd7 - 00002dd7 [    1] */,
+    0x0b77c000 /* 00002ddf - 00002ddf [    1] */,
+    0x0b978021 /* 00002e5e - 00002e7f [   34] */,
+    0x0ba68000 /* 00002e9a - 00002e9a [    1] */,
+    0x0bbd000b /* 00002ef4 - 00002eff [   12] */,
+    0x0bf58019 /* 00002fd6 - 00002fef [   26] */,
+    0x0c000000 /* 00003000 - 00003000 [    1] */,
+    0x0c100000 /* 00003040 - 00003040 [    1] */,
+    0x0c25c001 /* 00003097 - 00003098 [    2] */,
+    0x0c400004 /* 00003100 - 00003104 [    5] */,
+    0x0c4c0000 /* 00003130 - 00003130 [    1] */,
+    0x0c63c000 /* 0000318f - 0000318f [    1] */,
+    0x0c79000a /* 000031e4 - 000031ee [   11] */,
+    0x0c87c000 /* 0000321f - 0000321f [    1] */,
+    0x29234002 /* 0000a48d - 0000a48f [    3] */,
+    0x2931c008 /* 0000a4c7 - 0000a4cf [    9] */,
+    0x298b0013 /* 0000a62c - 0000a63f [   20] */,
+    0x29be0007 /* 0000a6f8 - 0000a6ff [    8] */,
+    0x29f2c004 /* 0000a7cb - 0000a7cf [    5] */,
+    0x29f48000 /* 0000a7d2 - 0000a7d2 [    1] */,
+    0x29f50000 /* 0000a7d4 - 0000a7d4 [    1] */,
+    0x29f68017 /* 0000a7da - 0000a7f1 [   24] */,
+    0x2a0b4002 /* 0000a82d - 0000a82f [    3] */,
+    0x2a0e8005 /* 0000a83a - 0000a83f [    6] */,
+    0x2a1e0007 /* 0000a878 - 0000a87f [    8] */,
+    0x2a318007 /* 0000a8c6 - 0000a8cd [    8] */,
+    0x2a368005 /* 0000a8da - 0000a8df [    6] */,
+    0x2a55000a /* 0000a954 - 0000a95e [   11] */,
+    0x2a5f4002 /* 0000a97d - 0000a97f [    3] */,
+    0x2a738000 /* 0000a9ce - 0000a9ce [    1] */,
+    0x2a768003 /* 0000a9da - 0000a9dd [    4] */,
+    0x2a7fc000 /* 0000a9ff - 0000a9ff [    1] */,
+    0x2a8dc008 /* 0000aa37 - 0000aa3f [    9] */,
+    0x2a938001 /* 0000aa4e - 0000aa4f [    2] */,
+    0x2a968001 /* 0000aa5a - 0000aa5b [    2] */,
+    0x2ab0c017 /* 0000aac3 - 0000aada [   24] */,
+    0x2abdc009 /* 0000aaf7 - 0000ab00 [   10] */,
+    0x2ac1c001 /* 0000ab07 - 0000ab08 [    2] */,
+    0x2ac3c001 /* 0000ab0f - 0000ab10 [    2] */,
+    0x2ac5c008 /* 0000ab17 - 0000ab1f [    9] */,
+    0x2ac9c000 /* 0000ab27 - 0000ab27 [    1] */,
+    0x2acbc000 /* 0000ab2f - 0000ab2f [    1] */,
+    0x2adb0003 /* 0000ab6c - 0000ab6f [    4] */,
+    0x2afb8001 /* 0000abee - 0000abef [    2] */,
+    0x2afe8005 /* 0000abfa - 0000abff [    6] */,
+    0x35e9000b /* 0000d7a4 - 0000d7af [   12] */,
+    0x35f1c003 /* 0000d7c7 - 0000d7ca [    4] */,
+    0x35ff2103 /* 0000d7fc - 0000f8ff [ 8452] */,
+    0x3e9b8001 /* 0000fa6e - 0000fa6f [    2] */,
+    0x3eb68025 /* 0000fada - 0000faff [   38] */,
+    0x3ec1c00b /* 0000fb07 - 0000fb12 [   12] */,
+    0x3ec60004 /* 0000fb18 - 0000fb1c [    5] */,
+    0x3ecdc000 /* 0000fb37 - 0000fb37 [    1] */,
+    0x3ecf4000 /* 0000fb3d - 0000fb3d [    1] */,
+    0x3ecfc000 /* 0000fb3f - 0000fb3f [    1] */,
+    0x3ed08000 /* 0000fb42 - 0000fb42 [    1] */,
+    0x3ed14000 /* 0000fb45 - 0000fb45 [    1] */,
+    0x3ef0c00f /* 0000fbc3 - 0000fbd2 [   16] */,
+    0x3f640001 /* 0000fd90 - 0000fd91 [    2] */,
+    0x3f720006 /* 0000fdc8 - 0000fdce [    7] */,
+    0x3f74001f /* 0000fdd0 - 0000fdef [   32] */,
+    0x3f868005 /* 0000fe1a - 0000fe1f [    6] */,
+    0x3f94c000 /* 0000fe53 - 0000fe53 [    1] */,
+    0x3f99c000 /* 0000fe67 - 0000fe67 [    1] */,
+    0x3f9b0003 /* 0000fe6c - 0000fe6f [    4] */,
+    0x3f9d4000 /* 0000fe75 - 0000fe75 [    1] */,
+    0x3fbf4003 /* 0000fefd - 0000ff00 [    4] */,
+    0x3fefc002 /* 0000ffbf - 0000ffc1 [    3] */,
+    0x3ff20001 /* 0000ffc8 - 0000ffc9 [    2] */,
+    0x3ff40001 /* 0000ffd0 - 0000ffd1 [    2] */,
+    0x3ff60001 /* 0000ffd8 - 0000ffd9 [    2] */,
+    0x3ff74002 /* 0000ffdd - 0000ffdf [    3] */,
+    0x3ff9c000 /* 0000ffe7 - 0000ffe7 [    1] */,
+    0x3ffbc00c /* 0000ffef - 0000fffb [   13] */,
+    0x3fff8001 /* 0000fffe - 0000ffff [    2] */,
+    0x40030000 /* 0001000c - 0001000c [    1] */,
+    0x4009c000 /* 00010027 - 00010027 [    1] */,
+    0x400ec000 /* 0001003b - 0001003b [    1] */,
+    0x400f8000 /* 0001003e - 0001003e [    1] */,
+    0x40138001 /* 0001004e - 0001004f [    2] */,
+    0x40178021 /* 0001005e - 0001007f [   34] */,
+    0x403ec004 /* 000100fb - 000100ff [    5] */,
+    0x4040c003 /* 00010103 - 00010106 [    4] */,
+    0x404d0002 /* 00010134 - 00010136 [    3] */,
+    0x4063c000 /* 0001018f - 0001018f [    1] */,
+    0x40674002 /* 0001019d - 0001019f [    3] */,
+    0x4068402e /* 000101a1 - 000101cf [   47] */,
+    0x407f8081 /* 000101fe - 0001027f [  130] */,
+    0x40a74002 /* 0001029d - 0001029f [    3] */,
+    0x40b4400e /* 000102d1 - 000102df [   15] */,
+    0x40bf0003 /* 000102fc - 000102ff [    4] */,
+    0x40c90008 /* 00010324 - 0001032c [    9] */,
+    0x40d2c004 /* 0001034b - 0001034f [    5] */,
+    0x40dec004 /* 0001037b - 0001037f [    5] */,
+    0x40e78000 /* 0001039e - 0001039e [    1] */,
+    0x40f10003 /* 000103c4 - 000103c7 [    4] */,
+    0x40f58029 /* 000103d6 - 000103ff [   42] */,
+    0x41278001 /* 0001049e - 0001049f [    2] */,
+    0x412a8005 /* 000104aa - 000104af [    6] */,
+    0x41350003 /* 000104d4 - 000104d7 [    4] */,
+    0x413f0003 /* 000104fc - 000104ff [    4] */,
+    0x414a0007 /* 00010528 - 0001052f [    8] */,
+    0x4159000a /* 00010564 - 0001056e [   11] */,
+    0x415ec000 /* 0001057b - 0001057b [    1] */,
+    0x4162c000 /* 0001058b - 0001058b [    1] */,
+    0x4164c000 /* 00010593 - 00010593 [    1] */,
+    0x41658000 /* 00010596 - 00010596 [    1] */,
+    0x41688000 /* 000105a2 - 000105a2 [    1] */,
+    0x416c8000 /* 000105b2 - 000105b2 [    1] */,
+    0x416e8000 /* 000105ba - 000105ba [    1] */,
+    0x416f4042 /* 000105bd - 000105ff [   67] */,
+    0x41cdc008 /* 00010737 - 0001073f [    9] */,
+    0x41d58009 /* 00010756 - 0001075f [   10] */,
+    0x41da0017 /* 00010768 - 0001077f [   24] */,
+    0x41e18000 /* 00010786 - 00010786 [    1] */,
+    0x41ec4000 /* 000107b1 - 000107b1 [    1] */,
+    0x41eec044 /* 000107bb - 000107ff [   69] */,
+    0x42018001 /* 00010806 - 00010807 [    2] */,
+    0x42024000 /* 00010809 - 00010809 [    1] */,
+    0x420d8000 /* 00010836 - 00010836 [    1] */,
+    0x420e4002 /* 00010839 - 0001083b [    3] */,
+    0x420f4001 /* 0001083d - 0001083e [    2] */,
+    0x42158000 /* 00010856 - 00010856 [    1] */,
+    0x4227c007 /* 0001089f - 000108a6 [    8] */,
+    0x422c002f /* 000108b0 - 000108df [   48] */,
+    0x423cc000 /* 000108f3 - 000108f3 [    1] */,
+    0x423d8004 /* 000108f6 - 000108fa [    5] */,
+    0x42470002 /* 0001091c - 0001091e [    3] */,
+    0x424e8004 /* 0001093a - 0001093e [    5] */,
+    0x4250003f /* 00010940 - 0001097f [   64] */,
+    0x426e0003 /* 000109b8 - 000109bb [    4] */,
+    0x42740001 /* 000109d0 - 000109d1 [    2] */,
+    0x42810000 /* 00010a04 - 00010a04 [    1] */,
+    0x4281c004 /* 00010a07 - 00010a0b [    5] */,
+    0x42850000 /* 00010a14 - 00010a14 [    1] */,
+    0x42860000 /* 00010a18 - 00010a18 [    1] */,
+    0x428d8001 /* 00010a36 - 00010a37 [    2] */,
+    0x428ec003 /* 00010a3b - 00010a3e [    4] */,
+    0x42924006 /* 00010a49 - 00010a4f [    7] */,
+    0x42964006 /* 00010a59 - 00010a5f [    7] */,
+    0x42a8001f /* 00010aa0 - 00010abf [   32] */,
+    0x42b9c003 /* 00010ae7 - 00010aea [    4] */,
+    0x42bdc008 /* 00010af7 - 00010aff [    9] */,
+    0x42cd8002 /* 00010b36 - 00010b38 [    3] */,
+    0x42d58001 /* 00010b56 - 00010b57 [    2] */,
+    0x42dcc004 /* 00010b73 - 00010b77 [    5] */,
+    0x42e48006 /* 00010b92 - 00010b98 [    7] */,
+    0x42e7400b /* 00010b9d - 00010ba8 [   12] */,
+    0x42ec004f /* 00010bb0 - 00010bff [   80] */,
+    0x43124036 /* 00010c49 - 00010c7f [   55] */,
+    0x432cc00c /* 00010cb3 - 00010cbf [   13] */,
+    0x433cc006 /* 00010cf3 - 00010cf9 [    7] */,
+    0x434a0007 /* 00010d28 - 00010d2f [    8] */,
+    0x434e8125 /* 00010d3a - 00010e5f [  294] */,
+    0x439fc000 /* 00010e7f - 00010e7f [    1] */,
+    0x43aa8000 /* 00010eaa - 00010eaa [    1] */,
+    0x43ab8001 /* 00010eae - 00010eaf [    2] */,
+    0x43ac804a /* 00010eb2 - 00010efc [   75] */,
+    0x43ca0007 /* 00010f28 - 00010f2f [    8] */,
+    0x43d68015 /* 00010f5a - 00010f6f [   22] */,
+    0x43e28025 /* 00010f8a - 00010faf [   38] */,
+    0x43f30013 /* 00010fcc - 00010fdf [   20] */,
+    0x43fdc008 /* 00010ff7 - 00010fff [    9] */,
+    0x44138003 /* 0001104e - 00011051 [    4] */,
+    0x441d8008 /* 00011076 - 0001107e [    9] */,
+    0x442f4000 /* 000110bd - 000110bd [    1] */,
+    0x4430c00c /* 000110c3 - 000110cf [   13] */,
+    0x443a4006 /* 000110e9 - 000110ef [    7] */,
+    0x443e8005 /* 000110fa - 000110ff [    6] */,
+    0x444d4000 /* 00011135 - 00011135 [    1] */,
+    0x44520007 /* 00011148 - 0001114f [    8] */,
+    0x445dc008 /* 00011177 - 0001117f [    9] */,
+    0x44780000 /* 000111e0 - 000111e0 [    1] */,
+    0x447d400a /* 000111f5 - 000111ff [   11] */,
+    0x44848000 /* 00011212 - 00011212 [    1] */,
+    0x4490803d /* 00011242 - 0001127f [   62] */,
+    0x44a1c000 /* 00011287 - 00011287 [    1] */,
+    0x44a24000 /* 00011289 - 00011289 [    1] */,
+    0x44a38000 /* 0001128e - 0001128e [    1] */,
+    0x44a78000 /* 0001129e - 0001129e [    1] */,
+    0x44aa8005 /* 000112aa - 000112af [    6] */,
+    0x44bac004 /* 000112eb - 000112ef [    5] */,
+    0x44be8005 /* 000112fa - 000112ff [    6] */,
+    0x44c10000 /* 00011304 - 00011304 [    1] */,
+    0x44c34001 /* 0001130d - 0001130e [    2] */,
+    0x44c44001 /* 00011311 - 00011312 [    2] */,
+    0x44ca4000 /* 00011329 - 00011329 [    1] */,
+    0x44cc4000 /* 00011331 - 00011331 [    1] */,
+    0x44cd0000 /* 00011334 - 00011334 [    1] */,
+    0x44ce8000 /* 0001133a - 0001133a [    1] */,
+    0x44d14001 /* 00011345 - 00011346 [    2] */,
+    0x44d24001 /* 00011349 - 0001134a [    2] */,
+    0x44d38001 /* 0001134e - 0001134f [    2] */,
+    0x44d44005 /* 00011351 - 00011356 [    6] */,
+    0x44d60004 /* 00011358 - 0001135c [    5] */,
+    0x44d90001 /* 00011364 - 00011365 [    2] */,
+    0x44db4002 /* 0001136d - 0001136f [    3] */,
+    0x44dd408a /* 00011375 - 000113ff [  139] */,
+    0x45170000 /* 0001145c - 0001145c [    1] */,
+    0x4518801d /* 00011462 - 0001147f [   30] */,
+    0x45320007 /* 000114c8 - 000114cf [    8] */,
+    0x453680a5 /* 000114da - 0001157f [  166] */,
+    0x456d8001 /* 000115b6 - 000115b7 [    2] */,
+    0x45778021 /* 000115de - 000115ff [   34] */,
+    0x4591400a /* 00011645 - 0001164f [   11] */,
+    0x45968005 /* 0001165a - 0001165f [    6] */,
+    0x459b4012 /* 0001166d - 0001167f [   19] */,
+    0x45ae8005 /* 000116ba - 000116bf [    6] */,
+    0x45b28035 /* 000116ca - 000116ff [   54] */,
+    0x45c6c001 /* 0001171b - 0001171c [    2] */,
+    0x45cb0003 /* 0001172c - 0001172f [    4] */,
+    0x45d1c0b8 /* 00011747 - 000117ff [  185] */,
+    0x460f0063 /* 0001183c - 0001189f [  100] */,
+    0x463cc00b /* 000118f3 - 000118fe [   12] */,
+    0x4641c001 /* 00011907 - 00011908 [    2] */,
+    0x46428001 /* 0001190a - 0001190b [    2] */,
+    0x46450000 /* 00011914 - 00011914 [    1] */,
+    0x4645c000 /* 00011917 - 00011917 [    1] */,
+    0x464d8000 /* 00011936 - 00011936 [    1] */,
+    0x464e4001 /* 00011939 - 0001193a [    2] */,
+    0x4651c008 /* 00011947 - 0001194f [    9] */,
+    0x46568045 /* 0001195a - 0001199f [   70] */,
+    0x466a0001 /* 000119a8 - 000119a9 [    2] */,
+    0x46760001 /* 000119d8 - 000119d9 [    2] */,
+    0x4679401a /* 000119e5 - 000119ff [   27] */,
+    0x46920007 /* 00011a48 - 00011a4f [    8] */,
+    0x46a8c00c /* 00011aa3 - 00011aaf [   13] */,
+    0x46be4006 /* 00011af9 - 00011aff [    7] */,
+    0x46c280f5 /* 00011b0a - 00011bff [  246] */,
+    0x47024000 /* 00011c09 - 00011c09 [    1] */,
+    0x470dc000 /* 00011c37 - 00011c37 [    1] */,
+    0x47118009 /* 00011c46 - 00011c4f [   10] */,
+    0x471b4002 /* 00011c6d - 00011c6f [    3] */,
+    0x47240001 /* 00011c90 - 00011c91 [    2] */,
+    0x472a0000 /* 00011ca8 - 00011ca8 [    1] */,
+    0x472dc048 /* 00011cb7 - 00011cff [   73] */,
+    0x4741c000 /* 00011d07 - 00011d07 [    1] */,
+    0x47428000 /* 00011d0a - 00011d0a [    1] */,
+    0x474dc002 /* 00011d37 - 00011d39 [    3] */,
+    0x474ec000 /* 00011d3b - 00011d3b [    1] */,
+    0x474f8000 /* 00011d3e - 00011d3e [    1] */,
+    0x47520007 /* 00011d48 - 00011d4f [    8] */,
+    0x47568005 /* 00011d5a - 00011d5f [    6] */,
+    0x47598000 /* 00011d66 - 00011d66 [    1] */,
+    0x475a4000 /* 00011d69 - 00011d69 [    1] */,
+    0x4763c000 /* 00011d8f - 00011d8f [    1] */,
+    0x47648000 /* 00011d92 - 00011d92 [    1] */,
+    0x47664006 /* 00011d99 - 00011d9f [    7] */,
+    0x476a8135 /* 00011daa - 00011edf [  310] */,
+    0x47be4006 /* 00011ef9 - 00011eff [    7] */,
+    0x47c44000 /* 00011f11 - 00011f11 [    1] */,
+    0x47cec002 /* 00011f3b - 00011f3d [    3] */,
+    0x47d68055 /* 00011f5a - 00011faf [   86] */,
+    0x47ec400e /* 00011fb1 - 00011fbf [   15] */,
+    0x47fc800c /* 00011ff2 - 00011ffe [   13] */,
+    0x48e68065 /* 0001239a - 000123ff [  102] */,
+    0x491bc000 /* 0001246f - 0001246f [    1] */,
+    0x491d400a /* 00012475 - 0001247f [   11] */,
+    0x49510a4b /* 00012544 - 00012f8f [ 2636] */,
+    0x4bfcc00c /* 00012ff3 - 00012fff [   13] */,
+    0x4d0c000f /* 00013430 - 0001343f [   16] */,
+    0x4d158fa9 /* 00013456 - 000143ff [ 4010] */,
+    0x5191e1b8 /* 00014647 - 000167ff [ 8633] */,
+    0x5a8e4006 /* 00016a39 - 00016a3f [    7] */,
+    0x5a97c000 /* 00016a5f - 00016a5f [    1] */,
+    0x5a9a8003 /* 00016a6a - 00016a6d [    4] */,
+    0x5aafc000 /* 00016abf - 00016abf [    1] */,
+    0x5ab28005 /* 00016aca - 00016acf [    6] */,
+    0x5abb8001 /* 00016aee - 00016aef [    2] */,
+    0x5abd8009 /* 00016af6 - 00016aff [   10] */,
+    0x5ad18009 /* 00016b46 - 00016b4f [   10] */,
+    0x5ad68000 /* 00016b5a - 00016b5a [    1] */,
+    0x5ad88000 /* 00016b62 - 00016b62 [    1] */,
+    0x5ade0004 /* 00016b78 - 00016b7c [    5] */,
+    0x5ae402af /* 00016b90 - 00016e3f [  688] */,
+    0x5ba6c064 /* 00016e9b - 00016eff [  101] */,
+    0x5bd2c003 /* 00016f4b - 00016f4e [    4] */,
+    0x5be20006 /* 00016f88 - 00016f8e [    7] */,
+    0x5be8003f /* 00016fa0 - 00016fdf [   64] */,
+    0x5bf9400a /* 00016fe5 - 00016fef [   11] */,
+    0x5bfc800d /* 00016ff2 - 00016fff [   14] */,
+    0x61fe0007 /* 000187f8 - 000187ff [    8] */,
+    0x63358029 /* 00018cd6 - 00018cff [   42] */,
+    0x634262e6 /* 00018d09 - 0001afef [ 8935] */,
+    0x6bfd0000 /* 0001aff4 - 0001aff4 [    1] */,
+    0x6bff0000 /* 0001affc - 0001affc [    1] */,
+    0x6bffc000 /* 0001afff - 0001afff [    1] */,
+    0x6c48c00e /* 0001b123 - 0001b131 [   15] */,
+    0x6c4cc01c /* 0001b133 - 0001b14f [   29] */,
+    0x6c54c001 /* 0001b153 - 0001b154 [    2] */,
+    0x6c55800d /* 0001b156 - 0001b163 [   14] */,
+    0x6c5a0007 /* 0001b168 - 0001b16f [    8] */,
+    0x6cbf0903 /* 0001b2fc - 0001bbff [ 2308] */,
+    0x6f1ac004 /* 0001bc6b - 0001bc6f [    5] */,
+    0x6f1f4002 /* 0001bc7d - 0001bc7f [    3] */,
+    0x6f224006 /* 0001bc89 - 0001bc8f [    7] */,
+    0x6f268001 /* 0001bc9a - 0001bc9b [    2] */,
+    0x6f28125f /* 0001bca0 - 0001ceff [ 4704] */,
+    0x73cb8001 /* 0001cf2e - 0001cf2f [    2] */,
+    0x73d1c008 /* 0001cf47 - 0001cf4f [    9] */,
+    0x73f1003b /* 0001cfc4 - 0001cfff [   60] */,
+    0x743d8009 /* 0001d0f6 - 0001d0ff [   10] */,
+    0x7449c001 /* 0001d127 - 0001d128 [    2] */,
+    0x745cc007 /* 0001d173 - 0001d17a [    8] */,
+    0x747ac014 /* 0001d1eb - 0001d1ff [   21] */,
+    0x74918079 /* 0001d246 - 0001d2bf [  122] */,
+    0x74b5000b /* 0001d2d4 - 0001d2df [   12] */,
+    0x74bd000b /* 0001d2f4 - 0001d2ff [   12] */,
+    0x74d5c008 /* 0001d357 - 0001d35f [    9] */,
+    0x74de4086 /* 0001d379 - 0001d3ff [  135] */,
+    0x75154000 /* 0001d455 - 0001d455 [    1] */,
+    0x75274000 /* 0001d49d - 0001d49d [    1] */,
+    0x75280001 /* 0001d4a0 - 0001d4a1 [    2] */,
+    0x7528c001 /* 0001d4a3 - 0001d4a4 [    2] */,
+    0x7529c001 /* 0001d4a7 - 0001d4a8 [    2] */,
+    0x752b4000 /* 0001d4ad - 0001d4ad [    1] */,
+    0x752e8000 /* 0001d4ba - 0001d4ba [    1] */,
+    0x752f0000 /* 0001d4bc - 0001d4bc [    1] */,
+    0x75310000 /* 0001d4c4 - 0001d4c4 [    1] */,
+    0x75418000 /* 0001d506 - 0001d506 [    1] */,
+    0x7542c001 /* 0001d50b - 0001d50c [    2] */,
+    0x75454000 /* 0001d515 - 0001d515 [    1] */,
+    0x75474000 /* 0001d51d - 0001d51d [    1] */,
+    0x754e8000 /* 0001d53a - 0001d53a [    1] */,
+    0x754fc000 /* 0001d53f - 0001d53f [    1] */,
+    0x75514000 /* 0001d545 - 0001d545 [    1] */,
+    0x7551c002 /* 0001d547 - 0001d549 [    3] */,
+    0x75544000 /* 0001d551 - 0001d551 [    1] */,
+    0x75a98001 /* 0001d6a6 - 0001d6a7 [    2] */,
+    0x75f30001 /* 0001d7cc - 0001d7cd [    2] */,
+    0x76a3000e /* 0001da8c - 0001da9a [   15] */,
+    0x76a80000 /* 0001daa0 - 0001daa0 [    1] */,
+    0x76ac044f /* 0001dab0 - 0001deff [ 1104] */,
+    0x77c7c005 /* 0001df1f - 0001df24 [    6] */,
+    0x77cac0d4 /* 0001df2b - 0001dfff [  213] */,
+    0x7801c000 /* 0001e007 - 0001e007 [    1] */,
+    0x78064001 /* 0001e019 - 0001e01a [    2] */,
+    0x78088000 /* 0001e022 - 0001e022 [    1] */,
+    0x78094000 /* 0001e025 - 0001e025 [    1] */,
+    0x780ac004 /* 0001e02b - 0001e02f [    5] */,
+    0x781b8020 /* 0001e06e - 0001e08e [   33] */,
+    0x7824006f /* 0001e090 - 0001e0ff [  112] */,
+    0x784b4002 /* 0001e12d - 0001e12f [    3] */,
+    0x784f8001 /* 0001e13e - 0001e13f [    2] */,
+    0x78528003 /* 0001e14a - 0001e14d [    4] */,
+    0x7854013f /* 0001e150 - 0001e28f [  320] */,
+    0x78abc010 /* 0001e2af - 0001e2bf [   17] */,
+    0x78be8004 /* 0001e2fa - 0001e2fe [    5] */,
+    0x78c001cf /* 0001e300 - 0001e4cf [  464] */,
+    0x793e82e5 /* 0001e4fa - 0001e7df [  742] */,
+    0x79f9c000 /* 0001e7e7 - 0001e7e7 [    1] */,
+    0x79fb0000 /* 0001e7ec - 0001e7ec [    1] */,
+    0x79fbc000 /* 0001e7ef - 0001e7ef [    1] */,
+    0x79ffc000 /* 0001e7ff - 0001e7ff [    1] */,
+    0x7a314001 /* 0001e8c5 - 0001e8c6 [    2] */,
+    0x7a35c028 /* 0001e8d7 - 0001e8ff [   41] */,
+    0x7a530003 /* 0001e94c - 0001e94f [    4] */,
+    0x7a568003 /* 0001e95a - 0001e95d [    4] */,
+    0x7a580310 /* 0001e960 - 0001ec70 [  785] */,
+    0x7b2d404b /* 0001ecb5 - 0001ed00 [   76] */,
+    0x7b4f80c1 /* 0001ed3e - 0001edff [  194] */,
+    0x7b810000 /* 0001ee04 - 0001ee04 [    1] */,
+    0x7b880000 /* 0001ee20 - 0001ee20 [    1] */,
+    0x7b88c000 /* 0001ee23 - 0001ee23 [    1] */,
+    0x7b894001 /* 0001ee25 - 0001ee26 [    2] */,
+    0x7b8a0000 /* 0001ee28 - 0001ee28 [    1] */,
+    0x7b8cc000 /* 0001ee33 - 0001ee33 [    1] */,
+    0x7b8e0000 /* 0001ee38 - 0001ee38 [    1] */,
+    0x7b8e8000 /* 0001ee3a - 0001ee3a [    1] */,
+    0x7b8f0005 /* 0001ee3c - 0001ee41 [    6] */,
+    0x7b90c003 /* 0001ee43 - 0001ee46 [    4] */,
+    0x7b920000 /* 0001ee48 - 0001ee48 [    1] */,
+    0x7b928000 /* 0001ee4a - 0001ee4a [    1] */,
+    0x7b930000 /* 0001ee4c - 0001ee4c [    1] */,
+    0x7b940000 /* 0001ee50 - 0001ee50 [    1] */,
+    0x7b94c000 /* 0001ee53 - 0001ee53 [    1] */,
+    0x7b954001 /* 0001ee55 - 0001ee56 [    2] */,
+    0x7b960000 /* 0001ee58 - 0001ee58 [    1] */,
+    0x7b968000 /* 0001ee5a - 0001ee5a [    1] */,
+    0x7b970000 /* 0001ee5c - 0001ee5c [    1] */,
+    0x7b978000 /* 0001ee5e - 0001ee5e [    1] */,
+    0x7b980000 /* 0001ee60 - 0001ee60 [    1] */,
+    0x7b98c000 /* 0001ee63 - 0001ee63 [    1] */,
+    0x7b994001 /* 0001ee65 - 0001ee66 [    2] */,
+    0x7b9ac000 /* 0001ee6b - 0001ee6b [    1] */,
+    0x7b9cc000 /* 0001ee73 - 0001ee73 [    1] */,
+    0x7b9e0000 /* 0001ee78 - 0001ee78 [    1] */,
+    0x7b9f4000 /* 0001ee7d - 0001ee7d [    1] */,
+    0x7b9fc000 /* 0001ee7f - 0001ee7f [    1] */,
+    0x7ba28000 /* 0001ee8a - 0001ee8a [    1] */,
+    0x7ba70004 /* 0001ee9c - 0001eea0 [    5] */,
+    0x7ba90000 /* 0001eea4 - 0001eea4 [    1] */,
+    0x7baa8000 /* 0001eeaa - 0001eeaa [    1] */,
+    0x7baf0033 /* 0001eebc - 0001eeef [   52] */,
+    0x7bbc810d /* 0001eef2 - 0001efff [  270] */,
+    0x7c0b0003 /* 0001f02c - 0001f02f [    4] */,
+    0x7c25000b /* 0001f094 - 0001f09f [   12] */,
+    0x7c2bc001 /* 0001f0af - 0001f0b0 [    2] */,
+    0x7c300000 /* 0001f0c0 - 0001f0c0 [    1] */,
+    0x7c340000 /* 0001f0d0 - 0001f0d0 [    1] */,
+    0x7c3d8009 /* 0001f0f6 - 0001f0ff [   10] */,
+    0x7c6b8037 /* 0001f1ae - 0001f1e5 [   56] */,
+    0x7c80c00c /* 0001f203 - 0001f20f [   13] */,
+    0x7c8f0003 /* 0001f23c - 0001f23f [    4] */,
+    0x7c924006 /* 0001f249 - 0001f24f [    7] */,
+    0x7c94800d /* 0001f252 - 0001f25f [   14] */,
+    0x7c998099 /* 0001f266 - 0001f2ff [  154] */,
+    0x7db60003 /* 0001f6d8 - 0001f6db [    4] */,
+    0x7dbb4002 /* 0001f6ed - 0001f6ef [    3] */,
+    0x7dbf4002 /* 0001f6fd - 0001f6ff [    3] */,
+    0x7dddc003 /* 0001f777 - 0001f77a [    4] */,
+    0x7df68005 /* 0001f7da - 0001f7df [    6] */,
+    0x7dfb0003 /* 0001f7ec - 0001f7ef [    4] */,
+    0x7dfc400e /* 0001f7f1 - 0001f7ff [   15] */,
+    0x7e030003 /* 0001f80c - 0001f80f [    4] */,
+    0x7e120007 /* 0001f848 - 0001f84f [    8] */,
+    0x7e168005 /* 0001f85a - 0001f85f [    6] */,
+    0x7e220007 /* 0001f888 - 0001f88f [    8] */,
+    0x7e2b8001 /* 0001f8ae - 0001f8af [    2] */,
+    0x7e2c804d /* 0001f8b2 - 0001f8ff [   78] */,
+    0x7e95000b /* 0001fa54 - 0001fa5f [   12] */,
+    0x7e9b8001 /* 0001fa6e - 0001fa6f [    2] */,
+    0x7e9f4002 /* 0001fa7d - 0001fa7f [    3] */,
+    0x7ea24006 /* 0001fa89 - 0001fa8f [    7] */,
+    0x7eaf8000 /* 0001fabe - 0001fabe [    1] */,
+    0x7eb18007 /* 0001fac6 - 0001facd [    8] */,
+    0x7eb70003 /* 0001fadc - 0001fadf [    4] */,
+    0x7eba4006 /* 0001fae9 - 0001faef [    7] */,
+    0x7ebe4006 /* 0001faf9 - 0001faff [    7] */,
+    0x7ee4c000 /* 0001fb93 - 0001fb93 [    1] */,
+    0x7ef2c024 /* 0001fbcb - 0001fbef [   37] */,
+    0x7efe8405 /* 0001fbfa - 0001ffff [ 1030] */,
+    0xa9b8001f /* 0002a6e0 - 0002a6ff [   32] */,
+    0xadce8005 /* 0002b73a - 0002b73f [    6] */,
+    0xae078001 /* 0002b81e - 0002b81f [    2] */,
+    0xb3a8800d /* 0002cea2 - 0002ceaf [   14] */,
+    0xbaf8400e /* 0002ebe1 - 0002ebef [   15] */,
+    0xbb9789a1 /* 0002ee5e - 0002f7ff [ 2466] */,
+    0xbe8785e1 /* 0002fa1e - 0002ffff [ 1506] */,
+    0xc4d2c004 /* 0003134b - 0003134f [    5] */};
 
+/// Returns whether the code unit needs to be escaped.
+///
 /// At the end of the valid Unicode code points space a lot of code points are
 /// either reserved or a noncharacter. Adding all these entries to the
-/// lookup table would add 446 entries to the table (in Unicode 14).
-/// Instead the only the start of the region is stored, every code point in
-/// this region needs to be escaped.
-inline constexpr uint32_t __unallocated_region_lower_bound = 0x000323b0;
+/// lookup table would greatly increase the size of the table. Instead these
+/// entries are manually processed. In this large area of reserved code points,
+/// there is a small area of extended graphemes that should not be escaped
+/// unconditionally. This is also manually coded. See the generation script for
+/// more details.
 
-/// Returns whether the code unit needs to be escaped.
 ///
 /// \pre The code point is a valid Unicode code point.
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __needs_escape(const char32_t __code_point) noexcept {
-  // Since __unallocated_region_lower_bound contains the unshifted range do the
-  // comparison without shifting.
-  if (__code_point >= __unallocated_region_lower_bound)
+
+  // The entries in the gap at the end.
+  if(__code_point >= 0x000e0100 && __code_point <= 0x000e01ef)
+     return false;
+
+  // The entries at the end.
+  if (__code_point >= 0x000323b0)
     return true;
 
-  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
+  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
   if (__i == 0)
     return false;
 
   --__i;
-  uint32_t __upper_bound = (__entries[__i] >> 11) + (__entries[__i] & 0x7ffu);
+  uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
   return __code_point <= __upper_bound;
 }
 
diff --git a/libcxx/include/__format/write_escaped.h b/libcxx/include/__format/write_escaped.h
index 43a074dd8d70..052ea98c3c3b 100644
--- a/libcxx/include/__format/write_escaped.h
+++ b/libcxx/include/__format/write_escaped.h
@@ -101,15 +101,27 @@ _LIBCPP_HIDE_FROM_ABI void __write_escape_ill_formed_code_unit(basic_string<_Cha
 }
 
 template <class _CharT>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool __is_escaped_sequence_written(basic_string<_CharT>& __str, char32_t __value) {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool
+__is_escaped_sequence_written(basic_string<_CharT>& __str, bool __last_escaped, char32_t __value) {
 #  ifdef _LIBCPP_HAS_NO_UNICODE
   // For ASCII assume everything above 127 is printable.
   if (__value > 127)
     return false;
 #  endif
 
+  // [format.string.escaped]/2.2.1.2.1
+  //   CE is UTF-8, UTF-16, or UTF-32 and C corresponds to a Unicode scalar
+  //   value whose Unicode property General_Category has a value in the groups
+  //   Separator (Z) or Other (C), as described by UAX #44 of the Unicode Standard,
   if (!__escaped_output_table::__needs_escape(__value))
-    return false;
+    // [format.string.escaped]/2.2.1.2.2
+    //   CE is UTF-8, UTF-16, or UTF-32 and C corresponds to a Unicode scalar
+    //   value with the Unicode property Grapheme_Extend=Yes as described by UAX
+    //   #44 of the Unicode Standard and C is not immediately preceded in S by a
+    //   character P appended to E without translation to an escape sequence,
+    if (!__last_escaped || __extended_grapheme_custer_property_boundary::__get_property(__value) !=
+                               __extended_grapheme_custer_property_boundary::__property::__Extend)
+      return false;
 
   __formatter::__write_well_formed_escaped_code_unit(__str, __value);
   return true;
@@ -124,8 +136,8 @@ enum class __escape_quotation_mark { __apostrophe, __double_quote };
 
 // [format.string.escaped]/2
 template <class _CharT>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool
-__is_escaped_sequence_written(basic_string<_CharT>& __str, char32_t __value, __escape_quotation_mark __mark) {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool __is_escaped_sequence_written(
+    basic_string<_CharT>& __str, char32_t __value, bool __last_escaped, __escape_quotation_mark __mark) {
   // 2.2.1.1 - Mapped character in [tab:format.escape.sequences]
   switch (__value) {
   case _CharT('\t'):
@@ -167,7 +179,7 @@ __is_escaped_sequence_written(basic_string<_CharT>& __str, char32_t __value, __e
   // TODO FMT determine what to do with shift sequences.
 
   // 2.2.1.2.1 and 2.2.1.2.2 - Escape
-  return __formatter::__is_escaped_sequence_written(__str, __formatter::__to_char32(__value));
+  return __formatter::__is_escaped_sequence_written(__str, __last_escaped, __formatter::__to_char32(__value));
 }
 
 template <class _CharT>
@@ -175,11 +187,15 @@ _LIBCPP_HIDE_FROM_ABI void
 __escape(basic_string<_CharT>& __str, basic_string_view<_CharT> __values, __escape_quotation_mark __mark) {
   __unicode::__code_point_view<_CharT> __view{__values.begin(), __values.end()};
 
+  // When the first code unit has the property Grapheme_Extend=Yes it needs to
+  // be escaped. This happens when the previous code unit was also escaped.
+  bool __escape = true;
   while (!__view.__at_end()) {
     auto __first                                  = __view.__position();
     typename __unicode::__consume_result __result = __view.__consume();
     if (__result.__status == __unicode::__consume_result::__ok) {
-      if (!__formatter::__is_escaped_sequence_written(__str, __result.__code_point, __mark))
+      __escape = __formatter::__is_escaped_sequence_written(__str, __result.__code_point, __escape, __mark);
+      if (!__escape)
         // 2.2.1.3 - Add the character
         ranges::copy(__first, __view.__position(), std::back_insert_iterator(__str));
     } else {
diff --git a/libcxx/include/__mutex/unique_lock.h b/libcxx/include/__mutex/unique_lock.h
index c27ce4b24c1a..4a616ba51ee1 100644
--- a/libcxx/include/__mutex/unique_lock.h
+++ b/libcxx/include/__mutex/unique_lock.h
@@ -36,26 +36,28 @@ private:
   bool __owns_;
 
 public:
-  _LIBCPP_HIDE_FROM_ABI unique_lock() _NOEXCEPT : __m_(nullptr), __owns_(false) {}
-  _LIBCPP_HIDE_FROM_ABI explicit unique_lock(mutex_type& __m) : __m_(std::addressof(__m)), __owns_(true) {
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock() _NOEXCEPT : __m_(nullptr), __owns_(false) {}
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI explicit unique_lock(mutex_type& __m)
+      : __m_(std::addressof(__m)), __owns_(true) {
     __m_->lock();
   }
 
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, defer_lock_t) _NOEXCEPT
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, defer_lock_t) _NOEXCEPT
       : __m_(std::addressof(__m)),
         __owns_(false) {}
 
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, try_to_lock_t)
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, try_to_lock_t)
       : __m_(std::addressof(__m)), __owns_(__m.try_lock()) {}
 
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, adopt_lock_t) : __m_(std::addressof(__m)), __owns_(true) {}
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, adopt_lock_t)
+      : __m_(std::addressof(__m)), __owns_(true) {}
 
   template <class _Clock, class _Duration>
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::time_point<_Clock, _Duration>& __t)
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::time_point<_Clock, _Duration>& __t)
       : __m_(std::addressof(__m)), __owns_(__m.try_lock_until(__t)) {}
 
   template <class _Rep, class _Period>
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::duration<_Rep, _Period>& __d)
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::duration<_Rep, _Period>& __d)
       : __m_(std::addressof(__m)), __owns_(__m.try_lock_for(__d)) {}
 
   _LIBCPP_HIDE_FROM_ABI ~unique_lock() {
@@ -66,7 +68,9 @@ public:
   unique_lock(unique_lock const&)            = delete;
   unique_lock& operator=(unique_lock const&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI unique_lock(unique_lock&& __u) _NOEXCEPT : __m_(__u.__m_), __owns_(__u.__owns_) {
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(unique_lock&& __u) _NOEXCEPT
+      : __m_(__u.__m_),
+        __owns_(__u.__owns_) {
     __u.__m_    = nullptr;
     __u.__owns_ = false;
   }
diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h
index 0941770f0eef..53e4beb270ad 100644
--- a/libcxx/include/__ranges/repeat_view.h
+++ b/libcxx/include/__ranges/repeat_view.h
@@ -22,6 +22,7 @@
 #include <__ranges/iota_view.h>
 #include <__ranges/movable_box.h>
 #include <__ranges/view_interface.h>
+#include <__type_traits/decay.h>
 #include <__type_traits/is_object.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/remove_cv.h>
@@ -127,8 +128,8 @@ private:
   _LIBCPP_NO_UNIQUE_ADDRESS _Bound __bound_ = _Bound();
 };
 
-template <class _Tp, class _Bound>
-repeat_view(_Tp, _Bound) -> repeat_view<_Tp, _Bound>;
+template <class _Tp, class _Bound = unreachable_sentinel_t>
+repeat_view(_Tp, _Bound = _Bound()) -> repeat_view<_Tp, _Bound>;
 
 // [range.repeat.iterator]
 template <move_constructible _Tp, semiregular _Bound>
@@ -230,9 +231,9 @@ namespace __repeat {
 struct __fn {
   template <class _Tp>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Tp&& __value)
-    noexcept(noexcept(ranges::repeat_view(std::forward<_Tp>(__value))))
-    -> decltype(      ranges::repeat_view(std::forward<_Tp>(__value)))
-    { return          ranges::repeat_view(std::forward<_Tp>(__value)); }
+    noexcept(noexcept(ranges::repeat_view<decay_t<_Tp>>(std::forward<_Tp>(__value))))
+    -> decltype(      ranges::repeat_view<decay_t<_Tp>>(std::forward<_Tp>(__value)))
+    { return          ranges::repeat_view<decay_t<_Tp>>(std::forward<_Tp>(__value)); }
 
   template <class _Tp, class _Bound>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static constexpr auto operator()(_Tp&& __value, _Bound&& __bound_sentinel)
diff --git a/libcxx/include/__ranges/to.h b/libcxx/include/__ranges/to.h
index 8a815bce5811..e0abe6290b8f 100644
--- a/libcxx/include/__ranges/to.h
+++ b/libcxx/include/__ranges/to.h
@@ -24,6 +24,7 @@
 #include <__ranges/concepts.h>
 #include <__ranges/from_range.h>
 #include <__ranges/range_adaptor.h>
+#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
 #include <__ranges/transform_view.h>
 #include <__type_traits/add_pointer.h>
@@ -129,7 +130,7 @@ template <class _Container, input_range _Range, class... _Args>
     // Try the recursive case.
   } else if constexpr (input_range<range_reference_t<_Range>>) {
     return ranges::to<_Container>(
-        __range | views::transform([](auto&& __elem) {
+        ref_view(__range) | views::transform([](auto&& __elem) {
           return ranges::to<range_value_t<_Container>>(std::forward<decltype(__elem)>(__elem));
         }),
         std::forward<_Args>(__args)...);
diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h
index 1fd22d518e1a..9d347b188ee1 100644
--- a/libcxx/include/__string/char_traits.h
+++ b/libcxx/include/__string/char_traits.h
@@ -344,7 +344,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char16_t> {
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT {
     __identity __proj;
-    const char_type* __match = std::__find_impl(__s, __s + __n, __a, __proj);
+    const char_type* __match = std::__find(__s, __s + __n, __a, __proj);
     if (__match == __s + __n)
       return nullptr;
     return __match;
@@ -430,7 +430,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char32_t> {
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT {
     __identity __proj;
-    const char_type* __match = std::__find_impl(__s, __s + __n, __a, __proj);
+    const char_type* __match = std::__find(__s, __s + __n, __a, __proj);
     if (__match == __s + __n)
       return nullptr;
     return __match;
diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h
index 72c6ce69b60b..4da8542e3807 100644
--- a/libcxx/include/__string/constexpr_c_functions.h
+++ b/libcxx/include/__string/constexpr_c_functions.h
@@ -224,7 +224,7 @@ __constexpr_memmove(_Tp* __dest, _Up* __src, __element_count __n) {
         std::__assign_trivially_copyable(__dest[__i], __src[__i]);
     }
   } else if (__count > 0) {
-    ::__builtin_memmove(__dest, __src, (__count - 1) * sizeof(_Tp) + __libcpp_datasizeof<_Tp>::value);
+    ::__builtin_memmove(__dest, __src, (__count - 1) * sizeof(_Tp) + __datasizeof_v<_Tp>);
   }
   return __dest;
 }
diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h
index 3a8b15160107..54fde242ebcd 100644
--- a/libcxx/include/__type_traits/datasizeof.h
+++ b/libcxx/include/__type_traits/datasizeof.h
@@ -26,39 +26,38 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Tp>
-struct __libcpp_datasizeof {
 #if __has_extension(datasizeof)
-  static const size_t value = __datasizeof(_Tp);
+template <class _Tp>
+inline const size_t __datasizeof_v = __datasizeof(_Tp);
 #else
 // NOLINTNEXTLINE(readability-redundant-preprocessor) This is https://llvm.org/PR64825
 #  if __has_cpp_attribute(__no_unique_address__)
-  template <class = char>
-  struct _FirstPaddingByte {
-    [[__no_unique_address__]] _Tp __v_;
-    char __first_padding_byte_;
-  };
+template <class _Tp>
+struct _FirstPaddingByte {
+  [[__no_unique_address__]] _Tp __v_;
+  char __first_padding_byte_;
+};
 #  else
-  template <bool = __libcpp_is_final<_Tp>::value || !is_class<_Tp>::value>
-  struct _FirstPaddingByte : _Tp {
-    char __first_padding_byte_;
-  };
+template <class _Tp, bool = __libcpp_is_final<_Tp>::value || !is_class<_Tp>::value>
+struct _FirstPaddingByte : _Tp {
+  char __first_padding_byte_;
+};
 
-  template <>
-  struct _FirstPaddingByte<true> {
-    _Tp __v_;
-    char __first_padding_byte_;
-  };
+template <class _Tp>
+struct _FirstPaddingByte<_Tp, true> {
+  _Tp __v_;
+  char __first_padding_byte_;
+};
 #  endif // __has_cpp_attribute(__no_unique_address__)
 
-  // _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
-  // the use as an extension.
-  _LIBCPP_DIAGNOSTIC_PUSH
-  _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
-  static const size_t value = offsetof(_FirstPaddingByte<>, __first_padding_byte_);
-  _LIBCPP_DIAGNOSTIC_POP
+// _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
+// the use as an extension.
+_LIBCPP_DIAGNOSTIC_PUSH
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
+template <class _Tp>
+inline const size_t __datasizeof_v = offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
+_LIBCPP_DIAGNOSTIC_POP
 #endif   // __has_extension(datasizeof)
-};
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__utility/no_destroy.h b/libcxx/include/__utility/no_destroy.h
index f9c1eb7bed45..8edd194577d7 100644
--- a/libcxx/include/__utility/no_destroy.h
+++ b/libcxx/include/__utility/no_destroy.h
@@ -12,6 +12,7 @@
 #include <__config>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__utility/forward.h>
+#include <new>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -29,33 +30,23 @@ struct __uninitialized_tag {};
 // initialization using __emplace.
 template <class _Tp>
 struct __no_destroy {
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) : __dummy_() {
-    if (__libcpp_is_constant_evaluated()) {
-      __dummy_ = char();
-    }
-  }
-  _LIBCPP_HIDE_FROM_ABI ~__no_destroy() {
-    // nothing
-  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __no_destroy(__uninitialized_tag) : __obj_() {}
 
   template <class... _Args>
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args)
-      : __obj_(std::forward<_Args>(__args)...) {}
+  _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args) {
+    ::new ((void*)__obj_) _Tp(std::forward<_Args>(__args)...);
+  }
 
   template <class... _Args>
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
-    new (&__obj_) _Tp(std::forward<_Args>(__args)...);
-    return __obj_;
+  _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
+    return *(::new ((void*)__obj_) _Tp(std::forward<_Args>(__args)...));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return __obj_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return __obj_; }
+  _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return *reinterpret_cast<_Tp*>(__obj_); }
+  _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return *reinterpret_cast<const _Tp*>(__obj_); }
 
 private:
-  union {
-    _Tp __obj_;
-    char __dummy_; // so we can initialize a member even with __uninitialized_tag for constexpr-friendliness
-  };
+  _ALIGNAS_TYPE(_Tp) char __obj_[sizeof(_Tp)];
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/format b/libcxx/include/format
index f1e87de0f830..07c2ba083199 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -193,6 +193,8 @@ namespace std {
 #include <__format/concepts.h>
 #include <__format/container_adaptor.h>
 #include <__format/enable_insertable.h>
+#include <__format/escaped_output_table.h>
+#include <__format/extended_grapheme_cluster_table.h>
 #include <__format/format_arg.h>
 #include <__format/format_arg_store.h>
 #include <__format/format_args.h>
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index 12fae9a88b9d..0d2b5914bc4f 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -427,10 +427,10 @@ class _LIBCPP_TEMPLATE_VIS scoped_lock;
 template <>
 class _LIBCPP_TEMPLATE_VIS scoped_lock<> {
 public:
-  explicit scoped_lock() {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock() {}
   ~scoped_lock() = default;
 
-  _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(adopt_lock_t) {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(adopt_lock_t) {}
 
   scoped_lock(scoped_lock const&)            = delete;
   scoped_lock& operator=(scoped_lock const&) = delete;
@@ -445,13 +445,15 @@ private:
   mutex_type& __m_;
 
 public:
-  explicit scoped_lock(mutex_type& __m) _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability(__m)) : __m_(__m) {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(mutex_type& __m)
+      _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
+      : __m_(__m) {
     __m_.lock();
   }
 
   ~scoped_lock() _LIBCPP_THREAD_SAFETY_ANNOTATION(release_capability()) { __m_.unlock(); }
 
-  _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(adopt_lock_t, mutex_type& __m)
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(adopt_lock_t, mutex_type& __m)
       _LIBCPP_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
       : __m_(__m) {}
 
@@ -465,9 +467,11 @@ class _LIBCPP_TEMPLATE_VIS scoped_lock {
   typedef tuple<_MArgs&...> _MutexTuple;
 
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(_MArgs&... __margs) : __t_(__margs...) { std::lock(__margs...); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(_MArgs&... __margs) : __t_(__margs...) {
+    std::lock(__margs...);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI scoped_lock(adopt_lock_t, _MArgs&... __margs) : __t_(__margs...) {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI scoped_lock(adopt_lock_t, _MArgs&... __margs) : __t_(__margs...) {}
 
   _LIBCPP_HIDE_FROM_ABI ~scoped_lock() {
     typedef typename __make_tuple_indices<sizeof...(_MArgs)>::type _Indices;
diff --git a/libcxx/include/stdexcept b/libcxx/include/stdexcept
index 4e4cd22a6a64..853c185187c7 100644
--- a/libcxx/include/stdexcept
+++ b/libcxx/include/stdexcept
@@ -17,14 +17,14 @@ namespace std
 {
 
 class logic_error;
-    class domain_error;
-    class invalid_argument;
-    class length_error;
-    class out_of_range;
+class domain_error;
+class invalid_argument;
+class length_error;
+class out_of_range;
 class runtime_error;
-    class range_error;
-    class overflow_error;
-    class underflow_error;
+class range_error;
+class overflow_error;
+class underflow_error;
 
 for each class xxx_error:
 
diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index 7964758c908f..a5b4ab9520ae 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -107,10 +107,12 @@ protected:
 
 */
 
+#include <__assert>
 #include <__config>
 #include <__fwd/streambuf.h>
 #include <__locale>
 #include <__type_traits/is_same.h>
+#include <__utility/is_valid_range.h>
 #include <climits>
 #include <ios>
 #include <iosfwd>
@@ -234,6 +236,9 @@ protected:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 void gbump(int __n) { __ninp_ += __n; }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 void setg(char_type* __gbeg, char_type* __gnext, char_type* __gend) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(std::__is_valid_range(__gbeg, __gnext), "[gbeg, gnext) must be a valid range");
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(std::__is_valid_range(__gbeg, __gend), "[gbeg, gend) must be a valid range");
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(std::__is_valid_range(__gnext, __gend), "[gnext, gend) must be a valid range");
     __binp_ = __gbeg;
     __ninp_ = __gnext;
     __einp_ = __gend;
@@ -249,6 +254,7 @@ protected:
   _LIBCPP_HIDE_FROM_ABI void __pbump(streamsize __n) { __nout_ += __n; }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 void setp(char_type* __pbeg, char_type* __pend) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(std::__is_valid_range(__pbeg, __pend), "[pbeg, pend) must be a valid range");
     __bout_ = __nout_ = __pbeg;
     __eout_           = __pend;
   }
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 858a49b980bd..34150bd45284 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -657,6 +657,10 @@ private:
 
 } // namespace __visitation
 
+// Adding semi-colons in macro expansions helps clang-format to do a better job.
+// This macro is used to avoid compilation errors due to "stray" semi-colons.
+#  define _LIBCPP_EAT_SEMICOLON static_assert(true, "")
+
 template <size_t _Index, class _Tp>
 struct _LIBCPP_TEMPLATE_VIS __alt {
   using __value_type = _Tp;
@@ -691,11 +695,10 @@ union _LIBCPP_TEMPLATE_VIS __union<_DestructibleTrait, _Index> {};
       __union(const __union&) = default;                                                                               \
       __union(__union&&)      = default;                                                                               \
                                                                                                                        \
-      destructor                                                                                                       \
+      destructor;                                                                                                      \
                                                                                                                        \
-          __union&                                                                                                     \
-          operator=(const __union&) = default;                                                                         \
-      __union& operator=(__union&&) = default;                                                                         \
+      __union& operator=(const __union&) = default;                                                                    \
+      __union& operator=(__union&&)      = default;                                                                    \
                                                                                                                        \
     private:                                                                                                           \
       char __dummy;                                                                                                    \
@@ -705,9 +708,10 @@ union _LIBCPP_TEMPLATE_VIS __union<_DestructibleTrait, _Index> {};
       friend struct __access::__union;                                                                                 \
     }
 
-_LIBCPP_VARIANT_UNION(_Trait::_TriviallyAvailable, ~__union() = default;);
-_LIBCPP_VARIANT_UNION(_Trait::_Available, ~__union(){});
-_LIBCPP_VARIANT_UNION(_Trait::_Unavailable, ~__union() = delete;);
+_LIBCPP_VARIANT_UNION(_Trait::_TriviallyAvailable, ~__union() = default);
+_LIBCPP_VARIANT_UNION(
+    _Trait::_Available, _LIBCPP_HIDE_FROM_ABI ~__union() {} _LIBCPP_EAT_SEMICOLON);
+_LIBCPP_VARIANT_UNION(_Trait::_Unavailable, ~__union() = delete);
 
 #  undef _LIBCPP_VARIANT_UNION
 
@@ -761,23 +765,27 @@ class _LIBCPP_TEMPLATE_VIS __dtor;
       using __base_type::__base_type;                                                                                  \
       using __base_type::operator=;                                                                                    \
                                                                                                                        \
-      __dtor(const __dtor&)                       = default;                                                           \
-      __dtor(__dtor&&)                            = default;                                                           \
-      destructor __dtor& operator=(const __dtor&) = default;                                                           \
-      __dtor& operator=(__dtor&&)                 = default;                                                           \
+      __dtor(const __dtor&)            = default;                                                                      \
+      __dtor(__dtor&&)                 = default;                                                                      \
+      __dtor& operator=(const __dtor&) = default;                                                                      \
+      __dtor& operator=(__dtor&&)      = default;                                                                      \
+      destructor;                                                                                                      \
                                                                                                                        \
     protected:                                                                                                         \
-      inline _LIBCPP_HIDE_FROM_ABI destroy                                                                             \
+      inline _LIBCPP_HIDE_FROM_ABI destroy;                                                                            \
     }
 
 _LIBCPP_VARIANT_DESTRUCTOR(
-    _Trait::_TriviallyAvailable, ~__dtor() = default;
-    , void __destroy() noexcept { this->__index = __variant_npos<__index_t>; });
+    _Trait::_TriviallyAvailable,
+    ~__dtor() = default, //
+    _LIBCPP_HIDE_FROM_ABI void __destroy() noexcept {
+      this->__index = __variant_npos<__index_t>;
+    } _LIBCPP_EAT_SEMICOLON);
 
 _LIBCPP_VARIANT_DESTRUCTOR(
     _Trait::_Available,
-    ~__dtor() { __destroy(); },
-    void __destroy() noexcept {
+    _LIBCPP_HIDE_FROM_ABI ~__dtor() { __destroy(); } _LIBCPP_EAT_SEMICOLON,
+    _LIBCPP_HIDE_FROM_ABI void __destroy() noexcept {
       if (!this->valueless_by_exception()) {
         __visitation::__base::__visit_alt(
             [](auto& __alt) noexcept {
@@ -787,9 +795,9 @@ _LIBCPP_VARIANT_DESTRUCTOR(
             *this);
       }
       this->__index = __variant_npos<__index_t>;
-    });
+    } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_DESTRUCTOR(_Trait::_Unavailable, ~__dtor() = delete;, void __destroy() noexcept = delete;);
+_LIBCPP_VARIANT_DESTRUCTOR(_Trait::_Unavailable, ~__dtor() = delete, void __destroy() noexcept = delete);
 
 #  undef _LIBCPP_VARIANT_DESTRUCTOR
 
@@ -839,20 +847,24 @@ class _LIBCPP_TEMPLATE_VIS __move_constructor;
       using __base_type::operator=;                                                                                    \
                                                                                                                        \
       __move_constructor(const __move_constructor&)            = default;                                              \
-      move_constructor ~__move_constructor()                   = default;                                              \
+      ~__move_constructor()                                    = default;                                              \
       __move_constructor& operator=(const __move_constructor&) = default;                                              \
       __move_constructor& operator=(__move_constructor&&)      = default;                                              \
+      move_constructor;                                                                                                \
     }
 
 _LIBCPP_VARIANT_MOVE_CONSTRUCTOR(_Trait::_TriviallyAvailable,
-                                 __move_constructor(__move_constructor&& __that) = default;);
+                                 __move_constructor(__move_constructor&& __that) = default);
 
 _LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
     _Trait::_Available,
-    __move_constructor(__move_constructor&& __that) noexcept(__all<is_nothrow_move_constructible_v<_Types>...>::value)
-    : __move_constructor(__valueless_t{}) { this->__generic_construct(*this, std::move(__that)); });
+    _LIBCPP_HIDE_FROM_ABI __move_constructor(__move_constructor&& __that) noexcept(
+        __all<is_nothrow_move_constructible_v<_Types>...>::value)
+    : __move_constructor(__valueless_t{}) {
+      this->__generic_construct(*this, std::move(__that));
+    } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(_Trait::_Unavailable, __move_constructor(__move_constructor&&) = delete;);
+_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(_Trait::_Unavailable, __move_constructor(__move_constructor&&) = delete);
 
 #  undef _LIBCPP_VARIANT_MOVE_CONSTRUCTOR
 
@@ -869,20 +881,21 @@ class _LIBCPP_TEMPLATE_VIS __copy_constructor;
       using __base_type::__base_type;                                                                                  \
       using __base_type::operator=;                                                                                    \
                                                                                                                        \
-      copy_constructor __copy_constructor(__copy_constructor&&) = default;                                             \
-      ~__copy_constructor()                                     = default;                                             \
-      __copy_constructor& operator=(const __copy_constructor&)  = default;                                             \
-      __copy_constructor& operator=(__copy_constructor&&)       = default;                                             \
-    }
+      __copy_constructor(__copy_constructor&&)                 = default;                                              \
+      ~__copy_constructor()                                    = default;                                              \
+      __copy_constructor& operator=(const __copy_constructor&) = default;                                              \
+      __copy_constructor& operator=(__copy_constructor&&)      = default;                                              \
+      copy_constructor;                                                                                                \
+    } // namespace __variant_detail
 
 _LIBCPP_VARIANT_COPY_CONSTRUCTOR(_Trait::_TriviallyAvailable,
-                                 __copy_constructor(const __copy_constructor& __that) = default;);
+                                 __copy_constructor(const __copy_constructor& __that) = default);
 
 _LIBCPP_VARIANT_COPY_CONSTRUCTOR(
-    _Trait::_Available, __copy_constructor(const __copy_constructor& __that)
-    : __copy_constructor(__valueless_t{}) { this->__generic_construct(*this, __that); });
+    _Trait::_Available, _LIBCPP_HIDE_FROM_ABI __copy_constructor(const __copy_constructor& __that)
+    : __copy_constructor(__valueless_t{}) { this->__generic_construct(*this, __that); } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_COPY_CONSTRUCTOR(_Trait::_Unavailable, __copy_constructor(const __copy_constructor&) = delete;);
+_LIBCPP_VARIANT_COPY_CONSTRUCTOR(_Trait::_Unavailable, __copy_constructor(const __copy_constructor&) = delete);
 
 #  undef _LIBCPP_VARIANT_COPY_CONSTRUCTOR
 
@@ -955,22 +968,22 @@ class _LIBCPP_TEMPLATE_VIS __move_assignment;
       __move_assignment(__move_assignment&&)                 = default;                                                \
       ~__move_assignment()                                   = default;                                                \
       __move_assignment& operator=(const __move_assignment&) = default;                                                \
-      move_assignment                                                                                                  \
+      move_assignment;                                                                                                 \
     }
 
 _LIBCPP_VARIANT_MOVE_ASSIGNMENT(_Trait::_TriviallyAvailable,
-                                __move_assignment& operator=(__move_assignment&& __that) = default;);
+                                __move_assignment& operator=(__move_assignment&& __that) = default);
 
 _LIBCPP_VARIANT_MOVE_ASSIGNMENT(
     _Trait::_Available,
-    __move_assignment&
+    _LIBCPP_HIDE_FROM_ABI __move_assignment&
     operator=(__move_assignment&& __that) noexcept(
         __all<(is_nothrow_move_constructible_v<_Types> && is_nothrow_move_assignable_v<_Types>)...>::value) {
       this->__generic_assign(std::move(__that));
       return *this;
-    });
+    } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_MOVE_ASSIGNMENT(_Trait::_Unavailable, __move_assignment& operator=(__move_assignment&&) = delete;);
+_LIBCPP_VARIANT_MOVE_ASSIGNMENT(_Trait::_Unavailable, __move_assignment& operator=(__move_assignment&&) = delete);
 
 #  undef _LIBCPP_VARIANT_MOVE_ASSIGNMENT
 
@@ -987,22 +1000,23 @@ class _LIBCPP_TEMPLATE_VIS __copy_assignment;
       using __base_type::__base_type;                                                                                  \
       using __base_type::operator=;                                                                                    \
                                                                                                                        \
-      __copy_assignment(const __copy_assignment&)                       = default;                                     \
-      __copy_assignment(__copy_assignment&&)                            = default;                                     \
-      ~__copy_assignment()                                              = default;                                     \
-      copy_assignment __copy_assignment& operator=(__copy_assignment&&) = default;                                     \
+      __copy_assignment(const __copy_assignment&)       = default;                                                     \
+      __copy_assignment(__copy_assignment&&)            = default;                                                     \
+      ~__copy_assignment()                              = default;                                                     \
+      __copy_assignment& operator=(__copy_assignment&&) = default;                                                     \
+      copy_assignment;                                                                                                 \
     }
 
 _LIBCPP_VARIANT_COPY_ASSIGNMENT(_Trait::_TriviallyAvailable,
-                                __copy_assignment& operator=(const __copy_assignment& __that) = default;);
+                                __copy_assignment& operator=(const __copy_assignment& __that) = default);
 
 _LIBCPP_VARIANT_COPY_ASSIGNMENT(
-    _Trait::_Available, __copy_assignment& operator=(const __copy_assignment& __that) {
+    _Trait::_Available, _LIBCPP_HIDE_FROM_ABI __copy_assignment& operator=(const __copy_assignment& __that) {
       this->__generic_assign(__that);
       return *this;
-    });
+    } _LIBCPP_EAT_SEMICOLON);
 
-_LIBCPP_VARIANT_COPY_ASSIGNMENT(_Trait::_Unavailable, __copy_assignment& operator=(const __copy_assignment&) = delete;);
+_LIBCPP_VARIANT_COPY_ASSIGNMENT(_Trait::_Unavailable, __copy_assignment& operator=(const __copy_assignment&) = delete);
 
 #  undef _LIBCPP_VARIANT_COPY_ASSIGNMENT
 
diff --git a/libcxx/modules/std/ranges.inc b/libcxx/modules/std/ranges.inc
index 7d215867a431..80f31c79a1a4 100644
--- a/libcxx/modules/std/ranges.inc
+++ b/libcxx/modules/std/ranges.inc
@@ -141,6 +141,17 @@ export namespace std {
 #if _LIBCPP_STD_VER >= 23
     // [range.adaptor.object], range adaptor objects
     using std::ranges::range_adaptor_closure;
+    // Note: This declaration not in the synopsis or explicitly in the wording.
+    // However it is needed for the range adaptors.
+    // [range.adaptor.object]/3
+    //   The template parameter D for range_adaptor_closure may be an
+    //   incomplete type. If an expression of type cv D is used as an operand
+    //   to the | operator, D shall be complete and model
+    //   derived_from<range_adaptor_closure<D>>. The behavior of an expression
+    //   involving an object of type cv D as an operand to the | operator is
+    //   undefined if overload resolution selects a program-defined operator|
+    //   function.
+    using std::ranges::operator|;
 #endif
 
     // [range.all], all view
diff --git a/libcxx/test/libcxx/diagnostics/mutex.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/mutex.nodiscard.verify.cpp
index a98eb5f14211..b9890ced55bb 100644
--- a/libcxx/test/libcxx/diagnostics/mutex.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/mutex.nodiscard.verify.cpp
@@ -12,14 +12,58 @@
 
 // check that <mutex> functions are marked [[nodiscard]]
 
-// clang-format off
-
 #include <mutex>
+#include <chrono>
+#include <utility>
 
 #include "test_macros.h"
 
 void test() {
-  std::mutex mutex;
-  std::lock_guard<std::mutex>{mutex};                  // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
-  std::lock_guard<std::mutex>{mutex, std::adopt_lock}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+  // std::scoped_lock
+  {
+#if TEST_STD_VER >= 17
+    using M = std::mutex;
+    M m0, m1, m2;
+    // clang-format off
+    std::scoped_lock<>{};                                   // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M>{m0};                                // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M, M>{m0, m1};                         // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M, M, M>{m0, m1, m2};                  // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+
+    std::scoped_lock<>{std::adopt_lock};                    // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M>{std::adopt_lock, m0};               // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M, M>{std::adopt_lock, m0, m1};        // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M, M, M>{std::adopt_lock, m0, m1, m2}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    // clang-format on
+#endif
+  }
+
+  // std::unique_lock
+  {
+    using M = std::timed_mutex; // necessary for the time_point and duration constructors
+    M m;
+    std::chrono::time_point<std::chrono::steady_clock> time_point;
+    std::chrono::milliseconds duration;
+    std::unique_lock<M> other;
+
+    // clang-format off
+    std::unique_lock<M>{};                      // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m};                     // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, std::defer_lock};    // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, std::try_to_lock};   // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, std::adopt_lock};    // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, time_point};         // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, duration};           // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>(std::move(other));      // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    // clang-format on
+  }
+
+  // std::lock_guard
+  {
+    std::mutex m;
+    // clang-format off
+    std::lock_guard<std::mutex>{m};                  // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::lock_guard<std::mutex>{m, std::adopt_lock}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    // clang-format on
+  }
 }
diff --git a/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp b/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp
deleted file mode 100644
index 5fe68c83b3d9..000000000000
--- a/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: no-threads
-
-// [[nodiscard]] isn't supported in C++03 (not even as an extension)
-// UNSUPPORTED: c++03
-
-// <mutex>
-
-// template <class Mutex> class lock_guard;
-
-// [[nodiscard]] explicit lock_guard(mutex_type& m);
-// [[nodiscard]] lock_guard(mutex_type& m, adopt_lock_t);
-
-// Test that we properly apply [[nodiscard]] to lock_guard's constructors,
-// which is a libc++ extension.
-
-#include <mutex>
-
-void f() {
-    std::mutex m;
-    std::lock_guard<std::mutex>{m}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
-    std::lock_guard<std::mutex>{m, std::adopt_lock}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
-}
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 6b80790a9d19..7d31ba160ee1 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -129,6 +129,7 @@ chrono cwchar
 chrono forward_list
 chrono limits
 chrono locale
+chrono new
 chrono optional
 chrono ostream
 chrono ratio
diff --git a/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp b/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
index 881b0bd85190..03dd0f6eac53 100644
--- a/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
@@ -9,10 +9,10 @@
 #include <__type_traits/datasizeof.h>
 #include <cstdint>
 
-static_assert(std::__libcpp_datasizeof<std::int8_t>::value == 1, "");
-static_assert(std::__libcpp_datasizeof<std::int16_t>::value == 2, "");
-static_assert(std::__libcpp_datasizeof<std::int32_t>::value == 4, "");
-static_assert(std::__libcpp_datasizeof<std::int64_t>::value == 8, "");
+static_assert(std::__datasizeof_v<std::int8_t> == 1, "");
+static_assert(std::__datasizeof_v<std::int16_t> == 2, "");
+static_assert(std::__datasizeof_v<std::int32_t> == 4, "");
+static_assert(std::__datasizeof_v<std::int64_t> == 8, "");
 
 struct OneBytePadding {
   OneBytePadding() {}
@@ -22,9 +22,9 @@ struct OneBytePadding {
 };
 
 #if defined(_WIN32) && !defined(__MINGW32__)
-static_assert(std::__libcpp_datasizeof<OneBytePadding>::value == 4, "");
+static_assert(std::__datasizeof_v<OneBytePadding> == 4, "");
 #else
-static_assert(std::__libcpp_datasizeof<OneBytePadding>::value == 3, "");
+static_assert(std::__datasizeof_v<OneBytePadding> == 3, "");
 #endif
 
 struct InBetweenPadding {
@@ -35,4 +35,4 @@ struct InBetweenPadding {
   std::int16_t c;
 };
 
-static_assert(std::__libcpp_datasizeof<InBetweenPadding>::value == 8, "");
+static_assert(std::__datasizeof_v<InBetweenPadding> == 8, "");
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
index cf1909b92873..580c0f4ae10c 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
@@ -47,28 +47,28 @@ static_assert(sizeof(std::expected<B, B>) == sizeof(B));
 
 // Check that `expected`'s datasize is large enough for the parameter type(s).
 static_assert(sizeof(std::expected<BoolWithPadding, Empty>) ==
-              std::__libcpp_datasizeof<std::expected<BoolWithPadding, Empty>>::value);
+              std::__datasizeof_v<std::expected<BoolWithPadding, Empty>>);
 static_assert(sizeof(std::expected<Empty, BoolWithPadding>) ==
-              std::__libcpp_datasizeof<std::expected<Empty, BoolWithPadding>>::value);
+              std::__datasizeof_v<std::expected<Empty, BoolWithPadding>>);
 
 // In this case, there should be tail padding in the `expected` because `A`
 // itself does _not_ have tail padding.
-static_assert(sizeof(std::expected<A, A>) > std::__libcpp_datasizeof<std::expected<A, A>>::value);
+static_assert(sizeof(std::expected<A, A>) > std::__datasizeof_v<std::expected<A, A>>);
 
 // Test with some real types.
 static_assert(sizeof(std::expected<std::optional<int>, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<std::optional<int>, int>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<std::optional<int>, int>> == 8);
 
 static_assert(sizeof(std::expected<int, std::optional<int>>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<int, std::optional<int>>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<int, std::optional<int>>> == 8);
 
 static_assert(sizeof(std::expected<int, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<int, int>>::value == 5);
+static_assert(std::__datasizeof_v<std::expected<int, int>> == 5);
 
 // clang-format off
-static_assert(std::__libcpp_datasizeof<int>::value == 4);
-static_assert(std::__libcpp_datasizeof<std::expected<int, int>>::value == 5);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<int, int>, int>>::value == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<std::expected<int, int>, int>, int>>::value == 9);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<std::expected<std::expected<int, int>, int>, int>, int>>::value == 12);
+static_assert(std::__datasizeof_v<int> == 4);
+static_assert(std::__datasizeof_v<std::expected<int, int>> == 5);
+static_assert(std::__datasizeof_v<std::expected<std::expected<int, int>, int>> == 8);
+static_assert(std::__datasizeof_v<std::expected<std::expected<std::expected<int, int>, int>, int>> == 9);
+static_assert(std::__datasizeof_v<std::expected<std::expected<std::expected<std::expected<int, int>, int>, int>, int>> == 12);
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
index fdee8b71e5d9..27da03c54ac4 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
@@ -45,23 +45,23 @@ static_assert(sizeof(std::expected<void, B>) == sizeof(B));
 
 // Check that `expected`'s datasize is large enough for the parameter type(s).
 static_assert(sizeof(std::expected<void, BoolWithPadding>) ==
-              std::__libcpp_datasizeof<std::expected<void, BoolWithPadding>>::value);
+              std::__datasizeof_v<std::expected<void, BoolWithPadding>>);
 
 // In this case, there should be tail padding in the `expected` because `A`
 // itself does _not_ have tail padding.
-static_assert(sizeof(std::expected<void, A>) > std::__libcpp_datasizeof<std::expected<void, A>>::value);
+static_assert(sizeof(std::expected<void, A>) > std::__datasizeof_v<std::expected<void, A>>);
 
 // Test with some real types.
 static_assert(sizeof(std::expected<void, std::optional<int>>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::optional<int>>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<void, std::optional<int>>> == 8);
 
 static_assert(sizeof(std::expected<void, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, int>>::value == 5);
+static_assert(std::__datasizeof_v<std::expected<void, int>> == 5);
 
 // clang-format off
-static_assert(std::__libcpp_datasizeof<int>::value == 4);
-static_assert(std::__libcpp_datasizeof<std::expected<void, int>>::value == 5);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, int>>>::value == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, std::expected<void, int>>>>::value == 9);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, std::expected<void, std::expected<void, int>>>>>::value == 12);
+static_assert(std::__datasizeof_v<int> == 4);
+static_assert(std::__datasizeof_v<std::expected<void, int>> == 5);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, int>>> == 8);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, std::expected<void, int>>>> == 9);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, std::expected<void, std::expected<void, int>>>>> == 12);
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
new file mode 100644
index 000000000000..5b1191642c9a
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -0,0 +1,102 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+
+// <format>
+
+// Tests the properties of the Unicode escaped output table.
+// The libc++ algorithm has size and speed optimizations based on the properties
+// of Unicode. This means updating the Unicode tables has a likilihood of
+// breaking test. This is an assert; it requires validating whether the
+// assumptions of the size and speed optimizations are still valid.
+
+#include <algorithm>
+#include <numeric>
+#include <format>
+#include <cassert>
+
+// Contains the entries for [format.string.escaped]/2.2.1.2.1
+//   CE is a Unicode encoding and C corresponds to a UCS scalar value whose
+//   Unicode property General_Category has a value in the groups Separator (Z)
+//   or Other (C), as described by table 12 of UAX #44
+//
+// Separator (Z) consists of General_Category
+// - Zs Space_Separator,
+// - Zl Line_Separator,
+// - Zp Paragraph_Separator.
+//
+// Other (C) consists of General_Category
+// - Cc Control,
+// - Cf Format,
+// - Cs Surrogate,
+// - Co Private_Use,
+// - Cn Unassigned.
+inline constexpr int Zs = 17;
+inline constexpr int Zl = 1;
+inline constexpr int Zp = 1;
+inline constexpr int Z  = Zs + Zl + Zp;
+
+inline constexpr int Cc = 65;
+inline constexpr int Cf = 170;
+inline constexpr int Cs = 2'048;
+inline constexpr int Co = 137'468;
+inline constexpr int Cn = 824'718;
+inline constexpr int C  = Cc + Cf + Cs + Co + Cn;
+
+// This is the final part of the Unicode properties table:
+//
+// 31350..323AF  ; Lo # [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
+// 323B0..E0000  ; Cn # [711761] <reserved-323B0>..<reserved-E0000>
+// E0001         ; Cf #       LANGUAGE TAG
+// E0002..E001F  ; Cn #  [30] <reserved-E0002>..<reserved-E001F>
+// E0020..E007F  ; Cf #  [96] TAG SPACE..CANCEL TAG
+// E0080..E00FF  ; Cn # [128] <reserved-E0080>..<reserved-E00FF>
+// E0100..E01EF  ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
+// E01F0..EFFFF  ; Cn # [65040] <reserved-E01F0>..<noncharacter-EFFFF>
+// F0000..FFFFD  ; Co # [65534] <private-use-F0000>..<private-use-FFFFD>
+// FFFFE..FFFFF  ; Cn #   [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
+// 100000..10FFFD; Co # [65534] <private-use-100000>..<private-use-10FFFD>
+// 10FFFE..10FFFF; Cn #   [2] <noncharacter-10FFFE>..<noncharacter-10FFFF>
+//
+// It can be observed all entries in the range 323B0..10FFFF are in the
+// categories Cf, Co, Cn, except a small range with the property Mn.
+// In order to reduce the size of the table only the entires in the range
+// [0000, 323B0) are stored in the table. The entries in the range
+// [323B0, 10FFFF] use a hand-crafted algorithm.
+//
+// This means a number of entries are omitted
+inline constexpr int excluded = ((0x10FFFF - 0x323B0) + 1) - 240;
+
+inline constexpr int entries = Z + C - excluded;
+
+static constexpr int count_entries() {
+  return std::transform_reduce(
+      std::begin(std::__escaped_output_table::__entries),
+      std::end(std::__escaped_output_table::__entries),
+      0,
+      std::plus{},
+      [](auto entry) { return 1 + static_cast<int>(entry & 0x3fffu); });
+}
+static_assert(count_entries() == entries);
+
+int main(int, char**) {
+  for (char32_t c = 0x31350; c <= 0x323AF; ++c) // 31350..323AF  ; Lo # [4192]
+    assert(std::__escaped_output_table::__needs_escape(c) == false);
+
+  for (char32_t c = 0x323B0; c <= 0xE00FF; ++c) // 323B0..E00FF ; C
+    assert(std::__escaped_output_table::__needs_escape(c) == true);
+
+  for (char32_t c = 0xE0100; c <= 0xE01EF; ++c) // E0100..E01EF  ; Mn # [240]
+    assert(std::__escaped_output_table::__needs_escape(c) == false);
+
+  for (char32_t c = 0xE01F0; c <= 0x10FFFF; ++c) // E01F0..10FFFF; C
+    assert(std::__escaped_output_table::__needs_escape(c) == true);
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/utilities/no_destroy.pass.cpp b/libcxx/test/libcxx/utilities/no_destroy.pass.cpp
new file mode 100644
index 000000000000..9a874a640753
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/no_destroy.pass.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <__utility/no_destroy.h>
+#include <cassert>
+
+#include "test_macros.h"
+
+#if TEST_STD_VER > 17
+// Test constexpr-constructibility.
+constinit std::__no_destroy<int> nd_int_const(std::__uninitialized_tag{});
+#endif
+
+struct DestroyLast {
+  ~DestroyLast() { assert(*ptr == 5); }
+
+  int* ptr;
+} last;
+
+static std::__no_destroy<int> nd_int(5);
+
+int main(int, char**) {
+  last.ptr = &nd_int.__get();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
index 7350c1ddf0e9..4119c39772e5 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: LIBCXX-AIX-FIXME
 // XFAIL: !has-64-bit-atomics
 
 // https://github.com/llvm/llvm-project/issues/72893
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
index 84dcde5f2784..2460765a3c86 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: LIBCXX-AIX-FIXME
 // XFAIL: !has-64-bit-atomics
 
 // https://github.com/llvm/llvm-project/issues/72893
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
index 386a393e3550..4bd303022c0d 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: LIBCXX-AIX-FIXME
 // XFAIL: !has-64-bit-atomics
 
 // floating-point-type operator-=(floating-point-type) volatile noexcept;
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
index afd06d537c7a..69abb9ae63c3 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: LIBCXX-AIX-FIXME
 // XFAIL: !has-64-bit-atomics
 
 // floating-point-type operator+=(floating-point-type) volatile noexcept;
diff --git a/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp b/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
index 209e24964807..7ba56577d1bb 100644
--- a/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
@@ -46,7 +46,7 @@ void test_type() {
     static_assert(!std::is_empty<Array>::value, "");
 
     // Make sure empty arrays don't have padding bytes
-    LIBCPP_STATIC_ASSERT(std::__libcpp_datasizeof<Array>::value == sizeof(Array), "");
+    LIBCPP_STATIC_ASSERT(std::__datasizeof_v<Array> == sizeof(Array), "");
   }
 
   {
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.cons/copy.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.cons/copy.pass.cpp
index 580675119507..b458f93601a1 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.cons/copy.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.cons/copy.pass.cpp
@@ -57,18 +57,20 @@ int main(int, char**)
         test<char> t2 = t;
     }
     {
-        char g1, g2, g3, p1, p3;
+        char g[3];
+        char p[3];
         test<char> t;
-        t.setg(&g1, &g2, &g3);
-        t.setp(&p1, &p3);
+        t.setg(&g[0], &g[1], &g[2]);
+        t.setp(&p[0], &p[2]);
         test<char> t2 = t;
     }
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
     {
-        wchar_t g1, g2, g3, p1, p3;
+        wchar_t g[3];
+        wchar_t p[3];
         test<wchar_t> t;
-        t.setg(&g1, &g2, &g3);
-        t.setp(&p1, &p3);
+        t.setg(&g[0], &g[1], &g[2]);
+        t.setp(&p[0], &p[2]);
         test<wchar_t> t2 = t;
     }
     {
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.assign/assign.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.assign/assign.pass.cpp
index 8a976e77f0f1..45a8cdf3a23f 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.assign/assign.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.assign/assign.pass.cpp
@@ -59,10 +59,11 @@ int main(int, char**)
         t2 = t;
     }
     {
-        char g1, g2, g3, p1, p3;
+        char g[3];
+        char p[3];
         test<char> t;
-        t.setg(&g1, &g2, &g3);
-        t.setp(&p1, &p3);
+        t.setg(&g[0], &g[1], &g[2]);
+        t.setp(&p[0], &p[2]);
         test<char> t2;
         t2 = t;
     }
@@ -73,10 +74,11 @@ int main(int, char**)
         t2 = t;
     }
     {
-        wchar_t g1, g2, g3, p1, p3;
+        wchar_t g[3];
+        wchar_t p[3];
         test<wchar_t> t;
-        t.setg(&g1, &g2, &g3);
-        t.setp(&p1, &p3);
+        t.setg(&g[0], &g[1], &g[2]);
+        t.setp(&p[0], &p[2]);
         test<wchar_t> t2;
         t2 = t;
     }
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.assign/swap.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.assign/swap.pass.cpp
index c575c2cb1271..b90c4c053c91 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.assign/swap.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.assign/swap.pass.cpp
@@ -68,10 +68,11 @@ int main(int, char**)
         t2.swap(t);
     }
     {
-        char g1, g2, g3, p1, p3;
+        char g[3];
+        char p[3];
         test<char> t;
-        t.setg(&g1, &g2, &g3);
-        t.setp(&p1, &p3);
+        t.setg(&g[0], &g[1], &g[2]);
+        t.setp(&p[0], &p[2]);
         test<char> t2;
         t2.swap(t);
     }
@@ -82,10 +83,11 @@ int main(int, char**)
         t2.swap(t);
     }
     {
-        wchar_t g1, g2, g3, p1, p3;
+        wchar_t g[3];
+        wchar_t p[3];
         test<wchar_t> t;
-        t.setg(&g1, &g2, &g3);
-        t.setp(&p1, &p3);
+        t.setg(&g[0], &g[1], &g[2]);
+        t.setp(&p[0], &p[2]);
         test<wchar_t> t2;
         t2.swap(t);
     }
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
new file mode 100644
index 000000000000..becf89b12fdd
--- /dev/null
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
@@ -0,0 +1,68 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// <streambuf>
+
+// template <class charT, class traits = char_traits<charT> >
+// class basic_streambuf;
+
+// void setg(char_type* gbeg, char_type* gnext, char_type* gend);
+
+#include <algorithm>
+#include <iterator>
+#include <streambuf>
+#include <string>
+
+#include "check_assertion.h"
+#include "make_string.h"
+#include "test_macros.h"
+
+template <class CharT>
+struct streambuf : public std::basic_streambuf<CharT> {
+  typedef std::basic_streambuf<CharT> base;
+
+  streambuf() {}
+
+  void setg(CharT* gbeg, CharT* gnext, CharT* gend) { base::setg(gbeg, gnext, gend); }
+};
+
+template <class CharT>
+void test() {
+  std::basic_string<CharT> str = MAKE_STRING(CharT, "ABCDEF");
+  CharT arr[6];
+  std::copy(str.begin(), str.end(), arr);
+
+  {
+    streambuf<CharT> buff;
+    TEST_LIBCPP_ASSERT_FAILURE(
+        buff.setg(std::begin(arr) + 1, std::begin(arr), std::end(arr)), "[gbeg, gnext) must be a valid range");
+  }
+  {
+    streambuf<CharT> buff;
+    TEST_LIBCPP_ASSERT_FAILURE(
+        buff.setg(std::begin(arr) + 1, std::begin(arr) + 1, std::begin(arr)), "[gbeg, gend) must be a valid range");
+  }
+  {
+    streambuf<CharT> buff;
+    TEST_LIBCPP_ASSERT_FAILURE(
+        buff.setg(std::begin(arr), std::begin(arr) + 3, std::begin(arr) + 2), "[gnext, gend) must be a valid range");
+  }
+}
+
+int main(int, char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
new file mode 100644
index 000000000000..abd42272de50
--- /dev/null
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// <streambuf>
+
+// template <class charT, class traits = char_traits<charT> >
+// class basic_streambuf;
+
+// void setp(char_type* pbeg, char_type* pend);
+
+#include <algorithm>
+#include <iterator>
+#include <streambuf>
+#include <string>
+
+#include "check_assertion.h"
+#include "make_string.h"
+#include "test_macros.h"
+
+template <class CharT>
+struct streambuf : public std::basic_streambuf<CharT> {
+  typedef std::basic_streambuf<CharT> base;
+
+  streambuf() {}
+
+  void setp(CharT* pbeg, CharT* pend) { base::setp(pbeg, pend); }
+};
+
+template <class CharT>
+void test() {
+  std::basic_string<CharT> str = MAKE_STRING(CharT, "ABCDEF");
+  CharT arr[6];
+  std::copy(str.begin(), str.end(), arr);
+
+  {
+    streambuf<CharT> buff;
+    TEST_LIBCPP_ASSERT_FAILURE(buff.setp(std::begin(arr) + 3, std::begin(arr)), "[pbeg, pend) must be a valid range");
+  }
+}
+
+int main(int, char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp b/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp
index ae277d53e46f..2656f0595bf5 100644
--- a/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: (clang || apple-clang) && stdlib=libc++
+// UNSUPPORTED: (clang || apple-clang) && stdlib=libc++
 
 #include <new>
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
index 0b540e09bab3..3d9856e0b1ba 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
@@ -51,11 +51,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = DoNotOptimize(new OverAligned[3]);
+        OverAligned* dummy_data_block = new OverAligned[3];
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(static_cast<void*>(x) == DummyData);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(new_called == 1);
 
-        delete[] x;
+        delete[] dummy_data_block;
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
index 2d021ecb30e7..73d1054df188 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
@@ -48,11 +48,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = new OverAligned[3];
+        OverAligned* dummy_data_block = new OverAligned[3];
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         assert(static_cast<void*>(x) == DummyData);
         assert(new_called == 1);
 
-        delete[] x;
+        delete[] dummy_data_block;
         assert(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
index 227b20f0b1e1..c9dc20a34b13 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -55,11 +55,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = DoNotOptimize(new (std::nothrow) OverAligned[3]);
+        OverAligned* dummy_data_block = new (std::nothrow) OverAligned[3];
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(static_cast<void*>(x) == DummyData);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(new_called == 1);
 
-        delete[] x;
+        delete[] dummy_data_block;
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
index e5ef5f166975..f9e339b22190 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
@@ -48,11 +48,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = new OverAligned;
+        OverAligned* dummy_data_block = new OverAligned;
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         assert(static_cast<void*>(x) == DummyData);
         assert(new_called == 1);
 
-        delete x;
+        delete dummy_data_block;
         assert(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
index 7eab0729f9ef..dedd3089f5ab 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -54,11 +54,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = DoNotOptimize(new (std::nothrow) OverAligned);
+        OverAligned* dummy_data_block = new (std::nothrow) OverAligned;
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(static_cast<void*>(x) == DummyData);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(new_called == 1);
 
-        delete x;
+        delete dummy_data_block;
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
index 9a5b53e03902..a25b67ea554b 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
@@ -47,11 +47,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_nothrow_called = delete_called = 0;
-        OverAligned* x = new (std::nothrow) OverAligned;
+        OverAligned* dummy_data_block      = new (std::nothrow) OverAligned;
+        OverAligned* x                     = DoNotOptimize(dummy_data_block);
         assert(static_cast<void*>(x) == DummyData);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(new_nothrow_called == 1);
 
-        delete x;
+        delete dummy_data_block;
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/numerics/complex.number/complex.tuple/get.pass.cpp b/libcxx/test/std/numerics/complex.number/complex.tuple/get.pass.cpp
index bd514a672cdc..c09457f08daa 100644
--- a/libcxx/test/std/numerics/complex.number/complex.tuple/get.pass.cpp
+++ b/libcxx/test/std/numerics/complex.number/complex.tuple/get.pass.cpp
@@ -111,7 +111,7 @@ constexpr void test() {
 
     std::same_as<std::vector<T>> decltype(auto) imags{
         arr | std::views::elements<1> | std::ranges::to<std::vector<T>>()};
-    assert(reals.size() == 2);
+    assert(imags.size() == 2);
     assert(imags[0] == T{28});
     assert(imags[1] == T{94});
   }
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.float.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.float.pass.cpp
index 6103f3e87d4f..0d28cff511de 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.float.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.float.pass.cpp
@@ -14,6 +14,7 @@
 // _Tp midpoint(_Float __a, _Float __b) noexcept
 //
 
+#include <limits>
 #include <numeric>
 #include <cassert>
 
diff --git a/libcxx/test/std/ranges/range.factories/range.repeat.view/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.repeat.view/ctad.compile.pass.cpp
index 454bac6e03e5..97b74289619d 100644
--- a/libcxx/test/std/ranges/range.factories/range.repeat.view/ctad.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.repeat.view/ctad.compile.pass.cpp
@@ -24,4 +24,10 @@ static_assert(std::same_as<decltype(std::ranges::repeat_view(std::declval<Empty&
 static_assert(std::same_as<decltype(std::ranges::repeat_view(10, 1)), std::ranges::repeat_view<int, int>>);
 static_assert(std::same_as<decltype(std::ranges::repeat_view(10, 1U)), std::ranges::repeat_view<int, unsigned>>);
 static_assert(std::same_as<decltype(std::ranges::repeat_view(10, 1UL)), std::ranges::repeat_view<int, unsigned long>>);
+
+using RPV = std::ranges::repeat_view<const char*>;
+static_assert(std::same_as<decltype(std::ranges::repeat_view("foo", std::unreachable_sentinel)), RPV>);  // OK
+static_assert(std::same_as<decltype(std::ranges::repeat_view(+"foo", std::unreachable_sentinel)), RPV>); // OK
+static_assert(std::same_as<decltype(std::ranges::repeat_view("foo")), RPV>);                             // OK since LWG4053
+static_assert(std::same_as<decltype(std::ranges::repeat_view(+"foo")), RPV>);                            // OK
 // clang-format on
diff --git a/libcxx/test/std/ranges/range.factories/range.repeat.view/views_repeat.pass.cpp b/libcxx/test/std/ranges/range.factories/range.repeat.view/views_repeat.pass.cpp
index 9cbe505621b9..5d913fa197f5 100644
--- a/libcxx/test/std/ranges/range.factories/range.repeat.view/views_repeat.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.repeat.view/views_repeat.pass.cpp
@@ -60,6 +60,18 @@ static_assert(!std::is_invocable_v<decltype(std::views::repeat), NonCopyable>);
 // Tp is move_constructible
 static_assert(std::is_invocable_v<decltype(std::views::repeat), MoveOnly>);
 
+// Test LWG4054 "Repeating a repeat_view should repeat the view"
+static_assert(std::is_same_v<decltype(std::views::repeat(std::views::repeat(42))),
+                             std::ranges::repeat_view<std::ranges::repeat_view<int>>>);
+
+// These cases are from LWG4053, but they are actually covered by the resolution of LWG4054,
+// and the resolution of LWG4053 only affects CTAD.
+using RPV = std::ranges::repeat_view<const char*>;
+static_assert(std::same_as<decltype(std::views::repeat("foo", std::unreachable_sentinel)), RPV>);  // OK
+static_assert(std::same_as<decltype(std::views::repeat(+"foo", std::unreachable_sentinel)), RPV>); // OK
+static_assert(std::same_as<decltype(std::views::repeat("foo")), RPV>);                             // OK since LWG4054
+static_assert(std::same_as<decltype(std::views::repeat(+"foo")), RPV>);                            // OK
+
 constexpr bool test() {
   assert(*std::views::repeat(33).begin() == 33);
   assert(*std::views::repeat(33, 10).begin() == 33);
diff --git a/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp b/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
index 3df88d6a2dcc..7f816bb21a19 100644
--- a/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
+++ b/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
@@ -560,6 +560,11 @@ constexpr void test_recursive() {
   }
 
   assert((in | std::ranges::to<C4>()) == result);
+
+  // LWG3984: ranges::to's recursion branch may be ill-formed
+  auto in_owning_view = std::views::all(std::move(in));
+  static_assert(!std::ranges::viewable_range<decltype((in_owning_view))>);
+  assert(std::ranges::to<C4>(in_owning_view) == result);
 }
 
 constexpr bool test() {
diff --git a/libcxx/test/std/strings/string.conversions/stol.pass.cpp b/libcxx/test/std/strings/string.conversions/stol.pass.cpp
index 1e47bd504957..083a6aaff28c 100644
--- a/libcxx/test/std/strings/string.conversions/stol.pass.cpp
+++ b/libcxx/test/std/strings/string.conversions/stol.pass.cpp
@@ -13,6 +13,7 @@
 
 #include <string>
 #include <cassert>
+#include <limits>
 #include <stdexcept>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
index 3497d8935c8d..5ab8c649cc58 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <format>
 #include <cassert>
+#include <limits>
 #include <type_traits>
 
 #include "constexpr_char_traits.h"
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
index a5d3703397f4..c590cebf48ac 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
@@ -14,6 +14,7 @@
 
 #include <format>
 #include <cassert>
+#include <limits>
 #include <type_traits>
 
 #include "test_macros.h"
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index bf5c0a51f944..96c1e2664f7a 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -223,7 +223,7 @@ void test_char() {
     static_assert(sizeof(CharT) == 4, "add support for unexpected size");
     // Unicode fitting in a 32-bit wchar_t
 
-    constexpr wchar_t x  = 0x1ffff;
+    constexpr wchar_t x       = 0x1ffff;
     constexpr std::uint32_t y = 0x1ffff;
     static_assert(x == y);
 
@@ -290,7 +290,7 @@ void test_string() {
     test_format(SV("[\"\ud7ff\"]"), SV("[{:?}]"), "\xed\x9f\xbf");              // U+D7FF last valid
 #else
     /* U+D800..D+DFFFF surrogate range */
-    test_format(SV(R"(["\u{d7ff}"])"), SV("[{:?}]"), "\xed\x9f\xbf");           // U+D7FF last valid
+    test_format(SV(R"(["\u{d7ff}"])"), SV("[{:?}]"), "\xed\x9f\xbf"); // U+D7FF last valid
 #endif
     test_format(SV(R"(["\x{ed}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xed\xa0\x80"); // U+D800
     test_format(SV(R"(["\x{ed}\x{af}\x{bf}"])"), SV("[{:?}]"), "\xed\xaf\xbf"); // U+DBFF
@@ -319,7 +319,8 @@ void test_string() {
     test_format(SV("[\"\u00c3(\"]"), SV("[{:?}]"), L"\xc3\x28");
   }
 
-  test_format(SV(R"(["🤷🏻\u{200d}♂\u{fe0f}"])"), SV("[{:?}]"), SV("🤷🏻‍♂️"));
+  // LWG-3965
+  test_format(SV(R"(["🤷🏻\u{200d}♂️"])"), SV("[{:?}]"), SV("🤷🏻‍♂️"));
 
   // *** Special cases ***
   test_format(SV(R"("\t\n\r\\'\" ")"), SV("{:?}"), SV("\t\n\r\\'\" "));
@@ -336,6 +337,11 @@ void test_string() {
   if constexpr (sizeof(CharT) == 1)
     test_format(SV(R"("\x{80}")"), SV("{:?}"), SV("\x80"));
 
+  // *** P2713R1 examples ***
+  test_format(SV(R"(["\u{301}"])"), SV("[{:?}]"), SV("\u0301"));
+  test_format(SV(R"(["\\\u{301}"])"), SV("[{:?}]"), SV("\\\u0301"));
+  test_format(SV(R"(["ẹ́"])"), SV("[{:?}]"), SV("e\u0301\u0323"));
+
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
   if constexpr (sizeof(CharT) > 1) {
     using V = std::basic_string_view<CharT>;
@@ -373,7 +379,7 @@ void test_string() {
     static_assert(sizeof(CharT) == 4, "add support for unexpected size");
     // Unicode fitting in a 32-bit wchar_t
 
-    constexpr wchar_t x  = 0x1ffff;
+    constexpr wchar_t x       = 0x1ffff;
     constexpr std::uint32_t y = 0x1ffff;
     static_assert(x == y);
 
@@ -406,20 +412,18 @@ void test_format_functions(TestFunction check) {
   check(SV(R"(*"hellö"**)"), SV("{:*^10?}"), SV("hellö"));
   check(SV(R"("hellö"***)"), SV("{:*<10?}"), SV("hellö"));
 
-  check(SV(R"("hello\u{308}")"), SV("{:*>10?}"), SV("hello\u0308"));
-  check(SV(R"(***"hello\u{308}")"), SV("{:*>17?}"), SV("hello\u0308"));
-  check(SV(R"(*"hello\u{308}"**)"), SV("{:*^17?}"), SV("hello\u0308"));
-  check(SV(R"("hello\u{308}"***)"), SV("{:*<17?}"), SV("hello\u0308"));
+  check(SV(R"(***"hellö")"), SV("{:*>10?}"), SV("hello\u0308"));
+  check(SV(R"(*"hellö"**)"), SV("{:*^10?}"), SV("hello\u0308"));
+  check(SV(R"("hellö"***)"), SV("{:*<10?}"), SV("hello\u0308"));
 
-  check(SV(R"("hello 🤷🏻\u{200d}♂\u{fe0f}")"), SV("{:*>10?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"(***"hello 🤷🏻\u{200d}♂\u{fe0f}")"), SV("{:*>30?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"(*"hello 🤷🏻\u{200d}♂\u{fe0f}"**)"), SV("{:*^30?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"("hello 🤷🏻\u{200d}♂\u{fe0f}"***)"), SV("{:*<30?}"), SV("hello 🤷🏻‍♂️"));
+  check(SV(R"(***"hello 🤷🏻\u{200d}♂️")"), SV("{:*>22?}"), SV("hello 🤷🏻‍♂️"));
+  check(SV(R"(*"hello 🤷🏻\u{200d}♂️"**)"), SV("{:*^22?}"), SV("hello 🤷🏻‍♂️"));
+  check(SV(R"("hello 🤷🏻\u{200d}♂️"***)"), SV("{:*<22?}"), SV("hello 🤷🏻‍♂️"));
 
   // *** width ***
   check(SV(R"("hellö"   )"), SV("{:10?}"), SV("hellö"));
-  check(SV(R"("hello\u{308}"   )"), SV("{:17?}"), SV("hello\u0308"));
-  check(SV(R"("hello 🤷🏻\u{200d}♂\u{fe0f}"   )"), SV("{:30?}"), SV("hello 🤷🏻‍♂️"));
+  check(SV(R"("hellö"   )"), SV("{:10?}"), SV("hello\u0308"));
+  check(SV(R"("hello 🤷🏻\u{200d}♂️"   )"), SV("{:22?}"), SV("hello 🤷🏻‍♂️"));
 
   // *** precision ***
   check(SV(R"("hell)"), SV("{:.5?}"), SV("hellö"));
@@ -431,9 +435,8 @@ void test_format_functions(TestFunction check) {
   check(SV(R"("hello 🤷🏻)"), SV("{:.9?}"), SV("hello 🤷🏻‍♂️"));
   check(SV(R"("hello 🤷🏻\)"), SV("{:.10?}"), SV("hello 🤷🏻‍♂️"));
   check(SV(R"("hello 🤷🏻\u{200d})"), SV("{:.17?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"("hello 🤷🏻\u{200d}♂)"), SV("{:.18?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"("hello 🤷🏻\u{200d}♂\)"), SV("{:.19?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"("hello 🤷🏻\u{200d}♂\u{fe0f}")"), SV("{:.28?}"), SV("hello 🤷🏻‍♂️"));
+  check(SV(R"("hello 🤷🏻\u{200d}♂️)"), SV("{:.18?}"), SV("hello 🤷🏻‍♂️"));
+  check(SV(R"("hello 🤷🏻\u{200d}♂️")"), SV("{:.19?}"), SV("hello 🤷🏻‍♂️"));
 
   // *** width & precision ***
   check(SV(R"("hell#########################)"), SV("{:#<30.5?}"), SV("hellö"));
@@ -445,9 +448,8 @@ void test_format_functions(TestFunction check) {
   check(SV(R"("hello 🤷🏻#####################)"), SV("{:#<30.9?}"), SV("hello 🤷🏻‍♂️"));
   check(SV(R"("hello 🤷🏻\####################)"), SV("{:#<30.10?}"), SV("hello 🤷🏻‍♂️"));
   check(SV(R"("hello 🤷🏻\u{200d}#############)"), SV("{:#<30.17?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"("hello 🤷🏻\u{200d}♂############)"), SV("{:#<30.18?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"("hello 🤷🏻\u{200d}♂\###########)"), SV("{:#<30.19?}"), SV("hello 🤷🏻‍♂️"));
-  check(SV(R"("hello 🤷🏻\u{200d}♂\u{fe0f}"###)"), SV("{:#<30.28?}"), SV("hello 🤷🏻‍♂️"));
+  check(SV(R"("hello 🤷🏻\u{200d}♂️############)"), SV("{:#<30.18?}"), SV("hello 🤷🏻‍♂️"));
+  check(SV(R"("hello 🤷🏻\u{200d}♂️"###########)"), SV("{:#<30.19?}"), SV("hello 🤷🏻‍♂️"));
 }
 
 template <class CharT>
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 60307a7d4f35..e40c2b635ef9 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -217,7 +217,13 @@ function test-armv7m-picolibc() {
         "${@}"
 
     ${NINJA} -vC "${BUILD_DIR}/compiler-rt" install
-    mv "${BUILD_DIR}/install/lib/armv7m-none-unknown-eabi"/* "${BUILD_DIR}/install/lib"
+
+    # Prior to clang 19, armv7m-none-eabi normalised to armv7m-none-unknown-eabi.
+    # clang 19 changed this to armv7m-unknown-none-eabi. So for as long as 18.x
+    # is supported, we have to ask clang what the triple will be.
+    NORMALISED_TARGET_TRIPLE=$(${CC-cc} --target=armv7m-none-eabi -print-target-triple)
+    # Without this step linking fails later in the build.
+    mv "${BUILD_DIR}/install/lib/${NORMALISED_TARGET_TRIPLE}"/* "${BUILD_DIR}/install/lib"
 
     check-runtimes
 }
diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index c6bde8f2411c..523a0be3a451 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -39,12 +39,6 @@ LINE_REGEX = re.compile(
 )
 
 
-def filterCoreProperty(element: PropertyRange) -> Optional[PropertyRange]:
-    if element.prop == "Grapheme_Extend":
-        return element
-    return None
-
-
 # https://www.unicode.org/reports/tr44/#GC_Values_Table
 def filterGeneralProperty(element: PropertyRange) -> Optional[PropertyRange]:
     if element.prop in ["Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn"]:
@@ -94,10 +88,9 @@ DATA_ARRAY_TEMPLATE = """
 /// The entries of the characters to escape in format's debug string.
 ///
 /// Contains the entries for [format.string.escaped]/2.2.1.2.1
-///   CE is a Unicode encoding and C corresponds to either a UCS scalar value
-///   whose Unicode property General_Category has a value in the groups
-///   Separator (Z) or Other (C) or to a UCS scalar value which has the Unicode
-///   property Grapheme_Extend=Yes, as described by table 12 of UAX #44
+///   CE is a Unicode encoding and C corresponds to a UCS scalar value whose
+///   Unicode property General_Category has a value in the groups Separator (Z)
+///   or Other (C), as described by table 12 of UAX #44
 ///
 /// Separator (Z) consists of General_Category
 /// - Space_Separator,
@@ -112,7 +105,6 @@ DATA_ARRAY_TEMPLATE = """
 /// - Unassigned.
 ///
 /// The data is generated from
-/// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
 /// - https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
 ///
 /// The table is similar to the table
@@ -121,34 +113,41 @@ DATA_ARRAY_TEMPLATE = """
 /// table lacks a property, thus having more bits available for the size.
 ///
 /// The data has 2 values:
-/// - bits [0, 10] The size of the range, allowing 2048 elements.
-/// - bits [11, 31] The lower bound code point of the range. The upper bound of
-///   the range is lower bound + size.
+/// - bits [0, 13] The size of the range, allowing 16384 elements.
+/// - bits [14, 31] The lower bound code point of the range. The upper bound of
+///   the range is lower bound + size. Note the code expects code units the fit
+///   into 18 bits, instead of the 21 bits needed for the full Unicode range.
 _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
 {entries}}};
 
+/// Returns whether the code unit needs to be escaped.
+///
 /// At the end of the valid Unicode code points space a lot of code points are
 /// either reserved or a noncharacter. Adding all these entries to the
-/// lookup table would add 446 entries to the table (in Unicode 14).
-/// Instead the only the start of the region is stored, every code point in
-/// this region needs to be escaped.
-inline constexpr uint32_t __unallocated_region_lower_bound = 0x{unallocated:08x};
+/// lookup table would greatly increase the size of the table. Instead these
+/// entries are manually processed. In this large area of reserved code points,
+/// there is a small area of extended graphemes that should not be escaped
+/// unconditionally. This is also manually coded. See the generation script for
+/// more details.
 
-/// Returns whether the code unit needs to be escaped.
 ///
 /// \pre The code point is a valid Unicode code point.
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __needs_escape(const char32_t __code_point) noexcept {{
-  // Since __unallocated_region_lower_bound contains the unshifted range do the
-  // comparison without shifting.
-  if (__code_point >= __unallocated_region_lower_bound)
+
+  // The entries in the gap at the end.
+  if(__code_point >= 0x{gap_lower:08x} && __code_point <= 0x{gap_upper:08x})
+     return false;
+
+  // The entries at the end.
+  if (__code_point >= 0x{unallocated:08x})
     return true;
 
-  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
+  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
   if (__i == 0)
     return false;
 
   --__i;
-  uint32_t __upper_bound = (__entries[__i] >> 11) + (__entries[__i] & 0x7ffu);
+  uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
   return __code_point <= __upper_bound;
 }}
 """
@@ -253,28 +252,43 @@ def property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]:
 
         while True:
             e = Entry(range.lower, range.upper - range.lower)
-            if e.offset <= 2047:
+            if e.offset <= 16383:
                 result.append(e)
                 break
-            e.offset = 2047
+            e.offset = 16383
             result.append(e)
-            range.lower += 2048
+            range.lower += 16384
     return result
 
 
-cpp_entrytemplate = "    0x{:08x}"
+cpp_entrytemplate = "    0x{:08x} /* {:08x} - {:08x} [{:>5}] */"
 
 
-def generate_cpp_data(ranges: list[PropertyRange], unallocated: int) -> str:
+def generate_cpp_data(
+    ranges: list[PropertyRange], unallocated: int, gap_lower: int, gap_upper: int
+) -> str:
     result = StringIO()
     table = property_ranges_to_table(ranges)
+    # Validates all entries fit in 18 bits.
+    for x in table:
+        assert x.lower + x.offset < 0x3FFFF
     result.write(
         DATA_ARRAY_TEMPLATE.format(
             size=len(table),
             entries=",\n".join(
-                [cpp_entrytemplate.format(x.lower << 11 | x.offset) for x in table]
+                [
+                    cpp_entrytemplate.format(
+                        x.lower << 14 | x.offset,
+                        x.lower,
+                        x.lower + x.offset,
+                        x.offset + 1,
+                    )
+                    for x in table
+                ]
             ),
             unallocated=unallocated,
+            gap_lower=gap_lower,
+            gap_upper=gap_upper,
         )
     )
 
@@ -291,12 +305,6 @@ def generate_data_tables() -> str:
         / "unicode"
         / "DerivedGeneralCategory.txt"
     )
-    derived_core_catagory_path = (
-        Path(__file__).absolute().parent
-        / "data"
-        / "unicode"
-        / "DerivedCoreProperties.txt"
-    )
 
     properties = list()
     with derived_general_catagory_path.open(encoding="utf-8") as f:
@@ -308,30 +316,31 @@ def generate_data_tables() -> str:
                 )
             )
         )
-    with derived_core_catagory_path.open(encoding="utf-8") as f:
-        properties.extend(
-            list(
-                filter(
-                    filterCoreProperty,
-                    [x for line in f if (x := parsePropertyLine(line))],
-                )
-            )
-        )
 
     data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower))
 
-    # The last entry is large. In Unicode 14 it contains the entries
-    # 3134B..0FFFF 912564 elements
-    # This are 446 entries of 1325 entries in the table.
-    # Based on the nature of these entries it is expected they remain for the
-    # forseeable future. Therefore we only store the lower bound of this section.
-    #
-    # When this region becomes substantially smaller we need to investigate
-    # this design.
-    assert data[-1].upper == 0x10FFFF
-    assert data[-1].upper - data[-1].lower > 900000
-
-    return "\n".join([generate_cpp_data(data[:-1], data[-1].lower)])
+    # The output table has two large entries at the end, with a small "gap"
+    #   E0100..E01EF  ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
+    # Based on Unicode 15.1.0:
+    # - Encoding all these entries in the table requires 1173 entries.
+    # - Manually handling these last two blocks reduces the size to 729 entries.
+    # This not only reduces the binary size, but also improves the performance
+    # by having fewer elements to search.
+    # The exact entries may differ between Unicode versions. When these numbers
+    # change the test needs to be updated too.
+    #   libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+    assert (data[-2].lower) == 0x323B0
+    assert (data[-2].upper) == 0xE00FF
+    assert (data[-1].lower) == 0xE01F0
+    assert (data[-1].upper) == 0x10FFFF
+
+    return "\n".join(
+        [
+            generate_cpp_data(
+                data[:-2], data[-2].lower, data[-2].upper + 1, data[-1].lower - 1
+            )
+        ]
+    )
 
 
 if __name__ == "__main__":
diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
index e46f5277a8c3..29c01da9e28f 100644
--- a/lld/COFF/MinGW.cpp
+++ b/lld/COFF/MinGW.cpp
@@ -50,7 +50,6 @@ AutoExporter::AutoExporter(
       "libclang_rt.profile-x86_64",
       "libc++",
       "libc++abi",
-      "libFortran_main",
       "libFortranRuntime",
       "libFortranDecimal",
       "libunwind",
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 20088d92bafa..e4d63250135e 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1057,7 +1057,7 @@ public:
 };
 } // namespace
 
-static void mergeArch(RISCVISAInfo::OrderedExtensionMap &mergedExts,
+static void mergeArch(RISCVISAUtils::OrderedExtensionMap &mergedExts,
                       unsigned &mergedXlen, const InputSectionBase *sec,
                       StringRef s) {
   auto maybeInfo = RISCVISAInfo::parseNormalizedArchString(s);
@@ -1086,7 +1086,7 @@ static void mergeArch(RISCVISAInfo::OrderedExtensionMap &mergedExts,
 
 static RISCVAttributesSection *
 mergeAttributesSection(const SmallVector<InputSectionBase *, 0> &sections) {
-  RISCVISAInfo::OrderedExtensionMap exts;
+  RISCVISAUtils::OrderedExtensionMap exts;
   const InputSectionBase *firstStackAlign = nullptr;
   unsigned firstStackAlignValue = 0, xlen = 0;
   bool hasArch = false;
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 33bfa42b0fcb..c55b547a733c 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -224,7 +224,8 @@ struct Config {
   bool checkSections;
   bool checkDynamicRelocs;
   std::optional<llvm::DebugCompressionType> compressDebugSections;
-  llvm::SmallVector<std::pair<llvm::GlobPattern, llvm::DebugCompressionType>, 0>
+  llvm::SmallVector<
+      std::tuple<llvm::GlobPattern, llvm::DebugCompressionType, unsigned>, 0>
       compressSections;
   bool cref;
   llvm::SmallVector<std::pair<llvm::GlobPattern, uint64_t>, 0>
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index a5b47f020f87..b29e1e1a67f1 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1533,9 +1533,17 @@ static void readConfigs(opt::InputArgList &args) {
             ": parse error, not 'section-glob=[none|zlib|zstd]'");
       continue;
     }
-    auto type = getCompressionType(fields[1], arg->getSpelling());
+    auto [typeStr, levelStr] = fields[1].split(':');
+    auto type = getCompressionType(typeStr, arg->getSpelling());
+    unsigned level = 0;
+    if (fields[1].size() != typeStr.size() &&
+        !llvm::to_integer(levelStr, level)) {
+      error(arg->getSpelling() +
+            ": expected a non-negative integer compression level, but got '" +
+            levelStr + "'");
+    }
     if (Expected<GlobPattern> pat = GlobPattern::create(fields[0])) {
-      config->compressSections.emplace_back(std::move(*pat), type);
+      config->compressSections.emplace_back(std::move(*pat), type, level);
     } else {
       error(arg->getSpelling() + ": " + toString(pat.takeError()));
       continue;
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index 72eaf157a181..73a4f9662a56 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -68,8 +68,9 @@ defm compress_debug_sections:
   MetaVarName<"[none,zlib,zstd]">;
 
 defm compress_sections: EEq<"compress-sections",
-  "Compress non-SHF_ALLOC output sections matching <section-glob>">,
-  MetaVarName<"<section-glob>=[none|zlib|zstd]">;
+  "Compress output sections that match the glob and do not have the SHF_ALLOC flag."
+  "The compression level is <level> (if specified) or a default speed-focused level">,
+  MetaVarName<"<section-glob>={none,zlib,zstd}[:level]">;
 
 defm defsym: Eq<"defsym", "Define a symbol alias">, MetaVarName<"<symbol>=<value>">;
 
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index eadab9d745d6..2dbbff06a890 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -301,7 +301,11 @@ static SmallVector<uint8_t, 0> deflateShard(ArrayRef<uint8_t> in, int level,
   // 15 and 8 are default. windowBits=-15 is negative to generate raw deflate
   // data with no zlib header or trailer.
   z_stream s = {};
-  deflateInit2(&s, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
+  auto res = deflateInit2(&s, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
+  if (res != 0) {
+    errorOrWarn("--compress-sections: deflateInit2 returned " + Twine(res));
+    return {};
+  }
   s.next_in = const_cast<uint8_t *>(in.data());
   s.avail_in = in.size();
 
@@ -335,12 +339,13 @@ template <class ELFT> void OutputSection::maybeCompress() {
   (void)sizeof(Elf_Chdr);
 
   DebugCompressionType ctype = DebugCompressionType::None;
-  for (auto &[glob, t] : config->compressSections)
-    if (glob.match(name))
-      ctype = t;
+  unsigned level = 0; // default compression level
   if (!(flags & SHF_ALLOC) && config->compressDebugSections &&
       name.starts_with(".debug_") && size)
     ctype = *config->compressDebugSections;
+  for (auto &[glob, t, l] : config->compressSections)
+    if (glob.match(name))
+      std::tie(ctype, level) = {t, l};
   if (ctype == DebugCompressionType::None)
     return;
   if (flags & SHF_ALLOC) {
@@ -364,90 +369,77 @@ template <class ELFT> void OutputSection::maybeCompress() {
   // useful when there are many compressed output sections.
   addralign = 1;
 
+  // Split input into 1-MiB shards.
+  [[maybe_unused]] constexpr size_t shardSize = 1 << 20;
+  auto shardsIn = split(ArrayRef<uint8_t>(buf.get(), size), shardSize);
+  const size_t numShards = shardsIn.size();
+  compressed.numShards = numShards;
+  auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
+
 #if LLVM_ENABLE_ZSTD
-  // Use ZSTD's streaming compression API which permits parallel workers working
-  // on the stream. See http://facebook.github.io/zstd/zstd_manual.html
-  // "Streaming compression - HowTo".
+  // Use ZSTD's streaming compression API. See
+  // http://facebook.github.io/zstd/zstd_manual.html "Streaming compression -
+  // HowTo".
   if (ctype == DebugCompressionType::Zstd) {
-    // Allocate a buffer of half of the input size, and grow it by 1.5x if
-    // insufficient.
-    compressed.type = ELFCOMPRESS_ZSTD;
-    compressed.shards = std::make_unique<SmallVector<uint8_t, 0>[]>(1);
-    SmallVector<uint8_t, 0> &out = compressed.shards[0];
-    out.resize_for_overwrite(std::max<size_t>(size / 2, 32));
-    size_t pos = 0;
-
-    ZSTD_CCtx *cctx = ZSTD_createCCtx();
-    // Ignore error if zstd was not built with ZSTD_MULTITHREAD.
-    (void)ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers,
-                                 parallel::strategy.compute_thread_count());
-    ZSTD_outBuffer zob = {out.data(), out.size(), 0};
-    ZSTD_EndDirective directive = ZSTD_e_continue;
-    const size_t blockSize = ZSTD_CStreamInSize();
-    do {
-      const size_t n = std::min(static_cast<size_t>(size - pos), blockSize);
-      if (n == size - pos)
-        directive = ZSTD_e_end;
-      ZSTD_inBuffer zib = {buf.get() + pos, n, 0};
-      size_t bytesRemaining = 0;
-      while (zib.pos != zib.size ||
-             (directive == ZSTD_e_end && bytesRemaining != 0)) {
+    parallelFor(0, numShards, [&](size_t i) {
+      SmallVector<uint8_t, 0> out;
+      ZSTD_CCtx *cctx = ZSTD_createCCtx();
+      ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, level);
+      ZSTD_inBuffer zib = {shardsIn[i].data(), shardsIn[i].size(), 0};
+      ZSTD_outBuffer zob = {nullptr, 0, 0};
+      size_t size;
+      do {
+        // Allocate a buffer of half of the input size, and grow it by 1.5x if
+        // insufficient.
         if (zob.pos == zob.size) {
-          out.resize_for_overwrite(out.size() * 3 / 2);
-          zob.dst = out.data();
-          zob.size = out.size();
+          out.resize_for_overwrite(
+              zob.size ? zob.size * 3 / 2 : std::max<size_t>(zib.size / 4, 64));
+          zob = {out.data(), out.size(), zob.pos};
         }
-        bytesRemaining = ZSTD_compressStream2(cctx, &zob, &zib, directive);
-        assert(!ZSTD_isError(bytesRemaining));
-      }
-      pos += n;
-    } while (directive != ZSTD_e_end);
-    out.resize(zob.pos);
-    ZSTD_freeCCtx(cctx);
-
-    size = sizeof(Elf_Chdr) + out.size();
-    flags |= SHF_COMPRESSED;
-    return;
+        size = ZSTD_compressStream2(cctx, &zob, &zib, ZSTD_e_end);
+        assert(!ZSTD_isError(size));
+      } while (size != 0);
+      out.truncate(zob.pos);
+      ZSTD_freeCCtx(cctx);
+      shardsOut[i] = std::move(out);
+    });
+    compressed.type = ELFCOMPRESS_ZSTD;
+    size = sizeof(Elf_Chdr);
+    for (size_t i = 0; i != numShards; ++i)
+      size += shardsOut[i].size();
   }
 #endif
 
 #if LLVM_ENABLE_ZLIB
   // We chose 1 (Z_BEST_SPEED) as the default compression level because it is
-  // the fastest. If -O2 is given, we use level 6 to compress debug info more by
-  // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
-  // compression) while they take significant amount of time (~2x), so level 6
-  // seems enough.
-  const int level = config->optimize >= 2 ? 6 : Z_BEST_SPEED;
-
-  // Split input into 1-MiB shards.
-  constexpr size_t shardSize = 1 << 20;
-  auto shardsIn = split(ArrayRef<uint8_t>(buf.get(), size), shardSize);
-  const size_t numShards = shardsIn.size();
-
-  // Compress shards and compute Alder-32 checksums. Use Z_SYNC_FLUSH for all
-  // shards but the last to flush the output to a byte boundary to be
-  // concatenated with the next shard.
-  auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
-  auto shardsAdler = std::make_unique<uint32_t[]>(numShards);
-  parallelFor(0, numShards, [&](size_t i) {
-    shardsOut[i] = deflateShard(shardsIn[i], level,
-                                i != numShards - 1 ? Z_SYNC_FLUSH : Z_FINISH);
-    shardsAdler[i] = adler32(1, shardsIn[i].data(), shardsIn[i].size());
-  });
+  // fast and provides decent compression ratios.
+  if (ctype == DebugCompressionType::Zlib) {
+    if (!level)
+      level = Z_BEST_SPEED;
+
+    // Compress shards and compute Alder-32 checksums. Use Z_SYNC_FLUSH for all
+    // shards but the last to flush the output to a byte boundary to be
+    // concatenated with the next shard.
+    auto shardsAdler = std::make_unique<uint32_t[]>(numShards);
+    parallelFor(0, numShards, [&](size_t i) {
+      shardsOut[i] = deflateShard(shardsIn[i], level,
+                                  i != numShards - 1 ? Z_SYNC_FLUSH : Z_FINISH);
+      shardsAdler[i] = adler32(1, shardsIn[i].data(), shardsIn[i].size());
+    });
 
-  // Update section size and combine Alder-32 checksums.
-  uint32_t checksum = 1;       // Initial Adler-32 value
-  size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
-  for (size_t i = 0; i != numShards; ++i) {
-    size += shardsOut[i].size();
-    checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
+    // Update section size and combine Alder-32 checksums.
+    uint32_t checksum = 1;       // Initial Adler-32 value
+    size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
+    for (size_t i = 0; i != numShards; ++i) {
+      size += shardsOut[i].size();
+      checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
+    }
+    size += 4; // checksum
+    compressed.type = ELFCOMPRESS_ZLIB;
+    compressed.checksum = checksum;
   }
-  size += 4; // checksum
 
-  compressed.type = ELFCOMPRESS_ZLIB;
   compressed.shards = std::move(shardsOut);
-  compressed.numShards = numShards;
-  compressed.checksum = checksum;
   flags |= SHF_COMPRESSED;
 #endif
 }
@@ -479,25 +471,22 @@ void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
     chdr->ch_size = compressed.uncompressedSize;
     chdr->ch_addralign = addralign;
     buf += sizeof(*chdr);
-    if (compressed.type == ELFCOMPRESS_ZSTD) {
-      memcpy(buf, compressed.shards[0].data(), compressed.shards[0].size());
-      return;
+
+    auto offsets = std::make_unique<size_t[]>(compressed.numShards);
+    if (compressed.type == ELFCOMPRESS_ZLIB) {
+      buf[0] = 0x78;  // CMF
+      buf[1] = 0x01;  // FLG: best speed
+      offsets[0] = 2; // zlib header
+      write32be(buf + (size - sizeof(*chdr) - 4), compressed.checksum);
     }
 
     // Compute shard offsets.
-    auto offsets = std::make_unique<size_t[]>(compressed.numShards);
-    offsets[0] = 2; // zlib header
     for (size_t i = 1; i != compressed.numShards; ++i)
       offsets[i] = offsets[i - 1] + compressed.shards[i - 1].size();
-
-    buf[0] = 0x78; // CMF
-    buf[1] = 0x01; // FLG: best speed
     parallelFor(0, compressed.numShards, [&](size_t i) {
       memcpy(buf + offsets[i], compressed.shards[i].data(),
              compressed.shards[i].size());
     });
-
-    write32be(buf + (size - sizeof(*chdr) - 4), compressed.checksum);
     return;
   }
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index a7ed49726fd9..f8fdebfeaecf 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -26,9 +26,12 @@ Non-comprehensive list of changes in this release
 ELF Improvements
 ----------------
 
-* ``--compress-sections <section-glib>=[none|zlib|zstd]`` is added to compress
+* ``--compress-sections <section-glib>={none,zlib,zstd}[:level]`` is added to compress
   matched output sections without the ``SHF_ALLOC`` flag.
   (`#84855 <https://github.com/llvm/llvm-project/pull/84855>`_)
+  (`#90567 <https://github.com/llvm/llvm-project/pull/90567>`_)
+* The default compression level for zlib is now independent of linker
+  optimization level (``Z_BEST_SPEED``).
 * ``GNU_PROPERTY_AARCH64_FEATURE_PAUTH`` notes, ``R_AARCH64_AUTH_ABS64`` and
   ``R_AARCH64_AUTH_RELATIVE`` relocations are now supported.
   (`#72714 <https://github.com/llvm/llvm-project/pull/72714>`_)
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 3861120915e8..9ea1a9c52f2a 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -156,16 +156,16 @@ may be
 No compression.
 .It Cm zlib
 The default compression level is 1 (fastest) as the debug info usually
-compresses well at that level. If you want to compress it more,
-you can specify
-.Fl O2
-to set the compression level to 6.
+compresses well at that level.
 .It Cm zstd
-The compression level is 5.
+Use the default compression level in zstd.
 .El
 .Pp
-.It Fl -compress-sections Ns = Ns Ar section-glob=[none|zlib|zstd]
+.It Fl -compress-sections Ns = Ns Ar section-glob={none,zlib,zstd}[:level]
 Compress output sections that match the glob and do not have the SHF_ALLOC flag.
+The compression level is
+.Cm level
+(if specified) or a default speed-focused level.
 This is like a generalized
 .Cm --compress-debug-sections.
 .It Fl -cref
diff --git a/lld/test/COFF/thinlto-index-only.ll b/lld/test/COFF/thinlto-index-only.ll
index 8ef981d6090f..f99134143e4d 100644
--- a/lld/test/COFF/thinlto-index-only.ll
+++ b/lld/test/COFF/thinlto-index-only.ll
@@ -22,8 +22,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-5300342847281564238|-2624081020897602054}}
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-5300342847281564238|-2624081020897602054}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3684000822 op2=3884832250}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3684000822 op2=3884832250}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -37,7 +37,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
+; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/ELF/compress-sections.s b/lld/test/ELF/compress-sections.s
index 59b5408c9624..aa30c7a90474 100644
--- a/lld/test/ELF/compress-sections.s
+++ b/lld/test/ELF/compress-sections.s
@@ -16,7 +16,7 @@
 # CHECK1: 0000000000000010  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
 # CHECK1: 0000000000000008  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
 
-# RUN: ld.lld -pie a.o --compress-sections '*c0=zlib' --compress-sections .debug_str=zstd -o out2
+# RUN: ld.lld -pie a.o --compress-sections '*c0=zlib' --compress-sections .debug_str=zstd:3 -o out2
 # RUN: llvm-readelf -SrsX -x nonalloc0 -x .debug_str out2 | FileCheck %s --check-prefix=CHECK2
 
 # CHECK2:      Name       Type          Address     Off      Size     ES Flg Lk Inf Al
@@ -39,11 +39,11 @@
 # CHECK2-NEXT: 02000000 00000000 38000000 00000000
 # CHECK2-NEXT: 01000000 00000000 {{.*}}
 
-## --compress-debug-sections=none takes precedence.
-# RUN: ld.lld a.o --compress-debug-sections=none --compress-sections .debug_str=zstd -o out3
+## --compress-sections takes precedence.
+# RUN: ld.lld a.o --compress-sections .debug_str=zstd --compress-debug-sections=none -o out3
 # RUN: llvm-readelf -S out3 | FileCheck %s --check-prefix=CHECK3
 
-# CHECK3:      .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+# CHECK3:      .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC  0   0  1
 
 # RUN: not ld.lld a.o --compress-sections '*0=zlib' 2>&1 | \
 # RUN:   FileCheck %s --check-prefix=ERR-ALLOC --implicit-check-not=error:
@@ -62,6 +62,16 @@
 # ERR3:      unknown --compress-sections value: zlib-gabi
 # ERR3-NEXT: --compress-sections: parse error, not 'section-glob=[none|zlib|zstd]'
 
+# RUN: not ld.lld a.o --compress-sections='a=zlib:' --compress-sections='a=zlib:-1' 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR4 --implicit-check-not=error:
+# ERR4: error: --compress-sections: expected a non-negative integer compression level, but got ''
+# ERR4: error: --compress-sections: expected a non-negative integer compression level, but got '-1'
+
+## Invalid compression level for zlib.
+# RUN: not ld.lld a.o --compress-sections='.debug*=zlib:99' 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR6 --implicit-check-not=error:
+# ERR6: error: --compress-sections: deflateInit2 returned -2
+
 .globl _start
 _start:
   ret
diff --git a/lld/test/ELF/compressed-debug-level.test b/lld/test/ELF/compressed-debug-level.test
index ee95f1267997..ce3a194bd7c2 100644
--- a/lld/test/ELF/compressed-debug-level.test
+++ b/lld/test/ELF/compressed-debug-level.test
@@ -2,22 +2,20 @@
 
 # RUN: yaml2obj %s -o %t.o
 
+## LLD uses zlib compression of level 1 by default. Unlike previous versions,
+## -O does not change the level.
 # RUN: ld.lld %t.o -o %t.default --compress-debug-sections=zlib
 # RUN: llvm-readelf --sections %t.default | FileCheck -check-prefixes=HEADER,LEVEL1 %s
 
 # RUN: ld.lld -O0 %t.o -o %t.O0 --compress-debug-sections=zlib
-# RUN: llvm-readelf --sections %t.O0 | FileCheck -check-prefixes=HEADER,LEVEL1 %s
 # RUN: cmp %t.default %t.O0
 
-# RUN: ld.lld -O1 %t.o -o %t.O1 --compress-debug-sections=zlib
-# RUN: llvm-readelf --sections %t.O1 | FileCheck -check-prefixes=HEADER,LEVEL1 %s
-# RUN: cmp %t.default %t.O1
-
 # RUN: ld.lld -O2 %t.o -o %t.O2 --compress-debug-sections=zlib
-# RUN: llvm-readelf --sections %t.O2 | FileCheck -check-prefixes=HEADER,LEVEL6 %s
+# RUN: cmp %t.default %t.O2
 
-## LLD uses zlib compression of level 1 when -O0, -O1 and level 6 when -O2.
-## Here we check how -O flag affects the size of compressed sections produced.
+## --compression-level specifies the level.
+# RUN: ld.lld %t.o -o %t.6 --compress-sections=.debug_info=zlib:6
+# RUN: llvm-readelf --sections %t.6 | FileCheck -check-prefixes=HEADER,LEVEL6 %s
 
 # HEADER: [Nr] Name        Type     Address  Off    Size
 # LEVEL1: [ 1] .debug_info PROGBITS 00000000 000094 00001{{[bc]}}
diff --git a/lld/test/ELF/lto/thinlto-emit-index.ll b/lld/test/ELF/lto/thinlto-emit-index.ll
index ba2ac2ceb689..03dfbc0200e9 100644
--- a/lld/test/ELF/lto/thinlto-emit-index.ll
+++ b/lld/test/ELF/lto/thinlto-emit-index.ll
@@ -76,8 +76,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -90,7 +90,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
+; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/ELF/lto/thinlto-index-only.ll b/lld/test/ELF/lto/thinlto-index-only.ll
index abf58ce5ea41..da60af80a004 100644
--- a/lld/test/ELF/lto/thinlto-index-only.ll
+++ b/lld/test/ELF/lto/thinlto-index-only.ll
@@ -103,8 +103,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -117,7 +117,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
+; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/ELF/mips-eh_frame-pic.s b/lld/test/ELF/mips-eh_frame-pic.s
index a84c36b0e5ec..c04dbdf57b08 100644
--- a/lld/test/ELF/mips-eh_frame-pic.s
+++ b/lld/test/ELF/mips-eh_frame-pic.s
@@ -36,8 +36,8 @@
 # RELOCS:            .rel{{a?}}.eh_frame {
 # ABS32-RELOCS-NEXT:   0x1C R_MIPS_32 .text
 # ABS64-RELOCS-NEXT:   0x1C R_MIPS_64/R_MIPS_NONE/R_MIPS_NONE .text
-# PIC64-RELOCS-NEXT:   0x1C R_MIPS_PC32/R_MIPS_NONE/R_MIPS_NONE <null>
-# PIC32-RELOCS-NEXT:   0x1C R_MIPS_PC32 <null>
+# PIC64-RELOCS-NEXT:   0x1C R_MIPS_PC32/R_MIPS_NONE/R_MIPS_NONE .L0
+# PIC32-RELOCS-NEXT:   0x1C R_MIPS_PC32 .L0
 # RELOCS-NEXT:       }
 
 # ABS64-EH-FRAME: Augmentation data: 0C
diff --git a/lld/test/ELF/riscv-branch.s b/lld/test/ELF/riscv-branch.s
index dbf39dc0bb8f..1a2b446b5a43 100644
--- a/lld/test/ELF/riscv-branch.s
+++ b/lld/test/ELF/riscv-branch.s
@@ -7,19 +7,19 @@
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+4 --defsym bar=_start -o %t.rv64
 # RUN: llvm-objdump -d %t.rv32 | FileCheck %s --check-prefix=CHECK-32
 # RUN: llvm-objdump -d %t.rv64 | FileCheck %s --check-prefix=CHECK-64
-# CHECK-32: 63 02 00 00     beqz    zero, 0x110b8
-# CHECK-32: e3 1e 00 fe     bnez    zero, 0x110b4
-# CHECK-64: 63 02 00 00     beqz    zero, 0x11124
-# CHECK-64: e3 1e 00 fe     bnez    zero, 0x11120
+# CHECK-32: 00000263     beqz    zero, 0x110b8
+# CHECK-32: fe001ee3     bnez    zero, 0x110b4
+# CHECK-64: 00000263     beqz    zero, 0x11124
+# CHECK-64: fe001ee3     bnez    zero, 0x11120
 #
 # RUN: ld.lld %t.rv32.o --defsym foo=_start+0xffe --defsym bar=_start+4-0x1000 -o %t.rv32.limits
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+0xffe --defsym bar=_start+4-0x1000 -o %t.rv64.limits
 # RUN: llvm-objdump -d %t.rv32.limits | FileCheck --check-prefix=LIMITS-32 %s
 # RUN: llvm-objdump -d %t.rv64.limits | FileCheck --check-prefix=LIMITS-64 %s
-# LIMITS-32:      e3 0f 00 7e     beqz    zero, 0x120b2
-# LIMITS-32-NEXT: 63 10 00 80     bnez    zero, 0x100b8
-# LIMITS-64:      e3 0f 00 7e     beqz    zero, 0x1211e
-# LIMITS-64-NEXT: 63 10 00 80     bnez    zero, 0x10124
+# LIMITS-32:      7e000fe3     beqz    zero, 0x120b2
+# LIMITS-32-NEXT: 80001063     bnez    zero, 0x100b8
+# LIMITS-64:      7e000fe3     beqz    zero, 0x1211e
+# LIMITS-64-NEXT: 80001063     bnez    zero, 0x10124
 
 # RUN: not ld.lld %t.rv32.o --defsym foo=_start+0x1000 --defsym bar=_start+4-0x1002 -o /dev/null 2>&1 | FileCheck --check-prefix=ERROR-RANGE %s
 # RUN: not ld.lld %t.rv64.o --defsym foo=_start+0x1000 --defsym bar=_start+4-0x1002 -o /dev/null 2>&1 | FileCheck --check-prefix=ERROR-RANGE %s
diff --git a/lld/test/ELF/riscv-call.s b/lld/test/ELF/riscv-call.s
index 5fef156df0bb..0e81e9b4710e 100644
--- a/lld/test/ELF/riscv-call.s
+++ b/lld/test/ELF/riscv-call.s
@@ -7,19 +7,19 @@
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+8 --defsym bar=_start -o %t.rv64
 # RUN: llvm-objdump -d %t.rv32 | FileCheck %s
 # RUN: llvm-objdump -d %t.rv64 | FileCheck %s
-# CHECK:      97 00 00 00     auipc   ra, 0x0
-# CHECK-NEXT: e7 80 80 00     jalr    0x8(ra)
-# CHECK:      97 00 00 00     auipc   ra, 0x0
-# CHECK-NEXT: e7 80 80 ff     jalr    -0x8(ra)
+# CHECK:      00000097     auipc   ra, 0x0
+# CHECK-NEXT: 008080e7     jalr    0x8(ra)
+# CHECK:      00000097     auipc   ra, 0x0
+# CHECK-NEXT: ff8080e7     jalr    -0x8(ra)
 
 # RUN: ld.lld %t.rv32.o --defsym foo=_start+0x7ffff7ff --defsym bar=_start+8-0x80000800 -o %t.rv32.limits
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+0x7ffff7ff --defsym bar=_start+8-0x80000800 -o %t.rv64.limits
 # RUN: llvm-objdump -d %t.rv32.limits | FileCheck --check-prefix=LIMITS %s
 # RUN: llvm-objdump -d %t.rv64.limits | FileCheck --check-prefix=LIMITS %s
-# LIMITS:      97 f0 ff 7f     auipc   ra, 0x7ffff
-# LIMITS-NEXT: e7 80 f0 7f     jalr    0x7ff(ra)
-# LIMITS-NEXT: 97 00 00 80     auipc   ra, 0x80000
-# LIMITS-NEXT: e7 80 00 80     jalr    -0x800(ra)
+# LIMITS:      7ffff097     auipc   ra, 0x7ffff
+# LIMITS-NEXT: 7ff080e7     jalr    0x7ff(ra)
+# LIMITS-NEXT: 80000097     auipc   ra, 0x80000
+# LIMITS-NEXT: 800080e7     jalr    -0x800(ra)
 
 # RUN: ld.lld %t.rv32.o --defsym foo=_start+0x7ffff800 --defsym bar=_start+8-0x80000801 -o %t
 # RUN: not ld.lld %t.rv64.o --defsym foo=_start+0x7ffff800 --defsym bar=_start+8-0x80000801 -o /dev/null 2>&1 | \
diff --git a/lld/test/ELF/riscv-hi20-lo12.s b/lld/test/ELF/riscv-hi20-lo12.s
index 85861432db0b..b9786f563f28 100644
--- a/lld/test/ELF/riscv-hi20-lo12.s
+++ b/lld/test/ELF/riscv-hi20-lo12.s
@@ -7,23 +7,23 @@
 # RUN: ld.lld %t.rv64.o --defsym foo=0 --defsym bar=42 -o %t.rv64
 # RUN: llvm-objdump -d %t.rv32 | FileCheck %s
 # RUN: llvm-objdump -d %t.rv64 | FileCheck %s
-# CHECK:      37 05 00 00     lui     a0, 0x0
-# CHECK-NEXT: 13 05 05 00     mv      a0, a0
-# CHECK-NEXT: 23 20 a5 00     sw      a0, 0x0(a0)
-# CHECK-NEXT: b7 05 00 00     lui     a1, 0x0
-# CHECK-NEXT: 93 85 a5 02     addi    a1, a1, 0x2a
-# CHECK-NEXT: 23 a5 b5 02     sw      a1, 0x2a(a1)
+# CHECK:      00000537     lui     a0, 0x0
+# CHECK-NEXT: 00050513     mv      a0, a0
+# CHECK-NEXT: 00a52023     sw      a0, 0x0(a0)
+# CHECK-NEXT: 000005b7     lui     a1, 0x0
+# CHECK-NEXT: 02a58593     addi    a1, a1, 0x2a
+# CHECK-NEXT: 02b5a523     sw      a1, 0x2a(a1)
 
 # RUN: ld.lld %t.rv32.o --defsym foo=0x7ffff7ff --defsym bar=0x7ffff800 -o %t.rv32.limits
 # RUN: ld.lld %t.rv64.o --defsym foo=0x7ffff7ff --defsym bar=0xffffffff7ffff800 -o %t.rv64.limits
 # RUN: llvm-objdump -d %t.rv32.limits | FileCheck --check-prefix=LIMITS %s
 # RUN: llvm-objdump -d %t.rv64.limits | FileCheck --check-prefix=LIMITS %s
-# LIMITS:      37 f5 ff 7f     lui     a0, 0x7ffff
-# LIMITS-NEXT: 13 05 f5 7f     addi    a0, a0, 0x7ff
-# LIMITS-NEXT: a3 2f a5 7e     sw      a0, 0x7ff(a0)
-# LIMITS-NEXT: b7 05 00 80     lui     a1, 0x80000
-# LIMITS-NEXT: 93 85 05 80     addi    a1, a1, -0x800
-# LIMITS-NEXT: 23 a0 b5 80     sw      a1, -0x800(a1)
+# LIMITS:      7ffff537     lui     a0, 0x7ffff
+# LIMITS-NEXT: 7ff50513     addi    a0, a0, 0x7ff
+# LIMITS-NEXT: 7ea52fa3     sw      a0, 0x7ff(a0)
+# LIMITS-NEXT: 800005b7     lui     a1, 0x80000
+# LIMITS-NEXT: 80058593     addi    a1, a1, -0x800
+# LIMITS-NEXT: 80b5a023     sw      a1, -0x800(a1)
 
 # RUN: not ld.lld %t.rv64.o --defsym foo=0x7ffff800 --defsym bar=0xffffffff7ffff7ff -o /dev/null 2>&1 | FileCheck --check-prefix ERROR %s
 # ERROR: relocation R_RISCV_HI20 out of range: 524288 is not in [-524288, 524287]; references 'foo'
diff --git a/lld/test/ELF/riscv-jal.s b/lld/test/ELF/riscv-jal.s
index cd3b842aad60..2129e4454706 100644
--- a/lld/test/ELF/riscv-jal.s
+++ b/lld/test/ELF/riscv-jal.s
@@ -7,19 +7,19 @@
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+4 --defsym bar=_start -o %t.rv64
 # RUN: llvm-objdump -d %t.rv32 | FileCheck %s --check-prefix=CHECK-32
 # RUN: llvm-objdump -d %t.rv64 | FileCheck %s --check-prefix=CHECK-64
-# CHECK-32: 6f 00 40 00    j   0x110b8
-# CHECK-32: ef f0 df ff    jal 0x110b4
-# CHECK-64: 6f 00 40 00    j   0x11124
-# CHECK-64: ef f0 df ff    jal 0x11120
+# CHECK-32: 0040006f    j   0x110b8
+# CHECK-32: ffdff0ef    jal 0x110b4
+# CHECK-64: 0040006f    j   0x11124
+# CHECK-64: ffdff0ef    jal 0x11120
 
 # RUN: ld.lld %t.rv32.o --defsym foo=_start+0xffffe --defsym bar=_start+4-0x100000 -o %t.rv32.limits
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+0xffffe --defsym bar=_start+4-0x100000 -o %t.rv64.limits
 # RUN: llvm-objdump -d %t.rv32.limits | FileCheck --check-prefix=LIMITS-32 %s
 # RUN: llvm-objdump -d %t.rv64.limits | FileCheck --check-prefix=LIMITS-64 %s
-# LIMITS-32:      6f f0 ff 7f j   0x1110b2
-# LIMITS-32-NEXT: ef 00 00 80 jal 0xfff110b8
-# LIMITS-64:      6f f0 ff 7f j   0x11111e
-# LIMITS-64-NEXT: ef 00 00 80 jal 0xfffffffffff11124
+# LIMITS-32:      7ffff06f j   0x1110b2
+# LIMITS-32-NEXT: 800000ef jal 0xfff110b8
+# LIMITS-64:      7ffff06f j   0x11111e
+# LIMITS-64-NEXT: 800000ef jal 0xfffffffffff11124
 
 # RUN: not ld.lld %t.rv32.o --defsym foo=_start+0x100000 --defsym bar=_start+4-0x100002 -o /dev/null 2>&1 | FileCheck --check-prefix=ERROR-RANGE %s
 # RUN: not ld.lld %t.rv64.o --defsym foo=_start+0x100000 --defsym bar=_start+4-0x100002 -o /dev/null 2>&1 | FileCheck --check-prefix=ERROR-RANGE %s
diff --git a/lld/test/MachO/thinlto-emit-index.ll b/lld/test/MachO/thinlto-emit-index.ll
index 6f8d552f1435..7a3332ab8c93 100644
--- a/lld/test/MachO/thinlto-emit-index.ll
+++ b/lld/test/MachO/thinlto-emit-index.ll
@@ -76,8 +76,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -90,7 +90,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
+; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/MachO/thinlto-index-only.ll b/lld/test/MachO/thinlto-index-only.ll
index a97cd126ad5b..4844e715f492 100644
--- a/lld/test/MachO/thinlto-index-only.ll
+++ b/lld/test/MachO/thinlto-index-only.ll
@@ -75,8 +75,8 @@
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
 ; BACKEND1: <FLAGS
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
-; BACKEND1: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
 ; BACKEND1: <COMBINED
 ; BACKEND1: <COMBINED
 ; BACKEND1: </GLOBALVAL_SUMMARY_BLOCK
@@ -89,7 +89,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
+; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/lld/test/wasm/init-fini.ll b/lld/test/wasm/init-fini.ll
index 3d2e9a78043e..ef2f41f96e89 100644
--- a/lld/test/wasm/init-fini.ll
+++ b/lld/test/wasm/init-fini.ll
@@ -78,7 +78,7 @@ entry:
 ; CHECK-NEXT:         Body:            10041005100A100F1012100F10141004100C100F10161002100E0B
 ; CHECK:            - Index:           22
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            02404186808080004100418088808000108080808000450D0000000B0B
+; CHECK-NEXT:         Body:            02404186808080004100418088808000108080808000450D00000B0B
 ; CHECK-NEXT:   - Type:            CUSTOM
 ; CHECK-NEXT:     Name:            name
 ; CHECK-NEXT:     FunctionNames:
diff --git a/lld/tools/lld/CMakeLists.txt b/lld/tools/lld/CMakeLists.txt
index 0f5daa78846b..8498a91597a9 100644
--- a/lld/tools/lld/CMakeLists.txt
+++ b/lld/tools/lld/CMakeLists.txt
@@ -42,3 +42,7 @@ endif()
 foreach(link ${LLD_SYMLINKS_TO_CREATE})
   add_lld_symlink(${link} lld)
 endforeach()
+
+if(LLVM_TOOL_LLVM_DRIVER_BUILD)
+  set_property(GLOBAL APPEND PROPERTY LLVM_DRIVER_HIDDEN_TOOL_ALIASES_lld ld)
+endif()
diff --git a/lldb/bindings/interface/SBValueDocstrings.i b/lldb/bindings/interface/SBValueDocstrings.i
index 6bab923e8b35..59fa807f5ec9 100644
--- a/lldb/bindings/interface/SBValueDocstrings.i
+++ b/lldb/bindings/interface/SBValueDocstrings.i
@@ -135,6 +135,26 @@ linked list."
 %feature("docstring", "Expands nested expressions like .a->b[0].c[1]->d."
 ) lldb::SBValue::GetValueForExpressionPath;
 
+%feature("docstring", "
+      Return the value as an address.  On failure, LLDB_INVALID_ADDRESS
+      will be returned.  On architectures like AArch64, where the
+      top (unaddressable) bits can be used for authentication,
+      memory tagging, or top byte ignore,  this method will return
+      the value with those top bits cleared.
+
+      GetValueAsUnsigned returns the actual value, with the
+      authentication/Top Byte Ignore/Memory Tagging Extension bits.
+
+      Calling this on a random value which is not a pointer is
+      incorrect.  Call GetType().IsPointerType() if in doubt.
+
+      An SB API program may want to show both the literal byte value
+      and the address it refers to in memory.  These two SBValue
+      methods allow SB API writers to behave appropriately for their
+      interface."
+) lldb::SBValue::GetValueAsAddress;
+
+
 %feature("doctstring", "
     Returns the number for children.
 
diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index a758261073ac..3c6223b015bb 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -7,6 +7,7 @@ set(LLDB_INCLUDE_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/include")
 
 set(LLDB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(LLDB_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(LLDB_OBJ_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR)
   message(FATAL_ERROR
diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index f915839f6b45..df2f8ddf54a3 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -71,6 +71,7 @@ endif()
 # At configuration time, collect headers for the framework bundle and copy them
 # into a staging directory. Later we can copy over the entire folder.
 file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h)
+set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h)
 file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h)
 file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h)
 list(REMOVE_ITEM root_public_headers ${root_private_headers})
@@ -80,6 +81,7 @@ find_program(unifdef_EXECUTABLE unifdef)
 set(lldb_header_staging ${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders)
 foreach(header
     ${public_headers}
+    ${generated_public_headers}
     ${root_public_headers})
 
   get_filename_component(basename ${header} NAME)
diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst
index 6906566ea55e..7a27f6914fa8 100644
--- a/lldb/docs/index.rst
+++ b/lldb/docs/index.rst
@@ -158,6 +158,7 @@ interesting areas to contribute to lldb.
    resources/dataformatters
    resources/extensions
    resources/lldbgdbremote
+   resources/lldbplatformpackets
    resources/caveats
    resources/projects
    Public C++ API <https://lldb.llvm.org/cpp_reference/namespacelldb.html>
diff --git a/lldb/docs/lldb-platform-packets.txt b/lldb/docs/lldb-platform-packets.txt
deleted file mode 100644
index 4cf575e5ee8a..000000000000
--- a/lldb/docs/lldb-platform-packets.txt
+++ /dev/null
@@ -1,451 +0,0 @@
-Here is a brief overview of the packets that an lldb platform server
-needs to implement for the lldb testsuite to be run on a remote
-target device/system.
-
-These are almost all lldb extensions to the gdb-remote serial
-protocol.  Many of the vFile: packets are described to the "Host
-I/O Packets" detailed in the gdb-remote protocol documentation,
-although the lldb platform extensions include packets that are not
-defined there (vFile:size:, vFile:mode:, vFile:symlink, vFile:chmod:).
-Most importantly, the flags that lldb passes to vFile:open: are 
-incompatible with the flags that gdb specifies.
-
-
-//----------------------------------------------------------------------
-// QStartNoAckMode
-//
-// BRIEF
-//   A request to stop sending ACK packets for each properly formatted packet.
-//
-// EXAMPLE 
-//   A platform session will typically start like this:
-//
-//   receive: +$QStartNoAckMode#b0
-//   send:    +       <-- ACKing the properly formatted QStartNoAckMode packet
-//   send:    $OK#9a
-//   receive: +       <-- Our OK packet getting ACKed
-//
-//   ACK mode is now disabled.
-
-//----------------------------------------------------------------------
-// qHostInfo
-//
-// BRIEF
-//   Describe the hardware and OS of the target system
-//
-// EXAMPLE
-//
-//  receive: qHostInfo
-//  send:    cputype:16777228;cpusubtype:1;ostype:ios;watchpoint_exceptions_received:before;os_version:12.1;vendor:apple;default_packet_timeout:5;
-//
-//  All numbers are base 10, os_version is a string that will be parsed as major.minor.patch.
-
-//----------------------------------------------------------------------
-// qModuleInfo
-//
-// BRIEF
-//   Report information about a binary on the target system
-//
-// EXAMPLE
-//  receive: qModuleInfo:2f62696e2f6c73;
-//
-// FIXME finish this packet description, v. GDBRemoteCommunicationServerCommon::Handle_qModuleInfo
-
-
-//----------------------------------------------------------------------
-// qGetWorkingDir
-//
-// BRIEF
-//   Get the current working directory of the platform stub in
-//   ASCII hex encoding.
-//
-// EXAMPLE
-// 
-//  receive: qGetWorkingDir
-//  send:    2f4170706c65496e7465726e616c2f6c6c64622f73657474696e67732f342f5465737453657474696e67732e746573745f646973617373656d626c65725f73657474696e6773 
-
-
-
-//----------------------------------------------------------------------
-// QSetWorkingDir:
-//
-// BRIEF
-//   Set the current working directory of the platform stub in
-//   ASCII hex encoding.
-//
-// EXAMPLE
-// 
-//  receive: QSetWorkingDir:2f4170706c65496e7465726e616c2f6c6c64622f73657474696e67732f342f5465737453657474696e67732e746573745f646973617373656d626c65725f73657474696e6773 
-//  send:    OK
-
-//----------------------------------------------------------------------
-// qPlatform_mkdir:
-//
-// BRIEF
-//   Create a directory on the target system.
-//
-// EXAMPLE
-// 
-//  receive: qPlatform_mkdir:000001fd,2f746d702f6131
-//  send:    F0
-//
-//  request packet has the fields:
-//     1. mode bits in base 16
-//     2. file path in ascii-hex encoding
-//
-//  response is F followed by the return value of the mkdir() call,
-//  base 16 encoded.
-
-//----------------------------------------------------------------------
-// qPlatform_shell:
-//
-// BRIEF
-//   Run a shell command on the target system, return the output.
-//
-// EXAMPLE
-// 
-//  receive: qPlatform_shell:6c73202f746d702f,0000000a
-//  send:    F,0,0,<OUTPUT>
-//
-//  request packet has the fields:
-//     1. shell command ascii-hex encoded
-//     2. timeout 
-//     3. {optional} working directory ascii-hex encoded
-//
-//  Response is F followed by the return value of the command (base 16),
-//  followed by another number, followed by the output of the command
-/   in binary-escaped-data encoding.
-
-//----------------------------------------------------------------------
-// qLaunchGDBServer
-//
-// BRIEF
-//   Start a gdbserver process (gdbserver, debugserver, lldb-server)
-//   on the target system.
-//
-// EXAMPLE
-// 
-//  receive: qLaunchGDBServer;host:<HOSTNAME_LLDB_IS_ON>;
-//  send:    pid:1337;port:43001;
-//
-//  request packet hostname field is not ascii-hex encoded. Hostnames 
-//  don't have $ or # characters in them.
-//
-//  response to the packet is the pid of the newly launched gdbserver,
-//  and the port it is listening for a connection on.
-//
-//  When the testsuite is running, lldb may use the pid to kill off a 
-//  debugserver that doesn't seem to be responding, etc.
-
-//----------------------------------------------------------------------
-// qKillSpawnedProcess:
-//
-// BRIEF
-//   Kill a process running on the target system.
-//
-// EXAMPLE
-// 
-//  receive: qKillSpawnedProcess:1337
-//  send:    OK
-//
-//  The request packet has the process ID in base 10.
-
-//----------------------------------------------------------------------
-// qProcessInfoPID:
-//
-// BRIEF
-//   Gather information about a process running on the target
-//
-// EXAMPLE
-// 
-//  receive: qProcessInfoPID:71964
-//  send:    pid:71964;name:612e6f7574;
-//
-//  The request packet has the pid encoded in base 10.
-//
-//  The reply has semicolon-separated name:value fields, two are
-//  shown here.  pid is base 10 encoded.  name is ascii hex encoded.
-//  lldb-server can reply with many additional fields, but I think
-//  this is enough for the testsuite.
-
-//----------------------------------------------------------------------
-// qfProcessInfo:
-//
-// BRIEF
-//   Search the process table for processes matching criteria, 
-//   respond with them in multiple packets.
-//
-// EXAMPLE
-// 
-//  receive: qfProcessInfo:name_match:equals;name:6e6f70726f6365737365786973747377697468746869736e616d65;
-//  send:    pid:3500;name:612e6f7574;
-//
-//  The request packet has a criteria to search for, followed by
-//  a specific name.  
-//
-//  KEY           VALUE     DESCRIPTION
-//  ===========   ========  ================================================
-//  "name"        ascii-hex An ASCII hex string that contains the name of 
-//                          the process that will be matched.
-//  "name_match"  enum      One of: "equals", "starts_with", "ends_with", 
-//                          "contains" or "regex"
-//  "pid"         integer   A string value containing the decimal process ID
-//  "parent_pid"  integer   A string value containing the decimal parent 
-//                          process ID
-//  "uid"         integer   A string value containing the decimal user ID
-//  "gid"         integer   A string value containing the decimal group ID
-//  "euid"        integer   A string value containing the decimal effective user ID
-//  "egid"        integer   A string value containing the decimal effective group ID
-//  "all_users"   bool      A boolean value that specifies if processes should
-//                          be listed for all users, not just the user that the 
-//                          platform is running as
-//  "triple"      ascii-hex An ASCII hex target triple string ("x86_64", 
-//                          "x86_64-apple-macosx", "armv7-apple-ios")
-//
-//  If no criteria is given, qfProcessInfo will request a list of every process.
-//
-//  The lldb testsuite currently only uses name_match:equals and the
-//  no-criteria mode to list every process.
-//
-//  The response should include any information about the process that
-//  can be retrieved in semicolon-separated name:value fields.
-//  In this example, pid is base 10, name is ascii-hex encoded.
-//  The testsuite seems to only require these two.
-//
-//  This packet only responds with one process.  To get further matches to
-//  the search, qsProcessInfo should be sent.
-//
-//  If no process match is found, Exx should be returned.
-//
-//  Sample packet/response:
-//  send packet: $qfProcessInfo#00
-//  read packet: $pid:60001;ppid:59948;uid:7746;gid:11;euid:7746;egid:11;name:6c6c6462;triple:7838365f36342d6170706c652d6d61636f7378;#00
-//  send packet: $qsProcessInfo#00
-//  read packet: $pid:59992;ppid:192;uid:7746;gid:11;euid:7746;egid:11;name:6d64776f726b6572;triple:7838365f36342d6170706c652d6d61636f7378;#00
-//  send packet: $qsProcessInfo#00
-//  read packet: $E04#00
-
-//----------------------------------------------------------------------
-// qsProcessInfo
-//
-// BRIEF
-//   Return the next process info found by the most recent qfProcessInfo:
-//   packet.
-//
-// EXAMPLE
-// 
-//  Continues to return the results of the qfProcessInfo.  Once all matches
-//  have been sent, Exx is returned to indicate end of matches.
-
-//----------------------------------------------------------------------
-// qPathComplete
-//
-// BRIEF
-//   Get a list of matched disk files/directories by passing a boolean flag
-//   and a partial path.
-//
-// EXAMPLE
-//
-//   receive: qPathComplete:0,6d61696e
-//   send:    M6d61696e2e637070
-//   receive: qPathComplete:1,746573
-//   send:    M746573742f,74657374732f
-//
-//   If the first argument is zero, the result should contain all
-//   files (including directories) starting with the given path. If the
-//   argument is one, the result should contain only directories.
-//
-//   The result should be a comma-separated list of hex-encoded paths.
-//   Paths denoting a directory should end with a directory separator ('/' or '\').
-
-//----------------------------------------------------------------------
-// vFile:size:
-//
-// BRIEF
-//   Get the size of a file on the target system, filename in ASCII hex.
-//
-// EXAMPLE
-// 
-//  receive: vFile:size:2f746d702f61
-//  send:    Fc008
-//
-//  response is "F" followed by the file size in base 16.
-//  "F-1,errno" with the errno if an error occurs, base 16.
-
-
-//----------------------------------------------------------------------
-// vFile:mode:
-//
-// BRIEF
-//   Get the mode bits of a file on the target system, filename in ASCII hex.
-//
-// EXAMPLE
-// 
-//  receive: vFile:mode:2f746d702f61
-//  send:    F1ed
-//
-//  response is "F" followed by the mode bits in base 16, this 0x1ed would 
-//  correspond to 0755 in octal.  
-//  "F-1,errno" with the errno if an error occurs, base 16.
-
-//----------------------------------------------------------------------
-// vFile:unlink:
-//
-// BRIEF
-//   Remove a file on the target system.
-//
-// EXAMPLE
-// 
-//  receive: vFile:unlink:2f746d702f61
-//  send:    F0
-//
-//  Argument is a file path in ascii-hex encoding.
-//  Response is "F" plus the return value of unlink(), base 16 encoding.
-//  Return value may optionally be followed by a comma and the base16
-//  value of errno if unlink failed.
-
-//----------------------------------------------------------------------
-// vFile:symlink:
-//
-// BRIEF
-//   Create a symbolic link (symlink, soft-link) on the target system.
-//
-// EXAMPLE
-// 
-//  receive: vFile:symlink:<SRC-FILE>,<DST-NAME>
-//  send:    F0,0
-//
-//  Argument file paths are in ascii-hex encoding.
-//  Response is "F" plus the return value of symlink(), base 16 encoding, 
-//  optionally followed by the value of errno if it failed, also base 16.
-
-//----------------------------------------------------------------------
-// vFile:chmod:
-// qPlatform_chmod:
-//
-// BRIEF
-//   Change the permission mode bits on a file on the target
-//
-// EXAMPLE
-// 
-//  receive: vFile:chmod:180,2f746d702f61
-//  send:    F0
-//
-//  Arguments are the mode bits to set, base 16, and a file path in 
-//  ascii-hex encoding.
-//  Response is "F" plus the return value of chmod(), base 16 encoding.
-//
-//  I don't know why there are two packets for the same thing, v.
-//  vFile:chmod:.
-
-//----------------------------------------------------------------------
-// vFile:chmod:
-//
-// BRIEF
-//   Change the permission mode bits on a file on the target
-//
-// EXAMPLE
-// 
-//  receive: vFile:chmod:180,2f746d702f61
-//  send:    F0
-//
-//  Arguments are the mode bits to set, base 16, and a file path in 
-//  ascii-hex encoding.
-//  Response is "F" plus the return value of chmod(), base 10 encoding.
-
-
-//----------------------------------------------------------------------
-// vFile:open:
-//
-// BRIEF
-//   Open a file on the remote system and return the file descriptor of it.
-//
-// EXAMPLE
-// 
-//  receive: vFile:open:2f746d702f61,00000001,00000180
-//  send:    F8
-//
-//  request packet has the fields:
-//     1. ASCII hex encoded filename
-//     2. flags passed to the open call, base 16.
-//        Note that these are not the oflags that open(2) takes, but
-//        are the constant values in enum OpenOptions from lldb's File.h
-//     3. mode bits, base 16
-//  
-//  response is F followed by the opened file descriptor in base 16.
-//  "F-1,errno" with the errno if an error occurs, base 16.
-//
-//----------------------------------------------------------------------
-// vFile:close:
-//
-// BRIEF
-//   Close a previously opened file descriptor.
-//
-// EXAMPLE
-// 
-//  receive: vFile:close:7
-//  send:    F0
-//
-//  File descriptor is in base 16.
-//  "F-1,errno" with the errno if an error occurs,
-//  errno is base 16.
-
-
-//----------------------------------------------------------------------
-// vFile:pread:
-//
-// BRIEF
-//   Read data from an opened file descriptor.
-//
-// EXAMPLE
-// 
-//  receive: vFile:pread:7,1024,0
-//  send:    F4;a'b\00
-//
-//  request packet has the fields:
-//     1. file descriptor, base 16
-//     2. number of bytes to be read, base 16
-//     3. offset into file to start from, base 16
-//
-//  Response is F, followed by the number of bytes read (base 16), a
-//  semicolon, followed by the data in the binary-escaped-data encoding.
-
-
-//----------------------------------------------------------------------
-// vFile:pwrite:
-//
-// BRIEF
-//   Write data to a previously opened file descriptor.
-//
-// EXAMPLE
-// 
-//  receive: vFile:pwrite:8,0,\cf\fa\ed\fe\0c\00\00
-//  send:    F1024
-//
-//  request packet has the fields:
-//     1. file descriptor, base 16
-//     2. offset into file to start from, base 16
-//     3. binary-escaped-data to be written
-//
-//  Response is F, followed by the number of bytes written (base 16)
-
-
-
-
-
-Finally, the platform must be able to launch processes so that debugserver
-can attach to them.  To do this, the following packets should be handled:
-
-QSetDisableASLR
-QSetDetachOnError
-QSetSTDOUT
-QSetSTDERR
-QSetSTDIN
-QEnvironment
-QEnvironmentHexEncoded
-A
-qLaunchSuccess
-qProcessInfo
-
-Most of these are documented in the standard gdb-remote protocol
-and/or the lldb-gdb-remote.txt documentation.
diff --git a/lldb/docs/resources/lldbgdbremote.md b/lldb/docs/resources/lldbgdbremote.md
index cbe5c766d61e..1467723fb79d 100644
--- a/lldb/docs/resources/lldbgdbremote.md
+++ b/lldb/docs/resources/lldbgdbremote.md
@@ -1,14 +1,19 @@
 # GDB Remote Protocol Extensions
 
 LLDB has added new GDB server packets to better support multi-threaded and
-remote debugging.
+remote debugging. These extend the
+[protocol defined by GDB ](https://sourceware.org/gdb/current/onlinedocs/gdb.html/Packets.html#Packets) (and [this page](https://sourceware.org/gdb/current/onlinedocs/gdb.html/Host-I_002fO-Packets.html#Host-I_002fO-Packets) for `vFile` packets).
 
-Why? Normally you need to start the correct GDB and the
-correct GDB server when debugging. If you have mismatch, then things go wrong
-very quickly. LLDB makes extensive use of the GDB remote protocol and we
-wanted to make sure that the experience was a bit more dynamic where we can
-discover information about a remote target without having to know anything up
-front.
+If a packet is restated here it is because LLDB's version has some behaviour
+difference to GDB's version, or it provides some context for a following LLDB
+extension packet.
+
+Why did we add these? The most common reason is flexibility. Normally you need
+to start the correct GDB and the correct GDB server when debugging. If you have
+mismatch, then things go wrong very quickly. LLDB makes extensive use of the GDB
+remote protocol and we wanted to make sure that the experience was a bit more
+dynamic where we can discover information about a remote target without having
+to know anything up front.
 
 We also ran into performance issues with the existing GDB remote
 protocol that can be overcome when using a reliable communications layer.
@@ -22,95 +27,53 @@ We prefer to be able to dynamically determine what kind of architecture, OS and
 vendor we are debugging, as well as how things are laid out when it comes to
 the thread register contexts.
 
-Below are the details on the new packets we have added above and beyond the
-standard GDB remote protocol packets.
-
-## QStartNoAckMode
-
-### Brief
-
-Try to enable no ACK mode to skip sending ACKs and NACKs.
-
-### Priority To Implement
-
-High. Any GDB remote server that can implement this should if the
-connection is reliable. This improves packet throughput and increases
-the performance of the connection.
+## _M\<size\>,\<permissions\>
 
-### Description
+Allocate memory on the remote target with the specified size and
+permissions.
 
-Having to send an ACK/NACK after every packet slows things down a bit, so we
-have a way to disable ACK packets to minimize the traffic for reliable
-communication interfaces (like sockets). Below GDB or LLDB will send this
-packet to try and disable ACKs. All lines that start with "send packet: " are
-from GDB/LLDB, and all lines that start with "read packet: " are from the GDB
-remote server:
+The allocate memory packet starts with `_M<size>,<permissions>`. It returns a
+raw big endian address value, or an empty response for unimplemented, or `EXX` for an error
+code. The packet is formatted as:
 ```
-send packet: $QStartNoAckMode#b0
-read packet: +
-read packet: $OK#9a
-send packet: +
+char packet[256];
+int packet_len;
+packet_len = ::snprintf (
+    packet,
+    sizeof(packet),
+    "_M%zx,%s%s%s",
+    (size_t)size,
+    permissions & lldb::ePermissionsReadable ? "r" : "",
+    permissions & lldb::ePermissionsWritable ? "w" : "",
+    permissions & lldb::ePermissionsExecutable ? "x" : "");
 ```
 
-## QSupported
-
-### Brief
-
-Query the GDB remote server for features it supports
-
-### Priority To Implement
-
-Optional.
-
-### Description
-
-QSupported is a standard GDB Remote Serial Protocol packet, but
-there are several additions to the response that lldb can parse.
-They are not all listed here.
-
-An example exchange:
-```
-send packet: qSupported:xmlRegisters=i386,arm,mips,arc;multiprocess+;fork-events+;vfork-events+
+You request a size and give the permissions. This packet does NOT need to be
+implemented if you don't want to support running JITed code. The return value
+is just the address of the newly allocated memory as raw big endian hex bytes.
 
-read packet: qXfer:features:read+;PacketSize=20000;qEcho+;native-signals+;SupportedCompressions=lzfse,zlib-deflate,lz4,lzma;SupportedWatchpointTypes=aarch64-mask,aarch64-bas;
-```
+**Priority To Implement:** High if you want LLDB to be able to JIT code and run
+that code. JIT code also needs data which is also allocated and tracked. Low if
+you don't support running JIT'ed code.
 
-In the example above, three lldb extensions are shown:
+## _m\<addr\>
 
-  * `PacketSize=20000`
-    * The base 16 maximum packet size that the stub can handle.
-  * `SupportedCompressions=<item,item,...>`
-    * A list of compression types that the stub can use to compress packets
-    when the QEnableCompression packet is used to request one of them.
-  * `SupportedWatchpointTypes=<item,item,...>`
-    * A list of watchpoint types that this stub can manage. Currently defined 
-      names are:
-        * `x86_64` - 64-bit x86-64 watchpoints (1, 2, 4, 8 byte watchpoints
-          aligned to those amounts)
-        * `aarch64-bas`  AArch64 Byte Address Select watchpoints
-                     (any number of contiguous bytes within a doubleword)
-        * `aarch64-mask` AArch64 MASK watchpoints
-                     (any power-of-2 region of memory from 8 to 2GB, aligned)
+Deallocate memory that was previously allocated using an allocate
+memory pack.
 
-      If nothing is specified, lldb will default to sending power-of-2
-      watchpoints, up to a pointer size, `sizeof(void*)`, a reasonable
-      baseline assumption.
+The deallocate memory packet is `_m<addr>` where you pass in the address you
+got back from a previous call to the allocate memory packet. It returns `OK`
+if the memory was successfully deallocated, or `EXX`" for an error, or an
+empty response if not supported.
 
+**Priority To Implement:** High if you want LLDB to be able to JIT code and run
+that code. JIT code also needs data which is also allocated and tracked. Low if
+you don't support running JIT'ed code.
 
 ## "A" - launch args packet
 
-### Brief
-
 Launch a program using the supplied arguments
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 We have added support for the "set program arguments" packet where we can
 start a connection to a remote server and then later supply the path to the
 executable and the arguments to use when executing:
@@ -130,224 +93,307 @@ The above packet helps when you have remote debugging abilities where you
 could launch a process on a remote host, this isn't needed for bare board
 debugging.
 
-
-## QEnvironment:NAME=VALUE
-
-### Brief
-
-Setup the environment up for a new child process that will soon be
-launched using the "A" packet.
-
-NB: key/value pairs are sent as-is so gdb-remote protocol meta characters
-(e.g. `#` or `$`) are not acceptable.  If any non-printable or
-metacharacters are present in the strings, `QEnvironmentHexEncoded`
-should be used instead if it is available.  If you don't want to
-scan the environment strings before sending, prefer
-the `QEnvironmentHexEncoded` packet over `QEnvironment`, if it is
-available.
-
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
+**Priority To Implement:** Low. Only needed if the remote target wants to launch
+a target after making a connection to a GDB server that isn't already connected to
 an inferior process.
 
-### Description
+## "D" - Detach and stay stopped
 
-Both GDB and LLDB support passing down environment variables. Is it ok to
-respond with a `$#00` (unimplemented):
+We extended the "D" packet to specify that the monitor should keep the
+target suspended on detach.  The normal behavior is to resume execution
+on detach.  We will send:
 ```
-send packet: $QEnvironment:ACK_COLOR_FILENAME=bold yellow#00
-read packet: $OK#00
+qSupportsDetachAndStayStopped:
 ```
-This packet can be sent one or more times _prior_ to sending a "A" packet.
-
-## QEnvironmentHexEncoded:HEX-ENCODING(NAME=VALUE)
-
-### Brief
 
-Setup the environment up for a new child process that will soon be
-launched using the "A" packet.
-
-The only difference between this packet and `QEnvironment` is that the
-environment key-value pair is ascii hex encoded for transmission.
-This allows values with gdb-remote metacharacters like `#` to be sent.
-
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
-Both GDB and LLDB support passing down environment variables. Is it ok to
-respond with a `$#00` (unimplemented):
+to query whether the monitor supports the extended detach, and if it does,
+when we want the monitor to detach but not resume the target, we will
+send:
 ```
-send packet: $QEnvironment:41434b5f434f4c4f525f46494c454e414d453d626f6c642379656c6c6f77#00
-read packet: $OK#00
+D1
+```
+In any case, if we want the normal detach behavior we will just send:
+```
+D
 ```
-This packet can be sent one or more times _prior_ to sending a "A" packet.
 
-## QEnableErrorStrings
+## jGetDyldProcessState
 
-### Brief
+This packet fetches the process launch state, as reported by libdyld on
+Darwin systems, most importantly to indicate when the system libraries
+have initialized sufficiently to safely call utility functions.
 
-This packet enables reporting of Error strings in remote packet
-replies from the server to client. If the server supports this
-feature, it should send an OK response. The client can expect the
-following error replies if this feature is enabled in the server:
 ```
-EXX;AAAAAAAAA
+LLDB SENDS: jGetDyldProcessState
+STUB REPLIES: {"process_state_value":48,"process_state string":"dyld_process_state_libSystem_initialized"}
 ```
-where `AAAAAAAAA` will be a hex encoded ASCII string.
-`XX`` is hex encoded byte number.
 
-It must be noted that even if the client has enabled reporting
-strings in error replies, it must not expect error strings to all
-error replies.
+**Priority To Implement:** Low. This packet is needed to prevent lldb's utility
+functions for scanning the Objective-C class list from running very early in
+process startup.
 
-### Priority To Implement
+## jGetLoadedDynamicLibrariesInfos
 
-Low. Only needed if the remote target wants to provide strings that
-are human readable along with an error code.
+This packet asks the remote debug stub to send the details about libraries
+being added/removed from the process as a performance optimization.
 
-### Example
+There are two ways this packet can be used.  Both return a dictionary of
+binary images formatted the same way.
 
+One requests information on all shared libraries:
 ```
-send packet: $QEnableErrorStrings
-read packet: $OK#00
+jGetLoadedDynamicLibrariesInfos:{"fetch_all_solibs":true}
 ```
+with an optional `"report_load_commands":false` which can be added, asking
+that only the dyld SPI information (load addresses, filenames) be returned.
+The default behavior is that debugserver scans the mach-o header and load
+commands of each binary, and returns it in the JSON reply.
 
-## QSetSTDIN:\<ascii-hex-path\> / QSetSTDOUT:\<ascii-hex-path\> / QSetSTDERR:\<ascii-hex-path\>
-
-### Brief
-
-Setup where STDIN, STDOUT, and STDERR go prior to sending an "A"
-packet.
-
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
+And the second requests information about a list of shared libraries, given their load addresses:
+```
+jGetLoadedDynamicLibrariesInfos:{"solib_addresses":[8382824135,3258302053,830202858503]}
+```
 
-### Description
+The second call is both a performance optimization (instead of having lldb read the mach-o header/load commands
+out of memory with generic read packets) but also adds additional information in the form of the
+filename of the shared libraries (which is not available in the mach-o header/load commands.)
 
-When launching a program through the GDB remote protocol with the "A" packet,
-you might also want to specify where stdin/out/err go:
+An example using the OS X 10.11 style call:
 ```
-QSetSTDIN:<ascii-hex-path>
-QSetSTDOUT:<ascii-hex-path>
-QSetSTDERR:<ascii-hex-path>
+LLDB SENDS: jGetLoadedDynamicLibrariesInfos:{"image_count":1,"image_list_address":140734800075128}
+STUB REPLIES: ${"images":[{"load_address":4294967296,"mod_date":0,"pathname":"/tmp/a.out","uuid":"02CF262C-ED6F-3965-9E14-63538B465CFF","mach_header":{"magic":4277009103,"cputype":16777223,"cpusubtype":18446744071562067971,"filetype":2},"segments":{"name":"__PAGEZERO","vmaddr":0,"vmsize":4294967296,"fileoff":0,"filesize":0,"maxprot":0},{"name":"__TEXT","vmaddr":4294967296,"vmsize":4096,"fileoff":0,"filesize":4096,"maxprot":7},{"name":"__LINKEDIT","vmaddr":4294971392,"vmsize":4096,"fileoff":4096,"filesize":152,"maxprot":7}}]}#00
 ```
-These packets must be sent  _prior_ to sending a "A" packet.
 
-## QSetWorkingDir:\<ascii-hex-path\>
+Or pretty-printed:
+```
+STUB REPLIES: ${"images":
+                [
+                    {"load_address":4294967296,
+                     "mod_date":0,
+                     "pathname":"/tmp/a.out",
+                     "uuid":"02CF262C-ED6F-3965-9E14-63538B465CFF",
+                     "mach_header":
+                        {"magic":4277009103,
+                         "cputype":16777223,
+                         "cpusubtype":18446744071562067971,
+                         "filetype":2
+                         },
+                     "segments":
+                      [
+                        {"name":"__PAGEZERO",
+                         "vmaddr":0,
+                         "vmsize":4294967296,
+                         "fileoff":0,
+                         "filesize":0,
+                         "maxprot":0
+                        },
+                        {"name":"__TEXT",
+                         "vmaddr":4294967296,
+                         "vmsize":4096,
+                         "fileoff":0,
+                         "filesize":4096,
+                         "maxprot":7
+                        },
+                        {"name":"__LINKEDIT",
+                         "vmaddr":4294971392,
+                         "vmsize":4096,
+                         "fileoff":4096,
+                         "filesize":152,
+                         "maxprot":7
+                        }
+                      ]
+                    }
+                ]
+            }
+```
 
-### Brief
+This is similar to the `qXfer:libraries:read` packet, and it could
+be argued that it should be merged into that packet.  A separate
+packet was created primarily because lldb needs to specify the
+number of images to be read and the address from which the initial
+information is read.  Also the XML DTD would need to be extended
+quite a bit to provide all the information that the `DynamicLoaderMacOSX`
+would need to work correctly on this platform.
 
-Set the working directory prior to sending an "A" packet.
+**Priority To Implement:**
 
-### Priority To Implement
+On OS X 10.11, iOS 9, tvOS 9, watchOS 2 and older: Low.  If this packet is absent,
+lldb will read the Mach-O headers/load commands out of memory.
+On macOS 10.12, iOS 10, tvOS 10, watchOS 3 and newer: High.  If this packet is absent,
+lldb will not know anything about shared libraries in the inferior, or where the main
+executable loaded.
 
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
+## jGetSharedCacheInfo
 
-### Description
+This packet asks the remote debug stub to send the details about the inferior's
+shared cache. The shared cache is a collection of common libraries/frameworks that
+are mapped into every process at the same address on Darwin systems, and can be
+identified by a load address and UUID.
 
-Or specify the working directory:
 ```
-QSetWorkingDir:<ascii-hex-path>
+LLDB SENDS: jGetSharedCacheInfo:{}
+STUB REPLIES: ${"shared_cache_base_address":140735683125248,"shared_cache_uuid":"DDB8D70C-C9A2-3561-B2C8-BE48A4F33F96","no_shared_cache":false,"shared_cache_private_cache":false]}#00
 ```
-This packet must be sent  _prior_ to sending a "A" packet.
 
-## QSetDisableASLR:\<bool\>
+**Priority To Implement:** Low
 
-### Brief
+When both lldb and the inferior process are running on the same computer, and lldb
+and the inferior process have the same shared cache, lldb may (as an optimization) read
+the shared cache out of its own memory instead of using gdb-remote read packets to read
+them from the inferior process.
 
-Enable or disable ASLR on the next "A" packet.
+## jModulesInfo:[{"file":"...",triple:"..."}, ...]
 
-### Priority To Implement
+Get information for a list of modules by given module path and
+architecture.
 
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process and if the target supports disabling ASLR
-(Address space layout randomization).
+The response is a JSON array of dictionaries containing the following keys:
+* `uuid`
+* `triple`
+* `file_path`
+* `file_offset`
+* `file_size`
 
-### Description
+The meaning of the fields is the same as in the `qModuleInfo` packet. The server
+signals the failure to retrieve the module info for a file by ommiting the
+corresponding array entry from the response. The server may also
+include entries the client did not ask for, if it has reason to
+the modules will be interesting to the client.
 
-Or control if ASLR is enabled/disabled:
-```
-send packet: QSetDisableASLR:1
-read packet: OK
+**Priority To Implement:** Optional. If not implemented, `qModuleInfo` packet
+will be used, which may be slower if the target contains a large number of modules
+and the communication link has a non-negligible latency.
 
-send packet: QSetDisableASLR:0
-read packet: OK
-```
-This packet must be sent  _prior_ to sending a "A" packet.
+## jLLDBTraceGetBinaryData
 
-## QListThreadsInStopReply
+Get binary data given a trace technology and a data identifier.
+The input is specified as a JSON object and the response has the same format
+as the "binary memory read" (aka "x") packet. In case of failures, an error
+message is returned.
 
-### Brief
+```
+send packet: jLLDBTraceGetBinaryData:{"type":<type>,"kind":<query>,"tid":<tid>,"offset":<offset>,"size":<size>}]
+read packet: <binary data>/E<error code>;AAAAAAAAA
+```
 
-Enable the `threads:` and `thread-pcs:` data in the question-mark packet
-("T packet") responses when the stub reports that a program has
-stopped executing.
+### Schema
 
-### Priority To Implement
+The schema for the input is:
+```
+{
+ "type": <string>,
+     Tracing technology name, e.g. intel-pt, arm-etm.
+ "kind": <string>,
+     Identifier for the data.
+ "cpuId": <Optional decimal>,
+     Core id in decimal if the data belongs to a CPU core.
+ "tid"?: <Optional decimal>,
+     Tid in decimal if the data belongs to a thread.
+}
+```
 
-Performance.  This is a performance benefit to lldb if the thread id's
-and thread pc values are provided to lldb in the T stop packet -- if
-they are not provided to lldb, lldb will likely need to send one to
-two packets per thread to fetch the data at every private stop.
+## jLLDBTraceGetState
 
-### Example
+Get the current state of the process and its threads being traced by
+a given trace technology. The response is a JSON object with custom
+information depending on the trace technology. In case of errors, an
+error message is returned.
 
 ```
-send packet: QListThreadsInStopReply
-read packet: OK
+send packet: jLLDBTraceGetState:{"type":<type>}]
+read packet: {...object}/E<error code>;AAAAAAAAA
 ```
 
-## jLLDBTraceSupported
-
-### Brief
+### Input Schema
 
-Get the processor tracing type supported by the gdb-server for the current
-inferior. Responses might be different depending on the architecture and
-capabilities of the underlying OS.
+```
+{
+   "type": <string>
+      Tracing technology name, e.g. intel-pt, arm-etm.
+}
+```
 
 ### Output Schema
 
 ```
- {
-   "name": <string>,
-       Tracing technology name, e.g. intel-pt, arm-etm.
-   "description": <string>,
-       Description for this technology.
- }
+{
+  "tracedThreads": [{
+    "tid": <decimal integer>,
+    "binaryData": [
+      {
+        "kind": <string>,
+            Identifier for some binary data related to this thread to
+            fetch with the jLLDBTraceGetBinaryData packet.
+        "size": <decimal integer>,
+            Size in bytes of this thread data.
+      },
+    ]
+  }],
+  "processBinaryData": [
+    {
+      "kind": <string>,
+          Identifier for some binary data related to this process to
+          fetch with the jLLDBTraceGetBinaryData packet.
+      "size": <decimal integer>,
+          Size in bytes of this thread data.
+    },
+  ],
+  "cpus"?: [
+    "id": <decimal integer>,
+        Identifier for this CPU logical core.
+    "binaryData": [
+      {
+        "kind": <string>,
+            Identifier for some binary data related to this thread to
+            fetch with the jLLDBTraceGetBinaryData packet.
+        "size": <decimal integer>,
+            Size in bytes of this cpu core data.
+      },
+    ]
+  ],
+  "warnings"?: [<string>],
+      Non-fatal messages useful for troubleshooting.
+
+  ... other attributes specific to the given tracing technology
+}
 ```
 
-If no tracing technology is supported for the inferior, or no process is
-running, then an error message is returned.
+**Note:** `tracedThreads` includes all threads traced by both "process tracing"
+and "thread tracing".
 
-### Note
+### Intel Pt
 
-This packet is used by Trace plug-ins (see `lldb_private::Trace.h`) to
-do live tracing. Specifically, the name of the plug-in should match the name
-of the tracing technology returned by this packet.
+If per-cpu process tracing is enabled, "tracedThreads" will contain all
+the threads of the process without any trace buffers. Besides that, the
+"cpus" field will also be returned with per cpu core trace buffers.
+A side effect of per-cpu tracing is that all the threads of unrelated
+processes will also be traced, thus polluting the tracing data.
 
-### Example
+Binary data kinds:
+  - iptTrace: trace buffer for a thread or a cpu.
+  - perfContextSwitchTrace: context switch trace for a cpu generated by
+                            perf_event_open.
+  - procfsCpuInfo: contents of the /proc/cpuinfo file.
 
-```
-send packet: jLLDBTraceSupported
-read packet: {"name":<name>, "description":<description>}/E<error code>;AAAAAAAAA
-```
+Additional attributes:
+  * tscPerfZeroConversion
+    * This field allows converting Intel processor's TSC values to nanoseconds.
+      It is available through the Linux perf_event API when cap_user_time and cap_user_time_zero
+      are set.
+      See the documentation of time_zero in
+      https://man7.org/linux/man-pages/man2/perf_event_open.2.html for more information about
+      the calculation and the meaning of the values in the schema below.
 
-## jLLDBTraceStart
+      Schema for this field:
+      ```
+      "tscPerfZeroConversion": {
+        "timeMult": <decimal integer>,
+        "timeShift": <decimal integer>,
+        "timeZero": <decimal integer>,
+      }
+      ```
 
-### Brief
+## jLLDBTraceStart
 
 Start tracing a process or its threads using a provided tracing technology.
 The input and output are specified as JSON objects. In case of success, an OK
@@ -358,10 +404,20 @@ response is returned, or an error otherwise.
 This traces existing and future threads of the current process. An error is
 returned if the process is already being traced.
 
+```
+send packet: jLLDBTraceStart:{"type":<type>,...other params}]
+read packet: OK/E<error code>;AAAAAAAAA
+```
+
 ### Thread Tracing
 
 This traces specific threads.
 
+```
+send packet: jLLDBTraceStart:{"type":<type>,"tids":<tids>,...other params}]
+read packet: OK/E<error code>;AAAAAAAAA
+```
+
 ### Input Schema
 
 ```
@@ -377,8 +433,7 @@ This traces specific threads.
 }
 ```
 
-### Notes
-
+**Notes:**
 - If "tids" is not provided, then the operation is "process tracing",
   otherwise it's "thread tracing".
 - Each tracing technology can have different levels of support for "thread
@@ -485,24 +540,8 @@ Notes:
  - If "thread tracing" is attempted on a thread already being traced with
    either "thread tracing" or "process tracing", it fails.
 
-### Examples
-
-Process tracing:
-```
-send packet: jLLDBTraceStart:{"type":<type>,...other params}]
-read packet: OK/E<error code>;AAAAAAAAA
-```
-
-Thread tracing:
-```
-send packet: jLLDBTraceStart:{"type":<type>,"tids":<tids>,...other params}]
-read packet: OK/E<error code>;AAAAAAAAA
-```
-
 ## jLLDBTraceStop
 
-### Brief
-
 Stop tracing a process or its threads using a provided tracing technology.
 The input and output are specified as JSON objects. In case of success, an OK
 response is returned, or an error otherwise.
@@ -512,11 +551,21 @@ response is returned, or an error otherwise.
 Stopping a process trace stops the active traces initiated with
 "thread tracing".
 
+```
+send packet: jLLDBTraceStop:{"type":<type>}]
+read packet: OK/E<error code>;AAAAAAAAA
+```
+
 ### Thread Trace Stopping
 
 This is a best effort request, which tries to stop as many traces as
 possible.
 
+```
+send packet: jLLDBTraceStop:{"type":<type>,"tids":<tids>}]
+read packet: OK/E<error code>;AAAAAAAAA
+```
+
 ### Input Schema
 
 The schema for the input is
@@ -531,190 +580,1002 @@ The schema for the input is
 }
 ```
 
-### Notes
-
-- If "tids" is not provided, then the operation is "process trace stopping".
+**Note:** If `tids` is not provided, then the operation is "process trace stopping".
 
 ### Intel Pt
 
 Stopping a specific thread trace started with "process tracing" is allowed.
 
-### Examples
+## jLLDBTraceSupported
+
+Get the processor tracing type supported by the gdb-server for the current
+inferior. Responses might be different depending on the architecture and
+capabilities of the underlying OS.
 
-Process trace stopping:
 ```
-send packet: jLLDBTraceStop:{"type":<type>}]
-read packet: OK/E<error code>;AAAAAAAAA
+send packet: jLLDBTraceSupported
+read packet: {"name":<name>, "description":<description>}/E<error code>;AAAAAAAAA
 ```
-Thread trace stopping:
+
+### Output Schema
+
 ```
-send packet: jLLDBTraceStop:{"type":<type>,"tids":<tids>}]
-read packet: OK/E<error code>;AAAAAAAAA
+ {
+   "name": <string>,
+       Tracing technology name, e.g. intel-pt, arm-etm.
+   "description": <string>,
+       Description for this technology.
+ }
 ```
 
-## jLLDBTraceGetState
+If no tracing technology is supported for the inferior, or no process is
+running, then an error message is returned.
 
-### Brief
+**Note:** This packet is used by Trace plug-ins (see `lldb_private::Trace.h`) to
+do live tracing. Specifically, the name of the plug-in should match the name
+of the tracing technology returned by this packet.
 
-Get the current state of the process and its threads being traced by
-a given trace technology. The response is a JSON object with custom
-information depending on the trace technology. In case of errors, an
-error message is returned.
+## jThreadExtendedInfo
 
-### Input Schema
+This packet, which takes its arguments as JSON and sends its reply as
+JSON, allows the gdb remote stub to provide additional information
+about a given thread.
 
+This packet takes its arguments in [JSON](http://www.json.org).
+At a minimum, a thread must be specified, for example:
 ```
-{
-   "type": <string>
-      Tracing technology name, e.g. intel-pt, arm-etm.
-}
+jThreadExtendedInfo:{"thread":612910}
 ```
 
-### Output Schema
+Because this is a JSON string, the thread number is provided in base 10.
+Additional key-value pairs may be provided by lldb to the gdb remote
+stub.  For instance, on some versions of macOS, lldb can read offset
+information out of the system libraries.  Using those offsets, debugserver
+is able to find the Thread Specific Address (TSD) for a thread and include
+that in the return information.  So lldb will send these additional fields
+like so:
+```
+jThreadExtendedInfo:{"plo_pthread_tsd_base_address_offset":0,"plo_pthread_tsd_base_offset":224,"plo_pthread_tsd_entry_size":8,"thread":612910}
+```
 
+There are no requirements for what is included in the response.  A simple
+reply on a OS X Yosemite / iOS 8 may include the pthread_t value, the
+Thread Specific Data (TSD) address, the dispatch_queue_t value if the thread
+is associated with a GCD queue, and the requested Quality of Service (QoS)
+information about that thread.  For instance, a reply may look like:
 ```
-{
-  "tracedThreads": [{
-    "tid": <decimal integer>,
-    "binaryData": [
-      {
-        "kind": <string>,
-            Identifier for some binary data related to this thread to
-            fetch with the jLLDBTraceGetBinaryData packet.
-        "size": <decimal integer>,
-            Size in bytes of this thread data.
-      },
-    ]
-  }],
-  "processBinaryData": [
-    {
-      "kind": <string>,
-          Identifier for some binary data related to this process to
-          fetch with the jLLDBTraceGetBinaryData packet.
-      "size": <decimal integer>,
-          Size in bytes of this thread data.
-    },
-  ],
-  "cpus"?: [
-    "id": <decimal integer>,
-        Identifier for this CPU logical core.
-    "binaryData": [
-      {
-        "kind": <string>,
-            Identifier for some binary data related to this thread to
-            fetch with the jLLDBTraceGetBinaryData packet.
-        "size": <decimal integer>,
-            Size in bytes of this cpu core data.
-      },
+{"tsd_address":4371349728,"requested_qos":{"enum_value":33,"constant_name":"QOS_CLASS_USER_INTERACTIVE","printable_name":"User Interactive"},"pthread_t":4371349504,"dispatch_queue_t":140735087127872}
+```
+
+`tsd_address`, `pthread_t`, and `dispatch_queue_t` are all simple key-value pairs.
+The JSON standard requires that numbers be expressed in base 10 - so all of
+these are. `requested_qos` is a dictionary with three key-value pairs in it -
+so the UI layer may choose the form most appropriate for displaying to the user.
+
+Sending JSON over gdb-remote protocol introduces some problems.  We may be
+sending strings with arbitrary contents in them, including the `#`, `$`, and `*`
+characters that have special meaning in gdb-remote protocol and cannot occur
+in the middle of the string. The standard solution for this would be to require
+ascii-hex encoding of all strings, or ascii-hex encode the entire JSON payload.
+
+Instead, the binary escaping convention is used for JSON data.  This convention
+(e.g. used for the `X` packet) says that if `#`, `$`, `*`, or `}` are to occur in
+the payload, the character `}` (`0x7d`) is emitted, then the metacharacter is emitted
+xor'ed by `0x20`. The `}` character occurs in every JSON payload at least once, and
+`} ^ 0x20` happens to be `]` so the raw packet characters for a request will look
+like:
+```
+jThreadExtendedInfo:{"thread":612910}]
+```
+
+**Priority To Implement:** Low. This packet is only needed if the gdb remote stub
+wants to provide interesting additional information about a thread for the user.
+
+## jThreadsInfo
+
+Ask for the server for thread stop information of all threads.
+
+The data in this packet is very similar to the stop reply packets, but is packaged in
+JSON and uses JSON arrays where applicable. The JSON output looks like:
+```
+    [
+      { "tid":1580681,
+        "metype":6,
+        "medata":[2,0],
+        "reason":"exception",
+        "qaddr":140735118423168,
+        "registers": {
+          "0":"8000000000000000",
+          "1":"0000000000000000",
+          "2":"20fabf5fff7f0000",
+          "3":"e8f8bf5fff7f0000",
+          "4":"0100000000000000",
+          "5":"d8f8bf5fff7f0000",
+          "6":"b0f8bf5fff7f0000",
+          "7":"20f4bf5fff7f0000",
+          "8":"8000000000000000",
+          "9":"61a8db78a61500db",
+          "10":"3200000000000000",
+          "11":"4602000000000000",
+          "12":"0000000000000000",
+          "13":"0000000000000000",
+          "14":"0000000000000000",
+          "15":"0000000000000000",
+          "16":"960b000001000000",
+          "17":"0202000000000000",
+          "18":"2b00000000000000",
+          "19":"0000000000000000",
+          "20":"0000000000000000"
+        },
+        "memory":[
+          {"address":140734799804592,"bytes":"c8f8bf5fff7f0000c9a59e8cff7f0000"},
+          {"address":140734799804616,"bytes":"00000000000000000100000000000000"}
+        ]
+      }
     ]
-  ],
-  "warnings"?: [<string>],
-      Non-fatal messages useful for troubleshooting.
+```
 
-  ... other attributes specific to the given tracing technology
-}
+It contains an array of dictionaries with all of the key value pairs that are
+normally in the stop reply packet, including the expedited registers. The registers are
+passed as hex-encoded JSON string in debuggee-endian byte order. Note that the register
+numbers are decimal numbers, unlike the stop-reply packet, where they are written in
+hex. The packet also contains expedited memory in the `memory` key.  This allows the
+server to expedite memory that the client is likely to use (e.g., areas around the
+stack pointer, which are needed for computing backtraces) and it reduces the packet
+count.
+
+On macOS with debugserver, we expedite the frame pointer backchain for a thread
+(up to 256 entries) by reading 2 pointers worth of bytes at the frame pointer (for
+the previous FP and PC), and follow the backchain. Most backtraces on macOS and
+iOS now don't require us to read any memory!
+
+**Priority To Implement:** Low
+
+This is a performance optimization, which speeds up debugging by avoiding
+multiple round-trips for retrieving thread information. The information from this
+packet can be retrieved using a combination of `qThreadStopInfo` and `m` packets.
+
+## QEnvironment:NAME=VALUE
+
+Setup the environment up for a new child process that will soon be
+launched using the "A" packet.
+
+NB: key/value pairs are sent as-is so gdb-remote protocol meta characters
+(e.g. `#` or `$`) are not acceptable.  If any non-printable or
+metacharacters are present in the strings, `QEnvironmentHexEncoded`
+should be used instead if it is available.  If you don't want to
+scan the environment strings before sending, prefer
+the `QEnvironmentHexEncoded` packet over `QEnvironment`, if it is
+available.
+
+Both GDB and LLDB support passing down environment variables. Is it ok to
+respond with a `$#00` (unimplemented):
 ```
+send packet: $QEnvironment:ACK_COLOR_FILENAME=bold yellow#00
+read packet: $OK#00
+```
+This packet can be sent one or more times _prior_ to sending a "A" packet.
 
-### Notes
+**Priority To Implement:** Low. Only needed if the remote target wants to launch
+a target after making a connection to a GDB server that isn't already connected to
+an inferior process.
 
- - "traceThreads" includes all thread traced by both "process tracing" and
-   "thread tracing".
+## QEnvironmentHexEncoded:HEX-ENCODING(NAME=VALUE)
 
-### Intel Pt
+Setup the environment up for a new child process that will soon be
+launched using the "A" packet.
 
-If per-cpu process tracing is enabled, "tracedThreads" will contain all
-the threads of the process without any trace buffers. Besides that, the
-"cpus" field will also be returned with per cpu core trace buffers.
-A side effect of per-cpu tracing is that all the threads of unrelated
-processes will also be traced, thus polluting the tracing data.
+The only difference between this packet and `QEnvironment` is that the
+environment key-value pair is ascii hex encoded for transmission.
+This allows values with gdb-remote metacharacters like `#` to be sent.
 
-Binary data kinds:
-  - iptTrace: trace buffer for a thread or a cpu.
-  - perfContextSwitchTrace: context switch trace for a cpu generated by
-                            perf_event_open.
-  - procfsCpuInfo: contents of the /proc/cpuinfo file.
+Both GDB and LLDB support passing down environment variables. Is it ok to
+respond with a `$#00` (unimplemented):
+```
+send packet: $QEnvironment:41434b5f434f4c4f525f46494c454e414d453d626f6c642379656c6c6f77#00
+read packet: $OK#00
+```
+This packet can be sent one or more times _prior_ to sending a "A" packet.
 
-Additional attributes:
-  * tscPerfZeroConversion
-    * This field allows converting Intel processor's TSC values to nanoseconds.
-      It is available through the Linux perf_event API when cap_user_time and cap_user_time_zero
-      are set.
-      See the documentation of time_zero in
-      https://man7.org/linux/man-pages/man2/perf_event_open.2.html for more information about
-      the calculation and the meaning of the values in the schema below.
+**Priority To Implement:** Low. Only needed if the remote target wants to launch
+a target after making a connection to a GDB server that isn't already connected to
+an inferior process.
 
-      Schema for this field:
-      ```
-      "tscPerfZeroConversion": {
-        "timeMult": <decimal integer>,
-        "timeShift": <decimal integer>,
-        "timeZero": <decimal integer>,
-      }
-      ```
+## QEnableCompression
 
-### Example
+This packet enables compression of the packets that the debug stub sends to lldb.
+If the debug stub can support compression, it indictes this in the reply of the
+"qSupported" packet. For example:
+```
+LLDB SENDS:    qSupported:xmlRegisters=i386,arm,mips
+STUB REPLIES:  qXfer:features:read+;SupportedCompressions=lzfse,zlib-deflate,lz4,lzma;
+```
 
+If lldb knows how to use any of these compression algorithms, it can ask that this
+compression mode be enabled.
 ```
-send packet: jLLDBTraceGetState:{"type":<type>}]
-read packet: {...object}/E<error code>;AAAAAAAAA
+QEnableCompression:type:zlib-deflate;
 ```
 
-## jLLDBTraceGetBinaryData
+The debug stub should reply with an uncompressed `OK` packet to indicate that the
+request was accepted.  All further packets the stub sends will use this compression.
 
-### Brief
+Packets are compressed as the last step before they are sent from the stub, and
+decompressed as the first step after they are received.  The packet format in compressed
+mode becomes one of two:
+```
+$N<uncompressed payload>#00
 
-Get binary data given a trace technology and a data identifier.
-The input is specified as a JSON object and the response has the same format
-as the "binary memory read" (aka "x") packet. In case of failures, an error
-message is returned.
+$C<size of uncompressed payload in base 10>:<compressed payload>#00
+```
 
-### Schema
+Where `#00` is the actual checksum value if noack mode is not enabled. The checksum
+value is for the `N<uncompressed payload>` or
+`C<size of uncompressed payload in base 10>:<compressed payload>` bytes in the packet.
+
+The size of the uncompressed payload in base 10 is provided because it will simplify
+decompression if the final buffer size needed is known ahead of time.
+
+Compression on low-latency connections is unlikely to be an improvement. Particularly
+when the debug stub and lldb are running on the same host. It should only be used
+for slow connections, and likely only for larger packets.
+
+Example compression algorithms that may be used include:
+* `zlib-deflate` -
+  The raw DEFLATE format as described in IETF RFC 1951.  With the ZLIB library, you
+  can compress to this format with an initialization like
+      deflateInit2 (&stream, 5, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY)
+  and you can decompress with an initialization like
+      inflateInit2 (&stream, -15).
+* `lz4` -
+  https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)
+  https://github.com/Cyan4973/lz4
+  The libcompression APIs on darwin systems call this `COMPRESSION_LZ4_RAW`.
+* `lzfse` -
+  Compression algorithm added in macOS 10.11, with open source C reference
+  implementation on github.
+  https://en.wikipedia.org/wiki/LZFSE
+  https://github.com/lzfse/lzfse
+* `lzma` -
+  libcompression implements "LZMA level 6", the default compression for the
+  open source LZMA implementation.
+
+
+## QEnableErrorStrings
+
+This packet enables reporting of Error strings in remote packet
+replies from the server to client. If the server supports this
+feature, it should send an OK response.
 
-The schema for the input is:
 ```
-{
- "type": <string>,
-     Tracing technology name, e.g. intel-pt, arm-etm.
- "kind": <string>,
-     Identifier for the data.
- "cpuId": <Optional decimal>,
-     Core id in decimal if the data belongs to a CPU core.
- "tid"?: <Optional decimal>,
-     Tid in decimal if the data belongs to a thread.
-}
+send packet: $QEnableErrorStrings
+read packet: $OK#00
 ```
 
-### Example
+The client can expect the following error replies if this feature is enabled in
+the server:
+```
+EXX;AAAAAAAAA
+```
+where `AAAAAAAAA` will be a hex encoded ASCII string.
+`XX`` is hex encoded byte number.
+
+It must be noted that even if the client has enabled reporting
+strings in error replies, it must not expect error strings to all
+error replies.
+
+**Priority To Implement:** Low. Only needed if the remote target wants to
+provide strings that are human readable along with an error code.
+
+## QListThreadsInStopReply
+
+Enable the `threads:` and `thread-pcs:` data in the question-mark packet
+("T packet") responses when the stub reports that a program has
+stopped executing.
 
 ```
-send packet: jLLDBTraceGetBinaryData:{"type":<type>,"kind":<query>,"tid":<tid>,"offset":<offset>,"size":<size>}]
-read packet: <binary data>/E<error code>;AAAAAAAAA
+send packet: QListThreadsInStopReply
+read packet: OK
 ```
 
-## qRegisterInfo\<hex-reg-id\>
+**Priority To Implement:** Performance.  This is a performance benefit to lldb
+if the thread id's and thread pc values are provided to lldb in the T stop packet
+-- if they are not provided to lldb, lldb will likely need to send one to
+two packets per thread to fetch the data at every private stop.
 
-### Brief
+## QRestoreRegisterState:\<save_id\> / QRestoreRegisterState:\<save_id\>;thread:XXXX;
 
-Discover register information from the remote GDB server.
+The `QRestoreRegisterState` packet tells the remote debugserver to
+restore all registers using the `save_id` which is an unsigned
+integer that was returned from a previous call to
+`QSaveRegisterState`. The restoration process can only be done once
+as the data backing the register state will be freed upon the
+completion of the `QRestoreRegisterState` command.
 
-### Priority To Implement
+If thread suffixes are enabled the second form of this packet is
+used, otherwise the first form is used.
 
-High. Any target that can self describe its registers, should do so.
-This means if new registers are ever added to a remote target, they
-will get picked up automatically, and allows registers to change
-depending on the actual CPU type that is used.
+The response is either:
+* `OK` - if all registers were successfully restored
+* `EXX` - for any errors
 
-NB: `qRegisterInfo` is deprecated in favor of the standard gdb remote
-serial protocol register description method,
-`qXfer:features:read:target.xml`.
-If `qXfer:features:read:target.xml` is supported, `qRegisterInfo` does
-not need to be implemented.  The target.xml format is used by most
-gdb RSP stubs whereas `qRegisterInfo` was an lldb-only design.
-`qRegisterInfo` requires one packet per register and can have undesirable
-performance costs at the start of a debug session, whereas target.xml
-may be able to describe all registers in a single packet.
+**Priority To Implement:** Low, this is mostly a convenience packet to avoid
+having to send all registers with a `g` packet. It should only be implemented
+if support for the `QSaveRegisterState` is added.
+
+## QSaveRegisterState / QSaveRegisterState;thread:XXXX;
+
+The `QSaveRegisterState` packet tells the remote debugserver to save
+all registers and return a non-zero unique integer ID that
+represents these save registers. If thread suffixes are enabled the
+second form of this packet is used, otherwise the first form is
+used. This packet is called prior to executing an expression, so
+the remote GDB server should do anything it needs to in order to
+ensure the registers that are saved are correct. On macOS this
+involves calling `thread_abort_safely(mach_port_t thread)` to
+ensure we get the correct registers for a thread in case it is
+currently having code run on its behalf in the kernel.
+
+The response is either:
+* `<unsigned int>` - The save_id result is a non-zero unsigned integer value
+                 that can be passed back to the GDB server using a
+                 `QRestoreRegisterState` packet to restore the registers
+                 one time.
+* `EXX` - or an error code in the form of `EXX` where `XX` is a
+          hex error code.
+
+**Priority To Implement:** Low, this is mostly a convenience packet to avoid
+having to send all registers with a `g` packet. It should only be implemented if
+support for the `QRestoreRegisterState` is added.
+
+## QSetDetachOnError
+
+Sets what the server should do when the communication channel with LLDB
+goes down. Either kill the inferior process (`0`) or remove breakpoints and
+detach (`1`).
+
+The data in this packet is a single a character, which should be `0` if the
+inferior process should be killed, or `1` if the server should remove all
+breakpoints and detach from the inferior.
+
+**Priority To Implement:** Low. Only required if the target wants to keep the
+inferior process alive when the communication channel goes down.
+
+## QSetDisableASLR:\<bool\>
+
+Enable or disable ASLR on the next "A" packet.
+
+Or control if ASLR is enabled/disabled:
+```
+send packet: QSetDisableASLR:1
+read packet: OK
+
+send packet: QSetDisableASLR:0
+read packet: OK
+```
+This packet must be sent  _prior_ to sending a "A" packet.
+
+**Priority To Implement:** Low. Only needed if the remote target wants to launch
+a target after making a connection to a GDB server that isn't already connected to
+an inferior process and if the target supports disabling ASLR
+(Address space layout randomization).
+
+## QSetSTDIN:\<ascii-hex-path\> / QSetSTDOUT:\<ascii-hex-path\> / QSetSTDERR:\<ascii-hex-path\>
+
+Setup where STDIN, STDOUT, and STDERR go prior to sending an "A"
+packet.
+
+When launching a program through the GDB remote protocol with the "A" packet,
+you might also want to specify where stdin/out/err go:
+```
+QSetSTDIN:<ascii-hex-path>
+QSetSTDOUT:<ascii-hex-path>
+QSetSTDERR:<ascii-hex-path>
+```
+These packets must be sent  _prior_ to sending a "A" packet.
+
+**Priority To Implement:** Low. Only needed if the remote target wants to launch
+a target after making a connection to a GDB server that isn't already connected to
+an inferior process.
+
+## QSetWorkingDir:\<ascii-hex-path\>
+
+Set the working directory prior to sending an "A" packet.
+
+Or specify the working directory:
+```
+QSetWorkingDir:<ascii-hex-path>
+```
+This packet must be sent  _prior_ to sending a "A" packet.
+
+**Priority To Implement:** Low. Only needed if the remote target wants to launch
+a target after making a connection to a GDB server that isn't already connected to
+an inferior process.
+
+## QStartNoAckMode
+
+Try to enable no ACK mode to skip sending ACKs and NACKs.
+
+Having to send an ACK/NACK after every packet slows things down a bit, so we
+have a way to disable ACK packets to minimize the traffic for reliable
+communication interfaces (like sockets). Below GDB or LLDB will send this
+packet to try and disable ACKs. All lines that start with "send packet: " are
+from GDB/LLDB, and all lines that start with "read packet: " are from the GDB
+remote server:
+```
+send packet: $QStartNoAckMode#b0
+read packet: +
+read packet: $OK#9a
+send packet: +
+```
+
+**Priority To Implement:** High. Any GDB remote server that can implement this
+should if the connection is reliable. This improves packet throughput and increases
+the performance of the connection.
+
+## QSupported
+
+Query the GDB remote server for features it supports
+
+QSupported is a standard GDB Remote Serial Protocol packet, but
+there are several additions to the response that lldb can parse.
+They are not all listed here.
+
+An example exchange:
+```
+send packet: qSupported:xmlRegisters=i386,arm,mips,arc;multiprocess+;fork-events+;vfork-events+
+
+read packet: qXfer:features:read+;PacketSize=20000;qEcho+;native-signals+;SupportedCompressions=lzfse,zlib-deflate,lz4,lzma;SupportedWatchpointTypes=aarch64-mask,aarch64-bas;
+```
+
+In the example above, three lldb extensions are shown:
+
+  * `PacketSize=20000`
+    * The base 16 maximum packet size that the stub can handle.
+  * `SupportedCompressions=<item,item,...>`
+    * A list of compression types that the stub can use to compress packets
+    when the QEnableCompression packet is used to request one of them.
+  * `SupportedWatchpointTypes=<item,item,...>`
+    * A list of watchpoint types that this stub can manage. Currently defined 
+      names are:
+        * `x86_64` - 64-bit x86-64 watchpoints (1, 2, 4, 8 byte watchpoints
+          aligned to those amounts)
+        * `aarch64-bas`  AArch64 Byte Address Select watchpoints
+                     (any number of contiguous bytes within a doubleword)
+        * `aarch64-mask` AArch64 MASK watchpoints
+                     (any power-of-2 region of memory from 8 to 2GB, aligned)
+
+      If nothing is specified, lldb will default to sending power-of-2
+      watchpoints, up to a pointer size, `sizeof(void*)`, a reasonable
+      baseline assumption.
+
+**Priority To Implement:** Optional
+
+## QThreadSuffixSupported
+
+Try to enable thread suffix support for the `g`, `G`, `p`, and `P` packets.
+
+When reading thread registers, you currently need to set the current
+thread, then read the registers. This is kind of cumbersome, so we added the
+ability to query if the remote GDB server supports adding a `thread:<tid>;`
+suffix to all packets that request information for a thread. To test if the
+remote GDB server supports this feature:
+```
+send packet: $QThreadSuffixSupported#00
+read packet: OK
+```
+
+If `OK` is returned, then the `g`, `G`, `p` and `P` packets can accept a
+thread suffix. So to send a `g` packet (read all register values):
+```
+send packet: $g;thread:<tid>;#00
+read packet: ....
+
+send packet: $G;thread:<tid>;#00
+read packet: ....
+
+send packet: $p1a;thread:<tid>;#00
+read packet: ....
+
+send packet: $P1a=1234abcd;thread:<tid>;#00
+read packet: ....
+```
+
+otherwise, without this you would need to always send two packets:
+```
+send packet: $Hg<tid>#00
+read packet: ....
+send packet: $g#00
+read packet: ....
+```
+
+We also added support for allocating and deallocating memory. We use this to
+allocate memory so we can run JITed code.
+
+**Priority To Implement:** High
+
+Adding a thread suffix allows us to read and write registers
+more efficiently and stops us from having to select a thread with
+one packet and then read registers with a second packet. It also
+makes sure that no errors can occur where the debugger thinks it
+already has a thread selected (see the `Hg` packet from the standard
+GDB remote protocol documentation) yet the remote GDB server actually
+has another thread selected.
+
+## qAttachOrWaitSupported
+
+This is a binary "is it supported" query. Return OK if you support
+`vAttachOrWait`.
+
+**Priority To Implement:** Low. This is required if you support `vAttachOrWait`,
+otherwise no support is needed since the standard "I don't recognize this packet"
+response will do the right thing.
+
+## qFileLoadAddress:\<file_path\>
+
+Get the load address of a memory mapped file.
+The load address is defined as the address of the first memory
+region what contains data mapped from the specified file.
+
+The response is either:
+* `<unsigned-hex64>` - Load address of the file in big endian encoding
+* `E01` - the requested file isn't loaded
+* `EXX` - for any other errors
+
+**Priority To Implement:** Low, required if dynamic linker don't fill in the load
+address of some object file in the rendezvous data structure.
+
+## qfProcessInfo / qsProcessInfo (Platform Extension)
+
+Get the first process info (`qfProcessInfo`) or subsequent process
+info (`qsProcessInfo`) for one or more processes on the remote
+platform. The first call gets the first match and subsequent calls
+to `qsProcessInfo` gets the subsequent matches. Return an error `EXX`,
+where `XX` are two hex digits, when no more matches are available.
+
+The `qfProcessInfo` packet can be followed by a `:` and
+some key value pairs. The key value pairs in the command are:
+* `name` - `ascii-hex` -
+  An ASCII hex string that contains the name of the process that will be matched.
+* `name_match` - `enum` -
+  One of:
+    * `equals`
+    * `starts_with`
+    * `ends_with`
+    * `contains`
+    * `regex`
+* `pid` - `integer`- A string value containing the decimal process ID
+* `parent_pid` - `integer` - A string value containing the decimal parent process ID
+* `uid` - `integer` - A string value containing the decimal user ID
+* `gid` - `integer` - A string value containing the decimal group ID
+* `euid` - `integer` - A string value containing the decimal effective user ID
+* `egid` - `integer` - A string value containing the decimal effective group ID
+* `all_users` - `bool` -
+  A boolean value that specifies if processes should
+  be listed for all users, not just the user that the
+  platform is running as
+* `triple` - `string` -
+  An ASCII triple string (`x86_64`, `x86_64-apple-macosx`, `armv7-apple-ios`)
+* `args` - `string` -
+  A string value containing the process arguments separated by the character `-`,
+  where each argument is hex-encoded. It includes `argv[0]`.
+
+The response consists of key/value pairs where the key is separated from the
+values with colons and each pair is terminated with a semi colon. For a list
+of the key/value pairs in the response see the `qProcessInfoPID` packet
+documentation.
+
+Sample packet/response:
+```
+send packet: $qfProcessInfo#00
+read packet: $pid:60001;ppid:59948;uid:7746;gid:11;euid:7746;egid:11;name:6c6c6462;triple:x86_64-apple-macosx;#00
+send packet: $qsProcessInfo#00
+read packet: $pid:59992;ppid:192;uid:7746;gid:11;euid:7746;egid:11;name:6d64776f726b6572;triple:x86_64-apple-macosx;#00
+send packet: $qsProcessInfo#00
+read packet: $E04#00
+```
+
+**Priority To Implement:** Required
+
+
+## qGDBServerVersion
+
+Get version information about this implementation of the gdb-remote
+protocol.
+
+The goal of this packet is to provide enough information about an
+implementation of the gdb-remote-protocol server that lldb can
+work around implementation problems that are discovered after the
+version has been released/deployed.  The name and version number
+should be sufficiently unique that lldb can unambiguously identify
+the origin of the program (for instance, debugserver from lldb) and
+the version/submission number/patch level of the program - whatever
+is appropriate for your server implementation.
+
+The packet follows the key-value pair model, semicolon separated.
+```
+send packet: $qGDBServerVersion#00
+read packet: $name:debugserver;version:310.2;#00
+```
+
+Other clients may find other key-value pairs to be useful for identifying
+a gdb stub.  Patch level, release name, build number may all be keys that
+better describe your implementation's version.
+
+Suggested key names:
+* `name`: the name of your remote server - "debugserver" is the lldb standard
+          implementation
+* `version`: identifies the version number of this server
+* `patch_level`: the patch level of this server
+* `release_name`: the name of this release, if your project uses names
+* `build_number`: if you use a build system with increasing build numbers,
+                  this may be the right key name for your server
+* `major_version`: major version number
+* `minor_version`: minor version number
+
+**Priority To Implement:** High. This packet is usually very easy to implement
+and can help LLDB to work around bugs in a server's implementation when they
+are found.
+
+## qGetWorkingDir
+
+Get the current working directory of the platform stub in
+ASCII hex encoding.
+
+```
+receive: qGetWorkingDir
+send:    2f4170706c65496e7465726e616c2f6c6c64622f73657474696e67732f342f5465737453657474696e67732e746573745f646973617373656d626c65725f73657474696e6773
+```
+
+## qHostInfo
+
+Get information about the host we are remotely connected to.
+
+LLDB supports a host info call that gets all sorts of details of the system
+that is being debugged:
+```
+send packet: $qHostInfo#00
+read packet: $cputype:16777223;cpusubtype:3;ostype:darwin;vendor:apple;endian:little;ptrsize:8;#00
+```
+
+Key value pairs are one of:
+* `cputype`: is a number that is the mach-o CPU type that is being debugged (base 10)
+* `cpusubtype`: is a number that is the mach-o CPU subtype type that is being debugged (base 10)
+* `triple`: a string for the target triple (x86_64-apple-macosx) that can be used to specify arch + vendor + os in one entry
+* `vendor`: a string for the vendor (apple), not needed if "triple" is specified
+* `ostype`: a string for the OS being debugged (macosx, linux, freebsd, ios, watchos), not needed if "triple" is specified
+* `endian`: is one of "little", "big", or "pdp"
+* `ptrsize`: an unsigned number that represents how big pointers are in bytes on the debug target
+* `hostname`: the hostname of the host that is running the GDB server if available
+* `os_build`: a string for the OS build for the remote host as a string value
+* `os_kernel`: a string describing the kernel version
+* `os_version`: a version string that represents the current OS version (10.8.2)
+* `watchpoint_exceptions_received`: one of "before" or "after" to specify if a watchpoint is triggered before or after the pc when it stops
+* `default_packet_timeout`: an unsigned number that specifies the default timeout in seconds
+* `distribution_id`: optional. For linux, specifies distribution id (e.g. ubuntu, fedora, etc.)
+* `osmajor`: optional, specifies the major version number of the OS (e.g. for macOS 10.12.2, it would be 10)
+* `osminor`: optional, specifies the minor version number of the OS (e.g. for macOS 10.12.2, it would be 12)
+* `ospatch`: optional, specifies the patch level number of the OS (e.g. for macOS 10.12.2, it would be 2)
+* `vm-page-size`: optional, specifies the target system VM page size, base 10.
+  Needed for the "dirty-pages:" list in the qMemoryRegionInfo
+  packet, where a list of dirty pages is sent from the remote
+  stub.  This page size tells lldb how large each dirty page is.
+* `addressing_bits`: optional, specifies how many bits in addresses are
+	significant for addressing, base 10.  If bits 38..0
+	in a 64-bit pointer are significant for addressing,
+	then the value is 39.  This is needed on e.g. AArch64
+	v8.3 ABIs that use pointer authentication, so lldb
+	knows which bits to clear/set to get the actual
+	addresses.
+* `low_mem_addressing_bits`: optional, specifies how many bits in
+  addresses in low memory are significant for addressing, base 10.
+  AArch64 can have different page table setups for low and high
+  memory, and therefore a different number of bits used for addressing.
+* `high_mem_addressing_bits`: optional, specifies how many bits in
+  addresses in high memory are significant for addressing, base 10.
+  AArch64 can have different page table setups for low and high
+  memory, and therefore a different number of bits used for addressing.
+
+**Priority To Implement:** High. This packet is usually very easy to implement
+and can help LLDB select the correct plug-ins for the job based on the target
+triple information that is supplied.
+
+## qKillSpawnedProcess (Platform Extension)
+
+Kill a process running on the target system.
+
+```
+receive: qKillSpawnedProcess:1337
+send:    OK
+```
+The request packet has the process ID in base 10.
+
+## qLaunchGDBServer (Platform Extension)
+
+Have the remote platform launch a GDB server.
+
+The `qLaunchGDBServer` packet must be followed by a `:` and
+some key value pairs. The key value pairs in the command are:
+* `port` - `integer` -
+  A string value containing the decimal port ID or zero if the port should be
+  bound and returned
+* `host` - `integer` -
+  The host that connections should be limited to when the GDB server is connected to.
+
+Sample packet/response:
+```
+send packet: $qLaunchGDBServer:port:0;host:lldb.apple.com;#00
+read packet: $pid:60025;port:50776;#00
+```
+
+The `pid` key/value pair is only specified if the remote platform launched
+a separate process for the GDB remote server and can be omitted if no
+process was separately launched.
+
+The `port` key/value pair in the response lets clients know what port number
+to attach to in case zero was specified as the "port" in the sent command.
+
+**Priority To Implement:** Required
+
+
+## qLaunchSuccess
+
+Check whether launching a process with the `A` packet succeeded.
+
+Returns the status of the last attempt to launch a process.
+Either `OK` if no error ocurred, or `E` followed by a string
+describing the error.
+
+**Priority To Implement:** High, launching processes is a key part of LLDB's
+platform mode.
+
+## qMemoryRegionInfo:\<addr\>
+
+Get information about the address range that contains `<addr>`.
+
+We added a way to get information for a memory region. The packet is:
+```
+qMemoryRegionInfo:<addr>
+```
+
+Where `<addr>` is a big endian hex address. The response is returned in a series
+of tuples like the data returned in a stop reply packet. The currently valid
+tuples to return are:
+* `start:<start-addr>;` - `<start-addr>` is a big endian hex address that is
+                          the start address of the range that contains `<addr>`
+* `size:<size>;` - `<size>` is a big endian hex byte size of the address
+                   of the range that contains `<addr>`
+* `permissions:<permissions>;` - `<permissions>` is a string that contains one
+                                 or more of the characters from `rwx`
+* `name:<name>;` - `<name>` is a hex encoded string that contains the name of
+                   the memory region mapped at the given address. In case of
+                   regions backed by a file it have to be the absolute path of
+                   the file while for anonymous regions it have to be the name
+                   associated to the region if that is available.
+* `flags:<flags-string>;` - where `<flags-string>` is a space separated string
+                            of flag names. Currently the only supported flag
+                            is `mt` for AArch64 memory tagging. lldb will
+                            ignore any other flags in this field.
+* `type:[<type>][,<type>];` - memory types that apply to this region, e.g.
+                              `stack` for stack memory.
+* `error:<ascii-byte-error-string>;` - where `<ascii-byte-error-string>` is
+                                       a hex encoded string value that
+                                       contains an error string
+* `dirty-pages:[<hexaddr>][,<hexaddr];` -
+  A list of memory pages within this
+  region that are "dirty" -- they have been modified.
+  Page addresses are in base 16. The size of a page can
+  be found from the `qHostInfo`'s `page-size` key-value.
+  
+  If the stub supports identifying dirty pages within a
+  memory region, this key should always be present for all
+  `qMemoryRegionInfo` replies.  This key with no pages
+  listed (`dirty-pages:;`) indicates no dirty pages in
+  this memory region.  The *absence* of this key means
+  that this stub cannot determine dirty pages.
+
+If the address requested is not in a mapped region (e.g. we've jumped through
+a NULL pointer and are at 0x0) currently lldb expects to get back the size
+of the unmapped region -- that is, the distance to the next valid region.
+For instance, with a macOS process which has nothing mapped in the first
+4GB of its address space, if we're asking about address 0x2:
+```
+  qMemoryRegionInfo:2
+  start:2;size:fffffffe;
+```
+
+The lack of `permissions:` indicates that none of read/write/execute are valid
+for this region.
+
+**Priority To Implement:** Medium
+
+This is nice to have, but it isn't necessary. It helps LLDB
+do stack unwinding when we branch into memory that isn't executable.
+If we can detect that the code we are stopped in isn't executable,
+then we can recover registers for stack frames above the current
+frame. Otherwise we must assume we are in some JIT'ed code (not JIT
+code that LLDB has made) and assume that no registers are available
+in higher stack frames.
+
+## qModuleInfo:\<module_path\>;\<arch triple\>
+
+Get information for a module by given module path and architecture.
+
+The response is either:
+* `(uuid|md5):...;triple:...;file_offset:...;file_size...;`
+* `EXX` - for any errors
+
+**Priority To Implement:** Optional, required if dynamic loader cannot fetch
+module's information like UUID directly from inferior's memory.
+
+## qPathComplete (Platform Extension)
+
+Get a list of matched disk files/directories by passing a boolean flag
+and a partial path.
+
+```
+receive: qPathComplete:0,6d61696e
+send:    M6d61696e2e637070
+receive: qPathComplete:1,746573
+send:    M746573742f,74657374732f
+```
+
+If the first argument is zero, the result should contain all
+files (including directories) starting with the given path. If the
+argument is one, the result should contain only directories.
+
+The result should be a comma-separated list of hex-encoded paths.
+Paths denoting a directory should end with a directory separator (`/` or `\`.
 
-### Description
+
+## qPlatform_mkdir
+
+Creates a new directory on the connected remote machine.
+
+Request: `qPlatform_mkdir:<hex-file-mode>,<ascii-hex-path>`
+
+The request packet has the fields:
+   1. mode bits in base 16
+   2. file path in ascii-hex encoding
+
+Reply: 
+  * `F<mkdir-return-code>`
+    (mkdir called successfully and returned with the given return code)
+  * `Exx` (An error occurred)
+
+**Priority To Implement:** Low
+
+## qPlatform_shell
+
+Run a command in a shell on the connected remote machine.
+
+The request consists of the command to be executed encoded in ASCII characters
+converted into hex bytes.
+
+The response to this packet consists of the letter F followed by the return code,
+followed by the signal number (or 0 if no signal was delivered), and escaped bytes
+of captured program output.
+
+Below is an example communication from a client sending an "ls -la" command:
+```
+send packet: $qPlatform_shell:6c73202d6c61,00000002#ec
+read packet: $F,00000000,00000000,total 4736
+drwxrwxr-x 16 username groupname    4096 Aug 15 21:36 .
+drwxr-xr-x 17 username groupname    4096 Aug 10 16:39 ..
+-rw-rw-r--  1 username groupname   73875 Aug 12 16:46 notes.txt
+drwxrwxr-x  5 username groupname    4096 Aug 15 21:36 source.cpp
+-rw-r--r--  1 username groupname    2792 Aug 12 16:46 a.out
+-rw-r--r--  1 username groupname    3190 Aug 12 16:46 Makefile
+```
+
+**Priority To Implement:** High
+
+## qProcessInfo
+
+Get information about the process we are currently debugging.
+
+**Priority To Implement:** Medium
+
+On systems which can launch multiple different architecture processes,
+the qHostInfo may not disambiguate sufficiently to know what kind of
+process is being debugged.
+
+For example on a 64-bit x86 Mac system both 32-bit and 64-bit user processes are possible,
+and with Mach-O universal files, the executable file may contain both 32- and
+64-bit slices so it may be impossible to know until you're attached to a real
+process to know what you're working with.
+
+All numeric fields return base 16 numbers without any "0x" prefix.
+
+An i386 process:
+```
+send packet: $qProcessInfo#00
+read packet: $pid:42a8;parent-pid:42bf;real-uid:ecf;real-gid:b;effective-uid:ecf;effective-gid:b;cputype:7;cpusubtype:3;ostype:macosx;vendor:apple;endian:little;ptrsize:4;#00
+```
+
+An x86_64 process:
+```
+send packet: $qProcessInfo#00
+read packet: $pid:d22c;parent-pid:d34d;real-uid:ecf;real-gid:b;effective-uid:ecf;effective-gid:b;cputype:1000007;cpusubtype:3;ostype:macosx;vendor:apple;endian:little;ptrsize:8;#00
+```
+
+Key value pairs include:
+* `pid`: the process id
+* `parent-pid`: the process of the parent process (often debugserver will become the parent when attaching)
+* `real-uid`: the real user id of the process
+* `real-gid`: the real group id of the process
+* `effective-uid`: the effective user id of the process
+* `effective-gid`: the effective group id of the process
+* `cputype`: the Mach-O CPU type of the process  (base 16)
+* `cpusubtype`: the Mach-O CPU subtype of the process  (base 16)
+* `ostype`: is a string the represents the OS being debugged (darwin, linux, freebsd)
+* `vendor`: is a string that represents the vendor (apple)
+* `endian`: is one of "little", "big", or "pdp"
+* `ptrsize`: is a number that represents how big pointers are in bytes
+* `main-binary-uuid`: is the UUID of a firmware type binary that the gdb stub knows about
+* `main-binary-address`: is the load address of the firmware type binary
+* `main-binary-slide`: is the slide of the firmware type binary, if address isn't known
+* `binary-addresses`: A comma-separated list of binary load addresses base 16.
+                      lldb will parse the binaries in memory to get UUIDs, then
+                      try to find the binaries & debug info by UUID.  Intended for
+                      use with a small number of firmware type binaries where the
+                      search for binary/debug info may be expensive.
+
+## qProcessInfoPID:PID (Platform Extension)
+
+Have the remote platform get detailed information on a process by
+ID. PID is specified as a decimal integer.
+
+The response consists of key/value pairs where the key is separated from the
+values with colons and each pair is terminated with a semi colon.
+
+The key value pairs in the response are:
+* `pid` - `integer` - Process ID as a decimal integer string
+* `ppid` - `integer` - Parent process ID as a decimal integer string
+* `uid` - `integer` - A string value containing the decimal user ID
+* `gid` - `integer` - A string value containing the decimal group ID
+* `euid` - `integer` - A string value containing the decimal effective user ID
+* `egid` - `integer` - A string value containing the decimal effective group ID
+* `name` - `ascii-hex` - An ASCII hex string that contains the name of the process
+* `triple` - `string` - A target triple (`x86_64-apple-macosx`, `armv7-apple-ios`)
+
+Sample packet/response:
+```
+send packet: $qProcessInfoPID:60050#00
+read packet: $pid:60050;ppid:59948;uid:7746;gid:11;euid:7746;egid:11;name:6c6c6462;triple:x86_64-apple-macosx;#00
+```
+
+**Priority To Implement:** Optional
+
+## qQueryGDBServer
+
+Ask the platform for the list of gdbservers we have to connect
+
+If the remote platform automatically started one or more gdbserver instance (without
+lldb asking it) then it have to return the list of port number or socket name for
+each of them what can be used by lldb to connect to those instances.
+
+The data in this packet is a JSON array of JSON objects with the following keys:
+* `port`: `<the port number to connect>` (optional)
+* `socket_name`: `<the name of the socket to connect>` (optional)
+
+Example packet:
+```
+[
+    { "port": 1234 },
+    { "port": 5432 },
+    { "socket_name": "foo" }
+]
+```
+
+**Priority To Implement:** Low
+
+The packet is required to support connecting to gdbserver started
+by the platform instance automatically.
+
+## qRegisterInfo\<hex-reg-id\>
+
+Discover register information from the remote GDB server.
 
 With LLDB, for register information, remote GDB servers can add
 support for the "qRegisterInfoN" packet where "N" is a zero based
@@ -981,259 +1842,25 @@ The keys and values are detailed below:
   modifying the CPSR register can cause the r8 - r14 and cpsr value to
   change depending on if the mode has changed.
 
+**Priority To Implement:** High. Any target that can self describe its registers,
+should do so. This means if new registers are ever added to a remote target, they
+will get picked up automatically, and allows registers to change
+depending on the actual CPU type that is used.
 
-## qPlatform_shell
-
-### Brief
-
-Run a command in a shell on the connected remote machine.
-
-### Priority To Implement
-
-High. This command allows LLDB clients to run arbitrary shell
-commands on a remote host.
-
-### Description
-
-The request consists of the command to be executed encoded in ASCII characters
-converted into hex bytes.
-
-The response to this packet consists of the letter F followed by the return code,
-followed by the signal number (or 0 if no signal was delivered), and escaped bytes
-of captured program output.
-
-Below is an example communication from a client sending an "ls -la" command:
-```
-send packet: $qPlatform_shell:6c73202d6c61,00000002#ec
-read packet: $F,00000000,00000000,total 4736
-drwxrwxr-x 16 username groupname    4096 Aug 15 21:36 .
-drwxr-xr-x 17 username groupname    4096 Aug 10 16:39 ..
--rw-rw-r--  1 username groupname   73875 Aug 12 16:46 notes.txt
-drwxrwxr-x  5 username groupname    4096 Aug 15 21:36 source.cpp
--rw-r--r--  1 username groupname    2792 Aug 12 16:46 a.out
--rw-r--r--  1 username groupname    3190 Aug 12 16:46 Makefile
-```
-
-## qPlatform_mkdir
-
-### Brief
-
-Creates a new directory on the connected remote machine.
-
-### Priority To Implement
-
-Low. This command allows LLDB clients to create new directories on
-a remote host.
-
-### Description
-
-Request: `qPlatform_mkdir:<hex-file-mode>,<ascii-hex-path>`
-
-Reply: 
-  * `F<mkdir-return-code>`
-    (mkdir called successfully and returned with the given return code)
-  * `Exx` (An error occurred)
-
-
-## qPlatform_chmod
-
-### Brief
-
-Change the permissions of a file on the connected remote machine.
-
-### Priority To Implement
-
-Low. This command allows LLDB clients to change the permissions of
-a file on the remote host.
-
-### Description
-
-Request: `qPlatform_chmod:<hex-file-mode>,<ascii-hex-path>`
-
-Reply:
-* `F<chmod-return-code>`
-  (chmod called successfully and returned with the given return code)
-* `Exx` (An error occurred)
-
-## qHostInfo
-
-### Brief
-
-Get information about the host we are remotely connected to.
-
-### Priority To Implement
-
-High. This packet is usually very easy to implement and can help
-LLDB select the correct plug-ins for the job based on the target
-triple information that is supplied.
-
-### Description
-
-LLDB supports a host info call that gets all sorts of details of the system
-that is being debugged:
-```
-send packet: $qHostInfo#00
-read packet: $cputype:16777223;cpusubtype:3;ostype:darwin;vendor:apple;endian:little;ptrsize:8;#00
-```
-
-Key value pairs are one of:
-* `cputype`: is a number that is the mach-o CPU type that is being debugged (base 10)
-* `cpusubtype`: is a number that is the mach-o CPU subtype type that is being debugged (base 10)
-* `triple`: a string for the target triple (x86_64-apple-macosx) that can be used to specify arch + vendor + os in one entry
-* `vendor`: a string for the vendor (apple), not needed if "triple" is specified
-* `ostype`: a string for the OS being debugged (macosx, linux, freebsd, ios, watchos), not needed if "triple" is specified
-* `endian`: is one of "little", "big", or "pdp"
-* `ptrsize`: an unsigned number that represents how big pointers are in bytes on the debug target
-* `hostname`: the hostname of the host that is running the GDB server if available
-* `os_build`: a string for the OS build for the remote host as a string value
-* `os_kernel`: a string describing the kernel version
-* `os_version`: a version string that represents the current OS version (10.8.2)
-* `watchpoint_exceptions_received`: one of "before" or "after" to specify if a watchpoint is triggered before or after the pc when it stops
-* `default_packet_timeout`: an unsigned number that specifies the default timeout in seconds
-* `distribution_id`: optional. For linux, specifies distribution id (e.g. ubuntu, fedora, etc.)
-* `osmajor`: optional, specifies the major version number of the OS (e.g. for macOS 10.12.2, it would be 10)
-* `osminor`: optional, specifies the minor version number of the OS (e.g. for macOS 10.12.2, it would be 12)
-* `ospatch`: optional, specifies the patch level number of the OS (e.g. for macOS 10.12.2, it would be 2)
-* `vm-page-size`: optional, specifies the target system VM page size, base 10.
-  Needed for the "dirty-pages:" list in the qMemoryRegionInfo
-  packet, where a list of dirty pages is sent from the remote
-  stub.  This page size tells lldb how large each dirty page is.
-* `addressing_bits`: optional, specifies how many bits in addresses are
-	significant for addressing, base 10.  If bits 38..0
-	in a 64-bit pointer are significant for addressing,
-	then the value is 39.  This is needed on e.g. AArch64
-	v8.3 ABIs that use pointer authentication, so lldb
-	knows which bits to clear/set to get the actual
-	addresses.
-* `low_mem_addressing_bits`: optional, specifies how many bits in
-  addresses in low memory are significant for addressing, base 10.
-  AArch64 can have different page table setups for low and high
-  memory, and therefore a different number of bits used for addressing.
-* `high_mem_addressing_bits`: optional, specifies how many bits in
-  addresses in high memory are significant for addressing, base 10.
-  AArch64 can have different page table setups for low and high
-  memory, and therefore a different number of bits used for addressing.
-
-## qGDBServerVersion
-
-### Brief
-
-Get version information about this implementation of the gdb-remote
-protocol.
-
-### Priority To Implement
-
-High. This packet is usually very easy to implement and can help
-LLDB to work around bugs in a server's implementation when they
-are found.
-
-### Description
-
-The goal of this packet is to provide enough information about an
-implementation of the gdb-remote-protocol server that lldb can
-work around implementation problems that are discovered after the
-version has been released/deployed.  The name and version number
-should be sufficiently unique that lldb can unambiguously identify
-the origin of the program (for instance, debugserver from lldb) and
-the version/submission number/patch level of the program - whatever
-is appropriate for your server implementation.
-
-The packet follows the key-value pair model, semicolon separated.
-```
-send packet: $qGDBServerVersion#00
-read packet: $name:debugserver;version:310.2;#00
-```
-
-Other clients may find other key-value pairs to be useful for identifying
-a gdb stub.  Patch level, release name, build number may all be keys that
-better describe your implementation's version.
-
-Suggested key names:
-* `name`: the name of your remote server - "debugserver" is the lldb standard
-          implementation
-* `version`: identifies the version number of this server
-* `patch_level`: the patch level of this server
-* `release_name`: the name of this release, if your project uses names
-* `build_number`: if you use a build system with increasing build numbers,
-                  this may be the right key name for your server
-* `major_version`: major version number
-* `minor_version`: minor version number
-
-## qProcessInfo
-
-### Brief
-
-Get information about the process we are currently debugging.
-
-### Priority To Implement
-
-Medium.  On systems which can launch multiple different architecture processes,
-the qHostInfo may not disambiguate sufficiently to know what kind of
-process is being debugged.
-
-For example on a 64-bit x86 Mac system both 32-bit and 64-bit user processes are possible,
-and with Mach-O universal files, the executable file may contain both 32- and
-64-bit slices so it may be impossible to know until you're attached to a real
-process to know what you're working with.
-
-All numeric fields return base 16 numbers without any "0x" prefix.
-
-### Description
-
-An i386 process:
-```
-send packet: $qProcessInfo#00
-read packet: $pid:42a8;parent-pid:42bf;real-uid:ecf;real-gid:b;effective-uid:ecf;effective-gid:b;cputype:7;cpusubtype:3;ostype:macosx;vendor:apple;endian:little;ptrsize:4;#00
-```
-
-An x86_64 process:
-```
-send packet: $qProcessInfo#00
-read packet: $pid:d22c;parent-pid:d34d;real-uid:ecf;real-gid:b;effective-uid:ecf;effective-gid:b;cputype:1000007;cpusubtype:3;ostype:macosx;vendor:apple;endian:little;ptrsize:8;#00
-```
-
-Key value pairs include:
-* `pid`: the process id
-* `parent-pid`: the process of the parent process (often debugserver will become the parent when attaching)
-* `real-uid`: the real user id of the process
-* `real-gid`: the real group id of the process
-* `effective-uid`: the effective user id of the process
-* `effective-gid`: the effective group id of the process
-* `cputype`: the Mach-O CPU type of the process  (base 16)
-* `cpusubtype`: the Mach-O CPU subtype of the process  (base 16)
-* `ostype`: is a string the represents the OS being debugged (darwin, linux, freebsd)
-* `vendor`: is a string that represents the vendor (apple)
-* `endian`: is one of "little", "big", or "pdp"
-* `ptrsize`: is a number that represents how big pointers are in bytes
-* `main-binary-uuid`: is the UUID of a firmware type binary that the gdb stub knows about
-* `main-binary-address`: is the load address of the firmware type binary
-* `main-binary-slide`: is the slide of the firmware type binary, if address isn't known
-* `binary-addresses`: A comma-separated list of binary load addresses base 16.
-                      lldb will parse the binaries in memory to get UUIDs, then
-                      try to find the binaries & debug info by UUID.  Intended for
-                      use with a small number of firmware type binaries where the
-                      search for binary/debug info may be expensive.
+**Note:** `qRegisterInfo` is deprecated in favor of the standard gdb remote
+serial protocol register description method, `qXfer:features:read:target.xml`.
+If `qXfer:features:read:target.xml` is supported, `qRegisterInfo` does
+not need to be implemented.  The target.xml format is used by most
+gdb RSP stubs whereas `qRegisterInfo` was an lldb-only design.
+`qRegisterInfo` requires one packet per register and can have undesirable
+performance costs at the start of a debug session, whereas target.xml
+may be able to describe all registers in a single packet.
 
 ## qShlibInfoAddr
 
-### Brief
-
 Get an address where the dynamic linker stores information about
 where shared libraries are loaded.
 
-### Priority To Implement
-
-High if you have a dynamic loader plug-in in LLDB for your target
-triple (see the "qHostInfo" packet) that can use this information.
-Many times address load randomization can make it hard to detect
-where the dynamic loader binary and data structures are located and
-some platforms know, or can find out where this information is.
-
-Low if you have a debug target where all object and symbol files
-contain static load addresses.
-
-### Description
-
 LLDB and GDB both support the `qShlibInfoAddr` packet which is a hint to each
 debugger as to where to find the dynamic loader information. For darwin
 binaries that run in user land this is the address of the `all_image_infos`
@@ -1245,23 +1872,20 @@ send packet: $qShlibInfoAddr#00
 read packet: $7fff5fc40040#00
 ```
 
-## qThreadStopInfo\<tid\>
-
-### Brief
+**Priority To Implement:** High
 
-Get information about why a thread, whose ID is `<tid>`, is stopped.
+If you have a dynamic loader plug-in in LLDB for your target
+triple (see the "qHostInfo" packet) that can use this information.
+Many times address load randomization can make it hard to detect
+where the dynamic loader binary and data structures are located and
+some platforms know, or can find out where this information is.
 
-### Priority To Implement
+Low if you have a debug target where all object and symbol files
+contain static load addresses.
 
-High if you need to support multi-threaded or multi-core debugging.
-Many times one thread will hit a breakpoint and while the debugger
-is in the process of suspending the other threads, other threads
-will also hit a breakpoint. This packet allows LLDB to know why all
-threads (live system debug) / cores (JTAG) in your program have
-stopped and allows LLDB to display and control your program
-correctly.
+## qThreadStopInfo\<tid\>
 
-### Description
+Get information about why a thread, whose ID is `<tid>`, is stopped.
 
 LLDB tries to use the `qThreadStopInfo` packet which is formatted as
 `qThreadStopInfo%x` where `%x` is the hex thread ID. This requests information
@@ -1271,374 +1895,22 @@ remote packets love to think that there is only _one_ reason that _one_ thread
 stops at a time. This allows us to see why all threads stopped and allows us
 to implement better multi-threaded debugging support.
 
-## QThreadSuffixSupported
-
-### Brief
-
-Try to enable thread suffix support for the `g`, `G`, `p`, and `P` packets.
-
-### Priority To Implement
-
-High. Adding a thread suffix allows us to read and write registers
-more efficiently and stops us from having to select a thread with
-one packet and then read registers with a second packet. It also
-makes sure that no errors can occur where the debugger thinks it
-already has a thread selected (see the `Hg` packet from the standard
-GDB remote protocol documentation) yet the remote GDB server actually
-has another thread selected.
-
-### Description
-
-When reading thread registers, you currently need to set the current
-thread, then read the registers. This is kind of cumbersome, so we added the
-ability to query if the remote GDB server supports adding a `thread:<tid>;`
-suffix to all packets that request information for a thread. To test if the
-remote GDB server supports this feature:
-```
-send packet: $QThreadSuffixSupported#00
-read packet: OK
-```
-
-If `OK` is returned, then the `g`, `G`, `p` and `P` packets can accept a
-thread suffix. So to send a `g` packet (read all register values):
-```
-send packet: $g;thread:<tid>;#00
-read packet: ....
-
-send packet: $G;thread:<tid>;#00
-read packet: ....
+**Priority To Implement:** High
 
-send packet: $p1a;thread:<tid>;#00
-read packet: ....
-
-send packet: $P1a=1234abcd;thread:<tid>;#00
-read packet: ....
-```
-
-otherwise, without this you would need to always send two packets:
-```
-send packet: $Hg<tid>#00
-read packet: ....
-send packet: $g#00
-read packet: ....
-```
-
-We also added support for allocating and deallocating memory. We use this to
-allocate memory so we can run JITed code.
-
-## _M\<size\>,\<permissions\>
-
-### Brief
-
-Allocate memory on the remote target with the specified size and
-permissions.
-
-### Priority To Implement
-
-High if you want LLDB to be able to JIT code and run that code. JIT
-code also needs data which is also allocated and tracked.
-
-Low if you don't support running JIT'ed code.
-
-### Description
-
-The allocate memory packet starts with `_M<size>,<permissions>`. It returns a
-raw big endian address value, or an empty response for unimplemented, or `EXX` for an error
-code. The packet is formatted as:
-```
-char packet[256];
-int packet_len;
-packet_len = ::snprintf (
-    packet,
-    sizeof(packet),
-    "_M%zx,%s%s%s",
-    (size_t)size,
-    permissions & lldb::ePermissionsReadable ? "r" : "",
-    permissions & lldb::ePermissionsWritable ? "w" : "",
-    permissions & lldb::ePermissionsExecutable ? "x" : "");
-```
-
-You request a size and give the permissions. This packet does NOT need to be
-implemented if you don't want to support running JITed code. The return value
-is just the address of the newly allocated memory as raw big endian hex bytes.
-
-## _m\<addr\>
-
-### Brief
-
-Deallocate memory that was previously allocated using an allocate
-memory pack.
-
-### Priority To Implement
-
-High if you want LLDB to be able to JIT code and run that code. JIT
-code also needs data which is also allocated and tracked.
-
-Low if you don't support running JIT'ed code.
-
-### Description
-
-The deallocate memory packet is `_m<addr>` where you pass in the address you
-got back from a previous call to the allocate memory packet. It returns `OK`
-if the memory was successfully deallocated, or `EXX`" for an error, or an
-empty response if not supported.
-
-## qMemoryRegionInfo:\<addr\>
-
-### Brief
-
-Get information about the address range that contains `<addr>`.
-
-### Priority To Implement
-
-Medium. This is nice to have, but it isn't necessary. It helps LLDB
-do stack unwinding when we branch into memory that isn't executable.
-If we can detect that the code we are stopped in isn't executable,
-then we can recover registers for stack frames above the current
-frame. Otherwise we must assume we are in some JIT'ed code (not JIT
-code that LLDB has made) and assume that no registers are available
-in higher stack frames.
-
-### Description
-
-We added a way to get information for a memory region. The packet is:
-```
-qMemoryRegionInfo:<addr>
-```
-
-Where `<addr>` is a big endian hex address. The response is returned in a series
-of tuples like the data returned in a stop reply packet. The currently valid
-tuples to return are:
-* `start:<start-addr>;` - `<start-addr>` is a big endian hex address that is
-                          the start address of the range that contains `<addr>`
-* `size:<size>;` - `<size>` is a big endian hex byte size of the address
-                   of the range that contains `<addr>`
-* `permissions:<permissions>;` - `<permissions>` is a string that contains one
-                                 or more of the characters from `rwx`
-* `name:<name>;` - `<name>` is a hex encoded string that contains the name of
-                   the memory region mapped at the given address. In case of
-                   regions backed by a file it have to be the absolute path of
-                   the file while for anonymous regions it have to be the name
-                   associated to the region if that is available.
-* `flags:<flags-string>;` - where `<flags-string>` is a space separated string
-                            of flag names. Currently the only supported flag
-                            is `mt` for AArch64 memory tagging. lldb will
-                            ignore any other flags in this field.
-* `type:[<type>][,<type>];` - memory types that apply to this region, e.g.
-                              `stack` for stack memory.
-* `error:<ascii-byte-error-string>;` - where `<ascii-byte-error-string>` is
-                                       a hex encoded string value that
-                                       contains an error string
-* `dirty-pages:[<hexaddr>][,<hexaddr];` -
-  A list of memory pages within this
-  region that are "dirty" -- they have been modified.
-  Page addresses are in base 16. The size of a page can
-  be found from the `qHostInfo`'s `page-size` key-value.
-  
-  If the stub supports identifying dirty pages within a
-  memory region, this key should always be present for all
-  `qMemoryRegionInfo` replies.  This key with no pages
-  listed (`dirty-pages:;`) indicates no dirty pages in
-  this memory region.  The *absence* of this key means
-  that this stub cannot determine dirty pages.
-
-If the address requested is not in a mapped region (e.g. we've jumped through
-a NULL pointer and are at 0x0) currently lldb expects to get back the size
-of the unmapped region -- that is, the distance to the next valid region.
-For instance, with a macOS process which has nothing mapped in the first
-4GB of its address space, if we're asking about address 0x2:
-```
-  qMemoryRegionInfo:2
-  start:2;size:fffffffe;
-```
-
-The lack of `permissions:` indicates that none of read/write/execute are valid
-for this region.
-
-## "x" - Binary memory read
-
-### Brief
-
-Like the `m` (read) and `M` (write) packets, this is a partner to the
-`X` (write binary data) packet, `x`.
-
-It is called like
-```
-xADDRESS,LENGTH
-```
-
-where both `ADDRESS` and `LENGTH` are big-endian base 16 values.
-
-To test if this packet is available, send a addr/len of 0:
-```
-x0,0
-```
-You will get an `OK` response if it is supported.
-
-The reply will be the data requested in 8-bit binary data format.
-The standard quoting is applied to the payload. Characters `}  #  $  *`
-will all be escaped with `}` (`0x7d`) character and then XOR'ed with `0x20`.
-
-A typical use to read 512 bytes at 0x1000 would look like:
-```
-x0x1000,0x200
-```
-The `0x` prefixes are optional - like most of the gdb-remote packets,
-omitting them will work fine; these numbers are always base 16.
-
-The length of the payload is not provided.  A reliable, 8-bit clean,
-transport layer is assumed.
-
-## Detach and stay stopped
-
-### Description
-
-We extended the "D" packet to specify that the monitor should keep the
-target suspended on detach.  The normal behavior is to resume execution
-on detach.  We will send:
-```
-qSupportsDetachAndStayStopped:
-```
-
-to query whether the monitor supports the extended detach, and if it does,
-when we want the monitor to detach but not resume the target, we will
-send:
-```
-D1
-```
-In any case, if we want the normal detach behavior we will just send:
-```
-D
-```
-
-## QSaveRegisterState / QSaveRegisterState;thread:XXXX;
-
-### Brief
-
-The `QSaveRegisterState` packet tells the remote debugserver to save
-all registers and return a non-zero unique integer ID that
-represents these save registers. If thread suffixes are enabled the
-second form of this packet is used, otherwise the first form is
-used. This packet is called prior to executing an expression, so
-the remote GDB server should do anything it needs to in order to
-ensure the registers that are saved are correct. On macOS this
-involves calling `thread_abort_safely(mach_port_t thread)` to
-ensure we get the correct registers for a thread in case it is
-currently having code run on its behalf in the kernel.
-
-### Response
-
-* `<unsigned int>` - The save_id result is a non-zero unsigned integer value
-                 that can be passed back to the GDB server using a
-                 `QRestoreRegisterState` packet to restore the registers
-                 one time.
-* `EXX` - or an error code in the form of `EXX` where `XX` is a
-          hex error code.
-
-### Priority To Implement
-
-Low, this is mostly a convenience packet to avoid having to send all
-registers with a `g` packet. It should only be implemented if support
-for the `QRestoreRegisterState` is added.
-
-## QRestoreRegisterState:\<save_id\> / QRestoreRegisterState:\<save_id\>;thread:XXXX;
-
-### Brief
-
-The `QRestoreRegisterState` packet tells the remote debugserver to
-restore all registers using the `save_id` which is an unsigned
-integer that was returned from a previous call to
-`QSaveRegisterState`. The restoration process can only be done once
-as the data backing the register state will be freed upon the
-completion of the `QRestoreRegisterState` command.
-
-If thread suffixes are enabled the second form of this packet is
-used, otherwise the first form is used.
-
-### Response
-
-* `OK` - if all registers were successfully restored
-* `EXX` - for any errors
-
-### Priority To Implement
-
-Low, this is mostly a convenience packet to avoid having to send all
-registers with a `g` packet. It should only be implemented if support
-for the `QSaveRegisterState` is added.
-
-## qFileLoadAddress:\<file_path\>
-
-### Brief
-
-Get the load address of a memory mapped file.
-The load address is defined as the address of the first memory
-region what contains data mapped from the specified file.
-
-### Response
-
-* `<unsigned-hex64>` - Load address of the file in big endian encoding
-* `E01` - the requested file isn't loaded
-* `EXX` - for any other errors
-
-### Priority To Implement
-
-Low, required if dynamic linker don't fill in the load address of
-some object file in the rendezvous data structure.
-
-## qModuleInfo:\<module_path\>;\<arch triple\>
-
-### Brief
-
-Get information for a module by given module path and architecture.
-
-### Response
-
-* `(uuid|md5):...;triple:...;file_offset:...;file_size...;`
-* `EXX` - for any errors
-
-### Priority To Implement
-
-Optional, required if dynamic loader cannot fetch module's information like
-UUID directly from inferior's memory.
-
-## jModulesInfo:[{"file":"...",triple:"..."}, ...]
-
-### Brief
-
-Get information for a list of modules by given module path and
-architecture.
-
-### Response
-
-A JSON array of dictionaries containing the following keys:
-* `uuid`
-* `triple`
-* `file_path`
-* `file_offset`
-* `file_size`
-
-The meaning of the fields is the same as in the `qModuleInfo` packet. The server
-signals the failure to retrieve the module info for a file by ommiting the
-corresponding array entry from the response. The server may also
-include entries the client did not ask for, if it has reason to
-the modules will be interesting to the client.
-
-### Priority To Implement
-
-Optional. If not implemented, `qModuleInfo` packet will be used, which
-may be slower if the target contains a large number of modules and
-the communication link has a non-negligible latency.
+If you need to support multi-threaded or multi-core debugging.
+Many times one thread will hit a breakpoint and while the debugger
+is in the process of suspending the other threads, other threads
+will also hit a breakpoint. This packet allows LLDB to know why all
+threads (live system debug) / cores (JTAG) in your program have
+stopped and allows LLDB to display and control your program
+correctly.
 
 ## Stop reply packet extensions
 
-### Brief
-
 This section describes some of the additional information you can
 specify in stop reply packets that help LLDB to know more detailed
 information about your threads.
 
-### Description
-
 Standard GDB remote stop reply packets are reply packets sent in
 response to a packet  that made the program run. They come in the
 following forms:
@@ -1840,181 +2112,26 @@ program correctly. What if a real SIGTRAP was delivered to a thread
 while we were trying to single step? We wouldn't know the difference
 with a standard GDB remote server and we could do the wrong thing.
 
-### Priority To Implement
-
-High. Having the extra information in your stop reply packets makes
-your debug session more reliable and informative.
-
-## qfProcessInfo / qsProcessInfo (Platform Extension)
-
-### Brief
-
-Get the first process info (`qfProcessInfo`) or subsequent process
-info (`qsProcessInfo`) for one or more processes on the remote
-platform. The first call gets the first match and subsequent calls
-to `qsProcessInfo` gets the subsequent matches. Return an error `EXX`,
-where `XX` are two hex digits, when no more matches are available.
-
-### Priority To Implement
-
-Required. The `qfProcessInfo` packet can be followed by a `:` and
-some key value pairs. The key value pairs in the command are:
-
-* `name` - `ascii-hex` -
-  An ASCII hex string that contains the name of the process that will be matched.
-* `name_match` - `enum` -
-  One of:
-    * `equals`
-    * `starts_with`
-    * `ends_with`
-    * `contains`
-    * `regex`
-* `pid` - `integer`- A string value containing the decimal process ID
-* `parent_pid` - `integer` - A string value containing the decimal parent process ID
-* `uid` - `integer` - A string value containing the decimal user ID
-* `gid` - `integer` - A string value containing the decimal group ID
-* `euid` - `integer` - A string value containing the decimal effective user ID
-* `egid` - `integer` - A string value containing the decimal effective group ID
-* `all_users` - `bool` -
-  A boolean value that specifies if processes should
-  be listed for all users, not just the user that the
-  platform is running as
-* `triple` - `string` -
-  An ASCII triple string (`x86_64`, `x86_64-apple-macosx`, `armv7-apple-ios`)
-* `args` - `string` -
-  A string value containing the process arguments separated by the character `-`,
-  where each argument is hex-encoded. It includes `argv[0]`.
-
-The response consists of key/value pairs where the key is separated from the
-values with colons and each pair is terminated with a semi colon. For a list
-of the key/value pairs in the response see the `qProcessInfoPID` packet
-documentation.
-
-Sample packet/response:
-```
-send packet: $qfProcessInfo#00
-read packet: $pid:60001;ppid:59948;uid:7746;gid:11;euid:7746;egid:11;name:6c6c6462;triple:x86_64-apple-macosx;#00
-send packet: $qsProcessInfo#00
-read packet: $pid:59992;ppid:192;uid:7746;gid:11;euid:7746;egid:11;name:6d64776f726b6572;triple:x86_64-apple-macosx;#00
-send packet: $qsProcessInfo#00
-read packet: $E04#00
-```
-
-## qLaunchGDBServer (Platform Extension)
-
-### Brief
-
-Have the remote platform launch a GDB server.
-
-### Priority To Implement
-
-Required. The `qLaunchGDBServer` packet must be followed by a `:` and
-some key value pairs. The key value pairs in the command are:
-* `port` - `integer` -
-  A string value containing the decimal port ID or zero if the port should be
-  bound and returned
-* `host` - `integer` -
-  The host that connections should be limited to when the GDB server is connected to.
-
-### Description
-
-The response consists of key/value pairs where the key is separated from the
-values with colons and each pair is terminated with a semi colon.
-
-Sample packet/response:
-```
-send packet: $qLaunchGDBServer:port:0;host:lldb.apple.com;#00
-read packet: $pid:60025;port:50776;#00
-```
-
-The `pid` key/value pair is only specified if the remote platform launched
-a separate process for the GDB remote server and can be omitted if no
-process was separately launched.
-
-The `port` key/value pair in the response lets clients know what port number
-to attach to in case zero was specified as the "port" in the sent command.
-
-
-## qProcessInfoPID:PID (Platform Extension)
-
-### Brief
-
-Have the remote platform get detailed information on a process by
-ID. PID is specified as a decimal integer.
-
-### Priority To Implement
-
-Optional.
-
-### Description
-
-The response consists of key/value pairs where the key is separated from the
-values with colons and each pair is terminated with a semi colon.
-
-The key value pairs in the response are:
-* `pid` - `integer` - Process ID as a decimal integer string
-* `ppid` - `integer` - Parent process ID as a decimal integer string
-* `uid` - `integer` - A string value containing the decimal user ID
-* `gid` - `integer` - A string value containing the decimal group ID
-* `euid` - `integer` - A string value containing the decimal effective user ID
-* `egid` - `integer` - A string value containing the decimal effective group ID
-* `name` - `ascii-hex` - An ASCII hex string that contains the name of the process
-* `triple` - `string` - A target triple (`x86_64-apple-macosx`, `armv7-apple-ios`)
-
-Sample packet/response:
-```
-send packet: $qProcessInfoPID:60050#00
-read packet: $pid:60050;ppid:59948;uid:7746;gid:11;euid:7746;egid:11;name:6c6c6462;triple:x86_64-apple-macosx;#00
-```
+**Priority To Implement:** High. Having the extra information in your stop reply
+packets makes your debug session more reliable and informative.
 
 ## vAttachName
 
-### Brief
-
 Same as `vAttach`, except instead of a `pid` you send a process name.
 
-### Priority To Implement
-
-Low. Only needed for `process attach -n`.  If the packet isn't supported
-then `process attach -n` will fail gracefully.  So you need only to support
-it if attaching to a process by name makes sense for your environment.
-
-## vAttachWait
-
-### Brief
-
-Same as `vAttachName`, except that the stub should wait for the next instance
-of a process by that name to be launched and attach to that.
-
-### Priority To Implement
-
-Low. Only needed to support `process attach -w -n` which will fail
-gracefully if the packet is not supported.
-
-## qAttachOrWaitSupported
-
-### Brief
-
-This is a binary "is it supported" query. Return OK if you support
-`vAttachOrWait`.
-
-### Priority To Implement
-
-Low. This is required if you support `vAttachOrWait`, otherwise no support
-is needed since the standard "I don't recognize this packet" response
-will do the right thing.
+**Priority To Implement:** Low. Only needed for `process attach -n`. If the
+packet isn't supported then `process attach -n` will fail gracefully. So you need
+only to support it if attaching to a process by name makes sense for your environment.
 
 ## vAttachOrWait
 
-### Brief
-
 Same as `vAttachWait`, except that the stub will attach to a process
 by name if it exists, and if it does not, it will wait for a process
 of that name to appear and attach to it.
 
-### Priority To Implement
+**Priority To Implement:** Low
 
-Low. Only needed to implement `process attach -w -i false -n`.  If
+Only needed to implement `process attach -w -i false -n`.  If
 you don't implement it but do implement `-n` AND lldb can somehow get
 a process list from your device, it will fall back on scanning the
 process list, and sending `vAttach` or `vAttachWait` depending on
@@ -2022,378 +2139,212 @@ whether the requested process exists already.  This is racy,
 however, so if you want to support this behavior it is better to
 support this packet.
 
-## jThreadExtendedInfo
+## vAttachWait
 
-### Brief
+Same as `vAttachName`, except that the stub should wait for the next instance
+of a process by that name to be launched and attach to that.
 
-This packet, which takes its arguments as JSON and sends its reply as
-JSON, allows the gdb remote stub to provide additional information
-about a given thread.
+**Priority To Implement:** Low. Only needed to support `process attach -w -n`
+which will fail gracefully if the packet is not supported.
 
-### Priority To Implement
+## vFile Packets
 
-Low.  This packet is only needed if the gdb remote stub wants to
-provide interesting additional information about a thread for the
-user.
+Though some of these may match the ones described in GDB's protocol
+documentation, we include our own expectations here in case of
+mismatches or extensions.
 
-### Description
+### vFile:chmod / qPlatform_chmod
 
-This packet takes its arguments in [JSON](http://www.json.org).
-At a minimum, a thread must be specified, for example:
-```
-jThreadExtendedInfo:{"thread":612910}
-```
+Change the permissions of a file on the connected remote machine.
 
-Because this is a JSON string, the thread number is provided in base 10.
-Additional key-value pairs may be provided by lldb to the gdb remote
-stub.  For instance, on some versions of macOS, lldb can read offset
-information out of the system libraries.  Using those offsets, debugserver
-is able to find the Thread Specific Address (TSD) for a thread and include
-that in the return information.  So lldb will send these additional fields
-like so:
-```
-jThreadExtendedInfo:{"plo_pthread_tsd_base_address_offset":0,"plo_pthread_tsd_base_offset":224,"plo_pthread_tsd_entry_size":8,"thread":612910}
-```
+Request: `qPlatform_chmod:<hex-file-mode>,<ascii-hex-path>`
 
-There are no requirements for what is included in the response.  A simple
-reply on a OS X Yosemite / iOS 8 may include the pthread_t value, the
-Thread Specific Data (TSD) address, the dispatch_queue_t value if the thread
-is associated with a GCD queue, and the requested Quality of Service (QoS)
-information about that thread.  For instance, a reply may look like:
-```
-{"tsd_address":4371349728,"requested_qos":{"enum_value":33,"constant_name":"QOS_CLASS_USER_INTERACTIVE","printable_name":"User Interactive"},"pthread_t":4371349504,"dispatch_queue_t":140735087127872}
-```
+Reply:
+* `F<chmod-return-code>`
+  (chmod called successfully and returned with the given return code)
+* `Exx` (An error occurred)
 
-`tsd_address`, `pthread_t`, and `dispatch_queue_t` are all simple key-value pairs.
-The JSON standard requires that numbers be expressed in base 10 - so all of
-these are. `requested_qos` is a dictionary with three key-value pairs in it -
-so the UI layer may choose the form most appropriate for displaying to the user.
+### vFile:close
 
-Sending JSON over gdb-remote protocol introduces some problems.  We may be
-sending strings with arbitrary contents in them, including the `#`, `$`, and `*`
-characters that have special meaning in gdb-remote protocol and cannot occur
-in the middle of the string. The standard solution for this would be to require
-ascii-hex encoding of all strings, or ascii-hex encode the entire JSON payload.
+Close a previously opened file descriptor.
 
-Instead, the binary escaping convention is used for JSON data.  This convention
-(e.g. used for the `X` packet) says that if `#`, `$`, `*`, or `}` are to occur in
-the payload, the character `}` (`0x7d`) is emitted, then the metacharacter is emitted
-xor'ed by `0x20`. The `}` character occurs in every JSON payload at least once, and
-`} ^ 0x20` happens to be `]` so the raw packet characters for a request will look
-like:
 ```
-jThreadExtendedInfo:{"thread":612910}]
+receive: vFile:close:7
+send:    F0
 ```
 
-## QEnableCompression
+File descriptor is in base 16. `F-1,errno` with the errno if an error occurs,
+errno is base 16.
 
-### Brief
+### vFile:exists
 
-This packet enables compression of the packets that the debug stub sends to lldb.
-If the debug stub can support compression, it indictes this in the reply of the
-"qSupported" packet. For example:
-```
-LLDB SENDS:    qSupported:xmlRegisters=i386,arm,mips
-STUB REPLIES:  qXfer:features:read+;SupportedCompressions=lzfse,zlib-deflate,lz4,lzma;
-```
+Check whether the file at the given path exists.
 
-If lldb knows how to use any of these compression algorithms, it can ask that this
-compression mode be enabled.
 ```
-QEnableCompression:type:zlib-deflate;
+receive: vFile:exists:2f746d702f61
+send         (exists): F,1
+send (does not exist): F,0
 ```
 
-The debug stub should reply with an uncompressed `OK` packet to indicate that the
-request was accepted.  All further packets the stub sends will use this compression.
+Request packet contains the ASCII hex encoded filename.
 
-Packets are compressed as the last step before they are sent from the stub, and
-decompressed as the first step after they are received.  The packet format in compressed
-mode becomes one of two:
-```
-$N<uncompressed payload>#00
+The response is a return code where 1 means the file exists
+and 0 means it does not.
 
-$C<size of uncompressed payload in base 10>:<compressed payload>#00
-```
+**Priority To Implement:** Low
 
-Where `#00` is the actual checksum value if noack mode is not enabled. The checksum
-value is for the `N<uncompressed payload>` or
-`C<size of uncompressed payload in base 10>:<compressed payload>` bytes in the packet.
+### vFile:MD5
 
-The size of the uncompressed payload in base 10 is provided because it will simplify
-decompression if the final buffer size needed is known ahead of time.
+Generate an MD5 hash of the file at the given path.
 
-Compression on low-latency connections is unlikely to be an improvement. Particularly
-when the debug stub and lldb are running on the same host. It should only be used
-for slow connections, and likely only for larger packets.
+```
+receive: vFile:MD5:2f746d702f61
+send (success): F,00000000000000001111111111111111
+send (failure): F,x
+```
 
-Example compression algorithms that may be used include:
-* `zlib-deflate` -
-  The raw DEFLATE format as described in IETF RFC 1951.  With the ZLIB library, you
-  can compress to this format with an initialization like
-      deflateInit2 (&stream, 5, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY)
-  and you can decompress with an initialization like
-      inflateInit2 (&stream, -15).
-* `lz4` -
-  https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)
-  https://github.com/Cyan4973/lz4
-  The libcompression APIs on darwin systems call this `COMPRESSION_LZ4_RAW`.
-* `lzfse` -
-  Compression algorithm added in macOS 10.11, with open source C reference
-  implementation on github.
-  https://en.wikipedia.org/wiki/LZFSE
-  https://github.com/lzfse/lzfse
-* `lzma` -
-  libcompression implements "LZMA level 6", the default compression for the
-  open source LZMA implementation.
+Request packet contains the ASCII hex encoded filename.
 
-## jGetLoadedDynamicLibrariesInfos
+If the hash succeeded, the response is `F,` followed by the low 64
+bits of the result, and finally the high 64 bits of the result. Both are in
+hex format without a prefix.
 
-### Brief
+The response is `F,`, followed by `x` if the file did not exist
+or failed to hash.
 
-This packet asks the remote debug stub to send the details about libraries
-being added/removed from the process as a performance optimization.
+### vFile:mode
 
-There are two ways this packet can be used.  Both return a dictionary of
-binary images formatted the same way.
+Get the mode bits of a file on the target system, filename in ASCII hex.
 
-One requests information on all shared libraries:
 ```
-jGetLoadedDynamicLibrariesInfos:{"fetch_all_solibs":true}
+receive: vFile:mode:2f746d702f61
+send:    F1ed
 ```
-with an optional `"report_load_commands":false` which can be added, asking
-that only the dyld SPI information (load addresses, filenames) be returned.
-The default behavior is that debugserver scans the mach-o header and load
-commands of each binary, and returns it in the JSON reply.
 
-And the second requests information about a list of shared libraries, given their load addresses:
-```
-jGetLoadedDynamicLibrariesInfos:{"solib_addresses":[8382824135,3258302053,830202858503]}
-```
+response is `F` followed by the mode bits in base 16, this `0x1ed` would
+correspond to `0755` in octal.
+`F-1,errno` with the errno if an error occurs, base 16.
 
-The second call is both a performance optimization (instead of having lldb read the mach-o header/load commands
-out of memory with generic read packets) but also adds additional information in the form of the
-filename of the shared libraries (which is not available in the mach-o header/load commands.)
+### vFile:open
 
-An example using the OS X 10.11 style call:
-```
-LLDB SENDS: jGetLoadedDynamicLibrariesInfos:{"image_count":1,"image_list_address":140734800075128}
-STUB REPLIES: ${"images":[{"load_address":4294967296,"mod_date":0,"pathname":"/tmp/a.out","uuid":"02CF262C-ED6F-3965-9E14-63538B465CFF","mach_header":{"magic":4277009103,"cputype":16777223,"cpusubtype":18446744071562067971,"filetype":2},"segments":{"name":"__PAGEZERO","vmaddr":0,"vmsize":4294967296,"fileoff":0,"filesize":0,"maxprot":0},{"name":"__TEXT","vmaddr":4294967296,"vmsize":4096,"fileoff":0,"filesize":4096,"maxprot":7},{"name":"__LINKEDIT","vmaddr":4294971392,"vmsize":4096,"fileoff":4096,"filesize":152,"maxprot":7}}]}#00
-```
+Open a file on the remote system and return the file descriptor of it.
 
-Or pretty-printed:
 ```
-STUB REPLIES: ${"images":
-                [
-                    {"load_address":4294967296,
-                     "mod_date":0,
-                     "pathname":"/tmp/a.out",
-                     "uuid":"02CF262C-ED6F-3965-9E14-63538B465CFF",
-                     "mach_header":
-                        {"magic":4277009103,
-                         "cputype":16777223,
-                         "cpusubtype":18446744071562067971,
-                         "filetype":2
-                         },
-                     "segments":
-                      [
-                        {"name":"__PAGEZERO",
-                         "vmaddr":0,
-                         "vmsize":4294967296,
-                         "fileoff":0,
-                         "filesize":0,
-                         "maxprot":0
-                        },
-                        {"name":"__TEXT",
-                         "vmaddr":4294967296,
-                         "vmsize":4096,
-                         "fileoff":0,
-                         "filesize":4096,
-                         "maxprot":7
-                        },
-                        {"name":"__LINKEDIT",
-                         "vmaddr":4294971392,
-                         "vmsize":4096,
-                         "fileoff":4096,
-                         "filesize":152,
-                         "maxprot":7
-                        }
-                      ]
-                    }
-                ]
-            }
+receive: vFile:open:2f746d702f61,00000001,00000180
+send:    F8
 ```
 
-### Description
-
-This is similar to the `qXfer:libraries:read` packet, and it could
-be argued that it should be merged into that packet.  A separate
-packet was created primarily because lldb needs to specify the
-number of images to be read and the address from which the initial
-information is read.  Also the XML DTD would need to be extended
-quite a bit to provide all the information that the `DynamicLoaderMacOSX`
-would need to work correctly on this platform.
+request packet has the fields:
+   1. ASCII hex encoded filename
+   2. Flags passed to the open call, base 16.
+      Note that these are not the `oflags` that `open(2)` takes, but
+      are the constant values in `enum OpenOptions` from LLDB's
+      [`File.h`](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Host/File.h).
+   3. Mode bits, base 16
 
-### Priority To Implement
+response is `F` followed by the opened file descriptor in base 16.
+`F-1,errno` with the errno if an error occurs, base 16.
 
-On OS X 10.11, iOS 9, tvOS 9, watchOS 2 and older: Low.  If this packet is absent,
-lldb will read the Mach-O headers/load commands out of memory.
-On macOS 10.12, iOS 10, tvOS 10, watchOS 3 and newer: High.  If this packet is absent,
-lldb will not know anything about shared libraries in the inferior, or where the main
-executable loaded.
+### vFile:pread
 
-## jThreadsInfo
+Read data from an opened file descriptor.
 
-### Brief
+```
+receive: vFile:pread:7,1024,0
+send:    F4;a'b\00
+```
 
-Ask for the server for thread stop information of all threads.
+Request packet has the fields:
+   1. File descriptor, base 16
+   2. Number of bytes to be read, base 16
+   3. Offset into file to start from, base 16
 
-### Priority To Implement
+Response is `F`, followed by the number of bytes read (base 16), a
+semicolon, followed by the data in the binary-escaped-data encoding.
 
-Low. This is a performance optimization, which speeds up debugging by avoiding
-multiple round-trips for retrieving thread information. The information from this
-packet can be retrieved using a combination of `qThreadStopInfo` and `m` packets.
+### vFile:pwrite
 
-### Description
+Write data to a previously opened file descriptor.
 
-The data in this packet is very similar to the stop reply packets, but is packaged in
-JSON and uses JSON arrays where applicable. The JSON output looks like:
 ```
-    [
-      { "tid":1580681,
-        "metype":6,
-        "medata":[2,0],
-        "reason":"exception",
-        "qaddr":140735118423168,
-        "registers": {
-          "0":"8000000000000000",
-          "1":"0000000000000000",
-          "2":"20fabf5fff7f0000",
-          "3":"e8f8bf5fff7f0000",
-          "4":"0100000000000000",
-          "5":"d8f8bf5fff7f0000",
-          "6":"b0f8bf5fff7f0000",
-          "7":"20f4bf5fff7f0000",
-          "8":"8000000000000000",
-          "9":"61a8db78a61500db",
-          "10":"3200000000000000",
-          "11":"4602000000000000",
-          "12":"0000000000000000",
-          "13":"0000000000000000",
-          "14":"0000000000000000",
-          "15":"0000000000000000",
-          "16":"960b000001000000",
-          "17":"0202000000000000",
-          "18":"2b00000000000000",
-          "19":"0000000000000000",
-          "20":"0000000000000000"
-        },
-        "memory":[
-          {"address":140734799804592,"bytes":"c8f8bf5fff7f0000c9a59e8cff7f0000"},
-          {"address":140734799804616,"bytes":"00000000000000000100000000000000"}
-        ]
-      }
-    ]
+receive: vFile:pwrite:8,0,\cf\fa\ed\fe\0c\00\00
+send:    F1024
 ```
 
-It contains an array of dictionaries with all of the key value pairs that are
-normally in the stop reply packet, including the expedited registers. The registers are
-passed as hex-encoded JSON string in debuggee-endian byte order. Note that the register
-numbers are decimal numbers, unlike the stop-reply packet, where they are written in
-hex. The packet also contains expedited memory in the `memory` key.  This allows the
-server to expedite memory that the client is likely to use (e.g., areas around the
-stack pointer, which are needed for computing backtraces) and it reduces the packet
-count.
+Request packet has the fields:
+   1. File descriptor, base 16
+   2. Offset into file to start from, base 16
+   3. binary-escaped-data to be written
 
-On macOS with debugserver, we expedite the frame pointer backchain for a thread
-(up to 256 entries) by reading 2 pointers worth of bytes at the frame pointer (for
-the previous FP and PC), and follow the backchain. Most backtraces on macOS and
-iOS now don't require us to read any memory!
-
-## jGetSharedCacheInfo
+Response is `F`, followed by the number of bytes written (base 16).
 
-### Brief
+### vFile:size
 
-This packet asks the remote debug stub to send the details about the inferior's
-shared cache. The shared cache is a collection of common libraries/frameworks that
-are mapped into every process at the same address on Darwin systems, and can be
-identified by a load address and UUID.
+Get the size of a file on the target system, filename in ASCII hex.
 
 ```
-LLDB SENDS: jGetSharedCacheInfo:{}
-STUB REPLIES: ${"shared_cache_base_address":140735683125248,"shared_cache_uuid":"DDB8D70C-C9A2-3561-B2C8-BE48A4F33F96","no_shared_cache":false,"shared_cache_private_cache":false]}#00
+receive: vFile:size:2f746d702f61
+send:    Fc008
 ```
 
-### Priority To Implement
+response is `F` followed by the file size in base 16.
+`F-1,errno` with the errno if an error occurs, base 16.
 
-Low.  When both lldb and the inferior process are running on the same computer, and lldb
-and the inferior process have the same shared cache, lldb may (as an optimization) read
-the shared cache out of its own memory instead of using gdb-remote read packets to read
-them from the inferior process.
+### vFile:symlink
 
-## qQueryGDBServer
-
-### Brief
-
-Ask the platform for the list of gdbservers we have to connect
+Create a symbolic link (symlink, soft-link) on the target system.
 
-### Priority To Implement
+```
+receive: vFile:symlink:<SRC-FILE>,<DST-NAME>
+send:    F0,0
+```
 
-Low. The packet is required to support connecting to gdbserver started
-by the platform instance automatically.
+Argument file paths are in ascii-hex encoding.
+Response is `F` plus the return value of `symlink()`, base 16 encoding,
+optionally followed by the value of errno if it failed, also base 16.
 
-### Description
+### vFile:unlink
 
-If the remote platform automatically started one or more gdbserver instance (without
-lldb asking it) then it have to return the list of port number or socket name for
-each of them what can be used by lldb to connect to those instances.
-
-The data in this packet is a JSON array of JSON objects with the following keys:
-* `port`: `<the port number to connect>` (optional)
-* `socket_name`: `<the name of the socket to connect>` (optional)
+Remove a file on the target system.
 
-Example packet:
 ```
-[
-    { "port": 1234 },
-    { "port": 5432 },
-    { "socket_name": "foo" }
-]
+receive: vFile:unlink:2f746d702f61
+send:    F0
 ```
 
-## QSetDetachOnError
-
-### Brief
+Argument is a file path in ascii-hex encoding.
+Response is `F` plus the return value of `unlink()`, base 16 encoding.
+Return value may optionally be followed by a comma and the base16
+value of errno if unlink failed.
 
-Sets what the server should do when the communication channel with LLDB
-goes down. Either kill the inferior process (`0`) or remove breakpoints and
-detach (`1`).
-
-### Priority To Implement
-
-Low. Only required if the target wants to keep the inferior process alive
-when the communication channel goes down.
+## "x" - Binary memory read
 
-### Description
+Like the `m` (read) and `M` (write) packets, this is a partner to the
+`X` (write binary data) packet, `x`.
 
-The data in this packet is a single a character, which should be `0` if the
-inferior process should be killed, or `1` if the server should remove all
-breakpoints and detach from the inferior.
+It is called like
+```
+xADDRESS,LENGTH
+```
 
-## jGetDyldProcessState
+where both `ADDRESS` and `LENGTH` are big-endian base 16 values.
 
-### Brief
+To test if this packet is available, send a addr/len of 0:
+```
+x0,0
+```
+You will get an `OK` response if it is supported.
 
-This packet fetches the process launch state, as reported by libdyld on
-Darwin systems, most importantly to indicate when the system libraries
-have initialized sufficiently to safely call utility functions.
+The reply will be the data requested in 8-bit binary data format.
+The standard quoting is applied to the payload. Characters `}  #  $  *`
+will all be escaped with `}` (`0x7d`) character and then XOR'ed with `0x20`.
 
+A typical use to read 512 bytes at 0x1000 would look like:
 ```
-LLDB SENDS: jGetDyldProcessState
-STUB REPLIES: {"process_state_value":48,"process_state string":"dyld_process_state_libSystem_initialized"}
+x0x1000,0x200
 ```
+The `0x` prefixes are optional - like most of the gdb-remote packets,
+omitting them will work fine; these numbers are always base 16.
 
-### Priority To Implement
-
-Low. This packet is needed to prevent lldb's utility functions for
-scanning the Objective-C class list from running very early in
-process startup.
+The length of the payload is not provided.  A reliable, 8-bit clean,
+transport layer is assumed.
+\ No newline at end of file
diff --git a/lldb/docs/resources/lldbplatformpackets.md b/lldb/docs/resources/lldbplatformpackets.md
new file mode 100644
index 000000000000..fea225528c9a
--- /dev/null
+++ b/lldb/docs/resources/lldbplatformpackets.md
@@ -0,0 +1,49 @@
+# LLDB Platform Packets
+
+This is a list of the packets that an lldb platform server
+needs to implement for the lldb testsuite to be run on a remote
+target device/system.
+
+These are almost all lldb extensions to the gdb-remote serial
+protocol. Many of the `vFile:` packets are also described in the "Host
+I/O Packets" detailed in the gdb-remote protocol documentation,
+although the lldb platform extensions include packets that are not
+defined there (`vFile:size:`, `vFile:mode:`, `vFile:symlink`, `vFile:chmod:`).
+
+Most importantly, the flags that LLDB passes to `vFile:open:` are
+incompatible with the flags that GDB specifies.
+
+* [QSetWorkingDir](./lldbgdbremote.md#qsetworkingdir-ascii-hex-path)
+* [QStartNoAckMode](./lldbgdbremote.md#qstartnoackmode)
+* [qGetWorkingDir](./lldbgdbremote.md#qgetworkingdir)
+* [qHostInfo](./lldbgdbremote.md#qhostinfo)
+* [qKillSpawnedProcess](./lldbgdbremote.md#qkillspawnedprocess-platform-extension)
+* [qLaunchGDBServer](./lldbgdbremote.md#qlaunchgdbserver-platform-extension)
+* [qModuleInfo](./lldbgdbremote.md#qmoduleinfo-module-path-arch-triple)
+* [qPathComplete](./lldbgdbremote.md#qpathcomplete-platform-extension)
+* [qPlatform_mkdir](./lldbgdbremote.md#qplatform-mkdir)
+* [qPlatform_shell](./lldbgdbremote.md#qplatform-shell)
+* [qProcessInfo](./lldbgdbremote.md#qprocessinfo)
+  * The lldb test suite currently only uses `name_match:equals` and the no-criteria mode to list every process.
+* [qProcessInfoPID](./lldbgdbremote.md#qprocessinfopid-pid-platform-extension)
+  * It is likely that you only need to support the `pid` and `name` fields.
+* [vFile:chmod](./lldbgdbremote.md#vfile-chmod-qplatform-chmod)
+* [vFile:close](./lldbgdbremote.md#vfile-close)
+* [vFile:mode](./lldbgdbremote.md#vfile-mode)
+* [vFile:open](./lldbgdbremote.md#vfile-open)
+* [vFile:pread](./lldbgdbremote.md#vfile-pread)
+* [vFile:pwrite](./lldbgdbremote.md#vfile-pwrite)
+* [vFile:size](./lldbgdbremote.md#vfile-size)
+* [vFile:symlink](./lldbgdbremote.md#vfile-symlink)
+* [vFile:unlink](./lldbgdbremote.md#vfile-unlink)
+
+The remote platform must be able to launch processes so that debugserver
+can attach to them. This requires the following packets in addition to the
+previous list:
+* [A](./lldbgdbremote.md#a-launch-args-packet)
+* [QEnvironment](./lldbgdbremote.md#qenvironment-name-value)
+* [QEnvironmentHexEncoded](./lldbgdbremote.md#qenvironmenthexencoded-hex-encoding-name-value)
+* [QSetDetatchOnError](./lldbgdbremote.md#qsetdetachonerror)
+* [QSetDisableASLR](./lldbgdbremote.md#qsetdisableaslr-bool)
+* [QSetSTDIN / QSetSTDOUT / QSetSTDERR](./lldbgdbremote.md#qsetstdin-ascii-hex-path-qsetstdout-ascii-hex-path-qsetstderr-ascii-hex-path) (all 3)
+* [qLaunchSuccess](./lldbgdbremote.md#qlaunchsuccess)
diff --git a/lldb/docs/resources/test.rst b/lldb/docs/resources/test.rst
index 094fde8b1b5a..382e42bf22b1 100644
--- a/lldb/docs/resources/test.rst
+++ b/lldb/docs/resources/test.rst
@@ -618,9 +618,9 @@ On non-Windows platforms, you can use the ``-d`` option to ``dotest.py`` which
 will cause the script to print out the pid of the test and wait for a while
 until a debugger is attached. Then run ``lldb -p <pid>`` to attach.
 
-To instead debug a test's python source, edit the test and insert
-``import pdb; pdb.set_trace()`` at the point you want to start debugging. In
-addition to pdb's debugging facilities, lldb commands can be executed with the
+To instead debug a test's python source, edit the test and insert ``import pdb; pdb.set_trace()`` or ``breakpoint()`` (Python 3 only) at the point you want to start debugging. The ``breakpoint()`` command can be used for any LLDB Python script, not just for API tests.
+
+In addition to pdb's debugging facilities, lldb commands can be executed with the
 help of a pdb alias. For example ``lldb bt`` and ``lldb v some_var``. Add this
 line to your ``~/.pdbrc``:
 
diff --git a/lldb/docs/use/tutorial.rst b/lldb/docs/use/tutorial.rst
index ff956d750c29..c7f89976156c 100644
--- a/lldb/docs/use/tutorial.rst
+++ b/lldb/docs/use/tutorial.rst
@@ -1,89 +1,94 @@
 Tutorial
 ========
 
-Here's a short precis of how to run lldb if you are familiar with the gdb
-command set. We will start with some details on lldb command structure and
-syntax to help orient you.
+This document describes how to use lldb if you are already familiar with
+gdb's command set. We will start with some details on lldb command structure and
+syntax.
 
 Command Structure
 -----------------
 
-Unlike gdb's command set, which is rather free-form, we tried to make the lldb command syntax fairly structured. The commands are all of the form:
+Unlike gdb's quite free-form commands, lldb's are more structured. All commands
+are of the form:
 
 ::
 
    <noun> <verb> [-options [option-value]] [argument [argument...]]
 
-The command line parsing is done before command execution, so it is uniform
-across all the commands. The command syntax for basic commands is very simple,
-arguments, options and option values are all white-space separated, and
-either single or double-quotes (in pairs) are used to protect white-spaces
-in an argument.  If you need to put a backslash or double-quote character in an
-argument you back-slash it in the argument. That makes the command syntax more
-regular, but it also means you may have to quote some arguments in lldb that
-you wouldn't in gdb.
+The command line parsing is done before command execution, so it is the same for
+all commands. The command syntax for basic commands is very simple.
 
-There is one other special quote character in lldb - the backtick.
+* Arguments, options and option values are all white-space separated.
+* Either single ``'`` or double-quotes ``"`` (in pairs) are used to protect white-spaces
+  in an argument.
+* Escape backslashes and double quotes within arguments should be escaped
+  with a backslash ``\``.
+
+This makes lldb's commands more regular, but it also means you may have to quote
+some arguments in lldb that you would not in gdb.
+
+There is one other special quote character in lldb - the backtick `````.
 If you put backticks around an argument or option value, lldb will run the text
 of the value through the expression parser, and the result of the expression
-will be passed to the command.  So for instance, if "len" is a local
-int variable with the value 5, then the command:
+will be passed to the command.  So for instance, if ``len`` is a local
+``int`` variable with the value ``5``, then the command:
 
 ::
 
    (lldb) memory read -c `len` 0x12345
 
-will receive the value 5 for the count option, rather than the string "len".
-
+Will receive the value ``5`` for the count option, rather than the string ``len``.
 
 Options can be placed anywhere on the command line, but if the arguments begin
-with a "-" then you have to tell lldb that you're done with options for the
-current command by adding an option termination: "--". So for instance, if you
-want to launch a process and give the "process launch" command the
-"--stop-at-entry" option, yet you want the process you are about to launch to
-be launched with the arguments "-program_arg value", you would type:
+with a ``-`` then you have to tell lldb that you are done with options for the
+current command by adding an option termination: ``--``.
+
+So for instance, if you want to launch a process and give the ``process launch``
+command the ``--stop-at-entry`` option, yet you want the process you are about
+to launch to be launched with the arguments ``-program_arg value``, you would type:
 
 ::
 
    (lldb) process launch --stop-at-entry -- -program_arg value
 
 We also tried to reduce the number of special purpose argument parsers, which
-sometimes forces the user to be a little more explicit about stating their
-intentions. The first instance you'll note of this is the breakpoint command.
-In gdb, to set a breakpoint, you might enter
+sometimes forces the user to be explicit about their intentions. The first
+instance you willl see of this is the breakpoint command. In gdb, to set a
+breakpoint, you might enter:
 
 ::
 
    (gdb) break foo.c:12
 
-to break at line 12 of foo.c, and:
+To break at line ``12`` of ``foo.c``, and:
 
 ::
 
    (gdb) break foo
 
-to break at the function foo. As time went on, the parser that tells foo.c:12
-from foo from foo.c::foo (which means the function foo in the file foo.c) got
-more and more complex and bizarre, and especially in C++ there are times where
-there's really no way to specify the function you want to break on. The lldb
-commands are more verbose but also more precise and allow for intelligent auto
-completion.
+To break at the function ``foo``. As time went on, the parser that tells ``foo.c:12``
+from ``foo`` from ``foo.c::foo`` (which means the function ``foo`` in the file ``foo.c``)
+got more and more complex. Especially in C++ there are times where there is
+really no way to specify the function you want to break on.
+
+The lldb commands are more verbose but also more precise and allow for
+intelligent auto completion.
 
-To set the same file and line breakpoint in LLDB you can enter either of:
+To set the same file and line breakpoint in lldb you can enter either of:
 
 ::
 
    (lldb) breakpoint set --file foo.c --line 12
    (lldb) breakpoint set -f foo.c -l 12
 
-To set a breakpoint on a function named foo in LLDB you can enter either of:
+To set a breakpoint on a function named ``foo`` in lldb you can enter either of:
 
 ::
 
    (lldb) breakpoint set --name foo
    (lldb) breakpoint set -n foo
 
-You can use the --name option multiple times to make a breakpoint on a set of
+You can use the ``--name`` option multiple times to make a breakpoint on a set of
 functions as well. This is convenient since it allows you to set common
 conditions or commands without having to specify them multiple times:
 
@@ -91,9 +96,9 @@ conditions or commands without having to specify them multiple times:
 
    (lldb) breakpoint set --name foo --name bar
 
-Setting breakpoints by name is even more specialized in LLDB as you can specify
+Setting breakpoints by name is even more specialized in lldb as you can specify
 that you want to set a breakpoint at a function by method name. To set a
-breakpoint on all C++ methods named foo you can enter either of:
+breakpoint on all C++ methods named ``foo`` you can enter either of:
 
 ::
 
@@ -101,7 +106,7 @@ breakpoint on all C++ methods named foo you can enter either of:
    (lldb) breakpoint set -M foo
 
 
-To set a breakpoint Objective-C selectors named alignLeftEdges: you can enter either of:
+To set a breakpoint Objective-C selectors named ``alignLeftEdges:`` you can enter either of:
 
 ::
 
@@ -109,14 +114,14 @@ To set a breakpoint Objective-C selectors named alignLeftEdges: you can enter ei
    (lldb) breakpoint set -S alignLeftEdges:
 
 You can limit any breakpoints to a specific executable image by using the
-"--shlib <path>" ("-s <path>" for short):
+``--shlib <path>`` (``-s <path>`` for short):
 
 ::
 
    (lldb) breakpoint set --shlib foo.dylib --name foo
    (lldb) breakpoint set -s foo.dylib -n foo
 
-The --shlib option can also be repeated to specify several shared libraries.
+The ``--shlib`` option can also be repeated to specify several shared libraries.
 
 Suggestions on more interesting primitives of this sort are also very welcome.
 
@@ -130,16 +135,16 @@ command:
    (lldb) br s -n "-[SKTGraphicView alignLeftEdges:]"
 
 lldb also supports command completion for source file names, symbol names, file
-names, etc. Completion is initiated by a hitting a TAB. Individual options in a
-command can have different completers, so for instance, the "--file <path>"
-option in "breakpoint" completes to source files, the "--shlib <path>" option
-to currently loaded shared libraries, etc. We can even do things like if you
-specify "--shlib <path>", and are completing on "--file <path>", we will only
-list source files in the shared library specified by "--shlib <path>".
-
-The individual commands are pretty extensively documented. You can use the help
+names, etc. Completion is initiated by hitting TAB. Individual options in a
+command can have different completers, so for instance, the ``--file <path>``
+option in ``breakpoint`` completes to source files, the ``--shlib <path>`` option
+to currently loaded shared libraries, etc. You can even do things like if you
+specify ``--shlib <path>``, and are completing on ``--file <path>``, lldb will only
+list source files in the shared library specified by ``--shlib <path>``.
+
+The individual commands are pretty extensively documented. You can use the ``help``
 command to get an overview of which commands are available or to obtain details
-about specific commands. There is also an apropos command that will search the
+about specific commands. There is also an ``apropos`` command that will search the
 help text for all commands for a particular word and dump a summary help string
 for each matching command.
 
@@ -150,78 +155,75 @@ For instance, if you get annoyed typing:
 
    (lldb) breakpoint set --file foo.c --line 12
 
-you can do:
+You can do:
 
 ::
 
    (lldb) command alias bfl breakpoint set -f %1 -l %2
    (lldb) bfl foo.c 12
 
-We have added a few aliases for commonly used commands (e.g. "step", "next" and
-"continue") but we haven't tried to be exhaustive because in our experience it
+lldb has a few aliases for commonly used commands (e.g. ``step``, ``next`` and
+``continue``) but it does not try to be exhaustive because in our experience it
 is more convenient to make the basic commands unique down to a letter or two,
 and then learn these sequences than to fill the namespace with lots of aliases,
 and then have to type them all the way out.
 
 However, users are free to customize lldb's command set however they like, and
-since lldb reads the file ~/.lldbinit at startup, you can store all your
+since lldb reads the file ``~/.lldbinit`` at startup, you can store all your
 aliases there and they will be generally available to you. Your aliases are
-also documented in the help command so you can remind yourself of what you've
+also documented in the ``help`` command so you can remind yourself of what you have
 set up.
 
-One alias of note that we do include by popular demand is a weak emulator of
-gdb's "break" command. It doesn't try to do everything that gdb's break command
-does (for instance, it doesn't handle foo.c::bar. But it mostly works, and
-makes the transition easier. Also, by popular demand, it is aliased to b. If you
+One alias of note that lldb does include by popular demand is a weak emulator of
+gdb's ``break`` command. It does not try to do everything that gdb's break command
+does (for instance, it does not handle ``foo.c::bar``). But it mostly works, and
+makes the transition easier. Also, by popular demand, it is aliased to ``b``. If you
 actually want to learn the lldb command set natively, that means it will get in
-the way of the rest of the breakpoint commands. Fortunately, if you don't like
-one of our aliases, you can easily get rid of it by running (for example):
+the way of the rest of the breakpoint commands. Fortunately, if you do not like
+one of our aliases, you can easily get rid of it by running, for example:
 
 ::
 
    (lldb) command unalias b
 
-I actually also do:
+You can also do:
 
 ::
 
    (lldb) command alias b breakpoint
 
-so I can run the native lldb breakpoint command with just b
+So you can run the native lldb breakpoint command with just ``b``.
 
 The lldb command parser also supports "raw" commands, where, after command
 options are stripped off, the rest of the command string is passed
 uninterpreted to the command. This is convenient for commands whose arguments
 might be some complex expression that would be painful to backslash protect.
-For instance, the "expression" command is a "raw" command for obvious reasons.
-The "help" output for a command will tell you if it is "raw" or not, so you
+For instance, the ``expression`` command is a "raw" command for obvious reasons.
+The ``help`` output for a command will tell you if it is "raw" or not, so you
 know what to expect. The one thing you have to watch out for is that since raw
 commands still can have options, if your command string has dashes in it,
-you'll have to indicate these are not option markers by putting "--" after the
+you will have to indicate these are not option markers by putting ``--`` after the
 command name, but before your command string.
 
 lldb also has a built-in Python interpreter, which is accessible by the
-"script" command. All the functionality of the debugger is available as classes
+``"script`` command. All the functionality of the debugger is available as classes
 in the Python interpreter, so the more complex commands that in gdb you would
-introduce with the "define" command can be done by writing Python functions
+introduce with the ``define`` command can be done by writing Python functions
 using the lldb-Python library, then loading the scripts into your running
-session and accessing them with the "script" command.
+session and accessing them with the ``script`` command.
 
-Having given an overview of lldb's command syntax, we proceed to lay out the
-stages of a standard debug session.
-
-
-Loading a Program into lldb
+Loading a Program Into lldb
 ---------------------------
 
-First we need to set the program to debug. As with gdb, you can start lldb and specify the file you wish to debug on the command line:
+First you need to set the program to debug. As with gdb, you can start lldb and
+specify the file you wish to debug on the command line:
 
 ::
 
    $ lldb /Projects/Sketch/build/Debug/Sketch.app
    Current executable set to '/Projects/Sketch/build/Debug/Sketch.app' (x86_64).
 
-or you can specify it after the fact with the "file" command:
+Or you can specify it after the fact with the ``file`` command:
 
 ::
 
@@ -232,15 +234,15 @@ or you can specify it after the fact with the "file" command:
 Setting Breakpoints
 -------------------
 
-We've discussed how to set breakpoints above. You can use help breakpoint set
-to see all the options for breakpoint setting. For instance, we might do:
+We have discussed how to set breakpoints above. You can use ``help breakpoint set``
+to see all the options for breakpoint setting. For instance, you could do:
 
 ::
 
    (lldb) breakpoint set --selector alignLeftEdges:
    Breakpoint created: 1: name = 'alignLeftEdges:', locations = 1, resolved = 1
 
-You can find out about the breakpoints you've set with:
+You can find out about the breakpoints you have set with:
 
 ::
 
@@ -257,12 +259,12 @@ your program. Similarly, a file and line breakpoint might result in multiple
 locations if that file and line were inlined in different places in your code.
 
 The logical breakpoint has an integer id, and its locations have an id within
-their parent breakpoint (the two are joined by a ".", e.g. 1.1 in the example
+their parent breakpoint (the two are joined by a ``.``, e.g. ``1.1`` in the example
 above).
 
-Also, the logical breakpoints remain live so that if another shared library were
-to be loaded that had another implementation of the "alignLeftEdges:" selector,
-the new location would be added to breakpoint 1 (e.g. a "1.2" breakpoint would
+Also logical breakpoints remain live so that if another shared library were
+to be loaded that had another implementation of the ``alignLeftEdges:`` selector,
+the new location would be added to breakpoint ``1`` (e.g. a ``1.2`` breakpoint would
 be set on the newly loaded selector).
 
 The other piece of information in the breakpoint listing is whether the
@@ -271,21 +273,19 @@ address it corresponds to gets loaded into the program you are debugging. For
 instance if you set a breakpoint in a shared library that then gets unloaded,
 that breakpoint location will remain, but it will no longer be resolved.
 
-
 One other thing to note for gdb users is that lldb acts like gdb with:
 
 ::
 
    (gdb) set breakpoint pending on
 
-That is, lldb will always make a breakpoint from your specification, even if it
-couldn't find any locations that match the specification. You can tell whether
+Which means that lldb will always make a breakpoint from your specification, even if it
+could not find any locations that match the specification. You can tell whether
 the expression was resolved or not by checking the locations field in
-"breakpoint list", and we report the breakpoint as "pending" when you set it so
-you can tell you've made a typo more easily, if that was indeed the reason no
+``breakpoint list``, and lldb reports the breakpoint as ``pending`` when you set it so
+you can tell you have made a typo more easily, if that was indeed the reason no
 locations were found:
 
-
 ::
 
    (lldb) breakpoint set --file foo.c --line 12
@@ -294,8 +294,8 @@ locations were found:
 
 You can delete, disable, set conditions and ignore counts either on all the
 locations generated by your logical breakpoint, or on any one of the particular
-locations your specification resolved to. For instance, if we wanted to add a
-command to print a backtrace when we hit this breakpoint we could do:
+locations your specification resolved to. For instance, if you wanted to add a
+command to print a backtrace when you hit this breakpoint you could do:
 
 ::
 
@@ -305,12 +305,12 @@ command to print a backtrace when we hit this breakpoint we could do:
    > DONE
 
 By default, the breakpoint command add command takes lldb command line
-commands. You can also specify this explicitly by passing the "--command"
-option. Use "--script" if you want to implement your breakpoint command using
+commands. You can also specify this explicitly by passing the ``--command``
+option. Use ``--script`` if you want to implement your breakpoint command using
 the Python script instead.
 
 This is a convenient point to bring up another feature of the lldb command
-help. Do:
+``help``. Do:
 
 ::
 
@@ -320,8 +320,8 @@ help. Do:
    Syntax: breakpoint command add <cmd-options> <breakpt-id>
    etc...
 
-When you see arguments to commands specified in the Syntax in angle brackets
-like <breakpt-id>, that indicates that that is some common argument type that
+When you see arguments to commands specified in the ``Syntax`` section in angle brackets
+like ``<breakpt-id>``, that indicates that that is some common argument type that
 you can get further help on from the command system. So in this case you could
 do:
 
@@ -330,35 +330,45 @@ do:
    (lldb) help <breakpt-id> <breakpt-id> -- Breakpoint ID's consist major and
    minor numbers; the major etc...
 
-
 Breakpoint Names
 ----------------
 
-Breakpoints carry two orthogonal sets of information: one specifies where to set the breakpoint, and the other how to react when the breakpoint is hit. The latter set of information (e.g. commands, conditions, hit-count, auto-continue...) we call breakpoint options.
+Breakpoints carry two orthogonal sets of information: one specifies where to set
+the breakpoint, and the other how to react when the breakpoint is hit. The latter
+set of information (e.g. commands, conditions, hit-count, auto-continue...) we
+call breakpoint options.
 
-It is fairly common to want to apply one set of options to a number of breakpoints. For instance, you might want to check that self == nil and if it is, print a backtrace and continue, on a number of methods. One convenient way to do that would be to make all the breakpoints, then configure the options with:
+It is fairly common to want to apply one set of options to a number of breakpoints.
+For instance, you might want to check that ``self == nil`` and if it is, print a
+backtrace and continue, on a number of methods. One convenient way to do that would
+be to make all the breakpoints, then configure the options with:
 
 ::
 
    (lldb) breakpoint modify -c "self == nil" -C bt --auto-continue 1 2 3
 
-That's not too bad, but you have to repeat this for every new breakpoint you make, and if you wanted to change the options, you have to remember all the ones you are using this way.
+That is not too bad, but you have to repeat this for every new breakpoint you make,
+and if you wanted to change the options, you have to remember all the ones you are
+using this way.
 
-Breakpoint names provide a convenient solution to this problem. The simple solution would be to use the name to gather the breakpoints you want to affect this way into a group. So when you make the breakpoint you would do:
+Breakpoint names provide a convenient solution to this problem. The simple solution
+would be to use the name to gather the breakpoints you want to affect this way into
+a group. So when you make the breakpoint you would do:
 
 ::
 
    (lldb) breakpoint set -N SelfNil
 
-Then when you've made all your breakpoints, you can set up or modify the options using the name to collect all the relevant breakpoints.
+Then when you have made all your breakpoints, you can set up or modify the options
+using the name to collect all the relevant breakpoints.
 
 ::
 
    (lldb) breakpoint modify -c "self == nil" -C bt --auto-continue SelfNil
 
 That is better, but suffers from the problem that when new breakpoints get
-added, they don't pick up these modifications, and the options only exist in
-the context of actual breakpoints, so they are hard to store & reuse.
+added, they do not pick up these modifications, and the options only exist in
+the context of actual breakpoints, so they are hard to store and reuse.
 
 An even better solution is to make a fully configured breakpoint name:
 
@@ -372,7 +382,7 @@ you change the options configured on the name, all the breakpoints pick up
 those changes. This makes it easy to use configured names to experiment with
 your options.
 
-You can make breakpoint names in your .lldbinit file, so you can use them to
+You can make breakpoint names in your ``.lldbinit`` file, so you can use them to
 can behaviors that you have found useful and reapply them in future sessions.
 
 You can also make a breakpoint name from the options set on a breakpoint:
@@ -387,9 +397,9 @@ Setting Watchpoints
 -------------------
 
 In addition to breakpoints, you can use help watchpoint to see all the commands
-for watchpoint manipulations. For instance, we might do the following to watch
-a variable called 'global' for write operation, but only stop if the condition
-'(global==5)' is true:
+for watchpoint manipulations. For instance, you might do the following to watch
+a variable called ``global`` for write operation, but only stop if the condition
+``(global==5)`` is true:
 
 ::
 
@@ -437,7 +447,8 @@ a variable called 'global' for write operation, but only stop if the condition
 Starting or Attaching to Your Program
 -------------------------------------
 
-To launch a program in lldb we use the "process launch" command or one of its built in aliases:
+To launch a program in lldb you will use the ``process launch`` command or one of
+its built in aliases:
 
 ::
 
@@ -446,7 +457,7 @@ To launch a program in lldb we use the "process launch" command or one of its bu
    (lldb) r
 
 You can also attach to a process by process ID or process name. When attaching
-to a process by name, lldb also supports the "--waitfor" option which waits for
+to a process by name, lldb also supports the ``--waitfor`` option which waits for
 the next process that has that name to show up, and attaches to it
 
 ::
@@ -465,7 +476,7 @@ After you launch or attach to a process, your process might stop somewhere:
    1 of 3 threads stopped with reasons:
    * thread #1: tid = 0x2c03, 0x00007fff85cac76a, where = libSystem.B.dylib`__getdirentries64 + 10, stop reason = signal = SIGSTOP, queue = com.apple.main-thread
 
-Note the line that says "1 of 3 threads stopped with reasons:" and the lines
+Note the line that says ``1 of 3 threads stopped with reasons:`` and the lines
 that follow it. In a multi-threaded environment it is very common for more than
 one thread to hit your breakpoint(s) before the kernel actually returns control
 to the debugger. In that case, you will see all the threads that stopped for
@@ -474,7 +485,8 @@ some interesting reason listed in the stop message.
 Controlling Your Program
 ------------------------
 
-After launching, we can continue until we hit our breakpoint. The primitive commands for process control all exist under the "thread" command:
+After launching, you can continue until you hit your breakpoint. The primitive commands
+for process control all exist under the "thread" command:
 
 ::
 
@@ -483,9 +495,14 @@ After launching, we can continue until we hit our breakpoint. The primitive comm
    Resuming process 46915
    (lldb)
 
-At present you can only operate on one thread at a time, but the design will ultimately support saying "step over the function in Thread 1, and step into the function in Thread 2, and continue Thread 3" etc. When we eventually support keeping some threads running while others are stopped this will be particularly important. For convenience, however, all the stepping commands have easy aliases. So "thread continue" is just "c", etc.
+At present you can only operate on one thread at a time, but the design will
+ultimately support saying "step over the function in Thread 1, and step into the
+function in Thread 2, and continue Thread 3" etc. When lldb eventually supports
+keeping some threads running while others are stopped this will be particularly
+important. For convenience, however, all the stepping commands have easy aliases.
+So ``thread continue`` is just ``c``, etc.
 
-The other program stepping commands are pretty much the same as in gdb. You've got:
+The other program stepping commands are pretty much the same as in gdb. You have got:
 
 ::
 
@@ -493,9 +510,11 @@ The other program stepping commands are pretty much the same as in gdb. You've g
    (lldb) thread step-over  // The same as gdb's "next" or "n"
    (lldb) thread step-out   // The same as gdb's "finish" or "f"
 
-By default, lldb does defined aliases to all common gdb process control commands ("s", "step", "n", "next", "finish"). If we have missed any, please add them to your ~/.lldbinit file using the "command alias" command.
+By default, lldb does defined aliases to all common gdb process control commands
+(``s``, ``step``, ``n``, ``next``, ``finish``). If lldb is missing any, please add
+them to your ``~/.lldbinit`` file using the ``command alias`` command.
 
-lldb also supported the step by instruction versions:
+lldb also supports the step by instruction versions:
 
 ::
 
@@ -509,20 +528,20 @@ Finally, lldb has a run until line or frame exit stepping mode:
 
    (lldb) thread until 100
 
-This command will run the thread in the current frame till it reaches line 100
+This command will run the thread in the current frame until it reaches line 100
 in this frame or stops if it leaves the current frame. This is a pretty close
-equivalent to gdb's "until" command.
+equivalent to gdb's ``until`` command.
 
 A process, by default, will share the lldb terminal with the inferior process.
 When in this mode, much like when debugging with gdb, when the process is
-running anything you type will go to the STDIN of the inferior process. To
-interrupt your inferior program, type CTRL+C.
+running anything you type will go to the ``STDIN`` of the inferior process. To
+interrupt your inferior program, type ``CTRL+C``.
 
-If you attach to a process, or launch a process with the "--no-stdin" option,
+If you attach to a process, or launch a process with the ``--no-stdin`` option,
 the command interpreter is always available to enter commands. It might be a
-little disconcerting to gdb users to always have an (lldb) prompt. This allows
-you to set a breakpoint, etc without having to explicitly interrupt the
-program you are debugging:
+little disconcerting to gdb users to always have an ``(lldb)`` prompt. This allows
+you to set a breakpoint, or use any other command without having to explicitly
+interrupt the program you are debugging:
 
 ::
 
@@ -530,43 +549,40 @@ program you are debugging:
    (lldb) breakpoint set --name stop_here
 
 There are many commands that won't work while running, and the command
-interpreter should do a good job of letting you know when this is the case. If
-you find any instances where the command interpreter isn't doing its job,
-please file a bug. This way of operation will set us up for a future debugging
+interpreter will let you know when this is the case. Please file an issue if
+it does not. This way of operation will set us up for a future debugging
 mode called thread centric debugging. This mode will allow us to run all
 threads and only stop the threads that are at breakpoints or have exceptions or
 signals.
 
 The commands that currently work while running include interrupting the process
-to halt execution ("process interrupt"), getting the process status ("process
-status"), breakpoint setting and clearing (" breakpoint
-[set|clear|enable|disable|list] ..."), and memory reading and writing (" memory
-[read|write] ...").
+to halt execution (``process interrupt``), getting the process status (``process status``),
+breakpoint setting and clearing (``breakpoint [set|clear|enable|disable|list] ...``),
+and memory reading and writing (``memory [read|write] ...``).
 
 The question of disabling stdio when running brings up a good opportunity to
-show how to set debugger properties in general. If you always want to run in
-the --no-stdin mode, you can set this as a generic process property using the
-lldb "settings" command, which is equivalent to gdb's "set" command. For
-instance, in this case you would say:
+show how to set debugger properties. If you always want to run in
+the ``--no-stdin`` mode, you can set this as a generic process property using the
+lldb ``settings`` command, which is equivalent to gdb's ``set`` command.
+In this case you would say:
 
 ::
 
    (lldb) settings set target.process.disable-stdio true
 
-Over time, gdb's "set command became a wilderness of disordered options, so
-that there were useful options that even experienced gdb users didn't know
-about because they were too hard to find. We tried to organize the settings
+Over time, gdb's ``set`` command became a wilderness of disordered options, so
+that there were useful options that even experienced gdb users did not know
+about because they were too hard to find. lldb instead organizes the settings
 hierarchically using the structure of the basic entities in the debugger. For
 the most part anywhere you can specify a setting on a generic entity (threads,
-for example) you can also apply the option to a particular instance, which can
-also be convenient at times. You can view the available settings with "settings
-list" and there is help on the settings command explaining how it works more
-generally.
+for example) you can also apply the option to a particular instance. You can
+view the available settings with the command ``settings list`` and there is help
+on the settings command explaining how it works more generally.
 
 Examining Thread State
 ----------------------
 
-Once you've stopped, lldb will choose a current thread, usually the one that
+Once you have stopped, lldb will choose a current thread, usually the one that
 stopped "for a reason", and a current frame in that thread (on stop this is
 always the bottom-most frame). Many the commands for inspecting state work on
 this current thread/frame.
@@ -600,27 +616,27 @@ that thread, do:
    frame #9: 0x0000000100015ae3, where = Sketch`main + 33 at /Projects/Sketch/SKTMain.m:11
    frame #10: 0x0000000100000f20, where = Sketch`start + 52
 
-You can also provide a list of threads to backtrace, or the keyword "all" to see all threads:
+You can also provide a list of threads to backtrace, or the keyword ``all`` to see all threads:
 
 ::
 
    (lldb) thread backtrace all
 
 You can select the current thread, which will be used by default in all the
-commands in the next section, with the "thread select" command:
+commands in the next section, with the ``thread select`` command:
 
 ::
 
    (lldb) thread select 2
 
-where the thread index is just the one shown in the "thread list" listing.
+where the thread index is just the one shown in the ``thread list`` listing.
 
 
 Examining Stack Frame State
 ---------------------------
 
 The most convenient way to inspect a frame's arguments and local variables is
-to use the "frame variable" command:
+to use the ``frame variable`` command:
 
 ::
 
@@ -632,16 +648,16 @@ to use the "frame variable" command:
    i = (NSUInteger) 0x00000001001264e0
    c = (NSUInteger) 0x00000001001253b0
 
-As you see above, if you don't specify any variable names, all arguments and
-locals will be shown. If you call "frame variable" passing in the names of a
-particular local(s), only those variables will be printed. For instance:
+As you see above, if you do not specify any variable names, all arguments and
+locals will be shown. If you call ``frame variable`` passing in the names of
+particular local variables, only those variables will be printed. For instance:
 
 ::
 
    (lldb) frame variable self
    (SKTGraphicView *) self = 0x0000000100208b40
 
-You can also pass in a path to some subelement of one of the available locals,
+You can also pass in a path to some sub-element of one of the available locals,
 and that sub-element will be printed. For instance:
 
 ::
@@ -649,7 +665,7 @@ and that sub-element will be printed. For instance:
    (lldb) frame variable self.isa
    (struct objc_class *) self.isa = 0x0000000100023730
 
-The "frame variable" command is not a full expression parser but it does
+The ``frame variable`` command is not a full expression parser but it does
 support a few simple operations like ``&``, ``*``, ``->``, ``[]`` (no
 overloaded operators). The array brackets can be used on pointers to treat
 pointers as arrays:
@@ -669,8 +685,8 @@ pointers as arrays:
    (char const *) argv[0] = 0x00007fff5fbffaf8 "/Projects/Sketch/build/Debug/Sketch.app/Contents/MacOS/Sketch"
 
 The frame variable command will also perform "object printing" operations on
-variables (currently we only support ObjC printing, using the object's
-"description" method. Turn this on by passing the -o flag to frame variable:
+variables (currently lldb only supports ObjC printing, using the object's
+``description`` method. Turn this on by passing the ``-o`` flag to frame variable:
 
 ::
 
@@ -680,4 +696,5 @@ variables (currently we only support ObjC printing, using the object's
    (lldb) frame select 9
    frame #9: 0x0000000100015ae3, where = Sketch`function1 + 33 at /Projects/Sketch/SKTFunctions.m:11
 
-You can also move up and down the stack by passing the "--relative" ("-r") option. And we have built-in aliases "u" and "d" which behave like their gdb equivalents.
+You can also move up and down the stack by passing the ``--relative`` (``-r``) option.
+We also have built-in aliases ``u`` and ``d`` which behave like their gdb equivalents.
diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h
index c83eb92fcfb3..b256544326a2 100644
--- a/lldb/include/lldb/API/LLDB.h
+++ b/lldb/include/lldb/API/LLDB.h
@@ -40,6 +40,7 @@
 #include "lldb/API/SBInstruction.h"
 #include "lldb/API/SBInstructionList.h"
 #include "lldb/API/SBLanguageRuntime.h"
+#include "lldb/API/SBLanguages.h"
 #include "lldb/API/SBLaunchInfo.h"
 #include "lldb/API/SBLineEntry.h"
 #include "lldb/API/SBListener.h"
diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index cf5409a12a05..7333cd57ad31 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -42,6 +42,13 @@ public:
 
 class LLDB_API SBDebugger {
 public:
+  FLAGS_ANONYMOUS_ENUM(){
+      eBroadcastBitProgress = lldb::DebuggerBroadcastBit::eBroadcastBitProgress,
+      eBroadcastBitWarning = lldb::DebuggerBroadcastBit::eBroadcastBitWarning,
+      eBroadcastBitError = lldb::DebuggerBroadcastBit::eBroadcastBitError,
+      eBroadcastBitProgressCategory =
+          lldb::DebuggerBroadcastBit::eBroadcastBitProgressCategory,
+  };
   SBDebugger();
 
   SBDebugger(const lldb::SBDebugger &rhs);
diff --git a/lldb/include/lldb/API/SBExpressionOptions.h b/lldb/include/lldb/API/SBExpressionOptions.h
index e0ddfda5ba37..19c416d0f3bc 100644
--- a/lldb/include/lldb/API/SBExpressionOptions.h
+++ b/lldb/include/lldb/API/SBExpressionOptions.h
@@ -10,6 +10,7 @@
 #define LLDB_API_SBEXPRESSIONOPTIONS_H
 
 #include "lldb/API/SBDefines.h"
+#include "lldb/API/SBLanguages.h"
 
 #include <vector>
 
@@ -67,6 +68,10 @@ public:
   void SetTrapExceptions(bool trap_exceptions = true);
 
   void SetLanguage(lldb::LanguageType language);
+  /// Set the language using a pair of language code and version as
+  /// defined by the DWARF 6 specification.
+  /// WARNING: These codes may change until DWARF 6 is finalized.
+  void SetLanguage(SBSourceLanguageName name, uint32_t version);
 
 #ifndef SWIG
   void SetCancelCallback(lldb::ExpressionCancelCallback callback, void *baton);
diff --git a/lldb/include/lldb/API/SBLineEntry.h b/lldb/include/lldb/API/SBLineEntry.h
index 7c2431ba3c8a..d70c4fac6ec7 100644
--- a/lldb/include/lldb/API/SBLineEntry.h
+++ b/lldb/include/lldb/API/SBLineEntry.h
@@ -29,6 +29,9 @@ public:
 
   lldb::SBAddress GetEndAddress() const;
 
+  lldb::SBAddress
+  GetSameLineContiguousAddressRangeEnd(bool include_inlined_functions) const;
+
   explicit operator bool() const;
 
   bool IsValid() const;
diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h
index 7da3335a7234..f1b5d1fb92ce 100644
--- a/lldb/include/lldb/API/SBProcess.h
+++ b/lldb/include/lldb/API/SBProcess.h
@@ -562,6 +562,8 @@ public:
 
   lldb::SBScriptObject GetScriptedImplementation();
 
+  void GetStatus(SBStream &status);
+
 protected:
   friend class SBAddress;
   friend class SBBreakpoint;
diff --git a/lldb/include/lldb/API/SBSymbolContextList.h b/lldb/include/lldb/API/SBSymbolContextList.h
index 4026afc21357..95100d219df2 100644
--- a/lldb/include/lldb/API/SBSymbolContextList.h
+++ b/lldb/include/lldb/API/SBSymbolContextList.h
@@ -44,6 +44,7 @@ public:
 protected:
   friend class SBModule;
   friend class SBTarget;
+  friend class SBCompileUnit;
 
   lldb_private::SymbolContextList *operator->() const;
 
diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h
index 3644ac056da3..feeaa1cb7113 100644
--- a/lldb/include/lldb/API/SBTarget.h
+++ b/lldb/include/lldb/API/SBTarget.h
@@ -879,6 +879,10 @@ public:
                                            uint32_t count,
                                            const char *flavor_string);
 
+  lldb::SBInstructionList ReadInstructions(lldb::SBAddress start_addr,
+                                           lldb::SBAddress end_addr,
+                                           const char *flavor_string);
+
   lldb::SBInstructionList GetInstructions(lldb::SBAddress base_addr,
                                           const void *buf, size_t size);
 
@@ -954,6 +958,7 @@ protected:
   friend class SBSection;
   friend class SBSourceManager;
   friend class SBSymbol;
+  friend class SBTypeStaticField;
   friend class SBValue;
   friend class SBVariablesOptions;
 
diff --git a/lldb/include/lldb/API/SBType.h b/lldb/include/lldb/API/SBType.h
index 9980fe121830..5b9ff2170b2b 100644
--- a/lldb/include/lldb/API/SBType.h
+++ b/lldb/include/lldb/API/SBType.h
@@ -107,6 +107,35 @@ protected:
   lldb::TypeMemberFunctionImplSP m_opaque_sp;
 };
 
+class LLDB_API SBTypeStaticField {
+public:
+  SBTypeStaticField();
+
+  SBTypeStaticField(const lldb::SBTypeStaticField &rhs);
+  lldb::SBTypeStaticField &operator=(const lldb::SBTypeStaticField &rhs);
+
+  ~SBTypeStaticField();
+
+  explicit operator bool() const;
+
+  bool IsValid() const;
+
+  const char *GetName();
+
+  const char *GetMangledName();
+
+  lldb::SBType GetType();
+
+  lldb::SBValue GetConstantValue(lldb::SBTarget target);
+
+protected:
+  friend class SBType;
+
+  explicit SBTypeStaticField(lldb_private::CompilerDecl decl);
+
+  std::unique_ptr<lldb_private::CompilerDecl> m_opaque_up;
+};
+
 class SBType {
 public:
   SBType();
@@ -182,6 +211,8 @@ public:
 
   lldb::SBTypeMember GetVirtualBaseClassAtIndex(uint32_t idx);
 
+  lldb::SBTypeStaticField GetStaticFieldWithName(const char *name);
+
   lldb::SBTypeEnumMemberList GetEnumMembers();
 
   uint32_t GetNumberOfTemplateArguments();
@@ -242,6 +273,7 @@ protected:
   friend class SBTypeNameSpecifier;
   friend class SBTypeMember;
   friend class SBTypeMemberFunction;
+  friend class SBTypeStaticField;
   friend class SBTypeList;
   friend class SBValue;
   friend class SBWatchpoint;
diff --git a/lldb/include/lldb/API/SBValue.h b/lldb/include/lldb/API/SBValue.h
index bbcccaab51aa..8f4c4fd56dfb 100644
--- a/lldb/include/lldb/API/SBValue.h
+++ b/lldb/include/lldb/API/SBValue.h
@@ -68,6 +68,8 @@ public:
 
   uint64_t GetValueAsUnsigned(uint64_t fail_value = 0);
 
+  lldb::addr_t GetValueAsAddress();
+
   ValueType GetValueType();
 
   // If you call this on a newly created ValueObject, it will always return
@@ -426,6 +428,7 @@ protected:
   friend class SBModule;
   friend class SBTarget;
   friend class SBThread;
+  friend class SBTypeStaticField;
   friend class SBTypeSummary;
   friend class SBValueList;
 
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index 418c2403d020..49ff0737acef 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -89,7 +89,7 @@ public:
 
   using DebuggerList = std::vector<lldb::DebuggerSP>;
 
-  static ConstString GetStaticBroadcasterClass();
+  static llvm::StringRef GetStaticBroadcasterClass();
 
   /// Get the public broadcaster for this debugger.
   Broadcaster &GetBroadcaster() { return m_broadcaster; }
diff --git a/lldb/include/lldb/Core/ThreadedCommunication.h b/lldb/include/lldb/Core/ThreadedCommunication.h
index 7ebb77beb77f..24412b202793 100644
--- a/lldb/include/lldb/Core/ThreadedCommunication.h
+++ b/lldb/include/lldb/Core/ThreadedCommunication.h
@@ -216,9 +216,9 @@ public:
   ///
   void SynchronizeWithReadThread();
 
-  static ConstString &GetStaticBroadcasterClass();
+  static llvm::StringRef GetStaticBroadcasterClass();
 
-  ConstString &GetBroadcasterClass() const override {
+  llvm::StringRef GetBroadcasterClass() const override {
     return GetStaticBroadcasterClass();
   }
 
diff --git a/lldb/include/lldb/Expression/Expression.h b/lldb/include/lldb/Expression/Expression.h
index 3e61d78828bb..356fe4b82ae4 100644
--- a/lldb/include/lldb/Expression/Expression.h
+++ b/lldb/include/lldb/Expression/Expression.h
@@ -47,11 +47,8 @@ public:
   /// expression.  Text() should contain the definition of this function.
   virtual const char *FunctionName() = 0;
 
-  /// Return the language that should be used when parsing.  To use the
-  /// default, return eLanguageTypeUnknown.
-  virtual lldb::LanguageType Language() const {
-    return lldb::eLanguageTypeUnknown;
-  }
+  /// Return the language that should be used when parsing.
+  virtual SourceLanguage Language() const { return {}; }
 
   /// Return the Materializer that the parser should use when registering
   /// external values.
diff --git a/lldb/include/lldb/Expression/LLVMUserExpression.h b/lldb/include/lldb/Expression/LLVMUserExpression.h
index 7d32d17dbf54..40b463933c07 100644
--- a/lldb/include/lldb/Expression/LLVMUserExpression.h
+++ b/lldb/include/lldb/Expression/LLVMUserExpression.h
@@ -52,7 +52,7 @@ public:
   };
 
   LLVMUserExpression(ExecutionContextScope &exe_scope, llvm::StringRef expr,
-                     llvm::StringRef prefix, lldb::LanguageType language,
+                     llvm::StringRef prefix, SourceLanguage language,
                      ResultType desired_type,
                      const EvaluateExpressionOptions &options);
   ~LLVMUserExpression() override;
diff --git a/lldb/include/lldb/Expression/UserExpression.h b/lldb/include/lldb/Expression/UserExpression.h
index b6cfeec7e899..b04d00b72e8f 100644
--- a/lldb/include/lldb/Expression/UserExpression.h
+++ b/lldb/include/lldb/Expression/UserExpression.h
@@ -56,7 +56,7 @@ public:
   ///     If not eResultTypeAny, the type to use for the expression
   ///     result.
   UserExpression(ExecutionContextScope &exe_scope, llvm::StringRef expr,
-                 llvm::StringRef prefix, lldb::LanguageType language,
+                 llvm::StringRef prefix, SourceLanguage language,
                  ResultType desired_type,
                  const EvaluateExpressionOptions &options);
 
@@ -202,7 +202,7 @@ public:
   virtual bool IsParseCacheable() { return true; }
   /// Return the language that should be used when parsing.  To use the
   /// default, return eLanguageTypeUnknown.
-  lldb::LanguageType Language() const override { return m_language; }
+  SourceLanguage Language() const override { return m_language; }
 
   /// Return the desired result type of the function, or eResultTypeAny if
   /// indifferent.
@@ -315,19 +315,22 @@ protected:
                            lldb::ProcessSP &process_sp,
                            lldb::StackFrameSP &frame_sp);
 
-  Address m_address;       ///< The address the process is stopped in.
-  std::string m_expr_text; ///< The text of the expression, as typed by the user
-  std::string m_expr_prefix; ///< The text of the translation-level definitions,
-                             ///as provided by the user
-  std::string m_fixed_text; ///< The text of the expression with fix-its applied
-                            ///- this won't be set if the fixed text doesn't
-                            ///parse.
-  lldb::LanguageType m_language; ///< The language to use when parsing
-                                 ///(eLanguageTypeUnknown means use defaults)
-  ResultType m_desired_type; ///< The type to coerce the expression's result to.
-                             ///If eResultTypeAny, inferred from the expression.
-  EvaluateExpressionOptions
-      m_options; ///< Additional options provided by the user.
+  /// The address the process is stopped in.
+  Address m_address;
+  /// The text of the expression, as typed by the user.
+  std::string m_expr_text;
+  /// The text of the translation-level definitions, as provided by the user.
+  std::string m_expr_prefix;
+  /// The text of the expression with fix-its applied this won't be set if the
+  /// fixed text doesn't parse.
+  std::string m_fixed_text;
+  /// The language to use when parsing (unknown means use defaults).
+  SourceLanguage m_language;
+  /// The type to coerce the expression's result to. If eResultTypeAny, inferred
+  /// from the expression.
+  ResultType m_desired_type;
+  /// Additional options provided by the user.
+  EvaluateExpressionOptions m_options;
 };
 
 } // namespace lldb_private
diff --git a/lldb/include/lldb/Interpreter/CommandInterpreter.h b/lldb/include/lldb/Interpreter/CommandInterpreter.h
index d190bcdcab44..70a55a77465b 100644
--- a/lldb/include/lldb/Interpreter/CommandInterpreter.h
+++ b/lldb/include/lldb/Interpreter/CommandInterpreter.h
@@ -255,9 +255,9 @@ public:
 
   // These two functions fill out the Broadcaster interface:
 
-  static ConstString &GetStaticBroadcasterClass();
+  static llvm::StringRef GetStaticBroadcasterClass();
 
-  ConstString &GetBroadcasterClass() const override {
+  llvm::StringRef GetBroadcasterClass() const override {
     return GetStaticBroadcasterClass();
   }
 
diff --git a/lldb/include/lldb/Symbol/CompilerDecl.h b/lldb/include/lldb/Symbol/CompilerDecl.h
index 825a4f15836f..5c99cae3781c 100644
--- a/lldb/include/lldb/Symbol/CompilerDecl.h
+++ b/lldb/include/lldb/Symbol/CompilerDecl.h
@@ -73,6 +73,9 @@ public:
 
   CompilerDeclContext GetDeclContext() const;
 
+  // If this decl has a type, return it.
+  CompilerType GetType() const;
+
   // If this decl represents a function, return the return type
   CompilerType GetFunctionReturnType() const;
 
@@ -91,6 +94,10 @@ public:
   /// the subsequent entry, so the topmost entry is the global namespace.
   std::vector<lldb_private::CompilerContext> GetCompilerContext() const;
 
+  // If decl represents a constant value, return it. Otherwise, return an
+  // invalid/empty Scalar.
+  Scalar GetConstantValue() const;
+
 private:
   TypeSystem *m_type_system = nullptr;
   void *m_opaque_decl = nullptr;
diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h
index b71c531f2163..28c723abf279 100644
--- a/lldb/include/lldb/Symbol/CompilerType.h
+++ b/lldb/include/lldb/Symbol/CompilerType.h
@@ -262,6 +262,12 @@ public:
   size_t GetPointerByteSize() const;
   /// \}
 
+  unsigned GetPtrAuthKey() const;
+
+  unsigned GetPtrAuthDiscriminator() const;
+
+  bool GetPtrAuthAddressDiversity() const;
+
   /// Accessors.
   /// \{
 
@@ -369,6 +375,12 @@ public:
 
   /// Create related types using the current type's AST
   CompilerType GetBasicTypeFromAST(lldb::BasicType basic_type) const;
+
+  /// Return a new CompilerType adds a ptrauth modifier from the given 32-bit
+  /// opaque payload to this type if this type is valid and the type system
+  /// supports ptrauth modifiers, else return an invalid type. Note that this
+  /// does not check if this type is a pointer.
+  CompilerType AddPtrAuthModifier(uint32_t payload) const;
   /// \}
 
   /// Exploring the type.
@@ -416,6 +428,8 @@ public:
   CompilerType GetVirtualBaseClassAtIndex(size_t idx,
                                           uint32_t *bit_offset_ptr) const;
 
+  CompilerDecl GetStaticFieldWithName(llvm::StringRef name) const;
+
   uint32_t GetIndexOfFieldWithName(const char *name,
                                    CompilerType *field_compiler_type = nullptr,
                                    uint64_t *bit_offset_ptr = nullptr,
diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h
index b5eac5fa732d..1c4f7b5601b0 100644
--- a/lldb/include/lldb/Symbol/Type.h
+++ b/lldb/include/lldb/Symbol/Type.h
@@ -401,7 +401,9 @@ public:
     /// This type is the type whose UID is m_encoding_uid as an atomic type.
     eEncodingIsAtomicUID,
     /// This type is the synthetic type whose UID is m_encoding_uid.
-    eEncodingIsSyntheticUID
+    eEncodingIsSyntheticUID,
+    /// This type is a signed pointer.
+    eEncodingIsLLVMPtrAuthUID
   };
 
   enum class ResolveState : unsigned char {
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 3a927d313b82..7bcb8d69387a 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -27,6 +27,8 @@
 #include "lldb/Symbol/CompilerDecl.h"
 #include "lldb/Symbol/CompilerDeclContext.h"
 #include "lldb/Symbol/Type.h"
+#include "lldb/Utility/Scalar.h"
+#include "lldb/lldb-forward.h"
 #include "lldb/lldb-private.h"
 #include "lldb/lldb-types.h"
 
@@ -110,6 +112,8 @@ public:
   virtual std::vector<lldb_private::CompilerContext>
   DeclGetCompilerContext(void *opaque_decl);
 
+  virtual Scalar DeclGetConstantValue(void *opaque_decl) { return Scalar(); }
+
   virtual CompilerType GetTypeForDecl(void *opaque_decl) = 0;
 
   // CompilerDeclContext functions
@@ -217,6 +221,14 @@ public:
 
   virtual uint32_t GetPointerByteSize() = 0;
 
+  virtual unsigned GetPtrAuthKey(lldb::opaque_compiler_type_t type) = 0;
+
+  virtual unsigned
+  GetPtrAuthDiscriminator(lldb::opaque_compiler_type_t type) = 0;
+
+  virtual bool
+  GetPtrAuthAddressDiversity(lldb::opaque_compiler_type_t type) = 0;
+
   // Accessors
 
   virtual ConstString GetTypeName(lldb::opaque_compiler_type_t type,
@@ -281,6 +293,9 @@ public:
 
   virtual CompilerType AddRestrictModifier(lldb::opaque_compiler_type_t type);
 
+  virtual CompilerType AddPtrAuthModifier(lldb::opaque_compiler_type_t type,
+                                          uint32_t payload);
+
   /// \param opaque_payload      The m_payload field of Type, which may
   /// carry TypeSystem-specific extra information.
   virtual CompilerType CreateTypedef(lldb::opaque_compiler_type_t type,
@@ -339,6 +354,11 @@ public:
   GetVirtualBaseClassAtIndex(lldb::opaque_compiler_type_t type, size_t idx,
                              uint32_t *bit_offset_ptr) = 0;
 
+  virtual CompilerDecl GetStaticFieldWithName(lldb::opaque_compiler_type_t type,
+                                              llvm::StringRef name) {
+    return CompilerDecl();
+  }
+
   virtual CompilerType GetChildCompilerTypeAtIndex(
       lldb::opaque_compiler_type_t type, ExecutionContext *exe_ctx, size_t idx,
       bool transparent_pointers, bool omit_empty_base_classes,
@@ -474,12 +494,10 @@ public:
     return IsPointerOrReferenceType(type, nullptr);
   }
 
-  virtual UserExpression *
-  GetUserExpression(llvm::StringRef expr, llvm::StringRef prefix,
-                    lldb::LanguageType language,
-                    Expression::ResultType desired_type,
-                    const EvaluateExpressionOptions &options,
-                    ValueObject *ctx_obj) {
+  virtual UserExpression *GetUserExpression(
+      llvm::StringRef expr, llvm::StringRef prefix, SourceLanguage language,
+      Expression::ResultType desired_type,
+      const EvaluateExpressionOptions &options, ValueObject *ctx_obj) {
     return nullptr;
   }
 
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 2f3a3c22422e..aac0cf51680a 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -381,7 +381,7 @@ public:
 
   // These two functions fill out the Broadcaster interface:
 
-  static ConstString &GetStaticBroadcasterClass();
+  static llvm::StringRef GetStaticBroadcasterClass();
 
   static constexpr llvm::StringRef AttachSynchronousHijackListenerName =
       "lldb.internal.Process.AttachSynchronous.hijack";
@@ -390,7 +390,7 @@ public:
   static constexpr llvm::StringRef ResumeSynchronousHijackListenerName =
       "lldb.internal.Process.ResumeSynchronous.hijack";
 
-  ConstString &GetBroadcasterClass() const override {
+  llvm::StringRef GetBroadcasterClass() const override {
     return GetStaticBroadcasterClass();
   }
 
diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h
index 6c18511c6e1a..52f0a1ee6621 100644
--- a/lldb/include/lldb/Target/StackFrame.h
+++ b/lldb/include/lldb/Target/StackFrame.h
@@ -1,3 +1,4 @@
+
 //===-- StackFrame.h --------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -446,13 +447,12 @@ public:
   /// Query this frame to determine what the default language should be when
   /// parsing expressions given the execution context.
   ///
-  /// \return
-  ///   The language of the frame if known, else lldb::eLanguageTypeUnknown.
-  lldb::LanguageType GetLanguage();
+  /// \return   The language of the frame if known.
+  SourceLanguage GetLanguage();
 
-  // similar to GetLanguage(), but is allowed to take a potentially incorrect
-  // guess if exact information is not available
-  lldb::LanguageType GuessLanguage();
+  /// Similar to GetLanguage(), but is allowed to take a potentially incorrect
+  /// guess if exact information is not available.
+  SourceLanguage GuessLanguage();
 
   /// Attempt to econstruct the ValueObject for a given raw address touched by
   /// the current instruction.  The ExpressionPath should indicate how to get
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index 2c2e6b2831cc..7ad9f3358605 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -200,7 +200,7 @@ public:
 
   bool GetBreakpointsConsultPlatformAvoidList();
 
-  lldb::LanguageType GetLanguage() const;
+  SourceLanguage GetLanguage() const;
 
   llvm::StringRef GetExpressionPrefixContents();
 
@@ -310,9 +310,18 @@ public:
     m_execution_policy = policy;
   }
 
-  lldb::LanguageType GetLanguage() const { return m_language; }
+  SourceLanguage GetLanguage() const { return m_language; }
 
-  void SetLanguage(lldb::LanguageType language) { m_language = language; }
+  void SetLanguage(lldb::LanguageType language_type) {
+    m_language = SourceLanguage(language_type);
+  }
+
+  /// Set the language using a pair of language code and version as
+  /// defined by the DWARF 6 specification.
+  /// WARNING: These codes may change until DWARF 6 is finalized.
+  void SetLanguage(uint16_t name, uint32_t version) {
+    m_language = SourceLanguage(name, version);
+  }
 
   bool DoesCoerceToId() const { return m_coerce_to_id; }
 
@@ -445,7 +454,7 @@ public:
 
 private:
   ExecutionPolicy m_execution_policy = default_execution_policy;
-  lldb::LanguageType m_language = lldb::eLanguageTypeUnknown;
+  SourceLanguage m_language;
   std::string m_prefix;
   bool m_coerce_to_id = false;
   bool m_unwind_on_error = true;
@@ -499,9 +508,9 @@ public:
 
   // These two functions fill out the Broadcaster interface:
 
-  static ConstString &GetStaticBroadcasterClass();
+  static llvm::StringRef GetStaticBroadcasterClass();
 
-  ConstString &GetBroadcasterClass() const override {
+  llvm::StringRef GetBroadcasterClass() const override {
     return GetStaticBroadcasterClass();
   }
 
@@ -1160,7 +1169,7 @@ public:
 
   UserExpression *
   GetUserExpressionForLanguage(llvm::StringRef expr, llvm::StringRef prefix,
-                               lldb::LanguageType language,
+                               SourceLanguage language,
                                Expression::ResultType desired_type,
                                const EvaluateExpressionOptions &options,
                                ValueObject *ctx_obj, Status &error);
diff --git a/lldb/include/lldb/Target/TargetList.h b/lldb/include/lldb/Target/TargetList.h
index a0bc6f1f820b..a0cddc6b2966 100644
--- a/lldb/include/lldb/Target/TargetList.h
+++ b/lldb/include/lldb/Target/TargetList.h
@@ -37,9 +37,9 @@ public:
 
   // These two functions fill out the Broadcaster interface:
 
-  static ConstString &GetStaticBroadcasterClass();
+  static llvm::StringRef GetStaticBroadcasterClass();
 
-  ConstString &GetBroadcasterClass() const override {
+  llvm::StringRef GetBroadcasterClass() const override {
     return GetStaticBroadcasterClass();
   }
 
diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h
index 1efef93b17de..c17bddf4d98b 100644
--- a/lldb/include/lldb/Target/Thread.h
+++ b/lldb/include/lldb/Target/Thread.h
@@ -74,9 +74,9 @@ public:
     eBroadcastBitThreadSelected = (1 << 4)
   };
 
-  static ConstString &GetStaticBroadcasterClass();
+  static llvm::StringRef GetStaticBroadcasterClass();
 
-  ConstString &GetBroadcasterClass() const override {
+  llvm::StringRef GetBroadcasterClass() const override {
     return GetStaticBroadcasterClass();
   }
 
diff --git a/lldb/include/lldb/Utility/Broadcaster.h b/lldb/include/lldb/Utility/Broadcaster.h
index f39e677fe9ee..58436ddb9f26 100644
--- a/lldb/include/lldb/Utility/Broadcaster.h
+++ b/lldb/include/lldb/Utility/Broadcaster.h
@@ -39,12 +39,12 @@ namespace lldb_private {
 /// Debugger maintains a list of BroadcastEventSpec's and when it is made
 class BroadcastEventSpec {
 public:
-  BroadcastEventSpec(const ConstString &broadcaster_class, uint32_t event_bits)
+  BroadcastEventSpec(llvm::StringRef broadcaster_class, uint32_t event_bits)
       : m_broadcaster_class(broadcaster_class), m_event_bits(event_bits) {}
 
   ~BroadcastEventSpec() = default;
 
-  ConstString GetBroadcasterClass() const { return m_broadcaster_class; }
+  const std::string &GetBroadcasterClass() const { return m_broadcaster_class; }
 
   uint32_t GetEventBits() const { return m_event_bits; }
 
@@ -67,7 +67,7 @@ public:
   bool operator<(const BroadcastEventSpec &rhs) const;
 
 private:
-  ConstString m_broadcaster_class;
+  std::string m_broadcaster_class;
   uint32_t m_event_bits;
 };
 
@@ -307,7 +307,7 @@ public:
   /// FIXME: Probably should make a ManagedBroadcaster subclass with all the
   /// bits needed to work with the BroadcasterManager, so that it is clearer
   /// how to add one.
-  virtual ConstString &GetBroadcasterClass() const;
+  virtual llvm::StringRef GetBroadcasterClass() const;
 
   lldb::BroadcasterManagerSP GetManager();
 
diff --git a/lldb/include/lldb/Utility/ProcessInfo.h b/lldb/include/lldb/Utility/ProcessInfo.h
index e9fe71e1b851..54ac000dc7fc 100644
--- a/lldb/include/lldb/Utility/ProcessInfo.h
+++ b/lldb/include/lldb/Utility/ProcessInfo.h
@@ -234,7 +234,7 @@ public:
 
   bool CumulativeSystemTimeIsValid() const {
     return m_cumulative_system_time.tv_sec > 0 ||
-           m_cumulative_system_time.tv_sec > 0;
+           m_cumulative_system_time.tv_usec > 0;
   }
 
   void Dump(Stream &s, UserIDResolver &resolver) const;
diff --git a/lldb/include/lldb/lldb-private-types.h b/lldb/include/lldb/lldb-private-types.h
index 7d301666df1a..055eea9f6456 100644
--- a/lldb/include/lldb/lldb-private-types.h
+++ b/lldb/include/lldb/lldb-private-types.h
@@ -96,6 +96,25 @@ struct RegisterSet {
   const uint32_t *registers;
 };
 
+/// A type-erased pair of llvm::dwarf::SourceLanguageName and version.
+struct SourceLanguage {
+  SourceLanguage() = default;
+  SourceLanguage(lldb::LanguageType language_type);
+  SourceLanguage(uint16_t name, uint32_t version)
+      : name(name), version(version) {}
+  SourceLanguage(std::optional<std::pair<uint16_t, uint32_t>> name_vers)
+      : name(name_vers ? name_vers->first : 0),
+        version(name_vers ? name_vers->second : 0) {}
+  operator bool() const { return name > 0; }
+  lldb::LanguageType AsLanguageType() const;
+  llvm::StringRef GetDescription() const;
+  bool IsC() const;
+  bool IsObjC() const;
+  bool IsCPlusPlus() const;
+  uint16_t name = 0;
+  uint32_t version = 0;
+};
+
 struct OptionEnumValueElement {
   int64_t value;
   const char *string_value;
diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index 823a982e6743..21ea3530e24f 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -148,6 +148,9 @@ class Builder:
             return libcpp_args
         return []
 
+    def getLLDBObjRoot(self):
+        return ["LLDB_OBJ_ROOT={}".format(configuration.lldb_obj_root)]
+
     def _getDebugInfoArgs(self, debug_info):
         if debug_info is None:
             return []
@@ -185,6 +188,7 @@ class Builder:
             self.getSDKRootSpec(),
             self.getModuleCacheSpec(),
             self.getLibCxxArgs(),
+            self.getLLDBObjRoot(),
             self.getCmdLine(dictionary),
         ]
         command = list(itertools.chain(*command_parts))
diff --git a/lldb/packages/Python/lldbsuite/test/configuration.py b/lldb/packages/Python/lldbsuite/test/configuration.py
index 685f491c85fe..dbd4a2d72a15 100644
--- a/lldb/packages/Python/lldbsuite/test/configuration.py
+++ b/lldb/packages/Python/lldbsuite/test/configuration.py
@@ -119,6 +119,7 @@ all_tests = set()
 
 # LLDB library directory.
 lldb_libs_dir = None
+lldb_obj_root = None
 
 libcxx_include_dir = None
 libcxx_include_target_dir = None
diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index 8e13aa6a1388..79cc0a2aeacb 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -206,6 +206,7 @@ def _decorateTest(
     remote=None,
     dwarf_version=None,
     setting=None,
+    asan=None,
 ):
     def fn(actual_debug_info=None):
         skip_for_os = _match_decorator_property(
@@ -256,6 +257,7 @@ def _decorateTest(
             )
         )
         skip_for_setting = (setting is None) or (setting in configuration.settings)
+        skip_for_asan = (asan is None) or is_running_under_asan()
 
         # For the test to be skipped, all specified (e.g. not None) parameters must be True.
         # An unspecified parameter means "any", so those are marked skip by default.  And we skip
@@ -273,6 +275,7 @@ def _decorateTest(
             (remote, skip_for_remote, "platform locality (remote/local)"),
             (dwarf_version, skip_for_dwarf_version, "dwarf version"),
             (setting, skip_for_setting, "setting"),
+            (asan, skip_for_asan, "running under asan"),
         ]
         reasons = []
         final_skip_result = True
@@ -331,6 +334,7 @@ def expectedFailureAll(
     remote=None,
     dwarf_version=None,
     setting=None,
+    asan=None,
 ):
     return _decorateTest(
         DecorateMode.Xfail,
@@ -348,6 +352,7 @@ def expectedFailureAll(
         remote=remote,
         dwarf_version=dwarf_version,
         setting=setting,
+        asan=asan,
     )
 
 
@@ -356,7 +361,7 @@ def expectedFailureAll(
 # for example,
 # @skipIf, skip for all platform/compiler/arch,
 # @skipIf(compiler='gcc'), skip for gcc on all platform/architecture
-# @skipIf(bugnumber, ["linux"], "gcc", ['>=', '4.9'], ['i386']), skip for gcc>=4.9 on linux with i386
+# @skipIf(bugnumber, ["linux"], "gcc", ['>=', '4.9'], ['i386']), skip for gcc>=4.9 on linux with i386 (all conditions must be true)
 def skipIf(
     bugnumber=None,
     oslist=None,
@@ -372,6 +377,7 @@ def skipIf(
     remote=None,
     dwarf_version=None,
     setting=None,
+    asan=None,
 ):
     return _decorateTest(
         DecorateMode.Skip,
@@ -389,6 +395,7 @@ def skipIf(
         remote=remote,
         dwarf_version=dwarf_version,
         setting=setting,
+        asan=asan,
     )
 
 
@@ -1091,7 +1098,7 @@ def skipUnlessFeature(feature):
                 ).decode("utf-8")
                 # If 'feature: 1' was output, then this feature is available and
                 # the test should not be skipped.
-                if re.match("%s: 1\s*" % feature, output):
+                if re.match(r"%s: 1\s*" % feature, output):
                     return None
                 else:
                     return "%s is not supported on this system." % feature
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index 2ec4a840b916..ebabf348643e 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -423,6 +423,7 @@ def parseOptionsAndInitTestdirs():
         configuration.lldb_module_cache_dir = os.path.join(
             configuration.test_build_dir, "module-cache-lldb"
         )
+
     if args.clang_module_cache_dir:
         configuration.clang_module_cache_dir = args.clang_module_cache_dir
     else:
@@ -432,6 +433,8 @@ def parseOptionsAndInitTestdirs():
 
     if args.lldb_libs_dir:
         configuration.lldb_libs_dir = args.lldb_libs_dir
+    if args.lldb_obj_root:
+        configuration.lldb_obj_root = args.lldb_obj_root
 
     if args.enabled_plugins:
         configuration.enabled_plugins = args.enabled_plugins
diff --git a/lldb/packages/Python/lldbsuite/test/dotest_args.py b/lldb/packages/Python/lldbsuite/test/dotest_args.py
index e4de786ec054..8b00c7a4d56e 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest_args.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest_args.py
@@ -237,10 +237,16 @@ def create_parser():
         help="The clang module cache directory used in the Make files by Clang while building tests. Defaults to <test build directory>/module-cache-clang.",
     )
     group.add_argument(
+        "--lldb-obj-root",
+        dest="lldb_obj_root",
+        metavar="path",
+        help="The path to the LLDB object files.",
+    )
+    group.add_argument(
         "--lldb-libs-dir",
         dest="lldb_libs_dir",
         metavar="path",
-        help="The path to LLDB library directory (containing liblldb)",
+        help="The path to LLDB library directory (containing liblldb).",
     )
     group.add_argument(
         "--enable-plugin",
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 7a7afec73457..5fd686c143e9 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1473,11 +1473,12 @@ class Base(unittest.TestCase):
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s -I%s"
+                "CFLAGS_EXTRAS": "%s %s -I%s -I%s"
                 % (
                     stdflag,
                     stdlibflag,
                     os.path.join(os.environ["LLDB_SRC"], "include"),
+                    os.path.join(configuration.lldb_obj_root, "include"),
                 ),
                 "LD_EXTRAS": "-L%s -lliblldb" % lib_dir,
             }
@@ -1485,11 +1486,12 @@ class Base(unittest.TestCase):
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s -I%s"
+                "CFLAGS_EXTRAS": "%s %s -I%s -I%s"
                 % (
                     stdflag,
                     stdlibflag,
                     os.path.join(os.environ["LLDB_SRC"], "include"),
+                    os.path.join(configuration.lldb_obj_root, "include"),
                 ),
                 "LD_EXTRAS": "-L%s -llldb -Wl,-rpath,%s" % (lib_dir, lib_dir),
             }
@@ -1508,7 +1510,8 @@ class Base(unittest.TestCase):
             d = {
                 "DYLIB_CXX_SOURCES": sources,
                 "DYLIB_NAME": lib_name,
-                "CFLAGS_EXTRAS": "%s -stdlib=libc++" % stdflag,
+                "CFLAGS_EXTRAS": "%s -stdlib=libc++ -I%s"
+                % (stdflag, os.path.join(configuration.lldb_obj_root, "include")),
                 "FRAMEWORK_INCLUDES": "-F%s" % self.framework_dir,
                 "LD_EXTRAS": "%s -Wl,-rpath,%s -dynamiclib"
                 % (self.lib_lldb, self.framework_dir),
@@ -1517,16 +1520,24 @@ class Base(unittest.TestCase):
             d = {
                 "DYLIB_CXX_SOURCES": sources,
                 "DYLIB_NAME": lib_name,
-                "CFLAGS_EXTRAS": "%s -I%s "
-                % (stdflag, os.path.join(os.environ["LLDB_SRC"], "include")),
-                "LD_EXTRAS": "-shared -l%s\liblldb.lib" % lib_dir,
+                "CFLAGS_EXTRAS": "%s -I%s -I%s"
+                % (
+                    stdflag,
+                    os.path.join(os.environ["LLDB_SRC"], "include"),
+                    os.path.join(configuration.lldb_obj_root, "include"),
+                ),
+                "LD_EXTRAS": "-shared -l%s\\liblldb.lib" % lib_dir,
             }
         else:
             d = {
                 "DYLIB_CXX_SOURCES": sources,
                 "DYLIB_NAME": lib_name,
-                "CFLAGS_EXTRAS": "%s -I%s -fPIC"
-                % (stdflag, os.path.join(os.environ["LLDB_SRC"], "include")),
+                "CFLAGS_EXTRAS": "%s -I%s -I%s -fPIC"
+                % (
+                    stdflag,
+                    os.path.join(os.environ["LLDB_SRC"], "include"),
+                    os.path.join(configuration.lldb_obj_root, "include"),
+                ),
                 "LD_EXTRAS": "-shared -L%s -llldb -Wl,-rpath,%s" % (lib_dir, lib_dir),
             }
         if self.TraceOn():
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index bfd249ccd43f..bd8eea3d6f5a 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -238,10 +238,11 @@ ifeq "$(OS)" "Darwin"
 endif
 
 ifeq "$(OS)" "Darwin"
-	CFLAGS += $(ARCHFLAG) $(ARCH) $(FRAMEWORK_INCLUDES) -I$(LLDB_BASE_DIR)include
+	CFLAGS += $(ARCHFLAG) $(ARCH) $(FRAMEWORK_INCLUDES)
 else
-	CFLAGS += $(ARCHFLAG)$(ARCH) $(FRAMEWORK_INCLUDES) -I$(LLDB_BASE_DIR)include
+	CFLAGS += $(ARCHFLAG)$(ARCH)
 endif
+CFLAGS += -I$(LLDB_BASE_DIR)include -I$(LLDB_OBJ_ROOT)/include
 
 CFLAGS += -I$(SRCDIR) -I$(THIS_FILE_DIR)
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 27a76a652f40..5838281bcb1a 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -811,23 +811,34 @@ class DebugCommunication(object):
         command_dict = {"command": "next", "type": "request", "arguments": args_dict}
         return self.send_recv(command_dict)
 
-    def request_stepIn(self, threadId):
+    def request_stepIn(self, threadId, targetId):
         if self.exit_status is not None:
-            raise ValueError("request_continue called after process exited")
-        args_dict = {"threadId": threadId}
+            raise ValueError("request_stepIn called after process exited")
+        args_dict = {"threadId": threadId, "targetId": targetId}
         command_dict = {"command": "stepIn", "type": "request", "arguments": args_dict}
         return self.send_recv(command_dict)
 
+    def request_stepInTargets(self, frameId):
+        if self.exit_status is not None:
+            raise ValueError("request_stepInTargets called after process exited")
+        args_dict = {"frameId": frameId}
+        command_dict = {
+            "command": "stepInTargets",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self.send_recv(command_dict)
+
     def request_stepOut(self, threadId):
         if self.exit_status is not None:
-            raise ValueError("request_continue called after process exited")
+            raise ValueError("request_stepOut called after process exited")
         args_dict = {"threadId": threadId}
         command_dict = {"command": "stepOut", "type": "request", "arguments": args_dict}
         return self.send_recv(command_dict)
 
     def request_pause(self, threadId=None):
         if self.exit_status is not None:
-            raise ValueError("request_continue called after process exited")
+            raise ValueError("request_pause called after process exited")
         if threadId is None:
             threadId = self.get_thread_id()
         args_dict = {"threadId": threadId}
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 23f650d2d36f..d56ea5dca14b 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -218,8 +218,8 @@ class DAPTestCaseBase(TestBase):
         """Set a top level global variable only."""
         return self.dap_server.request_setVariable(2, name, str(value), id=id)
 
-    def stepIn(self, threadId=None, waitForStop=True):
-        self.dap_server.request_stepIn(threadId=threadId)
+    def stepIn(self, threadId=None, targetId=None, waitForStop=True):
+        self.dap_server.request_stepIn(threadId=threadId, targetId=targetId)
         if waitForStop:
             return self.dap_server.wait_for_stopped()
         return None
diff --git a/lldb/scripts/generate-sbapi-dwarf-enum.py b/lldb/scripts/generate-sbapi-dwarf-enum.py
new file mode 100755
index 000000000000..464eb2afff7d
--- /dev/null
+++ b/lldb/scripts/generate-sbapi-dwarf-enum.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+import argparse
+import re
+
+HEADER = """\
+//===-- SBLanguages.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBLANGUAGE_H
+#define LLDB_API_SBLANGUAGE_H
+/// Used by \\ref SBExpressionOptions.
+/// These enumerations use the same language enumerations as the DWARF
+/// specification for ease of use and consistency.
+enum SBSourceLanguageName : uint16_t {
+"""
+
+FOOTER = """\
+};
+
+#endif
+"""
+
+REGEX = re.compile(
+    r'^ *HANDLE_DW_LNAME *\( *(?P<value>[^,]+), (?P<name>.*), "(?P<comment>[^"]+)",.*\)'
+)
+
+
+def emit_enum(input, output):
+    # Read the input and break it up by lines.
+    lines = []
+    with open(input, "r") as f:
+        lines = f.readlines()
+
+    # Write the output.
+    with open(output, "w") as f:
+        # Emit the header.
+        f.write(HEADER)
+
+        # Emit the enum values.
+        for line in lines:
+            match = REGEX.match(line)
+            if not match:
+                continue
+            f.write(f"  /// {match.group('comment')}.\n")
+            f.write(f"  eLanguageName{match.group('name')} = {match.group('value')},\n")
+
+        # Emit the footer
+        f.write(FOOTER)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output", "-o")
+    parser.add_argument("input")
+    args = parser.parse_args()
+
+    emit_enum(args.input, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index 57cc44f76467..a64c0d4a3334 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -20,6 +20,20 @@ if(LLDB_ENABLE_LUA)
   set(lldb_lua_wrapper ${lua_bindings_dir}/LLDBWrapLua.cpp)
 endif()
 
+# Target to generate SBLanguages.h from Dwarf.def.
+set(sb_languages_file
+  ${CMAKE_CURRENT_BINARY_DIR}/../../include/lldb/API/SBLanguages.h)
+add_custom_target(
+  lldb-sbapi-dwarf-enums
+  "${Python3_EXECUTABLE}"
+      ${LLDB_SOURCE_DIR}/scripts/generate-sbapi-dwarf-enum.py
+      ${LLVM_MAIN_INCLUDE_DIR}/llvm/BinaryFormat/Dwarf.def
+      -o ${sb_languages_file}
+  BYPRODUCTS ${sb_languages_file}
+  DEPENDS ${LLVM_MAIN_INCLUDE_DIR}/llvm/BinaryFormat/Dwarf.def
+  WORKING_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR}
+)
+
 add_lldb_library(liblldb SHARED ${option_framework}
   SBAddress.cpp
   SBAttachInfo.cpp
@@ -100,6 +114,9 @@ add_lldb_library(liblldb SHARED ${option_framework}
   ${lldb_python_wrapper}
   ${lldb_lua_wrapper}
 
+  DEPENDS
+    lldb-sbapi-dwarf-enums
+
   LINK_LIBS
     lldbBreakpoint
     lldbCore
diff --git a/lldb/source/API/SBCommandInterpreter.cpp b/lldb/source/API/SBCommandInterpreter.cpp
index 7b87dc507e4b..83c0951c56db 100644
--- a/lldb/source/API/SBCommandInterpreter.cpp
+++ b/lldb/source/API/SBCommandInterpreter.cpp
@@ -512,7 +512,8 @@ SBBroadcaster SBCommandInterpreter::GetBroadcaster() {
 const char *SBCommandInterpreter::GetBroadcasterClass() {
   LLDB_INSTRUMENT();
 
-  return CommandInterpreter::GetStaticBroadcasterClass().AsCString();
+  return ConstString(CommandInterpreter::GetStaticBroadcasterClass())
+      .AsCString();
 }
 
 const char *SBCommandInterpreter::GetArgumentTypeAsCString(
diff --git a/lldb/source/API/SBCommunication.cpp b/lldb/source/API/SBCommunication.cpp
index f93898718be6..ee33e2abd854 100644
--- a/lldb/source/API/SBCommunication.cpp
+++ b/lldb/source/API/SBCommunication.cpp
@@ -170,5 +170,6 @@ SBBroadcaster SBCommunication::GetBroadcaster() {
 const char *SBCommunication::GetBroadcasterClass() {
   LLDB_INSTRUMENT();
 
-  return ThreadedCommunication::GetStaticBroadcasterClass().AsCString();
+  return ConstString(ThreadedCommunication::GetStaticBroadcasterClass())
+      .AsCString();
 }
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index fbcf30e67fc1..9c662dfbf441 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -112,7 +112,7 @@ SBDebugger &SBDebugger::operator=(const SBDebugger &rhs) {
 const char *SBDebugger::GetBroadcasterClass() {
   LLDB_INSTRUMENT();
 
-  return Debugger::GetStaticBroadcasterClass().AsCString();
+  return ConstString(Debugger::GetStaticBroadcasterClass()).AsCString();
 }
 
 const char *SBDebugger::GetProgressFromEvent(const lldb::SBEvent &event,
diff --git a/lldb/source/API/SBEvent.cpp b/lldb/source/API/SBEvent.cpp
index cc611449e250..aa9c0ff097d4 100644
--- a/lldb/source/API/SBEvent.cpp
+++ b/lldb/source/API/SBEvent.cpp
@@ -95,7 +95,8 @@ const char *SBEvent::GetBroadcasterClass() const {
 
   const Event *lldb_event = get();
   if (lldb_event)
-    return lldb_event->GetBroadcaster()->GetBroadcasterClass().AsCString();
+    return ConstString(lldb_event->GetBroadcaster()->GetBroadcasterClass())
+        .AsCString();
   else
     return "unknown class";
 }
diff --git a/lldb/source/API/SBExpressionOptions.cpp b/lldb/source/API/SBExpressionOptions.cpp
index bd81a04596b1..ce686112ff71 100644
--- a/lldb/source/API/SBExpressionOptions.cpp
+++ b/lldb/source/API/SBExpressionOptions.cpp
@@ -156,6 +156,13 @@ void SBExpressionOptions::SetLanguage(lldb::LanguageType language) {
   m_opaque_up->SetLanguage(language);
 }
 
+void SBExpressionOptions::SetLanguage(SBSourceLanguageName name,
+                                      uint32_t version) {
+  LLDB_INSTRUMENT_VA(this, name, version);
+
+  m_opaque_up->SetLanguage(name, version);
+}
+
 void SBExpressionOptions::SetCancelCallback(
     lldb::ExpressionCancelCallback callback, void *baton) {
   LLDB_INSTRUMENT_VA(this, callback, baton);
diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp
index a16bbc2ae756..47fc88625e30 100644
--- a/lldb/source/API/SBFrame.cpp
+++ b/lldb/source/API/SBFrame.cpp
@@ -1024,10 +1024,10 @@ SBValue SBFrame::EvaluateExpression(const char *expr) {
     options.SetFetchDynamicValue(fetch_dynamic_value);
     options.SetUnwindOnError(true);
     options.SetIgnoreBreakpoints(true);
-    if (target->GetLanguage() != eLanguageTypeUnknown)
-      options.SetLanguage(target->GetLanguage());
-    else
-      options.SetLanguage(frame->GetLanguage());
+    SourceLanguage language = target->GetLanguage();
+    if (!language)
+      language = frame->GetLanguage();
+    options.SetLanguage((SBSourceLanguageName)language.name, language.version);
     return EvaluateExpression(expr, options);
   } else {
     Status error;
@@ -1053,10 +1053,12 @@ SBFrame::EvaluateExpression(const char *expr,
 
   StackFrame *frame = exe_ctx.GetFramePtr();
   Target *target = exe_ctx.GetTargetPtr();
-  if (target && target->GetLanguage() != eLanguageTypeUnknown)
-    options.SetLanguage(target->GetLanguage());
-  else if (frame)
-    options.SetLanguage(frame->GetLanguage());
+  SourceLanguage language;
+  if (target)
+    language = target->GetLanguage();
+  if (!language && frame)
+    language = frame->GetLanguage();
+  options.SetLanguage((SBSourceLanguageName)language.name, language.version);
   return EvaluateExpression(expr, options);
 }
 
@@ -1074,10 +1076,12 @@ SBValue SBFrame::EvaluateExpression(const char *expr,
   options.SetIgnoreBreakpoints(true);
   StackFrame *frame = exe_ctx.GetFramePtr();
   Target *target = exe_ctx.GetTargetPtr();
-  if (target && target->GetLanguage() != eLanguageTypeUnknown)
-    options.SetLanguage(target->GetLanguage());
-  else if (frame)
-    options.SetLanguage(frame->GetLanguage());
+  SourceLanguage language;
+  if (target)
+    language = target->GetLanguage();
+  if (!language && frame)
+    language = frame->GetLanguage();
+  options.SetLanguage((SBSourceLanguageName)language.name, language.version);
   return EvaluateExpression(expr, options);
 }
 
@@ -1218,7 +1222,7 @@ lldb::LanguageType SBFrame::GuessLanguage() const {
     if (stop_locker.TryLock(&process->GetRunLock())) {
       frame = exe_ctx.GetFramePtr();
       if (frame) {
-        return frame->GuessLanguage();
+        return frame->GuessLanguage().AsLanguageType();
       }
     }
   }
diff --git a/lldb/source/API/SBLineEntry.cpp b/lldb/source/API/SBLineEntry.cpp
index 99a7b8fe644c..216ea6d18eab 100644
--- a/lldb/source/API/SBLineEntry.cpp
+++ b/lldb/source/API/SBLineEntry.cpp
@@ -67,6 +67,21 @@ SBAddress SBLineEntry::GetEndAddress() const {
   return sb_address;
 }
 
+SBAddress SBLineEntry::GetSameLineContiguousAddressRangeEnd(
+    bool include_inlined_functions) const {
+  LLDB_INSTRUMENT_VA(this);
+
+  SBAddress sb_address;
+  if (m_opaque_up) {
+    AddressRange line_range = m_opaque_up->GetSameLineContiguousAddressRange(
+        include_inlined_functions);
+
+    sb_address.SetAddress(line_range.GetBaseAddress());
+    sb_address.OffsetAddress(line_range.GetByteSize());
+  }
+  return sb_address;
+}
+
 bool SBLineEntry::IsValid() const {
   LLDB_INSTRUMENT_VA(this);
   return this->operator bool();
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index b80664882ebc..c37c111c5a58 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -77,7 +77,7 @@ SBProcess::~SBProcess() = default;
 const char *SBProcess::GetBroadcasterClassName() {
   LLDB_INSTRUMENT();
 
-  return Process::GetStaticBroadcasterClass().AsCString();
+  return ConstString(Process::GetStaticBroadcasterClass()).AsCString();
 }
 
 const char *SBProcess::GetPluginName() {
@@ -807,7 +807,7 @@ SBBroadcaster SBProcess::GetBroadcaster() const {
 const char *SBProcess::GetBroadcasterClass() {
   LLDB_INSTRUMENT();
 
-  return Process::GetStaticBroadcasterClass().AsCString();
+  return ConstString(Process::GetStaticBroadcasterClass()).AsCString();
 }
 
 size_t SBProcess::ReadMemory(addr_t addr, void *dst, size_t dst_len,
@@ -928,6 +928,14 @@ size_t SBProcess::WriteMemory(addr_t addr, const void *src, size_t src_len,
   return bytes_written;
 }
 
+void SBProcess::GetStatus(SBStream &status) {
+  LLDB_INSTRUMENT_VA(this, status);
+
+  ProcessSP process_sp(GetSP());
+  if (process_sp)
+    process_sp->GetStatus(status.ref());
+}
+
 bool SBProcess::GetDescription(SBStream &description) {
   LLDB_INSTRUMENT_VA(this, description);
 
diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp
index cc9f1fdd76af..962ce9ba83cc 100644
--- a/lldb/source/API/SBTarget.cpp
+++ b/lldb/source/API/SBTarget.cpp
@@ -147,7 +147,7 @@ SBModule SBTarget::GetModuleAtIndexFromEvent(const uint32_t idx,
 const char *SBTarget::GetBroadcasterClassName() {
   LLDB_INSTRUMENT();
 
-  return Target::GetStaticBroadcasterClass().AsCString();
+  return ConstString(Target::GetStaticBroadcasterClass()).AsCString();
 }
 
 bool SBTarget::IsValid() const {
@@ -2011,6 +2011,30 @@ lldb::SBInstructionList SBTarget::ReadInstructions(lldb::SBAddress base_addr,
   return sb_instructions;
 }
 
+lldb::SBInstructionList SBTarget::ReadInstructions(lldb::SBAddress start_addr,
+                                                   lldb::SBAddress end_addr,
+                                                   const char *flavor_string) {
+  LLDB_INSTRUMENT_VA(this, start_addr, end_addr, flavor_string);
+
+  SBInstructionList sb_instructions;
+
+  TargetSP target_sp(GetSP());
+  if (target_sp) {
+    lldb::addr_t start_load_addr = start_addr.GetLoadAddress(*this);
+    lldb::addr_t end_load_addr = end_addr.GetLoadAddress(*this);
+    if (end_load_addr > start_load_addr) {
+      lldb::addr_t size = end_load_addr - start_load_addr;
+
+      AddressRange range(start_load_addr, size);
+      const bool force_live_memory = true;
+      sb_instructions.SetDisassembler(Disassembler::DisassembleRange(
+          target_sp->GetArchitecture(), nullptr, flavor_string, *target_sp,
+          range, force_live_memory));
+    }
+  }
+  return sb_instructions;
+}
+
 lldb::SBInstructionList SBTarget::GetInstructions(lldb::SBAddress base_addr,
                                                   const void *buf,
                                                   size_t size) {
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index eb9cf063802c..ac3e2cd25daa 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -53,7 +53,7 @@ using namespace lldb_private;
 const char *SBThread::GetBroadcasterClassName() {
   LLDB_INSTRUMENT();
 
-  return Thread::GetStaticBroadcasterClass().AsCString();
+  return ConstString(Thread::GetStaticBroadcasterClass()).AsCString();
 }
 
 // Constructors
diff --git a/lldb/source/API/SBType.cpp b/lldb/source/API/SBType.cpp
index ac0e56303fae..6cecb5c9ea81 100644
--- a/lldb/source/API/SBType.cpp
+++ b/lldb/source/API/SBType.cpp
@@ -7,16 +7,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBType.h"
+#include "Utils.h"
 #include "lldb/API/SBDefines.h"
 #include "lldb/API/SBModule.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBTypeEnumMember.h"
 #include "lldb/Core/Mangled.h"
+#include "lldb/Core/ValueObjectConstResult.h"
+#include "lldb/Symbol/CompilerDecl.h"
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Symbol/Type.h"
 #include "lldb/Symbol/TypeSystem.h"
 #include "lldb/Utility/ConstString.h"
+#include "lldb/Utility/DataExtractor.h"
 #include "lldb/Utility/Instrumentation.h"
+#include "lldb/Utility/Scalar.h"
 #include "lldb/Utility/Stream.h"
 
 #include "llvm/ADT/APSInt.h"
@@ -325,6 +330,79 @@ lldb::SBTypeMemberFunction SBType::GetMemberFunctionAtIndex(uint32_t idx) {
   return sb_func_type;
 }
 
+SBTypeStaticField::SBTypeStaticField() { LLDB_INSTRUMENT_VA(this); }
+
+SBTypeStaticField::SBTypeStaticField(lldb_private::CompilerDecl decl)
+    : m_opaque_up(decl ? std::make_unique<CompilerDecl>(decl) : nullptr) {}
+
+SBTypeStaticField::SBTypeStaticField(const SBTypeStaticField &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  m_opaque_up = clone(rhs.m_opaque_up);
+}
+
+SBTypeStaticField &SBTypeStaticField::operator=(const SBTypeStaticField &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  m_opaque_up = clone(rhs.m_opaque_up);
+  return *this;
+}
+
+SBTypeStaticField::~SBTypeStaticField() { LLDB_INSTRUMENT_VA(this); }
+
+SBTypeStaticField::operator bool() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  return IsValid();
+}
+
+bool SBTypeStaticField::IsValid() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  return m_opaque_up != nullptr;
+}
+
+const char *SBTypeStaticField::GetName() {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (!IsValid())
+    return "";
+  return m_opaque_up->GetName().GetCString();
+}
+
+const char *SBTypeStaticField::GetMangledName() {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (!IsValid())
+    return "";
+  return m_opaque_up->GetMangledName().GetCString();
+}
+
+SBType SBTypeStaticField::GetType() {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (!IsValid())
+    return SBType();
+  return SBType(m_opaque_up->GetType());
+}
+
+SBValue SBTypeStaticField::GetConstantValue(lldb::SBTarget target) {
+  LLDB_INSTRUMENT_VA(this, target);
+
+  if (!IsValid())
+    return SBValue();
+
+  Scalar value = m_opaque_up->GetConstantValue();
+  if (!value.IsValid())
+    return SBValue();
+  DataExtractor data;
+  value.GetData(data);
+  auto value_obj_sp = ValueObjectConstResult::Create(
+      target.GetSP().get(), m_opaque_up->GetType(), m_opaque_up->GetName(),
+      data);
+  return SBValue(std::move(value_obj_sp));
+}
+
 lldb::SBType SBType::GetUnqualifiedType() {
   LLDB_INSTRUMENT_VA(this);
 
@@ -438,6 +516,16 @@ SBTypeMember SBType::GetVirtualBaseClassAtIndex(uint32_t idx) {
   return sb_type_member;
 }
 
+SBTypeStaticField SBType::GetStaticFieldWithName(const char *name) {
+  LLDB_INSTRUMENT_VA(this, name);
+
+  if (!IsValid() || !name)
+    return SBTypeStaticField();
+
+  return SBTypeStaticField(m_opaque_sp->GetCompilerType(/*prefer_dynamic=*/true)
+                               .GetStaticFieldWithName(name));
+}
+
 SBTypeEnumMemberList SBType::GetEnumMembers() {
   LLDB_INSTRUMENT_VA(this);
 
diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp
index 94a8f3ea319e..c53ec5a74648 100644
--- a/lldb/source/API/SBValue.cpp
+++ b/lldb/source/API/SBValue.cpp
@@ -909,6 +909,25 @@ uint64_t SBValue::GetValueAsUnsigned(uint64_t fail_value) {
   return fail_value;
 }
 
+lldb::addr_t SBValue::GetValueAsAddress() {
+  addr_t fail_value = LLDB_INVALID_ADDRESS;
+  ValueLocker locker;
+  lldb::ValueObjectSP value_sp(GetSP(locker));
+  if (value_sp) {
+    bool success = true;
+    uint64_t ret_val = fail_value;
+    ret_val = value_sp->GetValueAsUnsigned(fail_value, &success);
+    if (!success)
+      return fail_value;
+    ProcessSP process_sp = m_opaque_sp->GetProcessSP();
+    if (!process_sp)
+      return ret_val;
+    return process_sp->FixDataAddress(ret_val);
+  }
+
+  return fail_value;
+}
+
 bool SBValue::MightHaveChildren() {
   LLDB_INSTRUMENT_VA(this);
 
diff --git a/lldb/source/Breakpoint/Watchpoint.cpp b/lldb/source/Breakpoint/Watchpoint.cpp
index a128ced57504..edb1a0e93460 100644
--- a/lldb/source/Breakpoint/Watchpoint.cpp
+++ b/lldb/source/Breakpoint/Watchpoint.cpp
@@ -460,9 +460,8 @@ void Watchpoint::SetCondition(const char *condition) {
     // Pass nullptr for expr_prefix (no translation-unit level definitions).
     Status error;
     m_condition_up.reset(m_target.GetUserExpressionForLanguage(
-        condition, llvm::StringRef(), lldb::eLanguageTypeUnknown,
-        UserExpression::eResultTypeAny, EvaluateExpressionOptions(), nullptr,
-        error));
+        condition, {}, {}, UserExpression::eResultTypeAny,
+        EvaluateExpressionOptions(), nullptr, error));
     if (error.Fail()) {
       // FIXME: Log something...
       m_condition_up.reset();
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index e1255f37d9bc..57a372a762e1 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -93,10 +93,10 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
 
   StackFrame *frame = m_exe_ctx.GetFramePtr();
 
-  // Either Swift was explicitly specified, or the frame is Swift.
+  // Either the language was explicitly specified, or we check the frame.
   lldb::LanguageType language = m_expr_options.language;
   if (language == lldb::eLanguageTypeUnknown && frame)
-    language = frame->GuessLanguage();
+    language = frame->GuessLanguage().AsLanguageType();
 
   // Add a hint if object description was requested, but no description
   // function was implemented.
diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp
index 97489bdc2d9c..46537dd1b98a 100644
--- a/lldb/source/Commands/CommandObjectType.cpp
+++ b/lldb/source/Commands/CommandObjectType.cpp
@@ -2509,7 +2509,7 @@ protected:
     if (!frame)
       return lang_type;
 
-    lang_type = frame->GuessLanguage();
+    lang_type = frame->GuessLanguage().AsLanguageType();
     if (lang_type != lldb::eLanguageTypeUnknown)
       return lang_type;
 
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 19b3cf3bbf46..cac4642873b7 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -823,8 +823,8 @@ TargetSP Debugger::FindTargetWithProcess(Process *process) {
   return target_sp;
 }
 
-ConstString Debugger::GetStaticBroadcasterClass() {
-  static ConstString class_name("lldb.debugger");
+llvm::StringRef Debugger::GetStaticBroadcasterClass() {
+  static constexpr llvm::StringLiteral class_name("lldb.debugger");
   return class_name;
 }
 
@@ -846,7 +846,7 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton)
       m_loaded_plugins(), m_event_handler_thread(), m_io_handler_thread(),
       m_sync_broadcaster(nullptr, "lldb.debugger.sync"),
       m_broadcaster(m_broadcaster_manager_sp,
-                    GetStaticBroadcasterClass().AsCString()),
+                    GetStaticBroadcasterClass().str()),
       m_forward_listener_sp(), m_clear_once() {
   // Initialize the debugger properties as early as possible as other parts of
   // LLDB will start querying them during construction.
diff --git a/lldb/source/Core/ThreadedCommunication.cpp b/lldb/source/Core/ThreadedCommunication.cpp
index 7d8aae5d8ff6..2f3dada3ac93 100644
--- a/lldb/source/Core/ThreadedCommunication.cpp
+++ b/lldb/source/Core/ThreadedCommunication.cpp
@@ -32,8 +32,8 @@
 using namespace lldb;
 using namespace lldb_private;
 
-ConstString &ThreadedCommunication::GetStaticBroadcasterClass() {
-  static ConstString class_name("lldb.communication");
+llvm::StringRef ThreadedCommunication::GetStaticBroadcasterClass() {
+  static constexpr llvm::StringLiteral class_name("lldb.communication");
   return class_name;
 }
 
diff --git a/lldb/source/Expression/LLVMUserExpression.cpp b/lldb/source/Expression/LLVMUserExpression.cpp
index 9c31cc84bf8f..1434011c80ad 100644
--- a/lldb/source/Expression/LLVMUserExpression.cpp
+++ b/lldb/source/Expression/LLVMUserExpression.cpp
@@ -42,7 +42,7 @@ char LLVMUserExpression::ID;
 LLVMUserExpression::LLVMUserExpression(ExecutionContextScope &exe_scope,
                                        llvm::StringRef expr,
                                        llvm::StringRef prefix,
-                                       lldb::LanguageType language,
+                                       SourceLanguage language,
                                        ResultType desired_type,
                                        const EvaluateExpressionOptions &options)
     : UserExpression(exe_scope, expr, prefix, language, desired_type, options),
diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp
index c181712a2f0b..5658426c8891 100644
--- a/lldb/source/Expression/UserExpression.cpp
+++ b/lldb/source/Expression/UserExpression.cpp
@@ -39,6 +39,7 @@
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/State.h"
 #include "lldb/Utility/StreamString.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 
 using namespace lldb_private;
 
@@ -46,8 +47,7 @@ char UserExpression::ID;
 
 UserExpression::UserExpression(ExecutionContextScope &exe_scope,
                                llvm::StringRef expr, llvm::StringRef prefix,
-                               lldb::LanguageType language,
-                               ResultType desired_type,
+                               SourceLanguage language, ResultType desired_type,
                                const EvaluateExpressionOptions &options)
     : Expression(exe_scope), m_expr_text(std::string(expr)),
       m_expr_prefix(std::string(prefix)), m_language(language),
@@ -176,7 +176,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   }
 
   lldb_private::ExecutionPolicy execution_policy = options.GetExecutionPolicy();
-  lldb::LanguageType language = options.GetLanguage();
+  SourceLanguage language = options.GetLanguage();
   const ResultType desired_type = options.DoesCoerceToId()
                                       ? UserExpression::eResultTypeId
                                       : UserExpression::eResultTypeAny;
@@ -242,7 +242,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   // If the language was not specified in the expression command, set it to the
   // language in the target's properties if specified, else default to the
   // langage for the frame.
-  if (language == lldb::eLanguageTypeUnknown) {
+  if (!language) {
     if (target->GetLanguage() != lldb::eLanguageTypeUnknown)
       language = target->GetLanguage();
     else if (StackFrame *frame = exe_ctx.GetFramePtr())
@@ -384,7 +384,8 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
       } else {
         if (expr_result) {
           result_valobj_sp = expr_result->GetValueObject();
-          result_valobj_sp->SetPreferredDisplayLanguage(language);
+          result_valobj_sp->SetPreferredDisplayLanguage(
+              language.AsLanguageType());
 
           LLDB_LOG(log,
                    "== [UserExpression::Evaluate] Execution completed "
@@ -426,7 +427,8 @@ UserExpression::Execute(DiagnosticManager &diagnostic_manager,
   Target *target = exe_ctx.GetTargetPtr();
   if (options.GetSuppressPersistentResult() && result_var && target) {
     if (auto *persistent_state =
-            target->GetPersistentExpressionStateForLanguage(m_language))
+            target->GetPersistentExpressionStateForLanguage(
+                m_language.AsLanguageType()))
       persistent_state->RemovePersistentVariable(result_var);
   }
   return expr_result;
diff --git a/lldb/source/Expression/UtilityFunction.cpp b/lldb/source/Expression/UtilityFunction.cpp
index d7a3c9d41d04..7b34c2c2ff76 100644
--- a/lldb/source/Expression/UtilityFunction.cpp
+++ b/lldb/source/Expression/UtilityFunction.cpp
@@ -80,8 +80,8 @@ FunctionCaller *UtilityFunction::MakeFunctionCaller(
   name.append("-caller");
 
   m_caller_up.reset(process_sp->GetTarget().GetFunctionCallerForLanguage(
-      Language(), return_type, impl_code_address, arg_value_list, name.c_str(),
-      error));
+      Language().AsLanguageType(), return_type, impl_code_address,
+      arg_value_list, name.c_str(), error));
   if (error.Fail()) {
 
     return nullptr;
diff --git a/lldb/source/Host/posix/PipePosix.cpp b/lldb/source/Host/posix/PipePosix.cpp
index afd3fe39059a..f35c348990df 100644
--- a/lldb/source/Host/posix/PipePosix.cpp
+++ b/lldb/source/Host/posix/PipePosix.cpp
@@ -108,7 +108,7 @@ Status PipePosix::CreateNew(bool child_processes_inherit) {
 }
 
 Status PipePosix::CreateNew(llvm::StringRef name, bool child_process_inherit) {
-  std::scoped_lock<std::mutex, std::mutex> (m_read_mutex, m_write_mutex);
+  std::scoped_lock<std::mutex, std::mutex> guard(m_read_mutex, m_write_mutex);
   if (CanReadUnlocked() || CanWriteUnlocked())
     return Status("Pipe is already opened");
 
@@ -146,7 +146,7 @@ Status PipePosix::CreateWithUniqueName(llvm::StringRef prefix,
 
 Status PipePosix::OpenAsReader(llvm::StringRef name,
                                bool child_process_inherit) {
-  std::scoped_lock<std::mutex, std::mutex> (m_read_mutex, m_write_mutex);
+  std::scoped_lock<std::mutex, std::mutex> guard(m_read_mutex, m_write_mutex);
 
   if (CanReadUnlocked() || CanWriteUnlocked())
     return Status("Pipe is already opened");
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 8c3972a2ba4c..4c58ecc3c184 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -119,15 +119,15 @@ enum {
 #include "InterpreterPropertiesEnum.inc"
 };
 
-ConstString &CommandInterpreter::GetStaticBroadcasterClass() {
-  static ConstString class_name("lldb.commandInterpreter");
+llvm::StringRef CommandInterpreter::GetStaticBroadcasterClass() {
+  static constexpr llvm::StringLiteral class_name("lldb.commandInterpreter");
   return class_name;
 }
 
 CommandInterpreter::CommandInterpreter(Debugger &debugger,
                                        bool synchronous_execution)
     : Broadcaster(debugger.GetBroadcasterManager(),
-                  CommandInterpreter::GetStaticBroadcasterClass().AsCString()),
+                  CommandInterpreter::GetStaticBroadcasterClass().str()),
       Properties(
           OptionValuePropertiesSP(new OptionValueProperties("interpreter"))),
       IOHandlerDelegate(IOHandlerDelegate::Completion::LLDBCommand),
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h
index cefec15a7980..17f1506036c6 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h
@@ -30,7 +30,7 @@ public:
 
   ~ExternalASTSourceWrapper() override;
 
-  clang::Decl *GetExternalDecl(uint32_t ID) override {
+  clang::Decl *GetExternalDecl(clang::GlobalDeclID ID) override {
     return m_Source->GetExternalDecl(ID);
   }
 
@@ -266,7 +266,7 @@ public:
   // ExternalASTSource.
   //===--------------------------------------------------------------------===//
 
-  clang::Decl *GetExternalDecl(uint32_t ID) override {
+  clang::Decl *GetExternalDecl(clang::GlobalDeclID ID) override {
     for (size_t i = 0; i < Sources.size(); ++i)
       if (clang::Decl *Result = Sources[i]->GetExternalDecl(ID))
         return Result;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
index f34e4661a81c..83c910477acc 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
@@ -49,7 +49,7 @@ public:
   ~ClangASTSource() override;
 
   /// Interface stubs.
-  clang::Decl *GetExternalDecl(uint32_t) override { return nullptr; }
+  clang::Decl *GetExternalDecl(clang::GlobalDeclID) override { return nullptr; }
   clang::Stmt *GetExternalDeclStmt(uint64_t) override { return nullptr; }
   clang::Selector GetExternalSelector(uint32_t) override {
     return clang::Selector();
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index f48bdc730d91..72c7cda13ecb 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -392,8 +392,8 @@ ClangExpressionParser::ClangExpressionParser(
   // Make sure clang uses the same VFS as LLDB.
   m_compiler->createFileManager(FileSystem::Instance().GetVirtualFileSystem());
 
-  lldb::LanguageType frame_lang =
-      expr.Language(); // defaults to lldb::eLanguageTypeUnknown
+  // Defaults to lldb::eLanguageTypeUnknown.
+  lldb::LanguageType frame_lang = expr.Language().AsLanguageType();
 
   std::string abi;
   ArchSpec target_arch;
@@ -410,7 +410,7 @@ ClangExpressionParser::ClangExpressionParser(
   // Make sure the user hasn't provided a preferred execution language with
   // `expression --language X -- ...`
   if (frame_sp && frame_lang == lldb::eLanguageTypeUnknown)
-    frame_lang = frame_sp->GetLanguage();
+    frame_lang = frame_sp->GetLanguage().AsLanguageType();
 
   if (process_sp && frame_lang != lldb::eLanguageTypeUnknown) {
     LLDB_LOGF(log, "Frame has language of type %s",
@@ -479,7 +479,7 @@ ClangExpressionParser::ClangExpressionParser(
   assert(m_compiler->hasTarget());
 
   // 4. Set language options.
-  lldb::LanguageType language = expr.Language();
+  lldb::LanguageType language = expr.Language().AsLanguageType();
   LangOptions &lang_opts = m_compiler->getLangOpts();
 
   switch (language) {
@@ -1344,10 +1344,10 @@ lldb_private::Status ClangExpressionParser::PrepareForExecution(
   {
     auto lang = m_expr.Language();
     LLDB_LOGF(log, "%s - Current expression language is %s\n", __FUNCTION__,
-              Language::GetNameForLanguageType(lang));
+              lang.GetDescription().data());
     lldb::ProcessSP process_sp = exe_ctx.GetProcessSP();
     if (process_sp && lang != lldb::eLanguageTypeUnknown) {
-      auto runtime = process_sp->GetLanguageRuntime(lang);
+      auto runtime = process_sp->GetLanguageRuntime(lang.AsLanguageType());
       if (runtime)
         runtime->GetIRPasses(custom_passes);
     }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index 5776b1e94e07..5ea7bc02a6e4 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -56,6 +56,7 @@
 #include "clang/AST/DeclObjC.h"
 
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 
 using namespace lldb_private;
 
@@ -63,22 +64,21 @@ char ClangUserExpression::ID;
 
 ClangUserExpression::ClangUserExpression(
     ExecutionContextScope &exe_scope, llvm::StringRef expr,
-    llvm::StringRef prefix, lldb::LanguageType language,
-    ResultType desired_type, const EvaluateExpressionOptions &options,
-    ValueObject *ctx_obj)
+    llvm::StringRef prefix, SourceLanguage language, ResultType desired_type,
+    const EvaluateExpressionOptions &options, ValueObject *ctx_obj)
     : LLVMUserExpression(exe_scope, expr, prefix, language, desired_type,
                          options),
       m_type_system_helper(*m_target_wp.lock(), options.GetExecutionPolicy() ==
                                                     eExecutionPolicyTopLevel),
       m_result_delegate(exe_scope.CalculateTarget()), m_ctx_obj(ctx_obj) {
-  switch (m_language) {
-  case lldb::eLanguageTypeC_plus_plus:
+  switch (m_language.name) {
+  case llvm::dwarf::DW_LNAME_C_plus_plus:
     m_allow_cxx = true;
     break;
-  case lldb::eLanguageTypeObjC:
+  case llvm::dwarf::DW_LNAME_ObjC:
     m_allow_objc = true;
     break;
-  case lldb::eLanguageTypeObjC_plus_plus:
+  case llvm::dwarf::DW_LNAME_ObjC_plus_plus:
   default:
     m_allow_cxx = true;
     m_allow_objc = true;
@@ -624,7 +624,8 @@ bool ClangUserExpression::TryParse(
 void ClangUserExpression::SetupCppModuleImports(ExecutionContext &exe_ctx) {
   Log *log = GetLog(LLDBLog::Expressions);
 
-  CppModuleConfiguration module_config = GetModuleConfig(m_language, exe_ctx);
+  CppModuleConfiguration module_config =
+      GetModuleConfig(m_language.AsLanguageType(), exe_ctx);
   m_imported_cpp_modules = module_config.GetImportedModules();
   m_include_directories = module_config.GetIncludeDirs();
 
@@ -734,7 +735,7 @@ bool ClangUserExpression::Parse(DiagnosticManager &diagnostic_manager,
     if (register_execution_unit) {
       if (auto *persistent_state =
               exe_ctx.GetTargetPtr()->GetPersistentExpressionStateForLanguage(
-                  m_language))
+                  m_language.AsLanguageType()))
         persistent_state->RegisterExecutionUnit(m_execution_unit_sp);
     }
   }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
index bc07cbcf9e64..09604feea5de 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
@@ -106,8 +106,8 @@ public:
   ///     definitions to be included when the expression is parsed.
   ///
   /// \param[in] language
-  ///     If not eLanguageTypeUnknown, a language to use when parsing
-  ///     the expression.  Currently restricted to those languages
+  ///     If not unknown, a language to use when parsing the
+  ///     expression.  Currently restricted to those languages
   ///     supported by Clang.
   ///
   /// \param[in] desired_type
@@ -122,7 +122,7 @@ public:
   ///     must be evaluated. For details see the comment to
   ///     `UserExpression::Evaluate`.
   ClangUserExpression(ExecutionContextScope &exe_scope, llvm::StringRef expr,
-                      llvm::StringRef prefix, lldb::LanguageType language,
+                      llvm::StringRef prefix, SourceLanguage language,
                       ResultType desired_type,
                       const EvaluateExpressionOptions &options,
                       ValueObject *ctx_obj);
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index d3fc487aed43..9409497f1c81 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -1869,15 +1869,15 @@ AppleObjCRuntimeV2::DynamicClassInfoExtractor::ComputeHelper(
       if (loader->IsFullyInitialized()) {
         switch (exe_ctx.GetTargetRef().GetDynamicClassInfoHelper()) {
         case eDynamicClassInfoHelperAuto:
-          LLVM_FALLTHROUGH;
+          [[fallthrough]];
         case eDynamicClassInfoHelperGetRealizedClassList:
           if (m_runtime.m_has_objc_getRealizedClassList_trylock)
             return DynamicClassInfoExtractor::objc_getRealizedClassList_trylock;
-          LLVM_FALLTHROUGH;
+          [[fallthrough]];
         case eDynamicClassInfoHelperCopyRealizedClassList:
           if (m_runtime.m_has_objc_copyRealizedClassList)
             return DynamicClassInfoExtractor::objc_copyRealizedClassList;
-          LLVM_FALLTHROUGH;
+          [[fallthrough]];
         case eDynamicClassInfoHelperRealizedClassesStruct:
           return DynamicClassInfoExtractor::gdb_objc_realized_classes;
         }
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 0d95a1c12bde..16f6d2e884b5 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -1854,6 +1854,39 @@ public:
 };
 }
 
+// We have to do this because ELF doesn't have section IDs, and also
+// doesn't require section names to be unique.  (We use the section index
+// for section IDs, but that isn't guaranteed to be the same in separate
+// debug images.)
+static SectionSP FindMatchingSection(const SectionList &section_list,
+                                     SectionSP section) {
+  SectionSP sect_sp;
+
+  addr_t vm_addr = section->GetFileAddress();
+  ConstString name = section->GetName();
+  offset_t byte_size = section->GetByteSize();
+  bool thread_specific = section->IsThreadSpecific();
+  uint32_t permissions = section->GetPermissions();
+  uint32_t alignment = section->GetLog2Align();
+
+  for (auto sect : section_list) {
+    if (sect->GetName() == name &&
+        sect->IsThreadSpecific() == thread_specific &&
+        sect->GetPermissions() == permissions &&
+        sect->GetByteSize() == byte_size && sect->GetFileAddress() == vm_addr &&
+        sect->GetLog2Align() == alignment) {
+      sect_sp = sect;
+      break;
+    } else {
+      sect_sp = FindMatchingSection(sect->GetChildren(), section);
+      if (sect_sp)
+        break;
+    }
+  }
+
+  return sect_sp;
+}
+
 void ObjectFileELF::CreateSections(SectionList &unified_section_list) {
   if (m_sections_up)
     return;
@@ -2067,10 +2100,12 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
   SectionList *module_section_list =
       module_sp ? module_sp->GetSectionList() : nullptr;
 
-  // Local cache to avoid doing a FindSectionByName for each symbol. The "const
-  // char*" key must came from a ConstString object so they can be compared by
-  // pointer
-  std::unordered_map<const char *, lldb::SectionSP> section_name_to_section;
+  // We might have debug information in a separate object, in which case
+  // we need to map the sections from that object to the sections in the
+  // main object during symbol lookup.  If we had to compare the sections
+  // for every single symbol, that would be expensive, so this map is
+  // used to accelerate the process.
+  std::unordered_map<lldb::SectionSP, lldb::SectionSP> section_map;
 
   unsigned i;
   for (i = 0; i < num_symbols; ++i) {
@@ -2275,14 +2310,14 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
 
     if (symbol_section_sp && module_section_list &&
         module_section_list != section_list) {
-      ConstString sect_name = symbol_section_sp->GetName();
-      auto section_it = section_name_to_section.find(sect_name.GetCString());
-      if (section_it == section_name_to_section.end())
-        section_it =
-            section_name_to_section
-                .emplace(sect_name.GetCString(),
-                         module_section_list->FindSectionByName(sect_name))
-                .first;
+      auto section_it = section_map.find(symbol_section_sp);
+      if (section_it == section_map.end()) {
+        section_it = section_map
+                         .emplace(symbol_section_sp,
+                                  FindMatchingSection(*module_section_list,
+                                                      symbol_section_sp))
+                         .first;
+      }
       if (section_it->second)
         symbol_section_sp = section_it->second;
     }
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 3d37bb226a65..ae1a77e5be83 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -2087,7 +2087,7 @@ void GDBRemoteCommunicationServerLLGS::AddProcessThreads(
 GDBRemoteCommunication::PacketResult
 GDBRemoteCommunicationServerLLGS::Handle_qfThreadInfo(
     StringExtractorGDBRemote &packet) {
-  assert(m_debugged_processes.size() == 1 ||
+  assert(m_debugged_processes.size() <= 1 ||
          bool(m_extensions_supported &
               NativeProcessProtocol::Extension::multiprocess));
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 41d81fbcf1b0..bea11e0e3840 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -495,6 +495,7 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc,
   case DW_TAG_const_type:
   case DW_TAG_restrict_type:
   case DW_TAG_volatile_type:
+  case DW_TAG_LLVM_ptrauth_type:
   case DW_TAG_atomic_type:
   case DW_TAG_unspecified_type: {
     type_sp = ParseTypeModifier(sc, die, attrs);
@@ -569,6 +570,39 @@ ExtractDataMemberLocation(DWARFDIE const &die, DWARFFormValue const &form_value,
   return memberOffset.ResolveValue(nullptr).UInt();
 }
 
+static TypePayloadClang GetPtrAuthMofidierPayload(const DWARFDIE &die) {
+  auto getAttr = [&](llvm::dwarf::Attribute Attr, unsigned defaultValue = 0) {
+    return die.GetAttributeValueAsUnsigned(Attr, defaultValue);
+  };
+  const unsigned key = getAttr(DW_AT_LLVM_ptrauth_key);
+  const bool addr_disc = getAttr(DW_AT_LLVM_ptrauth_address_discriminated);
+  const unsigned extra = getAttr(DW_AT_LLVM_ptrauth_extra_discriminator);
+  const bool isapointer = getAttr(DW_AT_LLVM_ptrauth_isa_pointer);
+  const bool authenticates_null_values =
+      getAttr(DW_AT_LLVM_ptrauth_authenticates_null_values);
+  const unsigned authentication_mode_int = getAttr(
+      DW_AT_LLVM_ptrauth_authentication_mode,
+      static_cast<unsigned>(clang::PointerAuthenticationMode::SignAndAuth));
+  clang::PointerAuthenticationMode authentication_mode =
+      clang::PointerAuthenticationMode::SignAndAuth;
+  if (authentication_mode_int >=
+          static_cast<unsigned>(clang::PointerAuthenticationMode::None) &&
+      authentication_mode_int <=
+          static_cast<unsigned>(
+              clang::PointerAuthenticationMode::SignAndAuth)) {
+    authentication_mode =
+        static_cast<clang::PointerAuthenticationMode>(authentication_mode_int);
+  } else {
+    die.GetDWARF()->GetObjectFile()->GetModule()->ReportError(
+        "[{0:x16}]: invalid pointer authentication mode method {1:x4}",
+        die.GetOffset(), authentication_mode_int);
+  }
+  auto ptr_auth = clang::PointerAuthQualifier::Create(
+      key, addr_disc, extra, authentication_mode, isapointer,
+      authenticates_null_values);
+  return TypePayloadClang(ptr_auth.getAsOpaqueValue());
+}
+
 lldb::TypeSP
 DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
                                        const DWARFDIE &die,
@@ -579,6 +613,7 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
   LanguageType cu_language = SymbolFileDWARF::GetLanguage(*die.GetCU());
   Type::ResolveState resolve_state = Type::ResolveState::Unresolved;
   Type::EncodingDataType encoding_data_type = Type::eEncodingIsUID;
+  TypePayloadClang payload(GetOwningClangModule(die));
   TypeSP type_sp;
   CompilerType clang_type;
 
@@ -676,6 +711,10 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
   case DW_TAG_volatile_type:
     encoding_data_type = Type::eEncodingIsVolatileUID;
     break;
+  case DW_TAG_LLVM_ptrauth_type:
+    encoding_data_type = Type::eEncodingIsLLVMPtrAuthUID;
+    payload = GetPtrAuthMofidierPayload(die);
+    break;
   case DW_TAG_atomic_type:
     encoding_data_type = Type::eEncodingIsAtomicUID;
     break;
@@ -785,8 +824,7 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
 
   type_sp = dwarf->MakeType(die.GetID(), attrs.name, attrs.byte_size, nullptr,
                             attrs.type.Reference().GetID(), encoding_data_type,
-                            &attrs.decl, clang_type, resolve_state,
-                            TypePayloadClang(GetOwningClangModule(die)));
+                            &attrs.decl, clang_type, resolve_state, payload);
 
   dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
   return type_sp;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
index dd130977d4b1..b8344f548ac3 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
@@ -32,7 +32,7 @@ public:
 
 private:
   DWARFCompileUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
-                   const DWARFUnitHeader &header,
+                   const llvm::DWARFUnitHeader &header,
                    const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                    DIERef::Section section, bool is_dwo)
       : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
index 7b58c632c6c5..8c1f932d8c7f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
@@ -24,15 +24,15 @@ public:
 
   void Dump(Stream *s) const override;
 
-  uint64_t GetTypeHash() { return m_header.GetTypeHash(); }
+  uint64_t GetTypeHash() { return m_header.getTypeHash(); }
 
-  dw_offset_t GetTypeOffset() { return GetOffset() + m_header.GetTypeOffset(); }
+  dw_offset_t GetTypeOffset() { return GetOffset() + m_header.getTypeOffset(); }
 
   static bool classof(const DWARFUnit *unit) { return unit->IsTypeUnit(); }
 
 private:
   DWARFTypeUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
-                const DWARFUnitHeader &header,
+                const llvm::DWARFUnitHeader &header,
                 const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                 DIERef::Section section, bool is_dwo)
       : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index e28036d34b34..dabc595427df 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -33,12 +33,12 @@ using namespace lldb_private::plugin::dwarf;
 extern int g_verbose;
 
 DWARFUnit::DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
-                     const DWARFUnitHeader &header,
+                     const llvm::DWARFUnitHeader &header,
                      const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                      DIERef::Section section, bool is_dwo)
     : UserID(uid), m_dwarf(dwarf), m_header(header), m_abbrevs(&abbrevs),
       m_cancel_scopes(false), m_section(section), m_is_dwo(is_dwo),
-      m_has_parsed_non_skeleton_unit(false), m_dwo_id(header.GetDWOId()) {}
+      m_has_parsed_non_skeleton_unit(false), m_dwo_id(header.getDWOId()) {}
 
 DWARFUnit::~DWARFUnit() = default;
 
@@ -345,7 +345,7 @@ void DWARFUnit::ExtractDIEsRWLocked() {
 void DWARFUnit::SetDwoStrOffsetsBase() {
   lldb::offset_t baseOffset = 0;
 
-  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.GetIndexEntry()) {
+  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) {
     if (const auto *contribution =
             entry->getContribution(llvm::DW_SECT_STR_OFFSETS))
       baseOffset = contribution->getOffset();
@@ -489,7 +489,7 @@ ParseListTableHeader(const llvm::DWARFDataExtractor &data, uint64_t offset,
 
 void DWARFUnit::SetLoclistsBase(dw_addr_t loclists_base) {
   uint64_t offset = 0;
-  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.GetIndexEntry()) {
+  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) {
     const auto *contribution = entry->getContribution(llvm::DW_SECT_LOCLISTS);
     if (!contribution) {
       GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
@@ -533,7 +533,7 @@ DWARFDataExtractor DWARFUnit::GetLocationData() const {
   DWARFContext &Ctx = GetSymbolFileDWARF().GetDWARFContext();
   const DWARFDataExtractor &data =
       GetVersion() >= 5 ? Ctx.getOrLoadLocListsData() : Ctx.getOrLoadLocData();
-  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.GetIndexEntry()) {
+  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) {
     if (const auto *contribution = entry->getContribution(
             GetVersion() >= 5 ? llvm::DW_SECT_LOCLISTS : llvm::DW_SECT_EXT_LOC))
       return DWARFDataExtractor(data, contribution->getOffset(),
@@ -546,7 +546,7 @@ DWARFDataExtractor DWARFUnit::GetLocationData() const {
 DWARFDataExtractor DWARFUnit::GetRnglistData() const {
   DWARFContext &Ctx = GetSymbolFileDWARF().GetDWARFContext();
   const DWARFDataExtractor &data = Ctx.getOrLoadRngListsData();
-  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.GetIndexEntry()) {
+  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) {
     if (const auto *contribution =
             entry->getContribution(llvm::DW_SECT_RNGLISTS))
       return DWARFDataExtractor(data, contribution->getOffset(),
@@ -924,84 +924,6 @@ const DWARFDebugAranges &DWARFUnit::GetFunctionAranges() {
   return *m_func_aranges_up;
 }
 
-llvm::Error DWARFUnitHeader::ApplyIndexEntry(
-    const llvm::DWARFUnitIndex::Entry *index_entry) {
-  // We should only be calling this function when the index entry is not set and
-  // we have a valid one to set it to.
-  assert(index_entry);
-  assert(!m_index_entry);
-
-  if (m_abbr_offset)
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "Package unit with a non-zero abbreviation offset");
-
-  auto *unit_contrib = index_entry->getContribution();
-  if (!unit_contrib || unit_contrib->getLength32() != m_length + 4)
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "Inconsistent DWARF package unit index");
-
-  auto *abbr_entry = index_entry->getContribution(llvm::DW_SECT_ABBREV);
-  if (!abbr_entry)
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "DWARF package index missing abbreviation column");
-
-  m_abbr_offset = abbr_entry->getOffset();
-  m_index_entry = index_entry;
-  return llvm::Error::success();
-}
-
-llvm::Expected<DWARFUnitHeader>
-DWARFUnitHeader::extract(const DWARFDataExtractor &data,
-                         DIERef::Section section, DWARFContext &context,
-                         lldb::offset_t *offset_ptr) {
-  DWARFUnitHeader header;
-  header.m_offset = *offset_ptr;
-  header.m_length = data.GetDWARFInitialLength(offset_ptr);
-  header.m_version = data.GetU16(offset_ptr);
-  if (header.m_version == 5) {
-    header.m_unit_type = data.GetU8(offset_ptr);
-    header.m_addr_size = data.GetU8(offset_ptr);
-    header.m_abbr_offset = data.GetDWARFOffset(offset_ptr);
-    if (header.m_unit_type == llvm::dwarf::DW_UT_skeleton ||
-        header.m_unit_type == llvm::dwarf::DW_UT_split_compile)
-      header.m_dwo_id = data.GetU64(offset_ptr);
-  } else {
-    header.m_abbr_offset = data.GetDWARFOffset(offset_ptr);
-    header.m_addr_size = data.GetU8(offset_ptr);
-    header.m_unit_type =
-        section == DIERef::Section::DebugTypes ? DW_UT_type : DW_UT_compile;
-  }
-
-  if (header.IsTypeUnit()) {
-    header.m_type_hash = data.GetU64(offset_ptr);
-    header.m_type_offset = data.GetDWARFOffset(offset_ptr);
-  }
-
-  bool length_OK = data.ValidOffset(header.GetNextUnitOffset() - 1);
-  bool version_OK = SymbolFileDWARF::SupportedVersion(header.m_version);
-  bool addr_size_OK = (header.m_addr_size == 2) || (header.m_addr_size == 4) ||
-                      (header.m_addr_size == 8);
-  bool type_offset_OK =
-      !header.IsTypeUnit() || (header.m_type_offset <= header.GetLength());
-
-  if (!length_OK)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "Invalid unit length");
-  if (!version_OK)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "Unsupported unit version");
-  if (!addr_size_OK)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "Invalid unit address size");
-  if (!type_offset_OK)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "Type offset out of range");
-
-  return header;
-}
-
 llvm::Expected<DWARFUnitSP>
 DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
                    const DWARFDataExtractor &debug_info,
@@ -1009,26 +931,35 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
   assert(debug_info.ValidOffset(*offset_ptr));
 
   DWARFContext &context = dwarf.GetDWARFContext();
-  auto expected_header =
-      DWARFUnitHeader::extract(debug_info, section, context, offset_ptr);
-  if (!expected_header)
-    return expected_header.takeError();
+
+  // FIXME: Either properly map between DIERef::Section and
+  // llvm::DWARFSectionKind or switch to llvm's definition entirely.
+  llvm::DWARFSectionKind section_kind_llvm =
+      section == DIERef::Section::DebugInfo
+          ? llvm::DWARFSectionKind::DW_SECT_INFO
+          : llvm::DWARFSectionKind::DW_SECT_EXT_TYPES;
+
+  llvm::DWARFDataExtractor debug_info_llvm = debug_info.GetAsLLVMDWARF();
+  llvm::DWARFUnitHeader header;
+  if (llvm::Error extract_err = header.extract(
+          context.GetAsLLVM(), debug_info_llvm, offset_ptr, section_kind_llvm))
+    return std::move(extract_err);
 
   if (context.isDwo()) {
     const llvm::DWARFUnitIndex::Entry *entry = nullptr;
-    const llvm::DWARFUnitIndex &index = expected_header->IsTypeUnit()
+    const llvm::DWARFUnitIndex &index = header.isTypeUnit()
                                             ? context.GetAsLLVM().getTUIndex()
                                             : context.GetAsLLVM().getCUIndex();
     if (index) {
-      if (expected_header->IsTypeUnit())
-        entry = index.getFromHash(expected_header->GetTypeHash());
-      else if (auto dwo_id = expected_header->GetDWOId())
+      if (header.isTypeUnit())
+        entry = index.getFromHash(header.getTypeHash());
+      else if (auto dwo_id = header.getDWOId())
         entry = index.getFromHash(*dwo_id);
     }
     if (!entry)
-      entry = index.getFromOffset(expected_header->GetOffset());
+      entry = index.getFromOffset(header.getOffset());
     if (entry)
-      if (llvm::Error err = expected_header->ApplyIndexEntry(entry))
+      if (llvm::Error err = header.applyIndexEntry(entry))
         return std::move(err);
   }
 
@@ -1039,13 +970,13 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
 
   bool abbr_offset_OK =
       dwarf.GetDWARFContext().getOrLoadAbbrevData().ValidOffset(
-          expected_header->GetAbbrOffset());
+          header.getAbbrOffset());
   if (!abbr_offset_OK)
     return llvm::make_error<llvm::object::GenericBinaryError>(
         "Abbreviation offset for unit is not valid");
 
   llvm::Expected<const llvm::DWARFAbbreviationDeclarationSet *> abbrevs_or_err =
-      abbr->getAbbreviationDeclarationSet(expected_header->GetAbbrOffset());
+      abbr->getAbbreviationDeclarationSet(header.getAbbrOffset());
   if (!abbrevs_or_err)
     return abbrevs_or_err.takeError();
 
@@ -1055,11 +986,11 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
         "No abbrev exists at the specified offset.");
 
   bool is_dwo = dwarf.GetDWARFContext().isDwo();
-  if (expected_header->IsTypeUnit())
-    return DWARFUnitSP(new DWARFTypeUnit(dwarf, uid, *expected_header, *abbrevs,
-                                         section, is_dwo));
-  return DWARFUnitSP(new DWARFCompileUnit(dwarf, uid, *expected_header,
-                                          *abbrevs, section, is_dwo));
+  if (header.isTypeUnit())
+    return DWARFUnitSP(
+        new DWARFTypeUnit(dwarf, uid, header, *abbrevs, section, is_dwo));
+  return DWARFUnitSP(
+      new DWARFCompileUnit(dwarf, uid, header, *abbrevs, section, is_dwo));
 }
 
 const lldb_private::DWARFDataExtractor &DWARFUnit::GetData() const {
@@ -1069,7 +1000,7 @@ const lldb_private::DWARFDataExtractor &DWARFUnit::GetData() const {
 }
 
 uint32_t DWARFUnit::GetHeaderByteSize() const {
-  switch (m_header.GetUnitType()) {
+  switch (m_header.getUnitType()) {
   case llvm::dwarf::DW_UT_compile:
   case llvm::dwarf::DW_UT_partial:
     return GetVersion() < 5 ? 11 : 12;
@@ -1106,7 +1037,7 @@ DWARFUnit::FindRnglistFromOffset(dw_offset_t offset) {
   llvm::DWARFDataExtractor data = GetRnglistData().GetAsLLVMDWARF();
 
   // As DW_AT_rnglists_base may be missing we need to call setAddressSize.
-  data.setAddressSize(m_header.GetAddressByteSize());
+  data.setAddressSize(m_header.getAddressByteSize());
   auto range_list_or_error = GetRnglistTable()->findList(data, offset);
   if (!range_list_or_error)
     return range_list_or_error.takeError();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index 28981b51bfcb..85c37971ced8 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -38,54 +38,6 @@ enum DWARFProducer {
   eProducerOther
 };
 
-/// Base class describing the header of any kind of "unit."  Some information
-/// is specific to certain unit types.  We separate this class out so we can
-/// parse the header before deciding what specific kind of unit to construct.
-class DWARFUnitHeader {
-  dw_offset_t m_offset = 0;
-  dw_offset_t m_length = 0;
-  uint16_t m_version = 0;
-  dw_offset_t m_abbr_offset = 0;
-
-  const llvm::DWARFUnitIndex::Entry *m_index_entry = nullptr;
-
-  uint8_t m_unit_type = 0;
-  uint8_t m_addr_size = 0;
-
-  uint64_t m_type_hash = 0;
-  uint32_t m_type_offset = 0;
-
-  std::optional<uint64_t> m_dwo_id;
-
-  DWARFUnitHeader() = default;
-
-public:
-  dw_offset_t GetOffset() const { return m_offset; }
-  uint16_t GetVersion() const { return m_version; }
-  uint16_t GetAddressByteSize() const { return m_addr_size; }
-  dw_offset_t GetLength() const { return m_length; }
-  dw_offset_t GetAbbrOffset() const { return m_abbr_offset; }
-  uint8_t GetUnitType() const { return m_unit_type; }
-  const llvm::DWARFUnitIndex::Entry *GetIndexEntry() const {
-    return m_index_entry;
-  }
-  uint64_t GetTypeHash() const { return m_type_hash; }
-  dw_offset_t GetTypeOffset() const { return m_type_offset; }
-  std::optional<uint64_t> GetDWOId() const { return m_dwo_id; }
-  bool IsTypeUnit() const {
-    return m_unit_type == llvm::dwarf::DW_UT_type ||
-           m_unit_type == llvm::dwarf::DW_UT_split_type;
-  }
-  dw_offset_t GetNextUnitOffset() const { return m_offset + m_length + 4; }
-
-  llvm::Error ApplyIndexEntry(const llvm::DWARFUnitIndex::Entry *index_entry);
-
-  static llvm::Expected<DWARFUnitHeader> extract(const DWARFDataExtractor &data,
-                                                 DIERef::Section section,
-                                                 DWARFContext &dwarf_context,
-                                                 lldb::offset_t *offset_ptr);
-};
-
 class DWARFUnit : public UserID {
   using die_iterator_range =
       llvm::iterator_range<DWARFDebugInfoEntry::collection::iterator>;
@@ -105,7 +57,7 @@ public:
   /// the DWO ID in the compile unit header and we sometimes only want to access
   /// this cheap value without causing the more expensive attribute fetches that
   /// GetDWOId() uses.
-  std::optional<uint64_t> GetHeaderDWOId() { return m_header.GetDWOId(); }
+  std::optional<uint64_t> GetHeaderDWOId() { return m_header.getDWOId(); }
   void ExtractUnitDIEIfNeeded();
   void ExtractUnitDIENoDwoIfNeeded();
   void ExtractDIEsIfNeeded();
@@ -143,7 +95,7 @@ public:
   uint32_t GetHeaderByteSize() const;
 
   // Offset of the initial length field.
-  dw_offset_t GetOffset() const { return m_header.GetOffset(); }
+  dw_offset_t GetOffset() const { return m_header.getOffset(); }
   /// Get the size in bytes of the length field in the header.
   ///
   /// In DWARF32 this is just 4 bytes
@@ -159,15 +111,15 @@ public:
   dw_offset_t GetFirstDIEOffset() const {
     return GetOffset() + GetHeaderByteSize();
   }
-  dw_offset_t GetNextUnitOffset() const { return m_header.GetNextUnitOffset(); }
+  dw_offset_t GetNextUnitOffset() const { return m_header.getNextUnitOffset(); }
   // Size of the CU data (without initial length and without header).
   size_t GetDebugInfoSize() const;
   // Size of the CU data incl. header but without initial length.
-  dw_offset_t GetLength() const { return m_header.GetLength(); }
-  uint16_t GetVersion() const { return m_header.GetVersion(); }
+  dw_offset_t GetLength() const { return m_header.getLength(); }
+  uint16_t GetVersion() const { return m_header.getVersion(); }
   const llvm::DWARFAbbreviationDeclarationSet *GetAbbreviations() const;
   dw_offset_t GetAbbrevOffset() const;
-  uint8_t GetAddressByteSize() const { return m_header.GetAddressByteSize(); }
+  uint8_t GetAddressByteSize() const { return m_header.getAddressByteSize(); }
   dw_addr_t GetAddrBase() const { return m_addr_base.value_or(0); }
   dw_addr_t GetBaseAddress() const { return m_base_addr; }
   dw_offset_t GetLineTableOffset();
@@ -250,8 +202,8 @@ public:
 
   DIERef::Section GetDebugSection() const { return m_section; }
 
-  uint8_t GetUnitType() const { return m_header.GetUnitType(); }
-  bool IsTypeUnit() const { return m_header.IsTypeUnit(); }
+  uint8_t GetUnitType() const { return m_header.getUnitType(); }
+  bool IsTypeUnit() const { return m_header.isTypeUnit(); }
   /// Note that this check only works for DWARF5+.
   bool IsSkeletonUnit() const {
     return GetUnitType() == llvm::dwarf::DW_UT_skeleton;
@@ -320,7 +272,7 @@ public:
 
 protected:
   DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
-            const DWARFUnitHeader &header,
+            const llvm::DWARFUnitHeader &header,
             const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
             DIERef::Section section, bool is_dwo);
 
@@ -352,7 +304,7 @@ protected:
 
   SymbolFileDWARF &m_dwarf;
   std::shared_ptr<DWARFUnit> m_dwo;
-  DWARFUnitHeader m_header;
+  llvm::DWARFUnitHeader m_header;
   const llvm::DWARFAbbreviationDeclarationSet *m_abbrevs = nullptr;
   lldb_private::CompileUnit *m_lldb_cu = nullptr;
   // If this is a DWO file, we have a backlink to our skeleton compile unit.
diff --git a/lldb/source/Plugins/SymbolLocator/Default/SymbolLocatorDefault.cpp b/lldb/source/Plugins/SymbolLocator/Default/SymbolLocatorDefault.cpp
index 6f0126b16cdc..edb1d59cf42f 100644
--- a/lldb/source/Plugins/SymbolLocator/Default/SymbolLocatorDefault.cpp
+++ b/lldb/source/Plugins/SymbolLocator/Default/SymbolLocatorDefault.cpp
@@ -36,6 +36,10 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ThreadPool.h"
 
+#if defined(__FreeBSD__)
+#include <sys/sysctl.h>
+#endif
+
 // From MacOSX system header "mach/machine.h"
 typedef int cpu_type_t;
 typedef int cpu_subtype_t;
@@ -141,6 +145,24 @@ std::optional<FileSpec> SymbolLocatorDefault::LocateExecutableSymbolFile(
       FileSystem::Instance().Resolve(file_spec);
       debug_file_search_paths.AppendIfUnique(file_spec);
     }
+#if defined(__FreeBSD__)
+    // Add $LOCALBASE/lib/debug directory, where LOCALBASE is
+    // usually /usr/local, but may be adjusted by the end user.
+    {
+      int mib[2];
+      char buf[PATH_MAX];
+      size_t len = PATH_MAX;
+
+      mib[0] = CTL_USER;
+      mib[1] = USER_LOCALBASE;
+      if (::sysctl(mib, 2, buf, &len, NULL, 0) == 0) {
+        FileSpec file_spec("/lib/debug");
+        file_spec.PrependPathComponent(StringRef(buf));
+        FileSystem::Instance().Resolve(file_spec);
+        debug_file_search_paths.AppendIfUnique(file_spec);
+      }
+    }
+#endif // __FreeBSD__
 #endif
 #endif // _WIN32
   }
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 662da313af59..3bdb288e97dd 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -9,6 +9,7 @@
 #include "TypeSystemClang.h"
 
 #include "clang/AST/DeclBase.h"
+#include "clang/AST/ExprCXX.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -1148,6 +1149,8 @@ CompilerType TypeSystemClang::GetTypeForDecl(clang::NamedDecl *decl) {
     return GetTypeForDecl(interface_decl);
   if (clang::TagDecl *tag_decl = llvm::dyn_cast<clang::TagDecl>(decl))
     return GetTypeForDecl(tag_decl);
+  if (clang::ValueDecl *value_decl = llvm::dyn_cast<clang::ValueDecl>(decl))
+    return GetTypeForDecl(value_decl);
   return CompilerType();
 }
 
@@ -1159,6 +1162,10 @@ CompilerType TypeSystemClang::GetTypeForDecl(ObjCInterfaceDecl *decl) {
   return GetType(getASTContext().getObjCInterfaceType(decl));
 }
 
+CompilerType TypeSystemClang::GetTypeForDecl(clang::ValueDecl *value_decl) {
+  return GetType(value_decl->getType());
+}
+
 #pragma mark Structure, Unions, Classes
 
 void TypeSystemClang::SetOwningModule(clang::Decl *decl,
@@ -1230,7 +1237,7 @@ CompilerType TypeSystemClang::CreateRecordType(
   // complete definition just in case.
 
   bool has_name = !name.empty();
-  CXXRecordDecl *decl = CXXRecordDecl::CreateDeserialized(ast, 0);
+  CXXRecordDecl *decl = CXXRecordDecl::CreateDeserialized(ast, GlobalDeclID());
   decl->setTagKind(static_cast<TagDecl::TagKind>(kind));
   decl->setDeclContext(decl_ctx);
   if (has_name)
@@ -1402,7 +1409,7 @@ clang::FunctionTemplateDecl *TypeSystemClang::CreateFunctionTemplateDecl(
   TemplateParameterList *template_param_list = CreateTemplateParameterList(
       ast, template_param_infos, template_param_decls);
   FunctionTemplateDecl *func_tmpl_decl =
-      FunctionTemplateDecl::CreateDeserialized(ast, 0);
+      FunctionTemplateDecl::CreateDeserialized(ast, GlobalDeclID());
   func_tmpl_decl->setDeclContext(decl_ctx);
   func_tmpl_decl->setLocation(func_decl->getLocation());
   func_tmpl_decl->setDeclName(func_decl->getDeclName());
@@ -1564,7 +1571,8 @@ ClassTemplateDecl *TypeSystemClang::CreateClassTemplateDecl(
   TemplateParameterList *template_param_list = CreateTemplateParameterList(
       ast, template_param_infos, template_param_decls);
 
-  CXXRecordDecl *template_cxx_decl = CXXRecordDecl::CreateDeserialized(ast, 0);
+  CXXRecordDecl *template_cxx_decl =
+      CXXRecordDecl::CreateDeserialized(ast, GlobalDeclID());
   template_cxx_decl->setTagKind(static_cast<TagDecl::TagKind>(kind));
   // What decl context do we use here? TU? The actual decl context?
   template_cxx_decl->setDeclContext(decl_ctx);
@@ -1581,7 +1589,8 @@ ClassTemplateDecl *TypeSystemClang::CreateClassTemplateDecl(
   // template_cxx_decl->startDefinition();
   // template_cxx_decl->completeDefinition();
 
-  class_template_decl = ClassTemplateDecl::CreateDeserialized(ast, 0);
+  class_template_decl =
+      ClassTemplateDecl::CreateDeserialized(ast, GlobalDeclID());
   // What decl context do we use here? TU? The actual decl context?
   class_template_decl->setDeclContext(decl_ctx);
   class_template_decl->setDeclName(decl_name);
@@ -1642,7 +1651,7 @@ TypeSystemClang::CreateClassTemplateSpecializationDecl(
         ast, template_param_infos.GetParameterPackArgs());
   }
   ClassTemplateSpecializationDecl *class_template_specialization_decl =
-      ClassTemplateSpecializationDecl::CreateDeserialized(ast, 0);
+      ClassTemplateSpecializationDecl::CreateDeserialized(ast, GlobalDeclID());
   class_template_specialization_decl->setTagKind(
       static_cast<TagDecl::TagKind>(kind));
   class_template_specialization_decl->setDeclContext(decl_ctx);
@@ -1792,7 +1801,8 @@ CompilerType TypeSystemClang::CreateObjCClass(
   if (!decl_ctx)
     decl_ctx = ast.getTranslationUnitDecl();
 
-  ObjCInterfaceDecl *decl = ObjCInterfaceDecl::CreateDeserialized(ast, 0);
+  ObjCInterfaceDecl *decl =
+      ObjCInterfaceDecl::CreateDeserialized(ast, GlobalDeclID());
   decl->setDeclContext(decl_ctx);
   decl->setDeclName(&ast.Idents.get(name));
   /*isForwardDecl,*/
@@ -1900,7 +1910,7 @@ TypeSystemClang::CreateBlockDeclaration(clang::DeclContext *ctx,
                                         OptionalClangModuleID owning_module) {
   if (ctx) {
     clang::BlockDecl *decl =
-        clang::BlockDecl::CreateDeserialized(getASTContext(), 0);
+        clang::BlockDecl::CreateDeserialized(getASTContext(), GlobalDeclID());
     decl->setDeclContext(ctx);
     ctx->addDecl(decl);
     SetOwningModule(decl, owning_module);
@@ -1969,7 +1979,7 @@ clang::VarDecl *TypeSystemClang::CreateVariableDeclaration(
     const char *name, clang::QualType type) {
   if (decl_context) {
     clang::VarDecl *var_decl =
-        clang::VarDecl::CreateDeserialized(getASTContext(), 0);
+        clang::VarDecl::CreateDeserialized(getASTContext(), GlobalDeclID());
     var_decl->setDeclContext(decl_context);
     if (name && name[0])
       var_decl->setDeclName(&getASTContext().Idents.getOwn(name));
@@ -2129,7 +2139,7 @@ FunctionDecl *TypeSystemClang::CreateFunctionDeclaration(
 
   clang::DeclarationName declarationName =
       GetDeclarationName(name, function_clang_type);
-  func_decl = FunctionDecl::CreateDeserialized(ast, 0);
+  func_decl = FunctionDecl::CreateDeserialized(ast, GlobalDeclID());
   func_decl->setDeclContext(decl_ctx);
   func_decl->setDeclName(declarationName);
   func_decl->setType(ClangUtil::GetQualType(function_clang_type));
@@ -2190,7 +2200,7 @@ ParmVarDecl *TypeSystemClang::CreateParameterDeclaration(
     const char *name, const CompilerType &param_type, int storage,
     bool add_decl) {
   ASTContext &ast = getASTContext();
-  auto *decl = ParmVarDecl::CreateDeserialized(ast, 0);
+  auto *decl = ParmVarDecl::CreateDeserialized(ast, GlobalDeclID());
   decl->setDeclContext(decl_ctx);
   if (name && name[0])
     decl->setDeclName(&ast.Idents.get(name));
@@ -2295,7 +2305,7 @@ CompilerType TypeSystemClang::CreateEnumerationType(
 
   // TODO: ask about these...
   //    const bool IsFixed = false;
-  EnumDecl *enum_decl = EnumDecl::CreateDeserialized(ast, 0);
+  EnumDecl *enum_decl = EnumDecl::CreateDeserialized(ast, GlobalDeclID());
   enum_decl->setDeclContext(decl_ctx);
   if (!name.empty())
     enum_decl->setDeclName(&ast.Idents.get(name));
@@ -2913,6 +2923,35 @@ bool TypeSystemClang::IsCStringType(lldb::opaque_compiler_type_t type,
   return false;
 }
 
+unsigned TypeSystemClang::GetPtrAuthKey(lldb::opaque_compiler_type_t type) {
+  if (type) {
+    clang::QualType qual_type(GetCanonicalQualType(type));
+    if (auto pointer_auth = qual_type.getPointerAuth())
+      return pointer_auth.getKey();
+  }
+  return 0;
+}
+
+unsigned
+TypeSystemClang::GetPtrAuthDiscriminator(lldb::opaque_compiler_type_t type) {
+  if (type) {
+    clang::QualType qual_type(GetCanonicalQualType(type));
+    if (auto pointer_auth = qual_type.getPointerAuth())
+      return pointer_auth.getExtraDiscriminator();
+  }
+  return 0;
+}
+
+bool TypeSystemClang::GetPtrAuthAddressDiversity(
+    lldb::opaque_compiler_type_t type) {
+  if (type) {
+    clang::QualType qual_type(GetCanonicalQualType(type));
+    if (auto pointer_auth = qual_type.getPointerAuth())
+      return pointer_auth.isAddressDiscriminated();
+  }
+  return false;
+}
+
 bool TypeSystemClang::IsFunctionType(lldb::opaque_compiler_type_t type) {
   auto isFunctionType = [&](clang::QualType qual_type) {
     return qual_type->isFunctionType();
@@ -4502,6 +4541,19 @@ TypeSystemClang::AddConstModifier(lldb::opaque_compiler_type_t type) {
 }
 
 CompilerType
+TypeSystemClang::AddPtrAuthModifier(lldb::opaque_compiler_type_t type,
+                                    uint32_t payload) {
+  if (type) {
+    clang::ASTContext &clang_ast = getASTContext();
+    auto pauth = PointerAuthQualifier::fromOpaqueValue(payload);
+    clang::QualType result =
+        clang_ast.getPointerAuthType(GetQualType(type), pauth);
+    return GetType(result);
+  }
+  return CompilerType();
+}
+
+CompilerType
 TypeSystemClang::AddVolatileModifier(lldb::opaque_compiler_type_t type) {
   if (type) {
     clang::QualType result(GetQualType(type));
@@ -4534,7 +4586,7 @@ CompilerType TypeSystemClang::CreateTypedef(
       decl_ctx = getASTContext().getTranslationUnitDecl();
 
     clang::TypedefDecl *decl =
-        clang::TypedefDecl::CreateDeserialized(clang_ast, 0);
+        clang::TypedefDecl::CreateDeserialized(clang_ast, GlobalDeclID());
     decl->setDeclContext(decl_ctx);
     decl->setDeclName(&clang_ast.Idents.get(typedef_name));
     decl->setTypeSourceInfo(clang_ast.getTrivialTypeSourceInfo(qual_type));
@@ -4851,7 +4903,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
     case clang::BuiltinType::Kind::OCLQueue:
     case clang::BuiltinType::Kind::OCLReserveID:
     case clang::BuiltinType::Kind::OCLSampler:
-    case clang::BuiltinType::Kind::OMPArraySection:
+    case clang::BuiltinType::Kind::ArraySection:
     case clang::BuiltinType::Kind::OMPArrayShaping:
     case clang::BuiltinType::Kind::OMPIterator:
     case clang::BuiltinType::Kind::Overload:
@@ -5903,6 +5955,36 @@ CompilerType TypeSystemClang::GetVirtualBaseClassAtIndex(
   return CompilerType();
 }
 
+CompilerDecl
+TypeSystemClang::GetStaticFieldWithName(lldb::opaque_compiler_type_t type,
+                                        llvm::StringRef name) {
+  clang::QualType qual_type = RemoveWrappingTypes(GetCanonicalQualType(type));
+  switch (qual_type->getTypeClass()) {
+  case clang::Type::Record: {
+    if (!GetCompleteType(type))
+      return CompilerDecl();
+
+    const clang::RecordType *record_type =
+        llvm::cast<clang::RecordType>(qual_type.getTypePtr());
+    const clang::RecordDecl *record_decl = record_type->getDecl();
+
+    clang::DeclarationName decl_name(&getASTContext().Idents.get(name));
+    for (NamedDecl *decl : record_decl->lookup(decl_name)) {
+      auto *var_decl = dyn_cast<clang::VarDecl>(decl);
+      if (!var_decl || var_decl->getStorageClass() != clang::SC_Static)
+        continue;
+
+      return CompilerDecl(this, var_decl);
+    }
+    break;
+  }
+
+  default:
+    break;
+  }
+  return CompilerDecl();
+}
+
 // If a pointer to a pointee type (the clang_type arg) says that it has no
 // children, then we either need to trust it, or override it and return a
 // different result. For example, an "int *" has one child that is an integer,
@@ -5973,7 +6055,7 @@ uint32_t TypeSystemClang::GetNumPointeeChildren(clang::QualType type) {
     case clang::BuiltinType::ARCUnbridgedCast:
     case clang::BuiltinType::PseudoObject:
     case clang::BuiltinType::BuiltinFn:
-    case clang::BuiltinType::OMPArraySection:
+    case clang::BuiltinType::ArraySection:
       return 1;
     default:
       return 0;
@@ -7291,7 +7373,7 @@ clang::FieldDecl *TypeSystemClang::AddFieldToRecordType(
 
   clang::RecordDecl *record_decl = ast->GetAsRecordDecl(type);
   if (record_decl) {
-    field = clang::FieldDecl::CreateDeserialized(clang_ast, 0);
+    field = clang::FieldDecl::CreateDeserialized(clang_ast, GlobalDeclID());
     field->setDeclContext(record_decl);
     field->setDeclName(ident);
     field->setType(ClangUtil::GetQualType(field_clang_type));
@@ -7338,7 +7420,8 @@ clang::FieldDecl *TypeSystemClang::AddFieldToRecordType(
 
       field_clang_type.GetCompleteType();
 
-      auto *ivar = clang::ObjCIvarDecl::CreateDeserialized(clang_ast, 0);
+      auto *ivar =
+          clang::ObjCIvarDecl::CreateDeserialized(clang_ast, GlobalDeclID());
       ivar->setDeclContext(class_interface_decl);
       ivar->setDeclName(ident);
       ivar->setType(ClangUtil::GetQualType(field_clang_type));
@@ -7504,7 +7587,8 @@ clang::VarDecl *TypeSystemClang::AddVariableToRecordType(
   if (!name.empty())
     ident = &ast->getASTContext().Idents.get(name);
 
-  var_decl = clang::VarDecl::CreateDeserialized(ast->getASTContext(), 0);
+  var_decl =
+      clang::VarDecl::CreateDeserialized(ast->getASTContext(), GlobalDeclID());
   var_decl->setDeclContext(record_decl);
   var_decl->setDeclName(ident);
   var_decl->setType(ClangUtil::GetQualType(var_type));
@@ -7605,8 +7689,8 @@ clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType(
                                     : clang::ExplicitSpecKind::ResolvedFalse);
 
   if (name.starts_with("~")) {
-    cxx_dtor_decl =
-        clang::CXXDestructorDecl::CreateDeserialized(getASTContext(), 0);
+    cxx_dtor_decl = clang::CXXDestructorDecl::CreateDeserialized(
+        getASTContext(), GlobalDeclID());
     cxx_dtor_decl->setDeclContext(cxx_record_decl);
     cxx_dtor_decl->setDeclName(
         getASTContext().DeclarationNames.getCXXDestructorName(
@@ -7618,7 +7702,7 @@ clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType(
     cxx_method_decl = cxx_dtor_decl;
   } else if (decl_name == cxx_record_decl->getDeclName()) {
     cxx_ctor_decl = clang::CXXConstructorDecl::CreateDeserialized(
-        getASTContext(), 0, 0);
+        getASTContext(), GlobalDeclID(), 0);
     cxx_ctor_decl->setDeclContext(cxx_record_decl);
     cxx_ctor_decl->setDeclName(
         getASTContext().DeclarationNames.getCXXConstructorName(
@@ -7644,8 +7728,8 @@ clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType(
         if (!TypeSystemClang::CheckOverloadedOperatorKindParameterCount(
                 is_method, op_kind, num_params))
           return nullptr;
-        cxx_method_decl =
-            clang::CXXMethodDecl::CreateDeserialized(getASTContext(), 0);
+        cxx_method_decl = clang::CXXMethodDecl::CreateDeserialized(
+            getASTContext(), GlobalDeclID());
         cxx_method_decl->setDeclContext(cxx_record_decl);
         cxx_method_decl->setDeclName(
             getASTContext().DeclarationNames.getCXXOperatorName(op_kind));
@@ -7656,7 +7740,8 @@ clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType(
       } else if (num_params == 0) {
         // Conversion operators don't take params...
         auto *cxx_conversion_decl =
-            clang::CXXConversionDecl::CreateDeserialized(getASTContext(), 0);
+            clang::CXXConversionDecl::CreateDeserialized(getASTContext(),
+                                                         GlobalDeclID());
         cxx_conversion_decl->setDeclContext(cxx_record_decl);
         cxx_conversion_decl->setDeclName(
             getASTContext().DeclarationNames.getCXXConversionFunctionName(
@@ -7671,8 +7756,8 @@ clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType(
     }
 
     if (cxx_method_decl == nullptr) {
-      cxx_method_decl =
-          clang::CXXMethodDecl::CreateDeserialized(getASTContext(), 0);
+      cxx_method_decl = clang::CXXMethodDecl::CreateDeserialized(
+          getASTContext(), GlobalDeclID());
       cxx_method_decl->setDeclContext(cxx_record_decl);
       cxx_method_decl->setDeclName(decl_name);
       cxx_method_decl->setType(method_qual_type);
@@ -7855,7 +7940,7 @@ bool TypeSystemClang::AddObjCClassProperty(
         ClangUtil::GetQualType(property_clang_type));
 
   clang::ObjCPropertyDecl *property_decl =
-      clang::ObjCPropertyDecl::CreateDeserialized(clang_ast, 0);
+      clang::ObjCPropertyDecl::CreateDeserialized(clang_ast, GlobalDeclID());
   property_decl->setDeclContext(class_interface_decl);
   property_decl->setDeclName(&clang_ast.Idents.get(property_name));
   property_decl->setType(ivar_decl
@@ -7944,7 +8029,8 @@ bool TypeSystemClang::AddObjCClassProperty(
         clang::ObjCImplementationControl::None;
     const bool HasRelatedResultType = false;
 
-    getter = clang::ObjCMethodDecl::CreateDeserialized(clang_ast, 0);
+    getter =
+        clang::ObjCMethodDecl::CreateDeserialized(clang_ast, GlobalDeclID());
     getter->setDeclName(getter_sel);
     getter->setReturnType(ClangUtil::GetQualType(property_clang_type_to_access));
     getter->setDeclContext(class_interface_decl);
@@ -7986,7 +8072,8 @@ bool TypeSystemClang::AddObjCClassProperty(
         clang::ObjCImplementationControl::None;
     const bool HasRelatedResultType = false;
 
-    setter = clang::ObjCMethodDecl::CreateDeserialized(clang_ast, 0);
+    setter =
+        clang::ObjCMethodDecl::CreateDeserialized(clang_ast, GlobalDeclID());
     setter->setDeclName(setter_sel);
     setter->setReturnType(result_type);
     setter->setDeclContext(class_interface_decl);
@@ -8114,7 +8201,8 @@ clang::ObjCMethodDecl *TypeSystemClang::AddMethodToObjCObjectType(
     return nullptr; // some debug information is corrupt.  We are not going to
                     // deal with it.
 
-  auto *objc_method_decl = clang::ObjCMethodDecl::CreateDeserialized(ast, 0);
+  auto *objc_method_decl =
+      clang::ObjCMethodDecl::CreateDeserialized(ast, GlobalDeclID());
   objc_method_decl->setDeclName(method_selector);
   objc_method_decl->setReturnType(method_function_prototype->getReturnType());
   objc_method_decl->setDeclContext(
@@ -8360,7 +8448,8 @@ clang::EnumConstantDecl *TypeSystemClang::AddEnumerationValueToEnumerationType(
     return nullptr;
 
   clang::EnumConstantDecl *enumerator_decl =
-      clang::EnumConstantDecl::CreateDeserialized(getASTContext(), 0);
+      clang::EnumConstantDecl::CreateDeserialized(getASTContext(),
+                                                  GlobalDeclID());
   enumerator_decl->setDeclContext(enutype->getDecl());
   if (name && name[0])
     enumerator_decl->setDeclName(&getASTContext().Idents.get(name));
@@ -9077,6 +9166,21 @@ CompilerType TypeSystemClang::DeclGetFunctionArgumentType(void *opaque_decl,
   return CompilerType();
 }
 
+Scalar TypeSystemClang::DeclGetConstantValue(void *opaque_decl) {
+  clang::Decl *decl = static_cast<clang::Decl *>(opaque_decl);
+  clang::VarDecl *var_decl = llvm::dyn_cast<clang::VarDecl>(decl);
+  if (!var_decl)
+    return Scalar();
+  clang::Expr *init_expr = var_decl->getInit();
+  if (!init_expr)
+    return Scalar();
+  std::optional<llvm::APSInt> value =
+      init_expr->getIntegerConstantExpr(getASTContext());
+  if (!value)
+    return Scalar();
+  return Scalar(*value);
+}
+
 // CompilerDeclContext functions
 
 std::vector<CompilerDecl> TypeSystemClang::DeclContextFindDeclByName(
@@ -9534,7 +9638,7 @@ void ScratchTypeSystemClang::Dump(llvm::raw_ostream &output) {
 }
 
 UserExpression *ScratchTypeSystemClang::GetUserExpression(
-    llvm::StringRef expr, llvm::StringRef prefix, lldb::LanguageType language,
+    llvm::StringRef expr, llvm::StringRef prefix, SourceLanguage language,
     Expression::ResultType desired_type,
     const EvaluateExpressionOptions &options, ValueObject *ctx_obj) {
   TargetSP target_sp = m_target_wp.lock();
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 68b82e9688f1..59ca69622d9e 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -22,6 +22,7 @@
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTFwd.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/TargetInfo.h"
@@ -66,11 +67,13 @@ public:
 
 /// The implementation of lldb::Type's m_payload field for TypeSystemClang.
 class TypePayloadClang {
-  /// The Layout is as follows:
+  /// The payload is used for typedefs and ptrauth types.
+  /// For typedefs, the Layout is as follows:
   /// \verbatim
   /// bit 0..30 ... Owning Module ID.
   /// bit 31 ...... IsCompleteObjCClass.
   /// \endverbatim
+  /// For ptrauth types, we store the PointerAuthQualifier as an opaque value.
   Type::Payload m_payload = 0;
 
 public:
@@ -251,6 +254,8 @@ public:
 
   CompilerType GetTypeForDecl(clang::ObjCInterfaceDecl *objc_decl);
 
+  CompilerType GetTypeForDecl(clang::ValueDecl *value_decl);
+
   template <typename RecordDeclType>
   CompilerType
   GetTypeForIdentifier(llvm::StringRef type_name,
@@ -559,6 +564,8 @@ public:
   std::vector<lldb_private::CompilerContext>
   DeclGetCompilerContext(void *opaque_decl) override;
 
+  Scalar DeclGetConstantValue(void *opaque_decl) override;
+
   CompilerType GetTypeForDecl(void *opaque_decl) override;
 
   // CompilerDeclContext override functions
@@ -648,6 +655,10 @@ public:
   bool IsFloatingPointType(lldb::opaque_compiler_type_t type, uint32_t &count,
                            bool &is_complex) override;
 
+  unsigned GetPtrAuthKey(lldb::opaque_compiler_type_t type) override;
+  unsigned GetPtrAuthDiscriminator(lldb::opaque_compiler_type_t type) override;
+  bool GetPtrAuthAddressDiversity(lldb::opaque_compiler_type_t type) override;
+
   bool IsFunctionType(lldb::opaque_compiler_type_t type) override;
 
   uint32_t IsHomogeneousAggregate(lldb::opaque_compiler_type_t type,
@@ -788,6 +799,9 @@ public:
 
   CompilerType AddConstModifier(lldb::opaque_compiler_type_t type) override;
 
+  CompilerType AddPtrAuthModifier(lldb::opaque_compiler_type_t type,
+                                  uint32_t payload) override;
+
   CompilerType AddVolatileModifier(lldb::opaque_compiler_type_t type) override;
 
   CompilerType AddRestrictModifier(lldb::opaque_compiler_type_t type) override;
@@ -868,6 +882,9 @@ public:
                                           size_t idx,
                                           uint32_t *bit_offset_ptr) override;
 
+  CompilerDecl GetStaticFieldWithName(lldb::opaque_compiler_type_t type,
+                                      llvm::StringRef name) override;
+
   static uint32_t GetNumPointeeChildren(clang::QualType type);
 
   CompilerType GetChildCompilerTypeAtIndex(
@@ -1272,12 +1289,12 @@ public:
   /// \see lldb_private::TypeSystem::Dump
   void Dump(llvm::raw_ostream &output) override;
 
-  UserExpression *
-  GetUserExpression(llvm::StringRef expr, llvm::StringRef prefix,
-                    lldb::LanguageType language,
-                    Expression::ResultType desired_type,
-                    const EvaluateExpressionOptions &options,
-                    ValueObject *ctx_obj) override;
+  UserExpression *GetUserExpression(llvm::StringRef expr,
+                                    llvm::StringRef prefix,
+                                    SourceLanguage language,
+                                    Expression::ResultType desired_type,
+                                    const EvaluateExpressionOptions &options,
+                                    ValueObject *ctx_obj) override;
 
   FunctionCaller *GetFunctionCaller(const CompilerType &return_type,
                                     const Address &function_address,
diff --git a/lldb/source/Symbol/CompilerDecl.cpp b/lldb/source/Symbol/CompilerDecl.cpp
index 0eb630e5b9e1..5fa0a32f041a 100644
--- a/lldb/source/Symbol/CompilerDecl.cpp
+++ b/lldb/source/Symbol/CompilerDecl.cpp
@@ -9,6 +9,7 @@
 #include "lldb/Symbol/CompilerDecl.h"
 #include "lldb/Symbol/CompilerDeclContext.h"
 #include "lldb/Symbol/TypeSystem.h"
+#include "lldb/Utility/Scalar.h"
 
 using namespace lldb_private;
 
@@ -24,6 +25,10 @@ CompilerDeclContext CompilerDecl::GetDeclContext() const {
   return m_type_system->DeclGetDeclContext(m_opaque_decl);
 }
 
+CompilerType CompilerDecl::GetType() const {
+  return m_type_system->GetTypeForDecl(m_opaque_decl);
+}
+
 CompilerType CompilerDecl::GetFunctionReturnType() const {
   return m_type_system->DeclGetFunctionReturnType(m_opaque_decl);
 }
@@ -52,3 +57,7 @@ std::vector<lldb_private::CompilerContext>
 CompilerDecl::GetCompilerContext() const {
   return m_type_system->DeclGetCompilerContext(m_opaque_decl);
 }
+
+Scalar CompilerDecl::GetConstantValue() const {
+  return m_type_system->DeclGetConstantValue(m_opaque_decl);
+}
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index 96e74b890d2d..072dbccec44f 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -108,6 +108,27 @@ bool CompilerType::IsConst() const {
   return false;
 }
 
+unsigned CompilerType::GetPtrAuthKey() const {
+  if (IsValid())
+    if (auto type_system_sp = GetTypeSystem())
+      return type_system_sp->GetPtrAuthKey(m_type);
+  return 0;
+}
+
+unsigned CompilerType::GetPtrAuthDiscriminator() const {
+  if (IsValid())
+    if (auto type_system_sp = GetTypeSystem())
+      return type_system_sp->GetPtrAuthDiscriminator(m_type);
+  return 0;
+}
+
+bool CompilerType::GetPtrAuthAddressDiversity() const {
+  if (IsValid())
+    if (auto type_system_sp = GetTypeSystem())
+      return type_system_sp->GetPtrAuthAddressDiversity(m_type);
+  return false;
+}
+
 bool CompilerType::IsFunctionType() const {
   if (IsValid())
     if (auto type_system_sp = GetTypeSystem())
@@ -664,6 +685,13 @@ CompilerType CompilerType::GetPointerType() const {
   return CompilerType();
 }
 
+CompilerType CompilerType::AddPtrAuthModifier(uint32_t payload) const {
+  if (IsValid())
+    if (auto type_system_sp = GetTypeSystem())
+      return type_system_sp->AddPtrAuthModifier(m_type, payload);
+  return CompilerType();
+}
+
 CompilerType CompilerType::GetLValueReferenceType() const {
   if (IsValid())
     if (auto type_system_sp = GetTypeSystem())
@@ -849,6 +877,12 @@ CompilerType::GetVirtualBaseClassAtIndex(size_t idx,
   return CompilerType();
 }
 
+CompilerDecl CompilerType::GetStaticFieldWithName(llvm::StringRef name) const {
+  if (IsValid())
+    return GetTypeSystem()->GetStaticFieldWithName(m_type, name);
+  return CompilerDecl();
+}
+
 uint32_t CompilerType::GetIndexOfFieldWithName(
     const char *name, CompilerType *field_compiler_type_ptr,
     uint64_t *bit_offset_ptr, uint32_t *bitfield_bit_size_ptr,
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index dcfd1238f6d7..b85c38097ebe 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -355,6 +355,9 @@ void Type::GetDescription(Stream *s, lldb::DescriptionLevel level,
     case eEncodingIsSyntheticUID:
       s->PutCString(" (synthetic type)");
       break;
+    case eEncodingIsLLVMPtrAuthUID:
+      s->PutCString(" (ptrauth type)");
+      break;
     }
   }
 }
@@ -416,6 +419,8 @@ void Type::Dump(Stream *s, bool show_context, lldb::DescriptionLevel level) {
     case eEncodingIsSyntheticUID:
       s->PutCString(" (synthetic type)");
       break;
+    case eEncodingIsLLVMPtrAuthUID:
+      s->PutCString(" (ptrauth type)");
     }
   }
 
@@ -477,7 +482,8 @@ std::optional<uint64_t> Type::GetByteSize(ExecutionContextScope *exe_scope) {
     // If we are a pointer or reference, then this is just a pointer size;
     case eEncodingIsPointerUID:
     case eEncodingIsLValueReferenceUID:
-    case eEncodingIsRValueReferenceUID: {
+    case eEncodingIsRValueReferenceUID:
+    case eEncodingIsLLVMPtrAuthUID: {
       if (ArchSpec arch = m_symbol_file->GetObjectFile()->GetArchitecture()) {
         m_byte_size = arch.GetAddressByteSize();
         m_byte_size_has_value = true;
@@ -621,6 +627,12 @@ bool Type::ResolveCompilerType(ResolveState compiler_type_resolve_state) {
             encoding_type->GetForwardCompilerType().GetRValueReferenceType();
         break;
 
+      case eEncodingIsLLVMPtrAuthUID:
+        m_compiler_type =
+            encoding_type->GetForwardCompilerType().AddPtrAuthModifier(
+                m_payload);
+        break;
+
       default:
         llvm_unreachable("Unhandled encoding_data_type.");
       }
@@ -676,6 +688,10 @@ bool Type::ResolveCompilerType(ResolveState compiler_type_resolve_state) {
           m_compiler_type = void_compiler_type.GetRValueReferenceType();
           break;
 
+        case eEncodingIsLLVMPtrAuthUID:
+          llvm_unreachable("Cannot handle eEncodingIsLLVMPtrAuthUID without "
+                           "valid encoding_type");
+
         default:
           llvm_unreachable("Unhandled encoding_data_type.");
         }
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index 59b1b39e635a..3665771b1889 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -93,6 +93,11 @@ CompilerType TypeSystem::AddConstModifier(lldb::opaque_compiler_type_t type) {
   return CompilerType();
 }
 
+CompilerType TypeSystem::AddPtrAuthModifier(lldb::opaque_compiler_type_t type,
+                                            uint32_t payload) {
+  return CompilerType();
+}
+
 CompilerType
 TypeSystem::AddVolatileModifier(lldb::opaque_compiler_type_t type) {
   return CompilerType();
diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index 1542c8cb68ce..d0bffe441f63 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -19,6 +19,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/Stream.h"
 
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/Threading.h"
 
 using namespace lldb;
@@ -532,3 +533,36 @@ Language::Language() = default;
 
 // Destructor
 Language::~Language() = default;
+
+SourceLanguage::SourceLanguage(lldb::LanguageType language_type) {
+  auto lname =
+      llvm::dwarf::toDW_LNAME((llvm::dwarf::SourceLanguage)language_type);
+  if (!lname)
+    return;
+  name = lname->first;
+  version = lname->second;
+}
+
+lldb::LanguageType SourceLanguage::AsLanguageType() const {
+  if (auto lang = llvm::dwarf::toDW_LANG((llvm::dwarf::SourceLanguageName)name,
+                                         version))
+    return (lldb::LanguageType)*lang;
+  return lldb::eLanguageTypeUnknown;
+}
+
+llvm::StringRef SourceLanguage::GetDescription() const {
+  LanguageType type = AsLanguageType();
+  if (type)
+    return Language::GetNameForLanguageType(type);
+  return llvm::dwarf::LanguageDescription(
+      (llvm::dwarf::SourceLanguageName)name);
+}
+bool SourceLanguage::IsC() const { return name == llvm::dwarf::DW_LNAME_C; }
+
+bool SourceLanguage::IsObjC() const {
+  return name == llvm::dwarf::DW_LNAME_ObjC;
+}
+
+bool SourceLanguage::IsCPlusPlus() const {
+  return name == llvm::dwarf::DW_LNAME_C_plus_plus;
+}
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 606518ca5412..30c240b064b5 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -408,8 +408,8 @@ ProcessSP Process::FindPlugin(lldb::TargetSP target_sp,
   return process_sp;
 }
 
-ConstString &Process::GetStaticBroadcasterClass() {
-  static ConstString class_name("lldb.process");
+llvm::StringRef Process::GetStaticBroadcasterClass() {
+  static constexpr llvm::StringLiteral class_name("lldb.process");
   return class_name;
 }
 
@@ -423,7 +423,7 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp,
                  const UnixSignalsSP &unix_signals_sp)
     : ProcessProperties(this),
       Broadcaster((target_sp->GetDebugger().GetBroadcasterManager()),
-                  Process::GetStaticBroadcasterClass().AsCString()),
+                  Process::GetStaticBroadcasterClass().str()),
       m_target_wp(target_sp), m_public_state(eStateUnloaded),
       m_private_state(eStateUnloaded),
       m_private_state_broadcaster(nullptr,
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 03a74f29e76e..246871d5abaa 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1203,26 +1203,23 @@ bool StackFrame::IsArtificial() const {
   return m_stack_frame_kind == StackFrame::Kind::Artificial;
 }
 
-lldb::LanguageType StackFrame::GetLanguage() {
+SourceLanguage StackFrame::GetLanguage() {
   CompileUnit *cu = GetSymbolContext(eSymbolContextCompUnit).comp_unit;
   if (cu)
     return cu->GetLanguage();
-  return lldb::eLanguageTypeUnknown;
+  return {};
 }
 
-lldb::LanguageType StackFrame::GuessLanguage() {
-  LanguageType lang_type = GetLanguage();
+SourceLanguage StackFrame::GuessLanguage() {
+  SourceLanguage lang_type = GetLanguage();
 
   if (lang_type == eLanguageTypeUnknown) {
-    SymbolContext sc = GetSymbolContext(eSymbolContextFunction
-                                        | eSymbolContextSymbol);
-    if (sc.function) {
-      lang_type = sc.function->GetMangled().GuessLanguage();
-    }
+    SymbolContext sc =
+        GetSymbolContext(eSymbolContextFunction | eSymbolContextSymbol);
+    if (sc.function)
+      lang_type = LanguageType(sc.function->GetMangled().GuessLanguage());
     else if (sc.symbol)
-    {
-      lang_type = sc.symbol->GetMangled().GuessLanguage();
-    }
+      lang_type = SourceLanguage(sc.symbol->GetMangled().GuessLanguage());
   }
 
   return lang_type;
@@ -1302,7 +1299,7 @@ GetBaseExplainingDereference(const Instruction::Operand &operand,
   }
   return std::make_pair(nullptr, 0);
 }
-}
+} // namespace
 
 lldb::ValueObjectSP StackFrame::GuessValueForAddress(lldb::addr_t addr) {
   TargetSP target_sp = CalculateTarget();
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 09b0ac42631d..82f3040e539a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -87,8 +87,8 @@ const Target::Arch &Target::Arch::operator=(const ArchSpec &spec) {
   return *this;
 }
 
-ConstString &Target::GetStaticBroadcasterClass() {
-  static ConstString class_name("lldb.target");
+llvm::StringRef Target::GetStaticBroadcasterClass() {
+  static constexpr llvm::StringLiteral class_name("lldb.target");
   return class_name;
 }
 
@@ -96,7 +96,7 @@ Target::Target(Debugger &debugger, const ArchSpec &target_arch,
                const lldb::PlatformSP &platform_sp, bool is_dummy_target)
     : TargetProperties(this),
       Broadcaster(debugger.GetBroadcasterManager(),
-                  Target::GetStaticBroadcasterClass().AsCString()),
+                  Target::GetStaticBroadcasterClass().str()),
       ExecutionContextScope(), m_debugger(debugger), m_platform_sp(platform_sp),
       m_mutex(), m_arch(target_arch), m_images(this), m_section_load_history(),
       m_breakpoint_list(false), m_internal_breakpoint_list(true),
@@ -504,7 +504,7 @@ BreakpointSP Target::CreateBreakpoint(
     if (skip_prologue == eLazyBoolCalculate)
       skip_prologue = GetSkipPrologue() ? eLazyBoolYes : eLazyBoolNo;
     if (language == lldb::eLanguageTypeUnknown)
-      language = GetLanguage();
+      language = GetLanguage().AsLanguageType();
 
     BreakpointResolverSP resolver_sp(new BreakpointResolverName(
         nullptr, func_name, func_name_type_mask, language, Breakpoint::Exact,
@@ -530,7 +530,7 @@ Target::CreateBreakpoint(const FileSpecList *containingModules,
     if (skip_prologue == eLazyBoolCalculate)
       skip_prologue = GetSkipPrologue() ? eLazyBoolYes : eLazyBoolNo;
     if (language == lldb::eLanguageTypeUnknown)
-      language = GetLanguage();
+      language = GetLanguage().AsLanguageType();
 
     BreakpointResolverSP resolver_sp(
         new BreakpointResolverName(nullptr, func_names, func_name_type_mask,
@@ -559,7 +559,7 @@ Target::CreateBreakpoint(const FileSpecList *containingModules,
         skip_prologue = eLazyBoolNo;
     }
     if (language == lldb::eLanguageTypeUnknown)
-      language = GetLanguage();
+      language = GetLanguage().AsLanguageType();
 
     BreakpointResolverSP resolver_sp(new BreakpointResolverName(
         nullptr, func_names, num_names, func_name_type_mask, language, offset,
@@ -2504,15 +2504,16 @@ Target::GetPersistentExpressionStateForLanguage(lldb::LanguageType language) {
 }
 
 UserExpression *Target::GetUserExpressionForLanguage(
-    llvm::StringRef expr, llvm::StringRef prefix, lldb::LanguageType language,
+    llvm::StringRef expr, llvm::StringRef prefix, SourceLanguage language,
     Expression::ResultType desired_type,
     const EvaluateExpressionOptions &options, ValueObject *ctx_obj,
     Status &error) {
-  auto type_system_or_err = GetScratchTypeSystemForLanguage(language);
+  auto type_system_or_err =
+      GetScratchTypeSystemForLanguage(language.AsLanguageType());
   if (auto err = type_system_or_err.takeError()) {
     error.SetErrorStringWithFormat(
         "Could not find type system for language %s: %s",
-        Language::GetNameForLanguageType(language),
+        Language::GetNameForLanguageType(language.AsLanguageType()),
         llvm::toString(std::move(err)).c_str());
     return nullptr;
   }
@@ -2521,7 +2522,7 @@ UserExpression *Target::GetUserExpressionForLanguage(
   if (!ts) {
     error.SetErrorStringWithFormat(
         "Type system for language %s is no longer live",
-        Language::GetNameForLanguageType(language));
+        language.GetDescription().data());
     return nullptr;
   }
 
@@ -2530,7 +2531,7 @@ UserExpression *Target::GetUserExpressionForLanguage(
   if (!user_expr)
     error.SetErrorStringWithFormat(
         "Could not create an expression for language %s",
-        Language::GetNameForLanguageType(language));
+        language.GetDescription().data());
 
   return user_expr;
 }
@@ -4646,9 +4647,9 @@ void TargetProperties::SetStandardErrorPath(llvm::StringRef path) {
   SetPropertyAtIndex(idx, path);
 }
 
-LanguageType TargetProperties::GetLanguage() const {
+SourceLanguage TargetProperties::GetLanguage() const {
   const uint32_t idx = ePropertyLanguage;
-  return GetPropertyAtIndexAs<LanguageType>(idx, {});
+  return {GetPropertyAtIndexAs<LanguageType>(idx, {})};
 }
 
 llvm::StringRef TargetProperties::GetExpressionPrefixContents() {
diff --git a/lldb/source/Target/TargetList.cpp b/lldb/source/Target/TargetList.cpp
index b5d308739d0f..10467753666f 100644
--- a/lldb/source/Target/TargetList.cpp
+++ b/lldb/source/Target/TargetList.cpp
@@ -29,15 +29,15 @@
 using namespace lldb;
 using namespace lldb_private;
 
-ConstString &TargetList::GetStaticBroadcasterClass() {
-  static ConstString class_name("lldb.targetList");
+llvm::StringRef TargetList::GetStaticBroadcasterClass() {
+  static constexpr llvm::StringLiteral class_name("lldb.targetList");
   return class_name;
 }
 
 // TargetList constructor
 TargetList::TargetList(Debugger &debugger)
     : Broadcaster(debugger.GetBroadcasterManager(),
-                  TargetList::GetStaticBroadcasterClass().AsCString()),
+                  TargetList::GetStaticBroadcasterClass().str()),
       m_target_list(), m_target_list_mutex(), m_selected_target_idx(0) {
   CheckInWithManager();
 }
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 412e44ede9c1..e75f5a356cec 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -205,15 +205,15 @@ Thread::ThreadEventData::GetStackFrameFromEvent(const Event *event_ptr) {
 
 // Thread class
 
-ConstString &Thread::GetStaticBroadcasterClass() {
-  static ConstString class_name("lldb.thread");
+llvm::StringRef Thread::GetStaticBroadcasterClass() {
+  static constexpr llvm::StringLiteral class_name("lldb.thread");
   return class_name;
 }
 
 Thread::Thread(Process &process, lldb::tid_t tid, bool use_invalid_index_id)
     : ThreadProperties(false), UserID(tid),
       Broadcaster(process.GetTarget().GetDebugger().GetBroadcasterManager(),
-                  Thread::GetStaticBroadcasterClass().AsCString()),
+                  Thread::GetStaticBroadcasterClass().str()),
       m_process_wp(process.shared_from_this()), m_stop_info_sp(),
       m_stop_info_stop_id(0), m_stop_info_override_stop_id(0),
       m_should_run_before_public_stop(false),
diff --git a/lldb/source/Utility/Broadcaster.cpp b/lldb/source/Utility/Broadcaster.cpp
index 12903edc36b1..bd65ffd86a1d 100644
--- a/lldb/source/Utility/Broadcaster.cpp
+++ b/lldb/source/Utility/Broadcaster.cpp
@@ -373,8 +373,8 @@ void Broadcaster::BroadcasterImpl::RestoreBroadcaster() {
     m_hijacking_masks.pop_back();
 }
 
-ConstString &Broadcaster::GetBroadcasterClass() const {
-  static ConstString class_name("lldb.anonymous");
+llvm::StringRef Broadcaster::GetBroadcasterClass() const {
+  static constexpr llvm::StringLiteral class_name("lldb.anonymous");
   return class_name;
 }
 
diff --git a/lldb/test/API/clear-sbvalue-nonaddressable-bits/Makefile b/lldb/test/API/clear-sbvalue-nonaddressable-bits/Makefile
new file mode 100644
index 000000000000..10495940055b
--- /dev/null
+++ b/lldb/test/API/clear-sbvalue-nonaddressable-bits/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/clear-sbvalue-nonaddressable-bits/TestClearSBValueNonAddressableBits.py b/lldb/test/API/clear-sbvalue-nonaddressable-bits/TestClearSBValueNonAddressableBits.py
new file mode 100644
index 000000000000..382b0e7a81d2
--- /dev/null
+++ b/lldb/test/API/clear-sbvalue-nonaddressable-bits/TestClearSBValueNonAddressableBits.py
@@ -0,0 +1,59 @@
+"""Test that SBValue clears non-addressable bits"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestClearSBValueNonAddressableBits(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    # On AArch64 systems, the top bits that are not used for
+    # addressing may be used for TBI, MTE, and/or pointer
+    # authentication.
+    @skipIf(archs=no_match(["aarch64", "arm64", "arm64e"]))
+
+    # Only run this test on systems where TBI is known to be
+    # enabled, so the address mask will clear the TBI bits.
+    @skipUnlessPlatform(["linux"] + lldbplatformutil.getDarwinOSTriples())
+    def test(self):
+        self.source = "main.c"
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "break here", lldb.SBFileSpec(self.source, False)
+        )
+
+        if self.TraceOn():
+            self.runCmd("frame variable")
+            self.runCmd("frame variable &count &global")
+
+        frame = thread.GetFrameAtIndex(0)
+
+        count_p = frame.FindVariable("count_p")
+        count_invalid_p = frame.FindVariable("count_invalid_p")
+        self.assertEqual(
+            count_p.GetValueAsUnsigned(), count_invalid_p.GetValueAsAddress()
+        )
+        self.assertNotEqual(
+            count_invalid_p.GetValueAsUnsigned(), count_invalid_p.GetValueAsAddress()
+        )
+        self.assertEqual(5, count_p.Dereference().GetValueAsUnsigned())
+        self.assertEqual(5, count_invalid_p.Dereference().GetValueAsUnsigned())
+
+        global_p = frame.FindVariable("global_p")
+        global_invalid_p = frame.FindVariable("global_invalid_p")
+        self.assertEqual(
+            global_p.GetValueAsUnsigned(), global_invalid_p.GetValueAsAddress()
+        )
+        self.assertNotEqual(
+            global_invalid_p.GetValueAsUnsigned(), global_invalid_p.GetValueAsAddress()
+        )
+        self.assertEqual(10, global_p.Dereference().GetValueAsUnsigned())
+        self.assertEqual(10, global_invalid_p.Dereference().GetValueAsUnsigned())
+
+        main_p = frame.FindVariable("main_p")
+        main_invalid_p = frame.FindVariable("main_invalid_p")
+        self.assertEqual(
+            main_p.GetValueAsUnsigned(), main_invalid_p.GetValueAsAddress()
+        )
diff --git a/lldb/test/API/clear-sbvalue-nonaddressable-bits/main.c b/lldb/test/API/clear-sbvalue-nonaddressable-bits/main.c
new file mode 100644
index 000000000000..1b0e42c50dd6
--- /dev/null
+++ b/lldb/test/API/clear-sbvalue-nonaddressable-bits/main.c
@@ -0,0 +1,27 @@
+#include <stdint.h>
+
+int global = 10;
+
+int main() {
+  int count = 5;
+  int *count_p = &count;
+
+  // Add some metadata in the top byte (this will crash unless the
+  // test is running with TBI enabled, but we won't dereference it)
+
+  intptr_t scratch = (intptr_t)count_p;
+  scratch |= (3ULL << 60);
+  int *count_invalid_p = (int *)scratch;
+
+  int (*main_p)() = main;
+  scratch = (intptr_t)main_p;
+  scratch |= (3ULL << 60);
+  int (*main_invalid_p)() = (int (*)())scratch;
+
+  int *global_p = &global;
+  scratch = (intptr_t)global_p;
+  scratch |= (3ULL << 60);
+  int *global_invalid_p = (int *)scratch;
+
+  return count; // break here
+}
diff --git a/lldb/test/API/driver/batch_mode/TestBatchMode.py b/lldb/test/API/driver/batch_mode/TestBatchMode.py
index 642dd47c6d45..bc6f2daebbab 100644
--- a/lldb/test/API/driver/batch_mode/TestBatchMode.py
+++ b/lldb/test/API/driver/batch_mode/TestBatchMode.py
@@ -13,6 +13,7 @@ from lldbsuite.test.lldbpexpect import PExpectTest
 class DriverBatchModeTest(PExpectTest):
     source = "main.c"
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
     @expectedFlakeyFreeBSD("llvm.org/pr25172 fails rarely on the buildbot")
     def test_batch_mode_run_crash(self):
@@ -50,6 +51,7 @@ class DriverBatchModeTest(PExpectTest):
         self.expect_prompt()
         self.expect("frame variable touch_me_not", substrs=["(char *) touch_me_not"])
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
     @expectedFlakeyFreeBSD("llvm.org/pr25172 fails rarely on the buildbot")
     def test_batch_mode_run_exit(self):
@@ -86,6 +88,7 @@ class DriverBatchModeTest(PExpectTest):
 
         child.expect(pexpect.EOF)
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
     @expectedFlakeyFreeBSD("llvm.org/pr25172 fails rarely on the buildbot")
     def test_batch_mode_launch_stop_at_entry(self):
@@ -125,6 +128,7 @@ class DriverBatchModeTest(PExpectTest):
             self.victim.close()
             self.victim = None
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
     @expectedFlakeyFreeBSD("llvm.org/pr25172 fails rarely on the buildbot")
     @expectedFailureNetBSD
diff --git a/lldb/test/API/driver/job_control/TestJobControl.py b/lldb/test/API/driver/job_control/TestJobControl.py
index 1a1739f4cb39..648acb1d4730 100644
--- a/lldb/test/API/driver/job_control/TestJobControl.py
+++ b/lldb/test/API/driver/job_control/TestJobControl.py
@@ -8,6 +8,7 @@ from lldbsuite.test.lldbpexpect import PExpectTest
 
 
 class JobControlTest(PExpectTest):
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
     def test_job_control(self):
         def post_spawn():
diff --git a/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py b/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
index 42527c88b992..5cfcf5d69fd2 100644
--- a/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
+++ b/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
@@ -12,6 +12,7 @@ from lldbsuite.test.lldbpexpect import PExpectTest
 class DriverQuitSpeedTest(PExpectTest):
     source = "main.c"
 
+    @skipIfAsan
     def test_run_quit(self):
         """Test that the lldb driver's batch mode works correctly."""
         import pexpect
diff --git a/lldb/test/API/functionalities/diagnostic_reporting/TestDiagnosticReporting.py b/lldb/test/API/functionalities/diagnostic_reporting/TestDiagnosticReporting.py
index 6353e3e8cbed..36a3be695628 100644
--- a/lldb/test/API/functionalities/diagnostic_reporting/TestDiagnosticReporting.py
+++ b/lldb/test/API/functionalities/diagnostic_reporting/TestDiagnosticReporting.py
@@ -15,7 +15,7 @@ class TestDiagnosticReporting(TestBase):
         self.broadcaster = self.dbg.GetBroadcaster()
         self.listener = lldbutil.start_listening_from(
             self.broadcaster,
-            lldb.eBroadcastBitWarning | lldb.eBroadcastBitError,
+            lldb.SBDebugger.eBroadcastBitWarning | lldb.SBDebugger.eBroadcastBitError,
         )
 
     def test_dwarf_symbol_loading_diagnostic_report(self):
diff --git a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
index 2dcbb728549f..dd9500c186b2 100644
--- a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
+++ b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
@@ -48,6 +48,8 @@ class TestConcurrentVFork(TestBase):
         self.expect("continue", patterns=[r"exited with status = 1[0-4]"])
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -56,6 +58,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-parent.
@@ -64,6 +68,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -72,6 +78,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -80,6 +88,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=True, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -88,6 +98,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
@@ -96,6 +108,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -104,6 +118,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_call_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
diff --git a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
index 98988d7624da..9af53845ca1b 100644
--- a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
+++ b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
@@ -13,7 +13,7 @@ class TestProgressReporting(TestBase):
         TestBase.setUp(self)
         self.broadcaster = self.dbg.GetBroadcaster()
         self.listener = lldbutil.start_listening_from(
-            self.broadcaster, lldb.eBroadcastBitProgress
+            self.broadcaster, lldb.SBDebugger.eBroadcastBitProgress
         )
 
     def test_dwarf_symbol_loading_progress_report(self):
diff --git a/lldb/test/API/functionalities/progress_reporting/clang_modules/TestClangModuleBuildProgress.py b/lldb/test/API/functionalities/progress_reporting/clang_modules/TestClangModuleBuildProgress.py
index 33c7c269c081..228f676aedf6 100644
--- a/lldb/test/API/functionalities/progress_reporting/clang_modules/TestClangModuleBuildProgress.py
+++ b/lldb/test/API/functionalities/progress_reporting/clang_modules/TestClangModuleBuildProgress.py
@@ -34,7 +34,7 @@ class TestCase(TestBase):
         # other unrelated progress events.
         broadcaster = self.dbg.GetBroadcaster()
         listener = lldbutil.start_listening_from(
-            broadcaster, lldb.eBroadcastBitProgress
+            broadcaster, lldb.SBDebugger.eBroadcastBitProgress
         )
 
         # Trigger module builds.
diff --git a/lldb/test/API/iohandler/sigint/TestProcessIOHandlerInterrupt.py b/lldb/test/API/iohandler/sigint/TestProcessIOHandlerInterrupt.py
index 8e19d56cd0c2..75ac0f6c0289 100644
--- a/lldb/test/API/iohandler/sigint/TestProcessIOHandlerInterrupt.py
+++ b/lldb/test/API/iohandler/sigint/TestProcessIOHandlerInterrupt.py
@@ -11,6 +11,7 @@ from lldbsuite.test.lldbpexpect import PExpectTest
 
 
 class TestCase(PExpectTest):
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(compiler="clang", compiler_version=["<", "11.0"])
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
     def test(self):
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 9d6775917e13..d934349fe3ca 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -239,6 +239,9 @@ if is_configured("llvm_tools_dir"):
 if is_configured("server"):
     dotest_cmd += ["--server", config.server]
 
+if is_configured("lldb_obj_root"):
+    dotest_cmd += ["--lldb-obj-root", config.lldb_obj_root]
+
 if is_configured("lldb_libs_dir"):
     dotest_cmd += ["--lldb-libs-dir", config.lldb_libs_dir]
 
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 053331dc4881..8b2d09ae41cd 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -8,7 +8,7 @@ config.llvm_include_dir = lit_config.substitute("@LLVM_INCLUDE_DIR@")
 config.llvm_shlib_dir = lit_config.substitute("@SHLIBDIR@")
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
-config.lldb_obj_root = "@LLDB_BINARY_DIR@"
+config.lldb_obj_root = lit_config.substitute("@LLDB_BINARY_DIR@")
 config.lldb_src_root = "@LLDB_SOURCE_DIR@"
 config.lldb_libs_dir = lit_config.substitute("@LLDB_LIBS_DIR@")
 config.lldb_framework_dir = lit_config.substitute("@LLDB_FRAMEWORK_DIR@")
diff --git a/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py b/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
index 15d9feb54389..a7c749e1528b 100644
--- a/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
+++ b/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
@@ -20,8 +20,6 @@ from lldbsuite.test import lldbtest_config
 class DarwinNSLogOutputTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    @skipUnlessDarwin
-    @skipIfRemote  # this test is currently written using lldb commands & assumes running on local system
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -119,6 +117,10 @@ class DarwinNSLogOutputTestCase(TestBase):
         self.runCmd("process continue")
         self.expect(expect_regexes)
 
+    @skipIfAsan # avoid dealing with pexpect timeout flakyness on bots
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
+    @skipUnlessDarwin
+    @skipIfRemote  # this test is currently written using lldb commands & assumes running on local system
     def test_nslog_output_is_displayed(self):
         """Test that NSLog() output shows up in the command-line debugger."""
         self.do_test(
@@ -131,6 +133,10 @@ class DarwinNSLogOutputTestCase(TestBase):
         self.assertGreater(len(self.child.match.groups()), 0)
         self.assertEqual("This is a message from NSLog", self.child.match.group(1))
 
+    @skipIfAsan # avoid dealing with pexpect timeout flakyness on bots
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
+    @skipUnlessDarwin
+    @skipIfRemote  # this test is currently written using lldb commands & assumes running on local system
     def test_nslog_output_is_suppressed_with_env_var(self):
         """Test that NSLog() output does not show up with the ignore env var."""
         # This test will only work properly on macOS 10.12+.  Skip it on earlier versions.
diff --git a/lldb/test/API/macosx/rosetta/TestRosetta.py b/lldb/test/API/macosx/rosetta/TestRosetta.py
index 669db95a1624..ce40de475ef1 100644
--- a/lldb/test/API/macosx/rosetta/TestRosetta.py
+++ b/lldb/test/API/macosx/rosetta/TestRosetta.py
@@ -49,7 +49,7 @@ class TestRosetta(TestBase):
         if rosetta_debugserver_installed():
             broadcaster = self.dbg.GetBroadcaster()
             listener = lldbutil.start_listening_from(
-                broadcaster, lldb.eBroadcastBitWarning
+                broadcaster, lldb.SBDebugger.eBroadcastBitWarning
             )
 
             target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py
index c647c2bcdccb..81c44f7a39d6 100644
--- a/lldb/test/API/python_api/type/TestTypeList.py
+++ b/lldb/test/API/python_api/type/TestTypeList.py
@@ -33,6 +33,32 @@ class TypeAndTypeListTestCase(TestBase):
         self.assertTrue(pointer_masks2_type)
         self.DebugSBType(pointer_masks2_type)
 
+    def _find_static_field_in_Task_pointer(self, task_pointer):
+        self.assertTrue(task_pointer)
+        self.DebugSBType(task_pointer)
+
+        task_type = task_pointer.GetPointeeType()
+        self.assertTrue(task_type)
+        self.DebugSBType(task_type)
+
+        static_constexpr_field = task_type.GetStaticFieldWithName(
+            "static_constexpr_field"
+        )
+        self.assertTrue(static_constexpr_field)
+        self.assertEqual(static_constexpr_field.GetName(), "static_constexpr_field")
+        self.assertEqual(static_constexpr_field.GetType().GetName(), "const long")
+
+        value = static_constexpr_field.GetConstantValue(self.target())
+        self.DebugSBValue(value)
+        self.assertEqual(value.GetValueAsSigned(), 47)
+
+        static_mutable_field = task_type.GetStaticFieldWithName("static_mutable_field")
+        self.assertTrue(static_mutable_field)
+        self.assertEqual(static_mutable_field.GetName(), "static_mutable_field")
+        self.assertEqual(static_mutable_field.GetType().GetName(), "int")
+
+        self.assertFalse(static_mutable_field.GetConstantValue(self.target()))
+
     @skipIf(compiler="clang", compiler_version=["<", "17.0"])
     def test(self):
         """Exercise SBType and SBTypeList API."""
@@ -175,6 +201,13 @@ class TypeAndTypeListTestCase(TestBase):
             frame0.EvaluateExpression("pointer").GetType()
         )
 
+        self._find_static_field_in_Task_pointer(
+            frame0.FindVariable("task_head").GetType()
+        )
+        self._find_static_field_in_Task_pointer(
+            frame0.EvaluateExpression("task_head").GetType()
+        )
+
         # We'll now get the child member 'id' from 'task_head'.
         id = task_head.GetChildMemberWithName("id")
         self.DebugSBValue(id)
diff --git a/lldb/test/API/python_api/type/main.cpp b/lldb/test/API/python_api/type/main.cpp
index 391f58e3e587..c86644d91827 100644
--- a/lldb/test/API/python_api/type/main.cpp
+++ b/lldb/test/API/python_api/type/main.cpp
@@ -27,12 +27,15 @@ public:
     enum E : unsigned char {} e;
     union U {
     } u;
+    static constexpr long static_constexpr_field = 47;
+    static int static_mutable_field;
     Task(int i, Task *n):
         id(i),
         next(n),
         type(TASK_TYPE_1)
     {}
 };
+int Task::static_mutable_field = 42;
 
 template <unsigned Value> struct PointerInfo {
   enum Masks1 { pointer_mask };
diff --git a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
index 21aca5fc85d5..313a265319db 100644
--- a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
+++ b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
@@ -19,6 +19,7 @@ class TestSTTYBeforeAndAfter(TestBase):
         cls.RemoveTempFile("child_send2.txt")
         cls.RemoveTempFile("child_read2.txt")
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @add_test_categories(["pexpect"])
     @no_debug_info_test
     def test_stty_dash_a_before_and_afetr_invoking_lldb_command(self):
diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
index ffa0dc943e06..8f456aaf890c 100644
--- a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
+++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
@@ -4,11 +4,23 @@ Test lldb-dap setBreakpoints request
 
 import dap_server
 import lldbdap_testcase
+import psutil
+from collections import deque
 from lldbsuite.test import lldbutil
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 
 
+def get_subprocess(process_name):
+    queue = deque([psutil.Process(os.getpid())])
+    while queue:
+        process = queue.popleft()
+        if process.name() == process_name:
+            return process
+        queue.extend(process.children())
+
+    self.assertTrue(False, "No subprocess with name %s found" % process_name)
+
 class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
     def check_lldb_command(
         self, lldb_command, contains_string, assert_msg, command_escape_prefix="`"
@@ -104,3 +116,49 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
             "Help can be invoked",
             command_escape_prefix="",
         )
+
+    @skipIfWindows
+    @skipIfRemote
+    def test_exit_status_message_sigterm(self):
+        source = "main.cpp"
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program, commandEscapePrefix="")
+        breakpoint1_line = line_number(source, "// breakpoint 1")
+        breakpoint_ids = self.set_source_breakpoints(source, [breakpoint1_line])
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        # Kill lldb-server process.
+        process_name = (
+            "debugserver" if platform.system() in ["Darwin"] else "lldb-server"
+        )
+        process = get_subprocess(process_name)
+        process.terminate()
+        process.wait()
+
+        # Get the console output
+        console_output = self.collect_console(1.0)
+
+        # Verify the exit status message is printed.
+        self.assertIn(
+            "exited with status = -1 (0xffffffff) debugserver died with signal SIGTERM",
+            console_output,
+            "Exit status does not contain message 'exited with status'",
+        )
+
+    @skipIfWindows
+    @skipIfRemote
+    def test_exit_status_message_ok(self):
+        source = "main.cpp"
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program, commandEscapePrefix="")
+        self.continue_to_exit()
+
+        # Get the console output
+        console_output = self.collect_console(1.0)
+
+        # Verify the exit status message is printed.
+        self.assertIn(
+            "exited with status = 0 (0x00000000)",
+            console_output,
+            "Exit status does not contain message 'exited with status'",
+        )
diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/Makefile b/lldb/test/API/tools/lldb-dap/stepInTargets/Makefile
new file mode 100644
index 000000000000..f772575cd561
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/stepInTargets/Makefile
@@ -0,0 +1,6 @@
+	
+ENABLE_THREADS := YES
+
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
new file mode 100644
index 000000000000..6296f6554d07
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
@@ -0,0 +1,68 @@
+"""
+Test lldb-dap stepInTargets request
+"""
+
+import dap_server
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+import lldbdap_testcase
+from lldbsuite.test import lldbutil
+
+
+class TestDAP_stepInTargets(lldbdap_testcase.DAPTestCaseBase):
+    @skipIf(
+        archs=no_match(["x86_64"])
+    )  # InstructionControlFlowKind for ARM is not supported yet.
+    def test_basic(self):
+        """
+        Tests the basic stepping in targets with directly calls.
+        """
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+
+        breakpoint_line = line_number(source, "// set breakpoint here")
+        lines = [breakpoint_line]
+        # Set breakpoint in the thread function so we can step the threads
+        breakpoint_ids = self.set_source_breakpoints(source, lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        threads = self.dap_server.get_threads()
+        self.assertEqual(len(threads), 1, "expect one thread")
+        tid = threads[0]["id"]
+
+        leaf_frame = self.dap_server.get_stackFrame()
+        self.assertIsNotNone(leaf_frame, "expect a leaf frame")
+
+        # Request all step in targets list and verify the response.
+        step_in_targets_response = self.dap_server.request_stepInTargets(
+            leaf_frame["id"]
+        )
+        self.assertEqual(step_in_targets_response["success"], True, "expect success")
+        self.assertIn(
+            "body", step_in_targets_response, "expect body field in response body"
+        )
+        self.assertIn(
+            "targets",
+            step_in_targets_response["body"],
+            "expect targets field in response body",
+        )
+
+        step_in_targets = step_in_targets_response["body"]["targets"]
+        self.assertEqual(len(step_in_targets), 3, "expect 3 step in targets")
+
+        # Verify the target names are correct.
+        self.assertEqual(step_in_targets[0]["label"], "bar()", "expect bar()")
+        self.assertEqual(step_in_targets[1]["label"], "bar2()", "expect bar2()")
+        self.assertEqual(
+            step_in_targets[2]["label"], "foo(int, int)", "expect foo(int, int)"
+        )
+
+        # Choose to step into second target and verify that we are in bar2()
+        self.stepIn(threadId=tid, targetId=step_in_targets[1]["id"], waitForStop=True)
+        leaf_frame = self.dap_server.get_stackFrame()
+        self.assertIsNotNone(leaf_frame, "expect a leaf frame")
+        self.assertEqual(leaf_frame["name"], "bar2()")
diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/main.cpp b/lldb/test/API/tools/lldb-dap/stepInTargets/main.cpp
new file mode 100644
index 000000000000..d3c3dbcc139e
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/stepInTargets/main.cpp
@@ -0,0 +1,11 @@
+
+int foo(int val, int extra) { return val + extra; }
+
+int bar() { return 22; }
+
+int bar2() { return 54; }
+
+int main(int argc, char const *argv[]) {
+  foo(bar(), bar2()); // set breakpoint here
+  return 0;
+}
diff --git a/lldb/test/Shell/ObjectFile/ELF/two-text-sections.yaml b/lldb/test/Shell/ObjectFile/ELF/two-text-sections.yaml
new file mode 100644
index 000000000000..8b2fd47df1a1
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/ELF/two-text-sections.yaml
@@ -0,0 +1,48 @@
+# Test handling of object files that have duplicate sections.  This is legal,
+# according to the System V ABI (Edition 4.1); see 4-20 where it says:
+#
+#   Section names with a dot (.) prefix are reserved for the system,
+#   although applications may use these sections if their existing
+#   meanings are satisfactory. ... **An object file may have more than
+#   one section with the same name.**
+#
+# (See https://github.com/llvm/llvm-project/issues/88001)
+
+# RUN: yaml2obj %s -o %t
+# RUN: lldb-test symbols %t | FileCheck %s
+
+# CHECK: 0x0000000000400010 {{.*}} my_function
+# CHECK: 0x0000000000401020 {{.*}} my_other_function
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .text
+    LastSec:         '.text (1)'
+    VAddr:           0x400000
+    Align:           0x1000
+    Offset:          0x0
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x400010
+    AddressAlign:    0x10
+  - Name:            '.text (1)'
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR, SHF_GNU_RETAIN ]
+    Address:         0x401000
+    AddressAlign:    0x10
+Symbols:
+  - Name:            my_function
+    Section:         .text
+    Value:           0x400010
+  - Name:            my_other_function
+    Section:         '.text (1)'
+    Value:           0x401020
diff --git a/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind-abort.test b/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind-abort.test
index 477a656a711f..d5e66ca5e263 100644
--- a/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind-abort.test
+++ b/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind-abort.test
@@ -9,12 +9,12 @@ process launch
 # CHECK: stop reason = signal SIGTRAP
 
 thread backtrace
-# CHECK: frame #0: {{.*}}`asm_main + 23
+# CHECK: frame #0: {{.*}}`asm_main + 19
 # CHECK: frame #1: {{.*}}`main + {{.*}}
 
 target modules show-unwind -n asm_main
 # CHECK: eh_frame UnwindPlan:
 # CHECK: row[0]:    0: CFA=rsp +8 => rip=[CFA-8]
-# CHECK: row[1]:   14: CFA=rsp+16 => rbp=[CFA-16] rip=[CFA-8]
-# CHECK: row[2]:   17: CFA=rbp+16 => rbp=[CFA-16] rip=[CFA-8]
-# CHECK: row[3]:   22: CFA=rsp +8 => rip=[CFA-8]
+# CHECK: row[1]:   10: CFA=rsp+16 => rbp=[CFA-16] rip=[CFA-8]
+# CHECK: row[2]:   13: CFA=rbp+16 => rbp=[CFA-16] rip=[CFA-8]
+# CHECK: row[3]:   18: CFA=rsp +8 => rip=[CFA-8]
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index 8015dec9ba8f..5c70a056fea4 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -162,6 +162,8 @@ struct DAP {
   std::vector<std::string> exit_commands;
   std::vector<std::string> stop_commands;
   std::vector<std::string> terminate_commands;
+  // Map step in target id to list of function targets that user can choose.
+  llvm::DenseMap<lldb::addr_t, std::string> step_in_targets;
   // A copy of the last LaunchRequest or AttachRequest so we can reuse its
   // arguments if we get a RestartRequest.
   std::optional<llvm::json::Object> last_launch_or_attach_request;
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 25c5ad56e3d6..8000d68dea7e 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -50,6 +50,7 @@
 #include <thread>
 #include <vector>
 
+#include "lldb/API/SBStream.h"
 #include "lldb/Host/Config.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -420,8 +421,8 @@ void SendStdOutStdErr(lldb::SBProcess &process) {
 
 void ProgressEventThreadFunction() {
   lldb::SBListener listener("lldb-dap.progress.listener");
-  g_dap.debugger.GetBroadcaster().AddListener(listener,
-                                              lldb::eBroadcastBitProgress);
+  g_dap.debugger.GetBroadcaster().AddListener(
+      listener, lldb::SBDebugger::eBroadcastBitProgress);
   g_dap.broadcaster.AddListener(listener, eBroadcastBitStopProgressThread);
   lldb::SBEvent event;
   bool done = false;
@@ -503,6 +504,10 @@ void EventThreadFunction() {
             SendContinuedEvent();
             break;
           case lldb::eStateExited:
+            lldb::SBStream stream;
+            process.GetStatus(stream);
+            g_dap.SendOutput(OutputType::Console, stream.GetData());
+
             // When restarting, we can get an "exited" event for the process we
             // just killed with the old PID, or even with no PID. In that case
             // we don't have to terminate the session.
@@ -1645,7 +1650,7 @@ void request_initialize(const llvm::json::Object &request) {
   // The debug adapter supports the gotoTargetsRequest.
   body.try_emplace("supportsGotoTargetsRequest", false);
   // The debug adapter supports the stepInTargetsRequest.
-  body.try_emplace("supportsStepInTargetsRequest", false);
+  body.try_emplace("supportsStepInTargetsRequest", true);
   // The debug adapter supports the completions request.
   body.try_emplace("supportsCompletionsRequest", true);
   // The debug adapter supports the disassembly request.
@@ -3180,14 +3185,155 @@ void request_stepIn(const llvm::json::Object &request) {
   llvm::json::Object response;
   FillResponse(request, response);
   auto arguments = request.getObject("arguments");
+
+  std::string step_in_target;
+  uint64_t target_id = GetUnsigned(arguments, "targetId", 0);
+  auto it = g_dap.step_in_targets.find(target_id);
+  if (it != g_dap.step_in_targets.end())
+    step_in_target = it->second;
+
+  const bool single_thread = GetBoolean(arguments, "singleThread", false);
+  lldb::RunMode run_mode =
+      single_thread ? lldb::eOnlyThisThread : lldb::eOnlyDuringStepping;
   lldb::SBThread thread = g_dap.GetLLDBThread(*arguments);
   if (thread.IsValid()) {
     // Remember the thread ID that caused the resume so we can set the
     // "threadCausedFocus" boolean value in the "stopped" events.
     g_dap.focus_tid = thread.GetThreadID();
-    thread.StepInto();
+    thread.StepInto(step_in_target.c_str(), run_mode);
+  } else {
+    response["success"] = llvm::json::Value(false);
+  }
+  g_dap.SendJSON(llvm::json::Value(std::move(response)));
+}
+
+// "StepInTargetsRequest": {
+//   "allOf": [ { "$ref": "#/definitions/Request" }, {
+//     "type": "object",
+//     "description": "This request retrieves the possible step-in targets for
+//     the specified stack frame.\nThese targets can be used in the `stepIn`
+//     request.\nClients should only call this request if the corresponding
+//     capability `supportsStepInTargetsRequest` is true.", "properties": {
+//       "command": {
+//         "type": "string",
+//         "enum": [ "stepInTargets" ]
+//       },
+//       "arguments": {
+//         "$ref": "#/definitions/StepInTargetsArguments"
+//       }
+//     },
+//     "required": [ "command", "arguments"  ]
+//   }]
+// },
+// "StepInTargetsArguments": {
+//   "type": "object",
+//   "description": "Arguments for `stepInTargets` request.",
+//   "properties": {
+//     "frameId": {
+//       "type": "integer",
+//       "description": "The stack frame for which to retrieve the possible
+//       step-in targets."
+//     }
+//   },
+//   "required": [ "frameId" ]
+// },
+// "StepInTargetsResponse": {
+//   "allOf": [ { "$ref": "#/definitions/Response" }, {
+//     "type": "object",
+//     "description": "Response to `stepInTargets` request.",
+//     "properties": {
+//       "body": {
+//         "type": "object",
+//         "properties": {
+//           "targets": {
+//             "type": "array",
+//             "items": {
+//               "$ref": "#/definitions/StepInTarget"
+//             },
+//             "description": "The possible step-in targets of the specified
+//             source location."
+//           }
+//         },
+//         "required": [ "targets" ]
+//       }
+//     },
+//     "required": [ "body" ]
+//   }]
+// }
+void request_stepInTargets(const llvm::json::Object &request) {
+  llvm::json::Object response;
+  FillResponse(request, response);
+  auto arguments = request.getObject("arguments");
+
+  g_dap.step_in_targets.clear();
+  lldb::SBFrame frame = g_dap.GetLLDBFrame(*arguments);
+  if (frame.IsValid()) {
+    lldb::SBAddress pc_addr = frame.GetPCAddress();
+    lldb::SBAddress line_end_addr =
+        pc_addr.GetLineEntry().GetSameLineContiguousAddressRangeEnd(true);
+    lldb::SBInstructionList insts = g_dap.target.ReadInstructions(
+        pc_addr, line_end_addr, /*flavor_string=*/nullptr);
+
+    if (!insts.IsValid()) {
+      response["success"] = false;
+      response["message"] = "Failed to get instructions for frame.";
+      g_dap.SendJSON(llvm::json::Value(std::move(response)));
+      return;
+    }
+
+    llvm::json::Array step_in_targets;
+    const auto num_insts = insts.GetSize();
+    for (size_t i = 0; i < num_insts; ++i) {
+      lldb::SBInstruction inst = insts.GetInstructionAtIndex(i);
+      if (!inst.IsValid())
+        break;
+
+      lldb::addr_t inst_addr = inst.GetAddress().GetLoadAddress(g_dap.target);
+
+      // Note: currently only x86/x64 supports flow kind.
+      lldb::InstructionControlFlowKind flow_kind =
+          inst.GetControlFlowKind(g_dap.target);
+      if (flow_kind == lldb::eInstructionControlFlowKindCall) {
+        // Use call site instruction address as id which is easy to debug.
+        llvm::json::Object step_in_target;
+        step_in_target["id"] = inst_addr;
+
+        llvm::StringRef call_operand_name = inst.GetOperands(g_dap.target);
+        lldb::addr_t call_target_addr;
+        if (call_operand_name.getAsInteger(0, call_target_addr))
+          continue;
+
+        lldb::SBAddress call_target_load_addr =
+            g_dap.target.ResolveLoadAddress(call_target_addr);
+        if (!call_target_load_addr.IsValid())
+          continue;
+
+        // The existing ThreadPlanStepInRange only accept step in target
+        // function with debug info.
+        lldb::SBSymbolContext sc = g_dap.target.ResolveSymbolContextForAddress(
+            call_target_load_addr, lldb::eSymbolContextFunction);
+
+        // The existing ThreadPlanStepInRange only accept step in target
+        // function with debug info.
+        std::string step_in_target_name;
+        if (sc.IsValid() && sc.GetFunction().IsValid())
+          step_in_target_name = sc.GetFunction().GetDisplayName();
+
+        // Skip call sites if we fail to resolve its symbol name.
+        if (step_in_target_name.empty())
+          continue;
+
+        g_dap.step_in_targets.try_emplace(inst_addr, step_in_target_name);
+        step_in_target.try_emplace("label", step_in_target_name);
+        step_in_targets.emplace_back(std::move(step_in_target));
+      }
+    }
+    llvm::json::Object body;
+    body.try_emplace("targets", std::move(step_in_targets));
+    response.try_emplace("body", std::move(body));
   } else {
     response["success"] = llvm::json::Value(false);
+    response["message"] = "Failed to get frame for input frameId.";
   }
   g_dap.SendJSON(llvm::json::Value(std::move(response)));
 }
@@ -3904,6 +4050,7 @@ void RegisterRequestCallbacks() {
   g_dap.RegisterRequestCallback("source", request_source);
   g_dap.RegisterRequestCallback("stackTrace", request_stackTrace);
   g_dap.RegisterRequestCallback("stepIn", request_stepIn);
+  g_dap.RegisterRequestCallback("stepInTargets", request_stepInTargets);
   g_dap.RegisterRequestCallback("stepOut", request_stepOut);
   g_dap.RegisterRequestCallback("threads", request_threads);
   g_dap.RegisterRequestCallback("variables", request_variables);
diff --git a/lldb/unittests/Host/HostTest.cpp b/lldb/unittests/Host/HostTest.cpp
index 5e01a6835c03..a1d8a3b7f485 100644
--- a/lldb/unittests/Host/HostTest.cpp
+++ b/lldb/unittests/Host/HostTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/Host.h"
+#include "lldb/Utility/ProcessInfo.h"
 #include "gtest/gtest.h"
 
 using namespace lldb_private;
@@ -25,3 +26,23 @@ TEST(Host, GetEnvironment) {
   ASSERT_EQ("Host::GetEnvironment",
             Host::GetEnvironment().lookup("LLDB_TEST_ENVIRONMENT_VAR"));
 }
+
+TEST(Host, ProcessInstanceInfoCumulativeUserTimeIsValid) {
+  ProcessInstanceInfo info;
+  info.SetCumulativeUserTime(ProcessInstanceInfo::timespec{0, 0});
+  EXPECT_FALSE(info.CumulativeUserTimeIsValid());
+  info.SetCumulativeUserTime(ProcessInstanceInfo::timespec{0, 1});
+  EXPECT_TRUE(info.CumulativeUserTimeIsValid());
+  info.SetCumulativeUserTime(ProcessInstanceInfo::timespec{1, 0});
+  EXPECT_TRUE(info.CumulativeUserTimeIsValid());
+}
+
+TEST(Host, ProcessInstanceInfoCumulativeSystemTimeIsValid) {
+  ProcessInstanceInfo info;
+  info.SetCumulativeSystemTime(ProcessInstanceInfo::timespec{0, 0});
+  EXPECT_FALSE(info.CumulativeSystemTimeIsValid());
+  info.SetCumulativeSystemTime(ProcessInstanceInfo::timespec{0, 1});
+  EXPECT_TRUE(info.CumulativeSystemTimeIsValid());
+  info.SetCumulativeSystemTime(ProcessInstanceInfo::timespec{1, 0});
+  EXPECT_TRUE(info.CumulativeSystemTimeIsValid());
+}
+\ No newline at end of file
diff --git a/lldb/unittests/Host/linux/HostTest.cpp b/lldb/unittests/Host/linux/HostTest.cpp
index 733909902474..8ecaf3ec0dec 100644
--- a/lldb/unittests/Host/linux/HostTest.cpp
+++ b/lldb/unittests/Host/linux/HostTest.cpp
@@ -69,17 +69,21 @@ TEST_F(HostTest, GetProcessInfo) {
   EXPECT_EQ(HostInfo::GetArchitecture(HostInfo::eArchKindDefault),
             Info.GetArchitecture());
   // Test timings
-  /*
-   * This is flaky in the buildbots on all archs
+  // In some sense this is a pretty trivial test. What it is trying to
+  // accomplish is just to validate that these values are never decreasing
+  // which would be unambiguously wrong. We can not reliably show them
+  // to be always increasing because the microsecond granularity means that,
+  // with hardware variations the number of loop iterations need to always
+  // be increasing for faster and faster machines.
   ASSERT_TRUE(Host::GetProcessInfo(getpid(), Info));
   ProcessInstanceInfo::timespec user_time = Info.GetUserTime();
   static volatile unsigned u = 0;
   for (unsigned i = 0; i < 10'000'000; i++) {
-    u = i;
+    u += i;
   }
+  ASSERT_TRUE(u > 0);
   ASSERT_TRUE(Host::GetProcessInfo(getpid(), Info));
   ProcessInstanceInfo::timespec next_user_time = Info.GetUserTime();
-  ASSERT_TRUE(user_time.tv_sec < next_user_time.tv_sec ||
-              user_time.tv_usec < next_user_time.tv_usec);
-  */
+  ASSERT_TRUE(user_time.tv_sec <= next_user_time.tv_sec ||
+              user_time.tv_usec <= next_user_time.tv_usec);
 }
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index 0c2a0d735dc8..ee90855437a7 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -287,6 +287,141 @@ DWARF:
   ASSERT_EQ(found_function_types, expected_function_types);
 }
 
+TEST_F(DWARFASTParserClangTests, TestPtrAuthParsing) {
+  // Tests parsing values with type DW_TAG_LLVM_ptrauth_type corresponding to
+  // explicitly signed raw function pointers
+
+  // This is Dwarf for the following C code:
+  // ```
+  // void (*__ptrauth(0, 0, 42) a)();
+  // ```
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - a
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x01
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x02
+          Tag:             DW_TAG_variable
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0x03
+          Tag:             DW_TAG_LLVM_ptrauth_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_LLVM_ptrauth_key
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_LLVM_ptrauth_extra_discriminator
+              Form:            DW_FORM_data2
+        - Code:            0x04
+          Tag:             DW_TAG_pointer_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+        - Code:            0x05
+          Tag:             DW_TAG_subroutine_type
+          Children:        DW_CHILDREN_yes
+        - Code:            0x06
+          Tag:             DW_TAG_unspecified_parameters
+          Children:        DW_CHILDREN_no
+
+  debug_info:
+    - Version:         5
+      UnitType:        DW_UT_compile
+      AddrSize:        8
+      Entries:
+# 0x0c: DW_TAG_compile_unit
+#         DW_AT_language [DW_FORM_data2]    (DW_LANG_C99)
+        - AbbrCode:        0x01
+          Values:
+            - Value:           0x0c
+
+# 0x0f:   DW_TAG_variable
+#           DW_AT_name [DW_FORM_strp]       (\"a\")
+#           DW_AT_type [DW_FORM_ref4]       (0x00000018 \"void (*__ptrauth(0, 0, 0x02a)\")
+#           DW_AT_external [DW_FORM_flag_present]   (true)
+        - AbbrCode:        0x02
+          Values:
+            - Value:           0x00
+            - Value:           0x18
+
+# 0x18:   DW_TAG_LLVM_ptrauth_type
+#           DW_AT_type [DW_FORM_ref4]       (0x00000020 \"void (*)(...)\")
+#           DW_AT_LLVM_ptrauth_key [DW_FORM_data1]  (0x00)
+#           DW_AT_LLVM_ptrauth_extra_discriminator [DW_FORM_data2]  (0x002a)
+        - AbbrCode:        0x03
+          Values:
+            - Value:           0x20
+            - Value:           0x00
+            - Value:           0x2a
+
+# 0x20:   DW_TAG_pointer_type
+#           DW_AT_type [DW_AT_type [DW_FORM_ref4]       (0x00000025 \"void (...)\")
+        - AbbrCode:        0x04
+          Values:
+            - Value:           0x25
+
+# 0x25:   DW_TAG_subroutine_type
+        - AbbrCode:        0x05
+
+# 0x26:     DW_TAG_unspecified_parameters
+        - AbbrCode:        0x06
+
+        - AbbrCode:        0x00 # end of child tags of 0x25
+        - AbbrCode:        0x00 # end of child tags of 0x0c
+...
+)";
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("ast");
+  auto &ast_ctx = *holder->GetAST();
+  DWARFASTParserClangStub ast_parser(ast_ctx);
+
+  DWARFDIE ptrauth_variable = cu_die.GetFirstChild();
+  ASSERT_EQ(ptrauth_variable.Tag(), DW_TAG_variable);
+  DWARFDIE ptrauth_type =
+      ptrauth_variable.GetAttributeValueAsReferenceDIE(DW_AT_type);
+  ASSERT_EQ(ptrauth_type.Tag(), DW_TAG_LLVM_ptrauth_type);
+
+  SymbolContext sc;
+  bool new_type = false;
+  lldb::TypeSP type_sp =
+      ast_parser.ParseTypeFromDWARF(sc, ptrauth_type, &new_type);
+  CompilerType compiler_type = type_sp->GetForwardCompilerType();
+  ASSERT_EQ(compiler_type.GetPtrAuthKey(), 0U);
+  ASSERT_EQ(compiler_type.GetPtrAuthAddressDiversity(), false);
+  ASSERT_EQ(compiler_type.GetPtrAuthDiscriminator(), 42U);
+}
+
 struct ExtractIntFromFormValueTest : public testing::Test {
   SubsystemRAII<FileSystem, HostInfo> subsystems;
   clang_utils::TypeSystemClangHolder holder;
diff --git a/lldb/utils/TableGen/LLDBTableGen.cpp b/lldb/utils/TableGen/LLDBTableGen.cpp
index abb6589f0ca6..c63ca76c0d48 100644
--- a/lldb/utils/TableGen/LLDBTableGen.cpp
+++ b/lldb/utils/TableGen/LLDBTableGen.cpp
@@ -67,7 +67,6 @@ int main(int argc, char **argv) {
   sys::PrintStackTraceOnErrorSignal(argv[0]);
   PrettyStackTraceProgram X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
-
   llvm_shutdown_obj Y;
 
   return TableGenMain(argv[0], &LLDBTableGenMain);
diff --git a/lldb/utils/TableGen/LLDBTableGenBackends.h b/lldb/utils/TableGen/LLDBTableGenBackends.h
index 88ae0888c22d..b60c4705de3a 100644
--- a/lldb/utils/TableGen/LLDBTableGenBackends.h
+++ b/lldb/utils/TableGen/LLDBTableGenBackends.h
@@ -32,6 +32,7 @@ namespace lldb_private {
 void EmitOptionDefs(RecordKeeper &RK, raw_ostream &OS);
 void EmitPropertyDefs(RecordKeeper &RK, raw_ostream &OS);
 void EmitPropertyEnumDefs(RecordKeeper &RK, raw_ostream &OS);
+int EmitSBAPIDWARFEnum(int argc, char **argv);
 
 } // namespace lldb_private
 
diff --git a/lldb/utils/lldb-dotest/lldb-dotest.in b/lldb/utils/lldb-dotest/lldb-dotest.in
index 022425591f45..0e9648a6e6dc 100755
--- a/lldb/utils/lldb-dotest/lldb-dotest.in
+++ b/lldb/utils/lldb-dotest/lldb-dotest.in
@@ -14,6 +14,7 @@ lldb_build_intel_pt = "@LLDB_BUILD_INTEL_PT@"
 lldb_framework_dir = "@LLDB_FRAMEWORK_DIR_CONFIGURED@"
 lldb_libs_dir = "@LLDB_LIBS_DIR_CONFIGURED@"
 llvm_tools_dir = "@LLVM_TOOLS_DIR_CONFIGURED@"
+lldb_obj_root = "@LLDB_BINARY_DIR@"
 has_libcxx = @LLDB_HAS_LIBCXX@
 libcxx_libs_dir = "@LIBCXX_LIBRARY_DIR@"
 libcxx_include_dir = "@LIBCXX_GENERATED_INCLUDE_DIR@"
@@ -45,6 +46,7 @@ if __name__ == '__main__':
         cmd.extend(['--framework', lldb_framework_dir])
     if lldb_build_intel_pt == "1":
         cmd.extend(['--enable-plugin', 'intel-pt'])
+    cmd.extend(['--lldb-obj-root', lldb_obj_root])
     cmd.extend(wrapper_args)
     # Invoke dotest.py and return exit code.
     print(' '.join(cmd))
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 43181af3bc19..c06e661573ed 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1156,6 +1156,11 @@ if (CMAKE_SYSTEM_NAME MATCHES "OS390")
   add_compile_definitions(_OPEN_SYS_FILE_EXT) # Needed for EBCDIC I/O.
   add_compile_definitions(_EXT) # Needed for file data.
   add_compile_definitions(_UNIX03_THREADS) # Multithreading support.
+  # Need to build LLVM as ASCII application.
+  # This can't be a global setting because other projects may
+  # need to be built in EBCDIC mode.
+  append("-fzos-le-char-mode=ascii" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+  append("-m64" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
 endif()
 
 # Build with _FILE_OFFSET_BITS=64 on Solaris to match g++ >= 9.
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index 770a9caea322..397bd5815b64 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -162,6 +162,12 @@ endif()
 if(NOT TARGET acc_gen)
   add_custom_target(acc_gen)
 endif()
+if(NOT TARGET ARMTargetParserTableGen)
+  add_custom_target(ARMTargetParserTableGen)
+endif()
+if(NOT TARGET AArch64TargetParserTableGen)
+  add_custom_target(AArch64TargetParserTableGen)
+endif()
 if(NOT TARGET RISCVTargetParserTableGen)
   add_custom_target(RISCVTargetParserTableGen)
 endif()
diff --git a/llvm/cmake/modules/llvm-driver-template.cpp.in b/llvm/cmake/modules/llvm-driver-template.cpp.in
index 71aca6cd140c..1470ef1f0616 100644
--- a/llvm/cmake/modules/llvm-driver-template.cpp.in
+++ b/llvm/cmake/modules/llvm-driver-template.cpp.in
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/LLVMDriver.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/LLVMDriver.h"
 
 int @TOOL_NAME@_main(int argc, char **, const llvm::ToolContext &);
 
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 57d6280d57c8..a62acfc8fdcd 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -366,12 +366,12 @@ them.
 
 .. option:: --keep-global-symbol <symbol>, -G
 
- Make all symbols local in the output, except for symbols with the name
+ Mark all symbols local in the output, except for symbols with the name
  ``<symbol>``. Can be specified multiple times to ignore multiple symbols.
 
 .. option:: --keep-global-symbols <filename>
 
- Make all symbols local in the output, except for symbols named in the file
+ Mark all symbols local in the output, except for symbols named in the file
  ``<filename>``. In the file, each line represents a single symbol, with leading
  and trailing whitespace ignored, as is anything following a '#'. Can be
  specified multiple times to read names from multiple files.
@@ -395,7 +395,7 @@ them.
 
 .. option:: --localize-hidden
 
- Make all symbols with hidden or internal visibility local in the output.
+ Mark all symbols with hidden or internal visibility local in the output.
 
 .. option:: --localize-symbol <symbol>, -L
 
diff --git a/llvm/docs/ConvergenceAndUniformity.rst b/llvm/docs/ConvergenceAndUniformity.rst
index 0e97595508f9..863cebd91a20 100644
--- a/llvm/docs/ConvergenceAndUniformity.rst
+++ b/llvm/docs/ConvergenceAndUniformity.rst
@@ -10,34 +10,61 @@ Convergence And Uniformity
 Introduction
 ============
 
-Some parallel environments execute threads in groups that allow
-communication within the group using special primitives called
-*convergent* operations. The outcome of a convergent operation is
-sensitive to the set of threads that executes it "together", i.e.,
-convergently.
-
-A value is said to be *uniform* across a set of threads if it is the
-same across those threads, and *divergent* otherwise. Correspondingly,
-a branch is said to be a uniform branch if its condition is uniform,
-and it is a divergent branch otherwise.
-
-Whether threads are *converged* or not depends on the paths they take
-through the control flow graph. Threads take different outgoing edges
-at a *divergent branch*. Divergent branches constrain
+In some environments, groups of threads execute the same program in parallel,
+where efficient communication within a group is established using special
+primitives called :ref:`convergent operations<convergent_operations>`. The
+outcome of a convergent operation is sensitive to the set of threads that
+participate in it.
+
+The intuitive picture of *convergence* is built around threads executing in
+"lock step" --- a set of threads is thought of as *converged* if they are all
+executing "the same sequence of instructions together". Such threads may
+*diverge* at a *divergent branch*, and they may later *reconverge* at some
+common program point.
+
+In this intuitive picture, when converged threads execute an instruction, the
+resulting value is said to be *uniform* if it is the same in those threads, and
+*divergent* otherwise. Correspondingly, a branch is said to be a uniform branch
+if its condition is uniform, and it is a divergent branch otherwise.
+
+But the assumption of lock-step execution is not necessary for describing
+communication at convergent operations. It also constrains the implementation
+(compiler as well as hardware) by overspecifying how threads execute in such a
+parallel environment. To eliminate this assumption:
+
+- We define convergence as a relation between the execution of each instruction
+  by different threads and not as a relation between the threads themselves.
+  This definition is reasonable for known targets and is compatible with the
+  semantics of :ref:`convergent operations<convergent_operations>` in LLVM IR.
+- We also define uniformity in terms of this convergence. The output of an
+  instruction can be examined for uniformity across multiple threads only if the
+  corresponding executions of that instruction are converged.
+
+This document decribes a static analysis for determining convergence at each
+instruction in a function. The analysis extends previous work on divergence
+analysis [DivergenceSPMD]_ to cover irreducible control-flow. The described
+analysis is used in LLVM to implement a UniformityAnalysis that determines the
+uniformity of value(s) computed at each instruction in an LLVM IR or MIR
+function.
+
+.. [DivergenceSPMD] Julian Rosemann, Simon Moll, and Sebastian
+   Hack. 2021. An Abstract Interpretation for SPMD Divergence on
+   Reducible Control Flow Graphs. Proc. ACM Program. Lang. 5, POPL,
+   Article 31 (January 2021), 35 pages.
+   https://doi.org/10.1145/3434312
+
+Motivation
+==========
+
+Divergent branches constrain
 program transforms such as changing the CFG or moving a convergent
 operation to a different point of the CFG. Performing these
 transformations across a divergent branch can change the sets of
 threads that execute convergent operations convergently. While these
-constraints are out of scope for this document, the described
-*uniformity analysis* allows these transformations to identify
+constraints are out of scope for this document,
+uniformity analysis allows these transformations to identify
 uniform branches where these constraints do not hold.
 
-Convergence and
-uniformity are inter-dependent: When threads diverge at a divergent
-branch, they may later *reconverge* at a common program point.
-Subsequent operations are performed convergently, but the inputs may
-be non-uniform, thus producing divergent outputs.
-
 Uniformity is also useful by itself on targets that execute threads in
 groups with shared execution resources (e.g. waves, warps, or
 subgroups):
@@ -50,18 +77,6 @@ subgroups):
   branches, since the whole group of threads follows either one side
   of the branch or the other.
 
-This document presents a definition of convergence that is reasonable
-for real targets and is compatible with the currently implicit
-semantics of convergent operations in LLVM IR. This is accompanied by
-a *uniformity analysis* that extends previous work on divergence analysis
-[DivergenceSPMD]_ to cover irreducible control-flow.
-
-.. [DivergenceSPMD] Julian Rosemann, Simon Moll, and Sebastian
-   Hack. 2021. An Abstract Interpretation for SPMD Divergence on
-   Reducible Control Flow Graphs. Proc. ACM Program. Lang. 5, POPL,
-   Article 31 (January 2021), 35 pages.
-   https://doi.org/10.1145/3434312
-
 Terminology
 ===========
 
@@ -133,12 +148,6 @@ meaning. Dynamic instances listed in the same column are converged.
 Convergence
 ===========
 
-*Converged-with* is a transitive symmetric relation over dynamic
-instances produced by *different threads* for the *same static
-instance*. Informally, two threads that produce converged dynamic
-instances are said to be *converged*, and they are said to execute
-that static instance *convergently*, at that point in the execution.
-
 *Convergence-before* is a strict partial order over dynamic instances
 that is defined as the transitive closure of:
 
@@ -171,11 +180,16 @@ to be converged (i.e., related to each other in the converged-with
 relation). The resulting convergence order includes the edges ``P ->
 Q2``, ``Q1 -> R``, ``P -> R``, ``P -> T``, etc.
 
-The fact that *convergence-before* is a strict partial order is a
-constraint on the *converged-with* relation. It is trivially satisfied
-if different dynamic instances are never converged. It is also
-trivially satisfied for all known implementations for which
-convergence plays some role.
+*Converged-with* is a transitive symmetric relation over dynamic instances
+produced by *different threads* for the *same static instance*.
+
+It is impractical to provide any one definition for the *converged-with*
+relation, since different environments may wish to relate dynamic instances in
+different ways. The fact that *convergence-before* is a strict partial order is
+a constraint on the *converged-with* relation. It is trivially satisfied if
+different dynamic instances are never converged. Below, we provide a relation
+called :ref:`maximal converged-with<convergence-maximal>`, which satisifies
+*convergence-before* and is suitable for known targets.
 
 .. _convergence-note-convergence:
 
@@ -217,14 +231,16 @@ iterations of parent cycles as well.
 
    Dynamic instances ``X1`` and ``X2`` produced by different threads
    for the same static instance ``X`` are converged in the maximal
-   converged-with relation if and only if for every cycle ``C`` with
-   header ``H`` that contains ``X``:
-
-   - every dynamic instance ``H1`` of ``H`` that precedes ``X1`` in
-     the respective thread is convergence-before ``X2``, and,
-   - every dynamic instance ``H2`` of ``H`` that precedes ``X2`` in
-     the respective thread is convergence-before ``X1``,
-   - without assuming that ``X1`` is converged with ``X2``.
+   converged-with relation if and only if:
+
+   - ``X`` is not contained in any cycle, or,
+   - For every cycle ``C`` with header ``H`` that contains ``X``:
+
+     - every dynamic instance ``H1`` of ``H`` that precedes ``X1`` in
+       the respective thread is convergence-before ``X2``, and,
+     - every dynamic instance ``H2`` of ``H`` that precedes ``X2`` in
+       the respective thread is convergence-before ``X1``,
+     - without assuming that ``X1`` is converged with ``X2``.
 
 .. note::
 
diff --git a/llvm/docs/ConvergentOperations.rst b/llvm/docs/ConvergentOperations.rst
index 332675f3edef..5081efffc89a 100644
--- a/llvm/docs/ConvergentOperations.rst
+++ b/llvm/docs/ConvergentOperations.rst
@@ -936,7 +936,8 @@ property <uniformity-analysis>` of static instances in the convergence region of
      1. Both threads executed converged dynamic instances of every token
         definition ``D`` such that ``X`` is in the convergence region of ``D``,
         and,
-     2. For every cycle ``C`` with header ``H`` that contains ``X``:
+     2. Either ``X`` is not contained in any cycle, or, for every cycle ``C``
+        with header ``H`` that contains ``X``:
 
         - every dynamic instance ``H1`` of ``H`` that precedes ``X1`` in the
           respective thread is convergence-before ``X2``, and,
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index a4247796cb65..a45d73a9a3d4 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -158,11 +158,6 @@ what to add to your calendar invite.
      - `ics <https://calendar.google.com/calendar/ical/lowrisc.org_0n5pkesfjcnp0bh5hps1p0bd80%40group.calendar.google.com/public/basic.ics>`__
        `gcal <https://calendar.google.com/calendar/b/1?cid=bG93cmlzYy5vcmdfMG41cGtlc2ZqY25wMGJoNWhwczFwMGJkODBAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ>`__
      - `Minutes/docs <https://docs.google.com/document/d/1G3ocHm2zE6AYTS2N3_3w2UxFnSEyKkcF57siLWe-NVs>`__
-   * - Scalable Vectors and Arm SVE
-     - Monthly, every 3rd Tuesday
-     - `ics <https://calendar.google.com/calendar/ical/bjms39pe6k6bo5egtsp7don414%40group.calendar.google.com/public/basic.ics>`__
-       `gcal <https://calendar.google.com/calendar/u/0/embed?src=bjms39pe6k6bo5egtsp7don414@group.calendar.google.com>`__
-     - `Minutes/docs <https://docs.google.com/document/d/1UPH2Hzou5RgGT8XfO39OmVXKEibWPfdYLELSaHr3xzo/edit>`__
    * - ML Guided Compiler Optimizations
      - Monthly
      -
@@ -176,10 +171,6 @@ what to add to your calendar invite.
      - Weekly, on Wednesday
      -
      - `Minutes/docs <https://docs.google.com/document/d/1fOSRdyZR2w75D87yU2Ma9h2-_lEPL4NxvhJGJd-s5pk/edit#heading=h.mulvhjtr8dk9>`__
-   * - `MLIR <https://mlir.llvm.org>`__ design meetings
-     - Weekly, on Thursdays
-     -
-     - `Minutes/docs <https://docs.google.com/document/d/1y_9f1AbfgcoVdJh4_aM6-BaSHvrHl8zuA5G4jv_94K8/edit#heading=h.cite1kolful9>`__
    * - flang
      - Multiple meeting series, `documented here <https://github.com/llvm/llvm-project/blob/main/flang/docs/GettingInvolved.md#calls>`__
      -
@@ -192,19 +183,10 @@ what to add to your calendar invite.
      - Every 4 weeks on Tuesdays
      - `ics <http://lists.llvm.org/pipermail/llvm-dev/attachments/20201103/a3499a67/attachment-0001.ics>`__
      - `Minutes/docs <https://docs.google.com/document/d/17U-WvX8qyKc3S36YUKr3xfF-GHunWyYowXbxEdpHscw>`__
-   * - Vector Predication
-     - Every 2 weeks on Tuesdays, 3pm UTC
-     -
-     - `Minutes/docs <https://docs.google.com/document/d/1q26ToudQjnqN5x31zk8zgq_s0lem1-BF8pQmciLa4k8/edit?usp=sharing>`__
    * - LLVM Pointer Authentication
      - Every month on Mondays
      - `ics <https://calendar.google.com/calendar/ical/fr1qtmrmt2s9odufjvurkb6j70%40group.calendar.google.com/public/basic.ics>`__
      - `Minutes/docs <https://discourse.llvm.org/t/llvm-pointer-authentication-sync-ups/62661>`__
-   * - MemorySSA in LLVM
-     - Every 8 weeks on Mondays
-     - `ics <https://calendar.google.com/calendar/ical/c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com/public/basic.ics>`__
-       `gcal <https://calendar.google.com/calendar/embed?src=c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com>`__
-     - `Minutes/docs <https://docs.google.com/document/d/1-uEEZfmRdPThZlctOq9eXlmUaSSAAi8oKxhrPY_lpjk/edit#>`__
    * - LLVM Embedded Toolchains
      - Every 4 weeks on Thursdays
      - `ics <https://drive.google.com/file/d/1uNa-PFYkhAfT83kR2Nc4Fi706TAQFBEL/view?usp=sharing>`__
@@ -222,14 +204,49 @@ what to add to your calendar invite.
      - Every 2 weeks on Mondays
      - `gcal <https://calendar.google.com/calendar/u/0?cid=c3ljbC5sbHZtLndnQGdtYWlsLmNvbQ>`__
      - `Meeting details/agenda <https://docs.google.com/document/d/1ivYDSn_5ChTeiZ7TiO64WC_jYJnGwAUiT9Ngi9cAdFU/edit?usp=sharing>`__
+   * - Floating Point Working Group
+     - Every 3rd Wednesday of the month
+     - `ics <https://calendar.google.com/calendar/ical/02582507bac79d186900712566ec3fc69b33ac24d7de0a8c76c7b19976f190c0%40group.calendar.google.com/private-6e35506dbfe13812e92e9aa8cd5d761d/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/u/0?cid=MDI1ODI1MDdiYWM3OWQxODY5MDA3MTI1NjZlYzNmYzY5YjMzYWMyNGQ3ZGUwYThjNzZjN2IxOTk3NmYxOTBjMEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `Meeting details/agenda: <https://docs.google.com/document/d/1QcmUlWftPlBi-Wz6b6PipqJfvjpJ-OuRMRnN9Dm2t0c>`__
+
+Past online sync-ups
+^^^^^^^^^^^^^^^^^^^^
+
+Some online sync-ups are no longer happening. We keep pointing to them here to
+keep track of the meeting notes and in case anyone would want to revive them in
+the future.
+
+.. list-table:: LLVM no-longer-happening sync-up calls
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Topic
+     - Frequency
+     - Calendar link
+     - Minutes/docs link
+   * - Scalable Vectors and Arm SVE
+     - Monthly, every 3rd Tuesday
+     - `ics <https://calendar.google.com/calendar/ical/bjms39pe6k6bo5egtsp7don414%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/u/0/embed?src=bjms39pe6k6bo5egtsp7don414@group.calendar.google.com>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1UPH2Hzou5RgGT8XfO39OmVXKEibWPfdYLELSaHr3xzo/edit>`__
+   * - MemorySSA in LLVM
+     - Every 8 weeks on Mondays
+     - `ics <https://calendar.google.com/calendar/ical/c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/embed?src=c_1mincouiltpa24ac14of14lhi4%40group.calendar.google.com>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1-uEEZfmRdPThZlctOq9eXlmUaSSAAi8oKxhrPY_lpjk/edit#>`__
    * - GlobalISel
      - Every 2nd Tuesday of the month
      - `gcal <https://calendar.google.com/calendar/u/0?cid=ZDcyMjc0ZjZiZjNhMzFlYmE3NTNkMWM2MGM2NjM5ZWU3ZDE2MjM4MGFlZDc2ZjViY2UyYzMwNzVhZjk4MzQ4ZEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
      - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__
-   * - Floating Point Working Group
-     - Every 3rd Wednesday of the month
-     - `gcal <https://calendar.google.com/calendar/u/0?cid=MDI1ODI1MDdiYWM3OWQxODY5MDA3MTI1NjZlYzNmYzY5YjMzYWMyNGQ3ZGUwYThjNzZjN2IxOTk3NmYxOTBjMEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
-     - `Meeting details/agenda: <https://docs.google.com/document/d/1QcmUlWftPlBi-Wz6b6PipqJfvjpJ-OuRMRnN9Dm2t0c>`__
+   * - Vector Predication
+     - Every 2 weeks on Tuesdays, 3pm UTC
+     -
+     - `Minutes/docs <https://docs.google.com/document/d/1q26ToudQjnqN5x31zk8zgq_s0lem1-BF8pQmciLa4k8/edit?usp=sharing>`__
+   * - `MLIR <https://mlir.llvm.org>`__ design meetings
+     - Weekly, on Thursdays
+     -
+     - `Minutes/docs <https://docs.google.com/document/d/1y_9f1AbfgcoVdJh4_aM6-BaSHvrHl8zuA5G4jv_94K8/edit#heading=h.cite1kolful9>`__
 
 .. _office-hours:
 
diff --git a/llvm/docs/GlobalISel/MIRPatterns.rst b/llvm/docs/GlobalISel/MIRPatterns.rst
index 728e32470144..d7dce1b978cd 100644
--- a/llvm/docs/GlobalISel/MIRPatterns.rst
+++ b/llvm/docs/GlobalISel/MIRPatterns.rst
@@ -514,3 +514,40 @@ of operands.
     (match (does_not_bind $tmp, $x)
            (G_MUL $dst, $x, $tmp)),
     (apply (COPY $dst, $x))>;
+
+
+
+
+Gallery
+=======
+
+We should use precise patterns that state our intentions. Please avoid
+using wip_match_opcode in patterns.
+
+.. code-block:: text
+  :caption: Example fold zext(trunc:nuw)
+
+  // Imprecise: matches any G_ZEXT
+  def zext : GICombineRule<
+    (defs root:$root),
+    (match (wip_match_opcode G_ZEXT):$root,
+    [{ return Helper.matchZextOfTrunc(*${root}, ${matchinfo}); }]),
+    (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+
+  // Imprecise: matches G_ZEXT of G_TRUNC
+  def zext_of_trunc : GICombineRule<
+    (defs root:$root),
+    (match (G_TRUNC $src, $x),
+           (G_ZEXT $root, $src),
+    [{ return Helper.matchZextOfTrunc(${root}, ${matchinfo}); }]),
+    (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+
+  // Precise: matches G_ZEXT of G_TRUNC with nuw flag
+  def zext_of_trunc_nuw : GICombineRule<
+    (defs root:$root),
+    (match (G_TRUNC $src, $x, (MIFlags NoUWrap)),
+           (G_ZEXT $root, $src),
+    [{ return Helper.matchZextOfTrunc(${root}, ${matchinfo}); }]),
+    (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 0e87a8e2ace0..6291a4e57919 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2447,12 +2447,14 @@ example:
     memory access, I/O, or other synchronization.  The ``mustprogress``
     attribute is intended to model the requirements of the first section of
     [intro.progress] of the C++ Standard. As a consequence, a loop in a
-    function with the `mustprogress` attribute can be assumed to terminate if
+    function with the ``mustprogress`` attribute can be assumed to terminate if
     it does not interact with the environment in an observable way, and
-    terminating loops without side-effects can be removed. If a `mustprogress`
-    function does not satisfy this contract, the behavior is undefined.  This
-    attribute does not apply transitively to callees, but does apply to call
-    sites within the function. Note that `willreturn` implies `mustprogress`.
+    terminating loops without side-effects can be removed. If a ``mustprogress``
+    function does not satisfy this contract, the behavior is undefined. If a
+    ``mustprogress`` function calls a function not marked ``mustprogress``,
+    and that function never returns, the program is well-defined even if there
+    isn't any other observable progress.  Note that ``willreturn`` implies
+    ``mustprogress``.
 ``"warn-stack-size"="<threshold>"``
     This attribute sets a threshold to emit diagnostics once the frame size is
     known should the frame size exceed the specified value.  It takes one
@@ -14139,6 +14141,41 @@ Semantics:
 """"""""""
 See description of '``llvm.instrprof.increment``' intrinsic.
 
+'``llvm.instrprof.callsite``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.instrprof.callsite(ptr <name>, i64 <hash>,
+                                            i32 <num-counters>,
+                                            i32 <index>, ptr <callsite>)
+
+Overview:
+"""""""""
+
+.. FIXME: detail when it's emitted once the support is added
+
+The '``llvm.instrprof.callsite``' intrinsic should be emitted before a callsite
+that's not to a "fake" callee (like another intrinsic or asm).
+
+Arguments:
+""""""""""
+The first 4 arguments are similar to ``llvm.instrprof.increment``. The indexing
+is specific to callsites, meaning callsites are indexed from 0, independent from
+the indexes used by the other intrinsics (such as 
+``llvm.instrprof.increment[.step]``).
+
+The last argument is the called value of the callsite this intrinsic precedes.
+
+Semantics:
+""""""""""
+.. FIXME: detail how when the lowering pass is added.
+
+This is lowered by contextual profiling.
+
 '``llvm.instrprof.timestamp``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -18770,7 +18807,7 @@ runtime, then the result vector is a :ref:`poison value <poisonvalues>`. The
 ``idx`` parameter must be a vector index constant type (for most targets this
 will be an integer pointer type).
 
-'``llvm.experimental.vector.reverse``' Intrinsic
+'``llvm.vector.reverse``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18779,25 +18816,26 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+      declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
+      declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reverse.*``' intrinsics reverse a vector.
+The '``llvm.vector.reverse.*``' intrinsics reverse a vector.
 The intrinsic takes a single vector and returns a vector of matching type but
 with the original lane order reversed. These intrinsics work for both fixed
-and scalable vectors. While this intrinsic is marked as experimental the
-recommended way to express reverse operations for fixed-width vectors is still
-to use a shufflevector, as that may allow for more optimization opportunities.
+and scalable vectors. While this intrinsic supports all vector types
+the recommended way to express this operation for fixed-width vectors is
+still to use a shufflevector, as that may allow for more optimization
+opportunities.
 
 Arguments:
 """"""""""
 
 The argument to this intrinsic must be a vector.
 
-'``llvm.experimental.vector.deinterleave2``' Intrinsic
+'``llvm.vector.deinterleave2``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18806,13 +18844,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec1)
-      declare {<vscale x 4 x i32>, <vscale x 4 x i32>}  @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec1)
+      declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec1)
+      declare {<vscale x 4 x i32>, <vscale x 4 x i32>}  @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec1)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.deinterleave2``' intrinsic constructs two
+The '``llvm.vector.deinterleave2``' intrinsic constructs two
 vectors by deinterleaving the even and odd lanes of the input vector.
 
 This intrinsic works for both fixed and scalable vectors. While this intrinsic
@@ -18824,7 +18862,7 @@ For example:
 
 .. code-block:: text
 
-  {<2 x i64>, <2 x i64>} llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> <i64 0, i64 1, i64 2, i64 3>); ==> {<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>}
+  {<2 x i64>, <2 x i64>} llvm.vector.deinterleave2.v4i64(<4 x i64> <i64 0, i64 1, i64 2, i64 3>); ==> {<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>}
 
 Arguments:
 """"""""""
@@ -18832,7 +18870,7 @@ Arguments:
 The argument is a vector whose type corresponds to the logical concatenation of
 the two result types.
 
-'``llvm.experimental.vector.interleave2``' Intrinsic
+'``llvm.vector.interleave2``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18841,13 +18879,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2)
-      declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2)
+      declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2)
+      declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.interleave2``' intrinsic constructs a vector
+The '``llvm.vector.interleave2``' intrinsic constructs a vector
 by interleaving two input vectors.
 
 This intrinsic works for both fixed and scalable vectors. While this intrinsic
@@ -18859,7 +18897,7 @@ For example:
 
 .. code-block:: text
 
-   <4 x i64> llvm.experimental.vector.interleave2.v4i64(<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>); ==> <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+   <4 x i64> llvm.vector.interleave2.v4i64(<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>); ==> <4 x i64> <i64 0, i64 1, i64 2, i64 3>
 
 Arguments:
 """"""""""
@@ -18905,7 +18943,7 @@ The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least
 significant) zero elements in a vector. If ``src == 0`` the result is the
 number of elements in the input vector.
 
-'``llvm.experimental.vector.splice``' Intrinsic
+'``llvm.vector.splice``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18914,13 +18952,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm)
+      declare <2 x double> @llvm.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm)
+      declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.splice.*``' intrinsics construct a vector by
+The '``llvm.vector.splice.*``' intrinsics construct a vector by
 concatenating elements from the first input vector with elements of the second
 input vector, returning a vector of the same type as the input vectors. The
 signed immediate, modulo the number of elements in the vector, is the index
@@ -18931,7 +18969,7 @@ immediate, it extracts ``-imm`` trailing elements from the first vector, and
 the remaining elements from ``%vec2``.
 
 These intrinsics work for both fixed and scalable vectors. While this intrinsic
-is marked as experimental, the recommended way to express this operation for
+supports all vector types the recommended way to express this operation for
 fixed-width vectors is still to use a shufflevector, as that may allow for more
 optimization opportunities.
 
@@ -18939,8 +18977,8 @@ For example:
 
 .. code-block:: text
 
- llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, 1);  ==> <B, C, D, E> index
- llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, -3); ==> <B, C, D, E> trailing elements
+ llvm.vector.splice(<A,B,C,D>, <E,F,G,H>, 1);  ==> <B, C, D, E> index
+ llvm.vector.splice(<A,B,C,D>, <E,F,G,H>, -3); ==> <B, C, D, E> trailing elements
 
 
 Arguments:
@@ -22163,7 +22201,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.vp.splice.*``' intrinsic is the vector length
-predicated version of the '``llvm.experimental.vector.splice.*``' intrinsic.
+predicated version of the '``llvm.vector.splice.*``' intrinsic.
 
 Arguments:
 """"""""""
@@ -22222,7 +22260,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.vp.reverse.*``' intrinsic is the vector length
-predicated version of the '``llvm.experimental.vector.reverse.*``' intrinsic.
+predicated version of the '``llvm.vector.reverse.*``' intrinsic.
 
 Arguments:
 """"""""""
@@ -23965,6 +24003,54 @@ Examples:
       %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
 
 
+.. _int_vp_cttz_elts:
+
+'``llvm.vp.cttz.elts.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. You can use ```llvm.vp.cttz.elts``` on any
+vector of integer elements, both fixed width and scalable.
+
+::
+
+      declare i32  @llvm.vp.cttz.elts.i32.v16i32 (<16 x i32> <op>, i1 <is_zero_poison>, <16 x i1> <mask>, i32 <vector_length>)
+      declare i64  @llvm.vp.cttz.elts.i64.nxv4i32 (<vscale x 4 x i32> <op>, i1 <is_zero_poison>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare i64  @llvm.vp.cttz.elts.i64.v256i1 (<256 x i1> <op>, i1 <is_zero_poison>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+This '```llvm.vp.cttz.elts```' intrinsic counts the number of trailing zero
+elements of a vector. This is basically the vector-predicated version of
+'```llvm.experimental.cttz.elts```'.
+
+Arguments:
+""""""""""
+
+The first argument is the vector to be counted. This argument must be a vector
+with integer element type. The return type must also be an integer type which is
+wide enough to hold the maximum number of elements of the source vector. The
+behavior of this intrinsic is undefined if the return type is not wide enough
+for the number of elements in the input vector.
+
+The second argument is a constant flag that indicates whether the intrinsic
+returns a valid result if the first argument is all zero.
+
+The third operand is the vector mask and has the same number of elements as the
+input vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.cttz.elts``' intrinsic counts the trailing (least
+significant / lowest-numbered) zero elements in the first operand on each
+enabled lane. If the first argument is all zero and the second argument is true,
+the result is poison. Otherwise, it returns the explicit vector length (i.e. the
+fourth operand).
+
 .. _int_vp_sadd_sat:
 
 '``llvm.vp.sadd.sat.*``' Intrinsics
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 6ef6ec20da67..d8cc667723f5 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -50,7 +50,11 @@ Update on required toolchains to build LLVM
 Changes to the LLVM IR
 ----------------------
 
-- Added Memory Model Relaxation Annotations (MMRAs).
+* Added Memory Model Relaxation Annotations (MMRAs).
+* Renamed ``llvm.experimental.vector.reverse`` intrinsic to ``llvm.vector.reverse``.
+* Renamed ``llvm.experimental.vector.splice`` intrinsic to ``llvm.vector.splice``.
+* Renamed ``llvm.experimental.vector.interleave2`` intrinsic to ``llvm.vector.interleave2``.
+* Renamed ``llvm.experimental.vector.deinterleave2`` intrinsic to ``llvm.vector.deinterleave2``.
 
 Changes to LLVM infrastructure
 ------------------------------
@@ -69,7 +73,8 @@ Changes to Interprocedural Optimizations
 Changes to the AArch64 Backend
 ------------------------------
 
-* Added support for Cortex-A78AE, Cortex-A520AE and Cortex-A720AE CPUs.
+* Added support for Cortex-A78AE, Cortex-A520AE, Cortex-A720AE,
+  Cortex-R82AE, Neoverse-N3, Neoverse-V3 and Neoverse-V3AE CPUs.
 
 Changes to the AMDGPU Backend
 -----------------------------
@@ -112,6 +117,8 @@ Changes to the RISC-V Backend
 * The experimental Ssqosid extension is supported.
 * Zacas is no longer experimental.
 * Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
+* llvm-objdump now prints disassembled opcode bytes in groups of 2 or 4 bytes to
+  match GNU objdump. The bytes within the groups are in big endian order.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index dab1d697761b..2c3c75e246c0 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -125,7 +125,20 @@ typedef enum {
   LLVMDWARFSourceLanguageFortran18,
   LLVMDWARFSourceLanguageAda2005,
   LLVMDWARFSourceLanguageAda2012,
+  LLVMDWARFSourceLanguageHIP,
+  LLVMDWARFSourceLanguageAssembly,
+  LLVMDWARFSourceLanguageC_sharp,
   LLVMDWARFSourceLanguageMojo,
+  LLVMDWARFSourceLanguageGLSL,
+  LLVMDWARFSourceLanguageGLSL_ES,
+  LLVMDWARFSourceLanguageHLSL,
+  LLVMDWARFSourceLanguageOpenCL_CPP,
+  LLVMDWARFSourceLanguageCPP_for_OpenCL,
+  LLVMDWARFSourceLanguageSYCL,
+  LLVMDWARFSourceLanguageRuby,
+  LLVMDWARFSourceLanguageMove,
+  LLVMDWARFSourceLanguageHylo,
+
   // Vendor extensions:
   LLVMDWARFSourceLanguageMips_Assembler,
   LLVMDWARFSourceLanguageGOOGLE_RenderScript,
diff --git a/llvm/include/llvm/ADT/FoldingSet.h b/llvm/include/llvm/ADT/FoldingSet.h
index 28a84d965883..f82eabd5044b 100644
--- a/llvm/include/llvm/ADT/FoldingSet.h
+++ b/llvm/include/llvm/ADT/FoldingSet.h
@@ -16,7 +16,6 @@
 #ifndef LLVM_ADT_FOLDINGSET_H
 #define LLVM_ADT_FOLDINGSET_H
 
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallVector.h"
@@ -355,13 +354,6 @@ public:
     AddInteger(unsigned(I));
     AddInteger(unsigned(I >> 32));
   }
-  void AddInteger(const APInt &Int) {
-    AddInteger(Int.getBitWidth());
-    const auto *Parts = Int.getRawData();
-    for (int i = 0, N = Int.getNumWords(); i < N; ++i) {
-      AddInteger(Parts[i]);
-    }
-  }
 
   void AddBoolean(bool B) { AddInteger(B ? 1U : 0U); }
   void AddString(StringRef String);
diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h
index 0360174c5231..04496c76e072 100644
--- a/llvm/include/llvm/ADT/StringRef.h
+++ b/llvm/include/llvm/ADT/StringRef.h
@@ -258,6 +258,9 @@ namespace llvm {
       return Length >= Prefix.Length &&
              compareMemory(Data, Prefix.Data, Prefix.Length) == 0;
     }
+    [[nodiscard]] bool starts_with(char Prefix) const {
+      return !empty() && front() == Prefix;
+    }
 
     /// Check if this string starts with the given \p Prefix, ignoring case.
     [[nodiscard]] bool starts_with_insensitive(StringRef Prefix) const;
@@ -268,6 +271,9 @@ namespace llvm {
              compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) ==
                  0;
     }
+    [[nodiscard]] bool ends_with(char Suffix) const {
+      return !empty() && back() == Suffix;
+    }
 
     /// Check if this string ends with the given \p Suffix, ignoring case.
     [[nodiscard]] bool ends_with_insensitive(StringRef Suffix) const;
diff --git a/llvm/include/llvm/ADT/TypeSwitch.h b/llvm/include/llvm/ADT/TypeSwitch.h
index 10a2d48e918d..5bbbdf23b257 100644
--- a/llvm/include/llvm/ADT/TypeSwitch.h
+++ b/llvm/include/llvm/ADT/TypeSwitch.h
@@ -61,29 +61,9 @@ public:
   }
 
 protected:
-  /// Trait to check whether `ValueT` provides a 'dyn_cast' method with type
-  /// `CastT`.
-  template <typename ValueT, typename CastT>
-  using has_dyn_cast_t =
-      decltype(std::declval<ValueT &>().template dyn_cast<CastT>());
-
-  /// Attempt to dyn_cast the given `value` to `CastT`. This overload is
-  /// selected if `value` already has a suitable dyn_cast method.
+  /// Attempt to dyn_cast the given `value` to `CastT`.
   template <typename CastT, typename ValueT>
-  static decltype(auto) castValue(
-      ValueT &&value,
-      std::enable_if_t<is_detected<has_dyn_cast_t, ValueT, CastT>::value> * =
-          nullptr) {
-    return value.template dyn_cast<CastT>();
-  }
-
-  /// Attempt to dyn_cast the given `value` to `CastT`. This overload is
-  /// selected if llvm::dyn_cast should be used.
-  template <typename CastT, typename ValueT>
-  static decltype(auto) castValue(
-      ValueT &&value,
-      std::enable_if_t<!is_detected<has_dyn_cast_t, ValueT, CastT>::value> * =
-          nullptr) {
+  static decltype(auto) castValue(ValueT &&value) {
     return dyn_cast<CastT>(value);
   }
 
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index 37221eb9e471..717693a7cf63 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -471,6 +471,11 @@ TLI_DEFINE_ENUM_INTERNAL(cxa_atexit)
 TLI_DEFINE_STRING_INTERNAL("__cxa_atexit")
 TLI_DEFINE_SIG_INTERNAL(Int, Ptr, Ptr, Ptr)
 
+/// int atexit(void (*f)(void));
+TLI_DEFINE_ENUM_INTERNAL(atexit)
+TLI_DEFINE_STRING_INTERNAL("atexit")
+TLI_DEFINE_SIG_INTERNAL(Int, Ptr)
+
 /// void __cxa_guard_abort(guard_t *guard);
 /// guard_t is int64_t in Itanium ABI or int32_t on ARM eabi.
 TLI_DEFINE_ENUM_INTERNAL(cxa_guard_abort)
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 571e44cdac26..afd18e7e56ba 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -860,7 +860,8 @@ enum class OverflowResult {
 };
 
 OverflowResult computeOverflowForUnsignedMul(const Value *LHS, const Value *RHS,
-                                             const SimplifyQuery &SQ);
+                                             const SimplifyQuery &SQ,
+                                             bool IsNSW = false);
 OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
                                            const SimplifyQuery &SQ);
 OverflowResult
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.def b/llvm/include/llvm/BinaryFormat/Dwarf.def
index 460a9264536b..adcf24eb83b0 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -11,19 +11,20 @@
 //===----------------------------------------------------------------------===//
 
 // TODO: Add other DW-based macros.
-#if !(                                                                         \
-    defined HANDLE_DW_TAG || defined HANDLE_DW_AT || defined HANDLE_DW_FORM || \
-    defined HANDLE_DW_OP || defined HANDLE_DW_OP_LLVM_USEROP ||                    \
-    defined HANDLE_DW_LANG || defined HANDLE_DW_ATE ||                         \
-    defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||             \
-    defined HANDLE_DW_CC || defined HANDLE_DW_LNS || defined HANDLE_DW_LNE ||  \
-    defined HANDLE_DW_LNCT || defined HANDLE_DW_MACRO ||                       \
-    defined HANDLE_DW_MACRO_GNU || defined HANDLE_MACRO_FLAG ||                \
-    defined HANDLE_DW_RLE || defined HANDLE_DW_LLE ||                          \
-    (defined HANDLE_DW_CFA && defined HANDLE_DW_CFA_PRED) ||                   \
-    defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||                \
-    defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX ||                   \
-    defined HANDLE_DW_END || defined HANDLE_DW_SECT)
+#if !(defined HANDLE_DW_TAG || defined HANDLE_DW_AT ||                         \
+      defined HANDLE_DW_FORM || defined HANDLE_DW_OP ||                        \
+      defined HANDLE_DW_OP_LLVM_USEROP || defined HANDLE_DW_LANG ||            \
+      defined HANDLE_DW_LNAME || defined HANDLE_DW_ATE ||                      \
+      defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||           \
+      defined HANDLE_DW_CC || defined HANDLE_DW_LNS ||                         \
+      defined HANDLE_DW_LNE || defined HANDLE_DW_LNCT ||                       \
+      defined HANDLE_DW_MACRO || defined HANDLE_DW_MACRO_GNU ||                \
+      defined HANDLE_MACRO_FLAG || defined HANDLE_DW_RLE ||                    \
+      defined HANDLE_DW_LLE ||                                                 \
+      (defined HANDLE_DW_CFA && defined HANDLE_DW_CFA_PRED) ||                 \
+      defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||              \
+      defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX ||                 \
+      defined HANDLE_DW_END || defined HANDLE_DW_SECT)
 #error "Missing macro definition of HANDLE_DW*"
 #endif
 
@@ -61,6 +62,10 @@
 #define HANDLE_DW_LANG(ID, NAME, LOWER_BOUND, VERSION, VENDOR)
 #endif
 
+#ifndef HANDLE_DW_LNAME
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND)
+#endif
+
 #ifndef HANDLE_DW_ATE
 #define HANDLE_DW_ATE(ID, NAME, VERSION, VENDOR)
 #endif
@@ -950,12 +955,81 @@ HANDLE_DW_LANG(0x002c, C17, 0, 0, DWARF)
 HANDLE_DW_LANG(0x002d, Fortran18, 0, 0, DWARF)
 HANDLE_DW_LANG(0x002e, Ada2005, 0, 0, DWARF)
 HANDLE_DW_LANG(0x002f, Ada2012, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0030, HIP, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0031, Assembly, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0032, C_sharp, 0, 0, DWARF)
 HANDLE_DW_LANG(0x0033, Mojo, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0034, GLSL, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0035, GLSL_ES, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0036, HLSL, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0037, OpenCL_CPP, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0038, CPP_for_OpenCL, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0039, SYCL, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0040, Ruby, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0041, Move, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0042, Hylo, 0, 0, DWARF)
+
 // Vendor extensions:
 HANDLE_DW_LANG(0x8001, Mips_Assembler, std::nullopt, 0, MIPS)
 HANDLE_DW_LANG(0x8e57, GOOGLE_RenderScript, 0, 0, GOOGLE)
 HANDLE_DW_LANG(0xb000, BORLAND_Delphi, 0, 0, BORLAND)
 
+// Tentative DWARF 6 language codes. This list is subject to change.
+HANDLE_DW_LNAME(0x0001, Ada, "ISO Ada", 1) // YYYY
+HANDLE_DW_LNAME(0x0002, BLISS, "BLISS", 0)
+//     YYYYMM
+// K&R 000000
+// C89 198912
+// C99 199901
+// C11 201112
+// C17 201710
+// C23 202311
+HANDLE_DW_LNAME(0x0003, C, "C (K&R and ISO)", 0)
+//       YYYYMM
+// C++98 199711
+// C++03 200310
+// C++11 201103
+// C++14 201402
+// C++17 201703
+// C++20 202002
+HANDLE_DW_LNAME(0x0004, C_plus_plus, "ISO C++", 0)
+HANDLE_DW_LNAME(0x0005, Cobol, "ISO Cobol", 1) // YYYY
+HANDLE_DW_LNAME(0x0006, Crystal, "Crystal", 0)
+HANDLE_DW_LNAME(0x0007, D, "D", 0)
+HANDLE_DW_LNAME(0x0008, Dylan, "Dylan", 0)
+HANDLE_DW_LNAME(0x0009, Fortran, "ISO Fortran", 1) // YYYY
+HANDLE_DW_LNAME(0x000a, Go, "Go", 0)
+HANDLE_DW_LNAME(0x000b, Haskell, "Haskell", 0)
+HANDLE_DW_LNAME(0x000c, Java, "Java", 0)
+HANDLE_DW_LNAME(0x000d, Julia, "Julia", 1)
+HANDLE_DW_LNAME(0x000e, Kotlin, "Kotlin", 0)
+HANDLE_DW_LNAME(0x000f, Modula2, "Modula 2", 1)
+HANDLE_DW_LNAME(0x0010, Modula3, "Modula 3", 1)
+HANDLE_DW_LNAME(0x0011, ObjC, "Objective C", 0) // YYYYMM
+HANDLE_DW_LNAME(0x0012, ObjC_plus_plus, "Objective C++", 0) // YYYYMM
+HANDLE_DW_LNAME(0x0013, OCaml, "OCaml", 0)
+HANDLE_DW_LNAME(0x0014, OpenCL_C, "OpenCL C", 0)
+HANDLE_DW_LNAME(0x0015, Pascal, "ISO Pascal", 1) // YYYY
+HANDLE_DW_LNAME(0x0016, PLI, "ANSI PL/I", 1)
+HANDLE_DW_LNAME(0x0017, Python, "Python", 0)
+HANDLE_DW_LNAME(0x0018, RenderScript, "RenderScript Kernel Language", 0)
+HANDLE_DW_LNAME(0x0019, Rust, "Rust", 0)
+HANDLE_DW_LNAME(0x001a, Swift, "Swift", 0) // VVMM
+HANDLE_DW_LNAME(0x001b, UPC, "Unified Parallel C (UPC)", 0)
+HANDLE_DW_LNAME(0x001c, Zig, "Zig", 0)
+HANDLE_DW_LNAME(0x001d, Assembly, "Assembly", 0)
+// Conflict: HANDLE_DW_LNAME(0x001d, HIP, "HIP", 0)
+HANDLE_DW_LNAME(0x001e, C_sharp, "C#", 0)
+HANDLE_DW_LNAME(0x001f, Mojo, "Mojo", 0)
+HANDLE_DW_LNAME(0x0020, GLSL, "OpenGL Shading Language", 0) // VVMMPP
+HANDLE_DW_LNAME(0x0021, GLSL_ES, "OpenGL ES Shading Language", 0) // VVMMPP
+HANDLE_DW_LNAME(0x0022, HLSL, "High Level Shading Language", 0) // YYYY
+HANDLE_DW_LNAME(0x0023, OpenCL_CPP, "OpenCL C++", 0) // VVMM
+HANDLE_DW_LNAME(0x0024, CPP_for_OpenCL, "C++ for OpenCL", 0) // VVMM
+HANDLE_DW_LNAME(0x0025, SYCL, "SYCL", 0) // YYYYRR
+HANDLE_DW_LNAME(0x0026, Ruby, "Ruby", 0) // VVMMPP
+HANDLE_DW_LNAME(0x0027, Move, "Move", 0) // YYYYMM
+HANDLE_DW_LNAME(0x0028, Hylo, "Hylo", 0)
 
 // DWARF attribute type encodings.
 HANDLE_DW_ATE(0x01, address, 2, DWARF)
@@ -1267,6 +1341,7 @@ HANDLE_DW_SECT(8, RNGLISTS)
 #undef HANDLE_DW_OP
 #undef HANDLE_DW_OP_LLVM_USEROP
 #undef HANDLE_DW_LANG
+#undef HANDLE_DW_LNAME
 #undef HANDLE_DW_ATE
 #undef HANDLE_DW_VIRTUALITY
 #undef HANDLE_DW_DEFAULTED
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 298700c8941e..74c4d6ff3a71 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -209,6 +209,284 @@ enum SourceLanguage {
   DW_LANG_hi_user = 0xffff
 };
 
+enum SourceLanguageName : uint16_t {
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND) DW_LNAME_##NAME = ID,
+#include "llvm/BinaryFormat/Dwarf.def"
+};
+
+/// Convert a DWARF 6 pair of language name and version to a DWARF 5 DW_LANG.
+/// If the version number doesn't exactly match a known version it is
+/// rounded up to the next-highest known version number.
+inline std::optional<SourceLanguage> toDW_LANG(SourceLanguageName name,
+                                               uint32_t version) {
+  switch (name) {
+  case DW_LNAME_Ada: // YYYY
+    if (version <= 1983)
+      return DW_LANG_Ada83;
+    if (version <= 1995)
+      return DW_LANG_Ada95;
+    if (version <= 2005)
+      return DW_LANG_Ada2005;
+    if (version <= 2012)
+      return DW_LANG_Ada2012;
+    return {};
+  case DW_LNAME_BLISS:
+    return DW_LANG_BLISS;
+  case DW_LNAME_C: // YYYYMM, K&R 000000
+    if (version == 0)
+      return DW_LANG_C;
+    if (version <= 198912)
+      return DW_LANG_C89;
+    if (version <= 199901)
+      return DW_LANG_C99;
+    if (version <= 201112)
+      return DW_LANG_C11;
+    if (version <= 201710)
+      return DW_LANG_C17;
+    return {};
+  case DW_LNAME_C_plus_plus: // YYYYMM
+    if (version == 0)
+      return DW_LANG_C_plus_plus;
+    if (version <= 199711)
+      return DW_LANG_C_plus_plus;
+    if (version <= 200310)
+      return DW_LANG_C_plus_plus_03;
+    if (version <= 201103)
+      return DW_LANG_C_plus_plus_11;
+    if (version <= 201402)
+      return DW_LANG_C_plus_plus_14;
+    if (version <= 201703)
+      return DW_LANG_C_plus_plus_17;
+    if (version <= 202002)
+      return DW_LANG_C_plus_plus_20;
+    return {};
+  case DW_LNAME_Cobol: // YYYY
+    if (version <= 1974)
+      return DW_LANG_Cobol74;
+    if (version <= 1985)
+      return DW_LANG_Cobol85;
+    return {};
+  case DW_LNAME_Crystal:
+    return DW_LANG_Crystal;
+  case DW_LNAME_D:
+    return DW_LANG_D;
+  case DW_LNAME_Dylan:
+    return DW_LANG_Dylan;
+  case DW_LNAME_Fortran: // YYYY
+    if (version <= 1977)
+      return DW_LANG_Fortran77;
+    if (version <= 1990)
+      return DW_LANG_Fortran90;
+    if (version <= 1995)
+      return DW_LANG_Fortran95;
+    if (version <= 2003)
+      return DW_LANG_Fortran03;
+    if (version <= 2008)
+      return DW_LANG_Fortran08;
+    if (version <= 2018)
+      return DW_LANG_Fortran18;
+    return {};
+  case DW_LNAME_Go:
+    return DW_LANG_Go;
+  case DW_LNAME_Haskell:
+    return DW_LANG_Haskell;
+  // case DW_LNAME_HIP:
+  //   return DW_LANG_HIP;
+  case DW_LNAME_Java:
+    return DW_LANG_Java;
+  case DW_LNAME_Julia:
+    return DW_LANG_Julia;
+  case DW_LNAME_Kotlin:
+    return DW_LANG_Kotlin;
+  case DW_LNAME_Modula2:
+    return DW_LANG_Modula2;
+  case DW_LNAME_Modula3:
+    return DW_LANG_Modula3;
+  case DW_LNAME_ObjC:
+    return DW_LANG_ObjC;
+  case DW_LNAME_ObjC_plus_plus:
+    return DW_LANG_ObjC_plus_plus;
+  case DW_LNAME_OCaml:
+    return DW_LANG_OCaml;
+  case DW_LNAME_OpenCL_C:
+    return DW_LANG_OpenCL;
+  case DW_LNAME_Pascal:
+    return DW_LANG_Pascal83;
+  case DW_LNAME_PLI:
+    return DW_LANG_PLI;
+  case DW_LNAME_Python:
+    return DW_LANG_Python;
+  case DW_LNAME_RenderScript:
+    return DW_LANG_RenderScript;
+  case DW_LNAME_Rust:
+    return DW_LANG_Rust;
+  case DW_LNAME_Swift:
+    return DW_LANG_Swift;
+  case DW_LNAME_UPC:
+    return DW_LANG_UPC;
+  case DW_LNAME_Zig:
+    return DW_LANG_Zig;
+  case DW_LNAME_Assembly:
+    return DW_LANG_Assembly;
+  case DW_LNAME_C_sharp:
+    return DW_LANG_C_sharp;
+  case DW_LNAME_Mojo:
+    return DW_LANG_Mojo;
+  case DW_LNAME_GLSL:
+    return DW_LANG_GLSL;
+  case DW_LNAME_GLSL_ES:
+    return DW_LANG_GLSL_ES;
+  case DW_LNAME_HLSL:
+    return DW_LANG_HLSL;
+  case DW_LNAME_OpenCL_CPP:
+    return DW_LANG_OpenCL_CPP;
+  case DW_LNAME_CPP_for_OpenCL:
+    return {};
+  case DW_LNAME_SYCL:
+    return DW_LANG_SYCL;
+  case DW_LNAME_Ruby:
+    return DW_LANG_Ruby;
+  case DW_LNAME_Move:
+    return DW_LANG_Move;
+  case DW_LNAME_Hylo:
+    return DW_LANG_Hylo;
+  }
+  return {};
+}
+
+/// Convert a DWARF 5 DW_LANG to a DWARF 6 pair of language name and version.
+inline std::optional<std::pair<SourceLanguageName, uint32_t>>
+toDW_LNAME(SourceLanguage language) {
+  switch (language) {
+  case DW_LANG_Ada83:
+    return {{DW_LNAME_Ada, 1983}};
+  case DW_LANG_Ada95:
+    return {{DW_LNAME_Ada, 1995}};
+  case DW_LANG_Ada2005:
+    return {{DW_LNAME_Ada, 2005}};
+  case DW_LANG_Ada2012:
+    return {{DW_LNAME_Ada, 2012}};
+  case DW_LANG_BLISS:
+    return {{DW_LNAME_BLISS, 0}};
+  case DW_LANG_C:
+    return {{DW_LNAME_C, 0}};
+  case DW_LANG_C89:
+    return {{DW_LNAME_C, 198912}};
+  case DW_LANG_C99:
+    return {{DW_LNAME_C, 199901}};
+  case DW_LANG_C11:
+    return {{DW_LNAME_C, 201112}};
+  case DW_LANG_C17:
+    return {{DW_LNAME_C, 201712}};
+  case DW_LANG_C_plus_plus:
+    return {{DW_LNAME_C_plus_plus, 0}};
+  case DW_LANG_C_plus_plus_03:
+    return {{DW_LNAME_C_plus_plus, 200310}};
+  case DW_LANG_C_plus_plus_11:
+    return {{DW_LNAME_C_plus_plus, 201103}};
+  case DW_LANG_C_plus_plus_14:
+    return {{DW_LNAME_C_plus_plus, 201402}};
+  case DW_LANG_C_plus_plus_17:
+    return {{DW_LNAME_C_plus_plus, 201703}};
+  case DW_LANG_C_plus_plus_20:
+    return {{DW_LNAME_C_plus_plus, 202002}};
+  case DW_LANG_Cobol74:
+    return {{DW_LNAME_Cobol, 1974}};
+  case DW_LANG_Cobol85:
+    return {{DW_LNAME_Cobol, 1985}};
+  case DW_LANG_Crystal:
+    return {{DW_LNAME_Crystal, 0}};
+  case DW_LANG_D:
+    return {{DW_LNAME_D, 0}};
+  case DW_LANG_Dylan:
+    return {{DW_LNAME_Dylan, 0}};
+  case DW_LANG_Fortran77:
+    return {{DW_LNAME_Fortran, 1977}};
+  case DW_LANG_Fortran90:
+    return {{DW_LNAME_Fortran, 1990}};
+  case DW_LANG_Fortran95:
+    return {{DW_LNAME_Fortran, 1995}};
+  case DW_LANG_Fortran03:
+    return {{DW_LNAME_Fortran, 2003}};
+  case DW_LANG_Fortran08:
+    return {{DW_LNAME_Fortran, 2008}};
+  case DW_LANG_Fortran18:
+    return {{DW_LNAME_Fortran, 2018}};
+  case DW_LANG_Go:
+    return {{DW_LNAME_Go, 0}};
+  case DW_LANG_Haskell:
+    return {{DW_LNAME_Haskell, 0}};
+  case DW_LANG_HIP:
+    return {}; // return {{DW_LNAME_HIP, 0}};
+  case DW_LANG_Java:
+    return {{DW_LNAME_Java, 0}};
+  case DW_LANG_Julia:
+    return {{DW_LNAME_Julia, 0}};
+  case DW_LANG_Kotlin:
+    return {{DW_LNAME_Kotlin, 0}};
+  case DW_LANG_Modula2:
+    return {{DW_LNAME_Modula2, 0}};
+  case DW_LANG_Modula3:
+    return {{DW_LNAME_Modula3, 0}};
+  case DW_LANG_ObjC:
+    return {{DW_LNAME_ObjC, 0}};
+  case DW_LANG_ObjC_plus_plus:
+    return {{DW_LNAME_ObjC_plus_plus, 0}};
+  case DW_LANG_OCaml:
+    return {{DW_LNAME_OCaml, 0}};
+  case DW_LANG_OpenCL:
+    return {{DW_LNAME_OpenCL_C, 0}};
+  case DW_LANG_Pascal83:
+    return {{DW_LNAME_Pascal, 1983}};
+  case DW_LANG_PLI:
+    return {{DW_LNAME_PLI, 0}};
+  case DW_LANG_Python:
+    return {{DW_LNAME_Python, 0}};
+  case DW_LANG_RenderScript:
+  case DW_LANG_GOOGLE_RenderScript:
+    return {{DW_LNAME_RenderScript, 0}};
+  case DW_LANG_Rust:
+    return {{DW_LNAME_Rust, 0}};
+  case DW_LANG_Swift:
+    return {{DW_LNAME_Swift, 0}};
+  case DW_LANG_UPC:
+    return {{DW_LNAME_UPC, 0}};
+  case DW_LANG_Zig:
+    return {{DW_LNAME_Zig, 0}};
+  case DW_LANG_Assembly:
+  case DW_LANG_Mips_Assembler:
+    return {{DW_LNAME_Assembly, 0}};
+  case DW_LANG_C_sharp:
+    return {{DW_LNAME_C_sharp, 0}};
+  case DW_LANG_Mojo:
+    return {{DW_LNAME_Mojo, 0}};
+  case DW_LANG_GLSL:
+    return {{DW_LNAME_GLSL, 0}};
+  case DW_LANG_GLSL_ES:
+    return {{DW_LNAME_GLSL_ES, 0}};
+  case DW_LANG_HLSL:
+    return {{DW_LNAME_HLSL, 0}};
+  case DW_LANG_OpenCL_CPP:
+    return {{DW_LNAME_OpenCL_CPP, 0}};
+  case DW_LANG_SYCL:
+    return {{DW_LNAME_SYCL, 0}};
+  case DW_LANG_Ruby:
+    return {{DW_LNAME_Ruby, 0}};
+  case DW_LANG_Move:
+    return {{DW_LNAME_Move, 0}};
+  case DW_LANG_Hylo:
+    return {{DW_LNAME_Hylo, 0}};
+  case DW_LANG_BORLAND_Delphi:
+  case DW_LANG_CPP_for_OpenCL:
+  case DW_LANG_lo_user:
+  case DW_LANG_hi_user:
+    return {};
+  }
+  return {};
+}
+
+llvm::StringRef LanguageDescription(SourceLanguageName name);
+
 inline bool isCPlusPlus(SourceLanguage S) {
   bool result = false;
   // Deliberately enumerate all the language options so we get a warning when
@@ -268,7 +546,19 @@ inline bool isCPlusPlus(SourceLanguage S) {
   case DW_LANG_Fortran18:
   case DW_LANG_Ada2005:
   case DW_LANG_Ada2012:
+  case DW_LANG_HIP:
+  case DW_LANG_Assembly:
+  case DW_LANG_C_sharp:
   case DW_LANG_Mojo:
+  case DW_LANG_GLSL:
+  case DW_LANG_GLSL_ES:
+  case DW_LANG_HLSL:
+  case DW_LANG_OpenCL_CPP:
+  case DW_LANG_CPP_for_OpenCL:
+  case DW_LANG_SYCL:
+  case DW_LANG_Ruby:
+  case DW_LANG_Move:
+  case DW_LANG_Hylo:
     result = false;
     break;
   }
@@ -335,7 +625,19 @@ inline bool isFortran(SourceLanguage S) {
   case DW_LANG_C17:
   case DW_LANG_Ada2005:
   case DW_LANG_Ada2012:
+  case DW_LANG_HIP:
+  case DW_LANG_Assembly:
+  case DW_LANG_C_sharp:
   case DW_LANG_Mojo:
+  case DW_LANG_GLSL:
+  case DW_LANG_GLSL_ES:
+  case DW_LANG_HLSL:
+  case DW_LANG_OpenCL_CPP:
+  case DW_LANG_CPP_for_OpenCL:
+  case DW_LANG_SYCL:
+  case DW_LANG_Ruby:
+  case DW_LANG_Move:
+  case DW_LANG_Hylo:
     result = false;
     break;
   }
@@ -400,7 +702,19 @@ inline bool isC(SourceLanguage S) {
   case DW_LANG_Fortran18:
   case DW_LANG_Ada2005:
   case DW_LANG_Ada2012:
+  case DW_LANG_HIP:
+  case DW_LANG_Assembly:
+  case DW_LANG_C_sharp:
   case DW_LANG_Mojo:
+  case DW_LANG_GLSL:
+  case DW_LANG_GLSL_ES:
+  case DW_LANG_HLSL:
+  case DW_LANG_OpenCL_CPP:
+  case DW_LANG_CPP_for_OpenCL:
+  case DW_LANG_SYCL:
+  case DW_LANG_Ruby:
+  case DW_LANG_Move:
+  case DW_LANG_Hylo:
     return false;
   }
   llvm_unreachable("Unknown language kind.");
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 56b5d4e399c6..f296acc2ca4b 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1939,11 +1939,12 @@ uint16_t convertArchNameToEMachine(StringRef Arch);
 /// Convert an ELF's e_machine value into an architecture name.
 StringRef convertEMachineToArchName(uint16_t EMachine);
 
-/// Convert a OS into ELF's EI_OSABI value.
-uint8_t convertOSToOSAbi(StringRef OS);
+// Convert a lowercase string identifier into an OSABI value.
+uint8_t convertNameToOSABI(StringRef Name);
 
-/// Convert an ELF's e_machine value into an architecture name.
-StringRef convertOSAbiToOS(uint8_t OSAbi);
+// Convert an OSABI value into a string that identifies the OS- or ABI-
+// specific ELF extension.
+StringRef convertOSABIToName(uint8_t OSABI);
 
 } // end namespace ELF
 } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 4a3a03dc5ad4..92b51438b4cb 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1662,12 +1662,12 @@ public:
           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
           std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
     }
-    case Intrinsic::experimental_vector_reverse: {
+    case Intrinsic::vector_reverse: {
       return thisT()->getShuffleCost(
           TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
           CostKind, 0, cast<VectorType>(RetTy));
     }
-    case Intrinsic::experimental_vector_splice: {
+    case Intrinsic::vector_splice: {
       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
       return thisT()->getShuffleCost(
           TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
diff --git a/llvm/include/llvm/CodeGen/FreeMachineFunction.h b/llvm/include/llvm/CodeGen/FreeMachineFunction.h
deleted file mode 100644
index 5f21c6720350..000000000000
--- a/llvm/include/llvm/CodeGen/FreeMachineFunction.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- llvm/CodeGen/FreeMachineFunction.h -----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_FREEMACHINEFUNCTION_H
-#define LLVM_CODEGEN_FREEMACHINEFUNCTION_H
-
-#include "llvm/CodeGen/MachinePassManager.h"
-
-namespace llvm {
-
-class FreeMachineFunctionPass : public PassInfoMixin<FreeMachineFunctionPass> {
-public:
-  PreservedAnalyses run(MachineFunction &MF,
-                        MachineFunctionAnalysisManager &MFAM);
-};
-
-} // namespace llvm
-
-#endif // LLVM_CODEGEN_FREEMACHINEFUNCTION_H
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 4b8aec8e8a5d..76e8d1166ae0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -853,6 +853,9 @@ public:
   bool matchExtractVectorElementWithDifferentIndices(const MachineOperand &MO,
                                                      BuildFnTy &MatchInfo);
 
+  /// Combine insert vector element OOB.
+  bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 5f28908e998a..deae2c55d26e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -247,8 +247,8 @@ private:
   bool translateTrap(const CallInst &U, MachineIRBuilder &MIRBuilder,
                      unsigned Opcode);
 
-  // Translate @llvm.experimental.vector.interleave2 and
-  // @llvm.experimental.vector.deinterleave2 intrinsics for fixed-width vector
+  // Translate @llvm.vector.interleave2 and
+  // @llvm.vector.deinterleave2 intrinsics for fixed-width vector
   // types into vector shuffles.
   bool translateVectorInterleave2Intrinsic(const CallInst &CI,
                                            MachineIRBuilder &MIRBuilder);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
index 0f20a33f3a75..7990997835d0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h
@@ -35,11 +35,23 @@ struct LegalityQuery;
 class MachineRegisterInfo;
 namespace GISelAddressing {
 /// Helper struct to store a base, index and offset that forms an address
-struct BaseIndexOffset {
+class BaseIndexOffset {
+private:
   Register BaseReg;
   Register IndexReg;
-  int64_t Offset = 0;
-  bool IsIndexSignExt = false;
+  std::optional<int64_t> Offset;
+
+public:
+  BaseIndexOffset() = default;
+  Register getBase() { return BaseReg; }
+  Register getBase() const { return BaseReg; }
+  Register getIndex() { return IndexReg; }
+  Register getIndex() const { return IndexReg; }
+  void setBase(Register NewBase) { BaseReg = NewBase; }
+  void setIndex(Register NewIndex) { IndexReg = NewIndex; }
+  void setOffset(std::optional<int64_t> NewOff) { Offset = NewOff; }
+  bool hasValidOffset() const { return Offset.has_value(); }
+  int64_t getOffset() const { return *Offset; }
 };
 
 /// Returns a BaseIndexOffset which describes the pointer in \p Ptr.
@@ -89,7 +101,7 @@ private:
     // order stores are writing to incremeneting consecutive addresses. So when
     // we walk the block in reverse order, the next eligible store must write to
     // an offset one store width lower than CurrentLowestOffset.
-    uint64_t CurrentLowestOffset;
+    int64_t CurrentLowestOffset;
     SmallVector<GStore *> Stores;
     // A vector of MachineInstr/unsigned pairs to denote potential aliases that
     // need to be checked before the candidate is considered safe to merge. The
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index c4174cee5e10..70421a518ab7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -559,5 +559,31 @@ void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI);
 /// having only floating-point operands.
 bool isPreISelGenericFloatingPointOpcode(unsigned Opc);
 
+/// Returns true if \p Reg can create undef or poison from non-undef &
+/// non-poison operands. \p ConsiderFlagsAndMetadata controls whether poison
+/// producing flags and metadata on the instruction are considered. This can be
+/// used to see if the instruction could still introduce undef or poison even
+/// without poison generating flags and metadata which might be on the
+/// instruction.
+bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
+                            bool ConsiderFlagsAndMetadata = true);
+
+/// Returns true if \p Reg can create poison from non-poison operands.
+bool canCreatePoison(Register Reg, const MachineRegisterInfo &MRI,
+                     bool ConsiderFlagsAndMetadata = true);
+
+/// Returns true if \p Reg cannot be poison and undef.
+bool isGuaranteedNotToBeUndefOrPoison(Register Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      unsigned Depth = 0);
+
+/// Returns true if \p Reg cannot be poison, but may be undef.
+bool isGuaranteedNotToBePoison(Register Reg, const MachineRegisterInfo &MRI,
+                               unsigned Depth = 0);
+
+/// Returns true if \p Reg cannot be undef, but may be poison.
+bool isGuaranteedNotToBeUndef(Register Reg, const MachineRegisterInfo &MRI,
+                              unsigned Depth = 0);
+
 } // End namespace llvm.
 #endif
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 078a936b061a..6429947958ee 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -205,6 +205,7 @@ enum NodeType {
   /// CopyFromReg - This node indicates that the input value is a virtual or
   /// physical register that is defined outside of the scope of this
   /// SelectionDAG.  The register is available from the RegisterSDNode object.
+  /// Note that CopyFromReg is considered as also freezing the value.
   CopyFromReg,
 
   /// UNDEF - An undefined node.
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
index e1606e7c0ea7..ae0938a48a71 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIRParser.h
@@ -34,6 +34,9 @@ class MachineModuleInfo;
 class SMDiagnostic;
 class StringRef;
 
+template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
+using ModuleAnalysisManager = AnalysisManager<Module>;
+
 typedef llvm::function_ref<std::optional<std::string>(StringRef, StringRef)>
     DataLayoutCallbackTy;
 
@@ -60,6 +63,15 @@ public:
   ///
   /// \returns true if an error occurred.
   bool parseMachineFunctions(Module &M, MachineModuleInfo &MMI);
+
+  /// Parses MachineFunctions in the MIR file and add them as the result
+  /// of MachineFunctionAnalysis in ModulePassManager \p MAM.
+  /// User should register at least MachineFunctionAnalysis,
+  /// MachineModuleAnalysis, FunctionAnalysisManagerModuleProxy and
+  /// PassInstrumentationAnalysis in \p MAM before parsing MIR.
+  ///
+  /// \returns true if an error occurred.
+  bool parseMachineFunctions(Module &M, ModuleAnalysisManager &MAM);
 };
 
 /// This function is the main interface to the MIR serialization format parser.
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index d71cdc63414d..36896a84a712 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -645,6 +645,7 @@ struct MachineFrameInfo {
   bool HasVAStart = false;
   bool HasMustTailInVarArgFunc = false;
   bool HasTailCall = false;
+  bool IsCalleeSavedInfoValid = false;
   unsigned LocalFrameSize = 0;
   StringValue SavePoint;
   StringValue RestorePoint;
@@ -668,7 +669,8 @@ struct MachineFrameInfo {
            HasMustTailInVarArgFunc == Other.HasMustTailInVarArgFunc &&
            HasTailCall == Other.HasTailCall &&
            LocalFrameSize == Other.LocalFrameSize &&
-           SavePoint == Other.SavePoint && RestorePoint == Other.RestorePoint;
+           SavePoint == Other.SavePoint && RestorePoint == Other.RestorePoint &&
+           IsCalleeSavedInfoValid == Other.IsCalleeSavedInfoValid;
   }
 };
 
@@ -696,6 +698,8 @@ template <> struct MappingTraits<MachineFrameInfo> {
     YamlIO.mapOptional("hasMustTailInVarArgFunc", MFI.HasMustTailInVarArgFunc,
                        false);
     YamlIO.mapOptional("hasTailCall", MFI.HasTailCall, false);
+    YamlIO.mapOptional("isCalleeSavedInfoValid", MFI.IsCalleeSavedInfoValid,
+                       false);
     YamlIO.mapOptional("localFrameSize", MFI.LocalFrameSize, (unsigned)0);
     YamlIO.mapOptional("savePoint", MFI.SavePoint,
                        StringValue()); // Don't print it out when it's empty.
diff --git a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
new file mode 100644
index 000000000000..f54d455292da
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
@@ -0,0 +1,50 @@
+//===- llvm/CodeGen/MachineFunctionAnalysis.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MachineFunctionAnalysis class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEFUNCTIONANALYSIS
+#define LLVM_CODEGEN_MACHINEFUNCTIONANALYSIS
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class MachineFunction;
+class LLVMTargetMachine;
+
+/// This analysis create MachineFunction for given Function.
+/// To release the MachineFunction, users should invalidate it explicitly.
+class MachineFunctionAnalysis
+    : public AnalysisInfoMixin<MachineFunctionAnalysis> {
+  friend AnalysisInfoMixin<MachineFunctionAnalysis>;
+
+  static AnalysisKey Key;
+
+  const LLVMTargetMachine *TM;
+
+public:
+  class Result {
+    std::unique_ptr<MachineFunction> MF;
+
+  public:
+    Result(std::unique_ptr<MachineFunction> MF) : MF(std::move(MF)) {}
+    MachineFunction &getMF() { return *MF; };
+    bool invalidate(Function &, const PreservedAnalyses &PA,
+                    FunctionAnalysisManager::Invalidator &);
+  };
+
+  MachineFunctionAnalysis(const LLVMTargetMachine *TM) : TM(TM){};
+  Result run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_MachineFunctionAnalysis
diff --git a/llvm/include/llvm/CodeGen/MachineModuleInfo.h b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
index 5f66d1ada0d0..92ea3c902ce9 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleInfo.h
@@ -147,10 +147,14 @@ public:
 
   /// Returns the MachineFunction constructed for the IR function \p F.
   /// Creates a new MachineFunction if none exists yet.
+  /// NOTE: New pass manager clients shall not use this method to get
+  /// the `MachineFunction`, use `MachineFunctionAnalysis` instead.
   MachineFunction &getOrCreateMachineFunction(Function &F);
 
   /// \brief Returns the MachineFunction associated to IR function \p F if there
   /// is one, otherwise nullptr.
+  /// NOTE: New pass manager clients shall not use this method to get
+  /// the `MachineFunction`, use `MachineFunctionAnalysis` instead.
   MachineFunction *getMachineFunction(const Function &F) const;
 
   /// Delete the MachineFunction \p MF and reset the link in the IR Function to
diff --git a/llvm/include/llvm/CodeGen/MachinePassManager.h b/llvm/include/llvm/CodeGen/MachinePassManager.h
index 4f0b6ba2b1e7..852b1a0f4161 100644
--- a/llvm/include/llvm/CodeGen/MachinePassManager.h
+++ b/llvm/include/llvm/CodeGen/MachinePassManager.h
@@ -108,6 +108,15 @@ bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate(
     ModuleAnalysisManager::Invalidator &Inv);
 extern template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
                                                 Module>;
+using MachineFunctionAnalysisManagerFunctionProxy =
+    InnerAnalysisManagerProxy<MachineFunctionAnalysisManager, Function>;
+
+template <>
+bool MachineFunctionAnalysisManagerFunctionProxy::Result::invalidate(
+    Function &F, const PreservedAnalyses &PA,
+    FunctionAnalysisManager::Invalidator &Inv);
+extern template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
+                                                Function>;
 
 extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
                                                 MachineFunction>;
@@ -129,16 +138,6 @@ public:
       Arg.FAM = nullptr;
     }
 
-    ~Result() {
-      // FAM is cleared in a moved from state where there is nothing to do.
-      if (!FAM)
-        return;
-
-      // Clear out the analysis manager if we're being destroyed -- it means we
-      // didn't even see an invalidate call when we got invalidated.
-      FAM->clear();
-    }
-
     Result &operator=(Result &&RHS) {
       FAM = RHS.FAM;
       // We have to null out the analysis manager in the moved-from state
@@ -187,18 +186,18 @@ private:
   FunctionAnalysisManager *FAM;
 };
 
-class ModuleToMachineFunctionPassAdaptor
-    : public PassInfoMixin<ModuleToMachineFunctionPassAdaptor> {
+class FunctionToMachineFunctionPassAdaptor
+    : public PassInfoMixin<FunctionToMachineFunctionPassAdaptor> {
 public:
   using PassConceptT =
       detail::PassConcept<MachineFunction, MachineFunctionAnalysisManager>;
 
-  explicit ModuleToMachineFunctionPassAdaptor(
+  explicit FunctionToMachineFunctionPassAdaptor(
       std::unique_ptr<PassConceptT> Pass)
       : Pass(std::move(Pass)) {}
 
-  /// Runs the function pass across every function in the module.
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  /// Runs the function pass across every function in the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
   void printPipeline(raw_ostream &OS,
                      function_ref<StringRef(StringRef)> MapClassName2PassName);
 
@@ -209,14 +208,14 @@ private:
 };
 
 template <typename MachineFunctionPassT>
-ModuleToMachineFunctionPassAdaptor
-createModuleToMachineFunctionPassAdaptor(MachineFunctionPassT &&Pass) {
+FunctionToMachineFunctionPassAdaptor
+createFunctionToMachineFunctionPassAdaptor(MachineFunctionPassT &&Pass) {
   using PassModelT = detail::PassModel<MachineFunction, MachineFunctionPassT,
                                        MachineFunctionAnalysisManager>;
   // Do not use make_unique, it causes too many template instantiations,
   // causing terrible compile times.
-  return ModuleToMachineFunctionPassAdaptor(
-      std::unique_ptr<ModuleToMachineFunctionPassAdaptor::PassConceptT>(
+  return FunctionToMachineFunctionPassAdaptor(
+      std::unique_ptr<FunctionToMachineFunctionPassAdaptor::PassConceptT>(
           new PassModelT(std::forward<MachineFunctionPassT>(Pass))));
 }
 
@@ -244,6 +243,10 @@ extern template class PassManager<MachineFunction>;
 /// Convenience typedef for a pass manager over functions.
 using MachineFunctionPassManager = PassManager<MachineFunction>;
 
+/// Returns the minimum set of Analyses that all machine function passes must
+/// preserve.
+PreservedAnalyses getMachineFunctionPassPreservedAnalyses();
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINEPASSMANAGER_H
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 9cca6b3a571c..b15abf040058 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -807,7 +807,7 @@ public:
   // constructor for empty set
   explicit ResourceSegments(){};
   bool empty() const { return _Intervals.empty(); }
-  explicit ResourceSegments(std::list<IntervalTy> Intervals)
+  explicit ResourceSegments(const std::list<IntervalTy> &Intervals)
       : _Intervals(Intervals) {
     sortAndMerge();
   }
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 4cc7bb9c3b55..c581eb7a60aa 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -716,7 +716,17 @@ inline SpecificInt_match m_SpecificInt(uint64_t V) {
 
 inline SpecificInt_match m_Zero() { return m_SpecificInt(0U); }
 inline SpecificInt_match m_One() { return m_SpecificInt(1U); }
-inline SpecificInt_match m_AllOnes() { return m_SpecificInt(~0U); }
+
+struct AllOnes_match {
+
+  AllOnes_match() = default;
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    return isAllOnesOrAllOnesSplat(N);
+  }
+};
+
+inline AllOnes_match m_AllOnes() { return AllOnes_match(); }
 
 /// Match true boolean value based on the information provided by
 /// TargetLowering.
@@ -766,7 +776,7 @@ inline BinaryOpc_match<SpecificInt_match, ValTy> m_Neg(const ValTy &V) {
 
 /// Match a Not as a xor(v, -1) or xor(-1, v)
 template <typename ValTy>
-inline BinaryOpc_match<ValTy, SpecificInt_match, true> m_Not(const ValTy &V) {
+inline BinaryOpc_match<ValTy, AllOnes_match, true> m_Not(const ValTy &V) {
   return m_Xor(V, m_AllOnes());
 }
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index f353aef1f446..4b1b58d4af0b 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2083,8 +2083,9 @@ public:
   /// Return true if the specified operand is an ISD::OR or ISD::XOR node
   /// that can be treated as an ISD::ADD node.
   /// or(x,y) == add(x,y) iff haveNoCommonBitsSet(x,y)
-  /// xor(x,y) == add(x,y) iff isMinSignedConstant(y)
-  bool isADDLike(SDValue Op) const;
+  /// xor(x,y) == add(x,y) iff isMinSignedConstant(y) && !NoWrap
+  /// If \p NoWrap is true, this will not match ISD::XOR.
+  bool isADDLike(SDValue Op, bool NoWrap = false) const;
 
   /// Return true if the specified operand is an ISD::ADD with a ConstantSDNode
   /// on the right-hand side, or if it is an ISD::OR with a ConstantSDNode that
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 261f7e49e5c8..e7c710414545 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -999,6 +999,13 @@ public:
   /// If Flags is not in a defined state then this has no effect.
   void intersectFlagsWith(const SDNodeFlags Flags);
 
+  bool hasPoisonGeneratingFlags() const {
+    SDNodeFlags Flags = getFlags();
+    return Flags.hasNoUnsignedWrap() || Flags.hasNoSignedWrap() ||
+           Flags.hasExact() || Flags.hasDisjoint() || Flags.hasNonNeg() ||
+           Flags.hasNoNaNs() || Flags.hasNoInfs();
+  }
+
   void setCFIType(uint32_t Type) { CFIType = Type; }
   uint32_t getCFIType() const { return CFIType; }
 
@@ -1283,8 +1290,10 @@ private:
   unsigned DestAddrSpace;
 
 public:
-  AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl, EVT VT,
-                      unsigned SrcAS, unsigned DestAS);
+  AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
+                      unsigned SrcAS, unsigned DestAS)
+      : SDNode(ISD::ADDRSPACECAST, Order, dl, VTs), SrcAddrSpace(SrcAS),
+        DestAddrSpace(DestAS) {}
 
   unsigned getSrcAddressSpace() const { return SrcAddrSpace; }
   unsigned getDestAddressSpace() const { return DestAddrSpace; }
@@ -1573,8 +1582,9 @@ class ShuffleVectorSDNode : public SDNode {
 protected:
   friend class SelectionDAG;
 
-  ShuffleVectorSDNode(EVT VT, unsigned Order, const DebugLoc &dl, const int *M)
-      : SDNode(ISD::VECTOR_SHUFFLE, Order, dl, getSDVTList(VT)), Mask(M) {}
+  ShuffleVectorSDNode(SDVTList VTs, unsigned Order, const DebugLoc &dl,
+                      const int *M)
+      : SDNode(ISD::VECTOR_SHUFFLE, Order, dl, VTs), Mask(M) {}
 
 public:
   ArrayRef<int> getMask() const {
@@ -1628,9 +1638,10 @@ class ConstantSDNode : public SDNode {
 
   const ConstantInt *Value;
 
-  ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val, EVT VT)
+  ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val,
+                 SDVTList VTs)
       : SDNode(isTarget ? ISD::TargetConstant : ISD::Constant, 0, DebugLoc(),
-               getSDVTList(VT)),
+               VTs),
         Value(val) {
     ConstantSDNodeBits.IsOpaque = isOpaque;
   }
@@ -1681,9 +1692,9 @@ class ConstantFPSDNode : public SDNode {
 
   const ConstantFP *Value;
 
-  ConstantFPSDNode(bool isTarget, const ConstantFP *val, EVT VT)
+  ConstantFPSDNode(bool isTarget, const ConstantFP *val, SDVTList VTs)
       : SDNode(isTarget ? ISD::TargetConstantFP : ISD::ConstantFP, 0,
-               DebugLoc(), getSDVTList(VT)),
+               DebugLoc(), VTs),
         Value(val) {}
 
 public:
@@ -1816,8 +1827,10 @@ class GlobalAddressSDNode : public SDNode {
   unsigned TargetFlags;
 
   GlobalAddressSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL,
-                      const GlobalValue *GA, EVT VT, int64_t o,
-                      unsigned TF);
+                      const GlobalValue *GA, SDVTList VTs, int64_t o,
+                      unsigned TF)
+      : SDNode(Opc, Order, DL, VTs), TheGlobal(GA), Offset(o), TargetFlags(TF) {
+  }
 
 public:
   const GlobalValue *getGlobal() const { return TheGlobal; }
@@ -1839,10 +1852,10 @@ class FrameIndexSDNode : public SDNode {
 
   int FI;
 
-  FrameIndexSDNode(int fi, EVT VT, bool isTarg)
-    : SDNode(isTarg ? ISD::TargetFrameIndex : ISD::FrameIndex,
-      0, DebugLoc(), getSDVTList(VT)), FI(fi) {
-  }
+  FrameIndexSDNode(int fi, SDVTList VTs, bool isTarg)
+      : SDNode(isTarg ? ISD::TargetFrameIndex : ISD::FrameIndex, 0, DebugLoc(),
+               VTs),
+        FI(fi) {}
 
 public:
   int getIndex() const { return FI; }
@@ -1917,10 +1930,10 @@ class JumpTableSDNode : public SDNode {
   int JTI;
   unsigned TargetFlags;
 
-  JumpTableSDNode(int jti, EVT VT, bool isTarg, unsigned TF)
-    : SDNode(isTarg ? ISD::TargetJumpTable : ISD::JumpTable,
-      0, DebugLoc(), getSDVTList(VT)), JTI(jti), TargetFlags(TF) {
-  }
+  JumpTableSDNode(int jti, SDVTList VTs, bool isTarg, unsigned TF)
+      : SDNode(isTarg ? ISD::TargetJumpTable : ISD::JumpTable, 0, DebugLoc(),
+               VTs),
+        JTI(jti), TargetFlags(TF) {}
 
 public:
   int getIndex() const { return JTI; }
@@ -1943,19 +1956,19 @@ class ConstantPoolSDNode : public SDNode {
   Align Alignment; // Minimum alignment requirement of CP.
   unsigned TargetFlags;
 
-  ConstantPoolSDNode(bool isTarget, const Constant *c, EVT VT, int o,
+  ConstantPoolSDNode(bool isTarget, const Constant *c, SDVTList VTs, int o,
                      Align Alignment, unsigned TF)
       : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
-               DebugLoc(), getSDVTList(VT)),
+               DebugLoc(), VTs),
         Offset(o), Alignment(Alignment), TargetFlags(TF) {
     assert(Offset >= 0 && "Offset is too large");
     Val.ConstVal = c;
   }
 
-  ConstantPoolSDNode(bool isTarget, MachineConstantPoolValue *v, EVT VT, int o,
-                     Align Alignment, unsigned TF)
+  ConstantPoolSDNode(bool isTarget, MachineConstantPoolValue *v, SDVTList VTs,
+                     int o, Align Alignment, unsigned TF)
       : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
-               DebugLoc(), getSDVTList(VT)),
+               DebugLoc(), VTs),
         Offset(o), Alignment(Alignment), TargetFlags(TF) {
     assert(Offset >= 0 && "Offset is too large");
     Val.MachineCPVal = v;
@@ -2003,9 +2016,9 @@ class TargetIndexSDNode : public SDNode {
   int64_t Offset;
 
 public:
-  TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned TF)
-      : SDNode(ISD::TargetIndex, 0, DebugLoc(), getSDVTList(VT)),
-        TargetFlags(TF), Index(Idx), Offset(Ofs) {}
+  TargetIndexSDNode(int Idx, SDVTList VTs, int64_t Ofs, unsigned TF)
+      : SDNode(ISD::TargetIndex, 0, DebugLoc(), VTs), TargetFlags(TF),
+        Index(Idx), Offset(Ofs) {}
 
   unsigned getTargetFlags() const { return TargetFlags; }
   int getIndex() const { return Index; }
@@ -2215,8 +2228,8 @@ class RegisterSDNode : public SDNode {
 
   Register Reg;
 
-  RegisterSDNode(Register reg, EVT VT)
-    : SDNode(ISD::Register, 0, DebugLoc(), getSDVTList(VT)), Reg(reg) {}
+  RegisterSDNode(Register reg, SDVTList VTs)
+      : SDNode(ISD::Register, 0, DebugLoc(), VTs), Reg(reg) {}
 
 public:
   Register getReg() const { return Reg; }
@@ -2251,10 +2264,10 @@ class BlockAddressSDNode : public SDNode {
   int64_t Offset;
   unsigned TargetFlags;
 
-  BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba,
+  BlockAddressSDNode(unsigned NodeTy, SDVTList VTs, const BlockAddress *ba,
                      int64_t o, unsigned Flags)
-    : SDNode(NodeTy, 0, DebugLoc(), getSDVTList(VT)),
-             BA(ba), Offset(o), TargetFlags(Flags) {}
+      : SDNode(NodeTy, 0, DebugLoc(), VTs), BA(ba), Offset(o),
+        TargetFlags(Flags) {}
 
 public:
   const BlockAddress *getBlockAddress() const { return BA; }
@@ -2292,9 +2305,10 @@ class ExternalSymbolSDNode : public SDNode {
   const char *Symbol;
   unsigned TargetFlags;
 
-  ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned TF, EVT VT)
+  ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned TF,
+                       SDVTList VTs)
       : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol, 0,
-               DebugLoc(), getSDVTList(VT)),
+               DebugLoc(), VTs),
         Symbol(Sym), TargetFlags(TF) {}
 
 public:
@@ -2312,8 +2326,8 @@ class MCSymbolSDNode : public SDNode {
 
   MCSymbol *Symbol;
 
-  MCSymbolSDNode(MCSymbol *Symbol, EVT VT)
-      : SDNode(ISD::MCSymbol, 0, DebugLoc(), getSDVTList(VT)), Symbol(Symbol) {}
+  MCSymbolSDNode(MCSymbol *Symbol, SDVTList VTs)
+      : SDNode(ISD::MCSymbol, 0, DebugLoc(), VTs), Symbol(Symbol) {}
 
 public:
   MCSymbol *getMCSymbol() const { return Symbol; }
@@ -3026,8 +3040,8 @@ class AssertAlignSDNode : public SDNode {
   Align Alignment;
 
 public:
-  AssertAlignSDNode(unsigned Order, const DebugLoc &DL, EVT VT, Align A)
-      : SDNode(ISD::AssertAlign, Order, DL, getSDVTList(VT)), Alignment(A) {}
+  AssertAlignSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs, Align A)
+      : SDNode(ISD::AssertAlign, Order, DL, VTs), Alignment(A) {}
 
   Align getAlign() const { return Alignment; }
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2dd978c7b584..7ed08cfa8a20 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3146,7 +3146,7 @@ public:
 
   /// Lower a deinterleave intrinsic to a target specific load intrinsic.
   /// Return true on success. Currently only supports
-  /// llvm.experimental.vector.deinterleave2
+  /// llvm.vector.deinterleave2
   ///
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
@@ -3157,7 +3157,7 @@ public:
 
   /// Lower an interleave intrinsic to a target specific store intrinsic.
   /// Return true on success. Currently only supports
-  /// llvm.experimental.vector.interleave2
+  /// llvm.vector.interleave2
   ///
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
@@ -5238,6 +5238,9 @@ public:
   /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
   SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand fminimum/fmaximum into multiple comparison with selects.
+  SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand FP_TO_[US]INT_SAT into FP_TO_[US]INT and selects or min/max.
   /// \param N Node to expand
   /// \returns The expansion result
@@ -5304,6 +5307,11 @@ public:
   /// \returns The expansion result or SDValue() if it fails.
   SDValue expandVPCTTZ(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand VP_CTTZ_ELTS/VP_CTTZ_ELTS_ZERO_UNDEF nodes.
+  /// \param N Node to expand
+  /// \returns The expansion result or SDValue() if it fails.
+  SDValue expandVPCTTZElements(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand ABS nodes. Expands vector/scalar ABS nodes,
   /// vector nodes can only succeed if all operations are legal/custom.
   /// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size))
diff --git a/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h b/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h
index 8deea3bff1ed..35ef0a8bc890 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h
@@ -57,13 +57,12 @@ public:
   }
 
   // For multi-threaded usage, we can collect stuff in another aggregator,
-  // then merge it in here
+  // then merge it in here. Note that this is *not* thread safe. It is up to
+  // the caller to ensure that this is only called from one thread at a time.
   void Merge(const OutputAggregator &other) {
     for (auto &&[name, count] : other.Aggregation) {
-      auto it = Aggregation.find(name);
-      if (it == Aggregation.end())
-        Aggregation.emplace(name, count);
-      else
+      auto [it, inserted] = Aggregation.emplace(name, count);
+      if (!inserted)
         it->second += count;
     }
   }
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
index 34f2e0789462..e452b90598a0 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
@@ -122,7 +122,7 @@ public:
   }
 
   /// Add a pass-config modifier.
-  ObjectLinkingLayer &addPlugin(std::unique_ptr<Plugin> P) {
+  ObjectLinkingLayer &addPlugin(std::shared_ptr<Plugin> P) {
     std::lock_guard<std::mutex> Lock(LayerMutex);
     Plugins.push_back(std::move(P));
     return *this;
@@ -181,11 +181,8 @@ public:
 private:
   using FinalizedAlloc = jitlink::JITLinkMemoryManager::FinalizedAlloc;
 
-  void modifyPassConfig(MaterializationResponsibility &MR,
-                        jitlink::LinkGraph &G,
-                        jitlink::PassConfiguration &PassConfig);
-  void notifyLoaded(MaterializationResponsibility &MR);
-  Error notifyEmitted(MaterializationResponsibility &MR, FinalizedAlloc FA);
+  Error recordFinalizedAlloc(MaterializationResponsibility &MR,
+                             FinalizedAlloc FA);
 
   Error handleRemoveResources(JITDylib &JD, ResourceKey K) override;
   void handleTransferResources(JITDylib &JD, ResourceKey DstKey,
@@ -198,7 +195,7 @@ private:
   bool AutoClaimObjectSymbols = false;
   ReturnObjectBufferFunction ReturnObjectBuffer;
   DenseMap<ResourceKey, std::vector<FinalizedAlloc>> Allocs;
-  std::vector<std::unique_ptr<Plugin>> Plugins;
+  std::vector<std::shared_ptr<Plugin>> Plugins;
 };
 
 class EHFrameRegistrationPlugin : public ObjectLinkingLayer::Plugin {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d22d2a8e948b..fe09bb8177c2 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -436,7 +436,6 @@ __OMP_RTL(__tgt_target_kernel, false, Int32, IdentPtr, Int64, Int32, Int32,
           VoidPtr, KernelArgsPtr)
 __OMP_RTL(__tgt_target_kernel_nowait, false, Int32, IdentPtr, Int64, Int32,
           Int32, VoidPtr, KernelArgsPtr, Int32, VoidPtr, Int32, VoidPtr)
-__OMP_RTL(__tgt_register_requires, false, Void, Int64)
 __OMP_RTL(__tgt_target_data_begin_mapper, false, Void, IdentPtr, Int64, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
 __OMP_RTL(__tgt_target_data_begin_nowait_mapper, false, Void, IdentPtr, Int64, Int32,
@@ -1025,8 +1024,6 @@ __OMP_RTL_ATTRS(__tgt_target_kernel_nowait, ForkAttrs, SExt,
                 ParamAttrs(AttributeSet(), AttributeSet(), SExt, SExt,
                            AttributeSet(), AttributeSet(), SExt, AttributeSet(),
                            SExt))
-__OMP_RTL_ATTRS(__tgt_register_requires, ForkAttrs, AttributeSet(),
-                ParamAttrs())
 __OMP_RTL_ATTRS(__tgt_target_data_begin_mapper, ForkAttrs, AttributeSet(),
                 ParamAttrs(AttributeSet(), AttributeSet(), SExt))
 __OMP_RTL_ATTRS(__tgt_target_data_begin_nowait_mapper, ForkAttrs, AttributeSet(),
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 60f41b30e91c..cb514cde95b5 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -46,6 +46,7 @@ typedef unsigned ID;
 
 class AssemblyAnnotationWriter;
 class Constant;
+class ConstantRange;
 struct DenormalMode;
 class DISubprogram;
 enum LibFunc : unsigned;
@@ -462,6 +463,9 @@ public:
   /// attributes for the given arg.
   void addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes);
 
+  /// adds the range attribute to the list of attributes for the return value.
+  void addRangeRetAttr(const ConstantRange &CR);
+
   MaybeAlign getParamAlign(unsigned ArgNo) const {
     return AttributeSets.getParamAlignment(ArgNo);
   }
diff --git a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
index a3ebde709ae6..7525c9eb758b 100644
--- a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
+++ b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
@@ -76,7 +76,7 @@ void GenericConvergenceVerifier<ContextT>::visit(const InstructionT &I) {
           "Entry intrinsic cannot be preceded by a convergent operation in the "
           "same basic block.",
           {Context.print(&I)});
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case CONV_ANCHOR:
     Check(!TokenDef,
           "Entry or anchor intrinsic cannot have a convergencectrl token "
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 4f22720f1c55..2e99c9e2ee3e 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -311,7 +311,8 @@ public:
 
   Value *getVariableLocationOp(unsigned OpIdx) const;
 
-  void replaceVariableLocationOp(Value *OldValue, Value *NewValue);
+  void replaceVariableLocationOp(Value *OldValue, Value *NewValue,
+                                 bool AllowEmpty = false);
   void replaceVariableLocationOp(unsigned OpIdx, Value *NewValue);
   /// Adding a new location operand will always result in this intrinsic using
   /// an ArgList, and must always be accompanied by a new expression that uses
@@ -1435,6 +1436,7 @@ protected:
     case Intrinsic::instrprof_cover:
     case Intrinsic::instrprof_increment:
     case Intrinsic::instrprof_increment_step:
+    case Intrinsic::instrprof_callsite:
     case Intrinsic::instrprof_timestamp:
     case Intrinsic::instrprof_value_profile:
       return true;
@@ -1519,6 +1521,21 @@ public:
   }
 };
 
+/// This represents the llvm.instrprof.callsite intrinsic.
+/// It is structurally like the increment or step counters, hence the
+/// inheritance relationship, albeit somewhat tenuous (it's not 'counting' per
+/// se)
+class InstrProfCallsite : public InstrProfCntrInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_callsite;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+  Value *getCallee() const;
+};
+
 /// This represents the llvm.instrprof.timestamp intrinsic.
 class InstrProfTimestampInst : public InstrProfCntrInstBase {
 public:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1d20f7e1b198..28116e5316c9 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -914,6 +914,11 @@ def int_instrprof_increment_step : Intrinsic<[],
                                         [llvm_ptr_ty, llvm_i64_ty,
                                          llvm_i32_ty, llvm_i32_ty, llvm_i64_ty]>;
 
+// Callsite instrumentation for contextual profiling
+def int_instrprof_callsite : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_i64_ty,
+                                         llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty]>;
+
 // A timestamp for instrumentation based profiling.
 def int_instrprof_timestamp : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty,
                                              llvm_i32_ty, llvm_i32_ty]>;
@@ -2250,6 +2255,12 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>
                                llvm_i1_ty,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+
+  def int_vp_cttz_elts : DefaultAttrsIntrinsic<[ llvm_anyint_ty ],
+                                  [ llvm_anyvector_ty,
+                                    llvm_i1_ty,
+                                    LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>,
+                                    llvm_i32_ty]>;
 }
 
 def int_get_active_lane_mask:
@@ -2572,15 +2583,15 @@ def int_preserve_static_offset : DefaultAttrsIntrinsic<[llvm_ptr_ty],
 
 //===------------ Intrinsics to perform common vector shuffles ------------===//
 
-def int_experimental_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                            [LLVMMatchType<0>],
-                                                            [IntrNoMem]>;
+def int_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                               [LLVMMatchType<0>],
+                                               [IntrNoMem]>;
 
-def int_experimental_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                           [LLVMMatchType<0>,
-                                                            LLVMMatchType<0>,
-                                                            llvm_i32_ty],
-                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+def int_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                              [LLVMMatchType<0>,
+                                               LLVMMatchType<0>,
+                                               llvm_i32_ty],
+                                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 //===---------- Intrinsics to query properties of scalable vectors --------===//
 def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
@@ -2595,15 +2606,15 @@ def int_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
 
 
-def int_experimental_vector_interleave2   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                                  [LLVMHalfElementsVectorType<0>,
-                                                                   LLVMHalfElementsVectorType<0>],
-                                                                  [IntrNoMem]>;
+def int_vector_interleave2   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                     [LLVMHalfElementsVectorType<0>,
+                                                      LLVMHalfElementsVectorType<0>],
+                                                     [IntrNoMem]>;
 
-def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>,
-                                                                   LLVMHalfElementsVectorType<0>],
-                                                                  [llvm_anyvector_ty],
-                                                                  [IntrNoMem]>;
+def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>,
+                                                      LLVMHalfElementsVectorType<0>],
+                                                     [llvm_anyvector_ty],
+                                                     [IntrNoMem]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index bcaa37de74b6..e31e00a9c76f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1762,6 +1762,7 @@ def int_aarch64_sve_uqsub_x   : AdvSIMD_2VectorArg_Intrinsic;
 def int_aarch64_sve_orqv      : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_eorqv     : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_andqv     : AdvSIMD_SVE_V128_Reduce_Intrinsic;
+def int_aarch64_sve_addqv     : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_smaxqv    : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_umaxqv    : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_sminqv    : AdvSIMD_SVE_V128_Reduce_Intrinsic;
@@ -2079,11 +2080,12 @@ def int_aarch64_sve_fmaxv   : AdvSIMD_SVE_Reduce_Intrinsic;
 def int_aarch64_sve_fmaxnmv : AdvSIMD_SVE_Reduce_Intrinsic;
 def int_aarch64_sve_fminv   : AdvSIMD_SVE_Reduce_Intrinsic;
 def int_aarch64_sve_fminnmv : AdvSIMD_SVE_Reduce_Intrinsic;
-def int_aarch64_sve_addqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
+
+def int_aarch64_sve_faddqv   : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_fmaxnmqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_fminnmqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
-def int_aarch64_sve_fmaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
-def int_aarch64_sve_fminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
+def int_aarch64_sve_fmaxqv   : AdvSIMD_SVE_V128_Reduce_Intrinsic;
+def int_aarch64_sve_fminqv   : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 
 //
 // Floating-point conversions
@@ -3646,4 +3648,4 @@ def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
 
 def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
    
-def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
-\ No newline at end of file
+def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 1d58860a0afc..4c4e7351212f 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -661,7 +661,7 @@ let TargetPrefix = "riscv" in {
         : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      llvm_anyint_ty],
-                    [IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
+                    [IntrNoMem]>, RISCVVIntrinsic {
     let ScalarOperand = 2;
     let VLOperand = 3;
   }
@@ -684,7 +684,7 @@ let TargetPrefix = "riscv" in {
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
                      LLVMMatchType<2>],
-                    [ImmArg<ArgIndex<5>>, IntrNoMem, IntrHasSideEffects]>, RISCVVIntrinsic {
+                    [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
     let ScalarOperand = 2;
     let VLOperand = 4;
   }
@@ -708,7 +708,7 @@ let TargetPrefix = "riscv" in {
         : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      llvm_anyint_ty, LLVMMatchType<2>],
-                    [ImmArg<ArgIndex<3>>, IntrNoMem, IntrHasSideEffects]>,
+                    [ImmArg<ArgIndex<3>>, IntrNoMem]>,
                     RISCVVIntrinsic {
     let VLOperand = 4;
   }
@@ -721,7 +721,7 @@ let TargetPrefix = "riscv" in {
                     [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
                      LLVMMatchType<2>, LLVMMatchType<2>],
-                    [ImmArg<ArgIndex<4>>,ImmArg<ArgIndex<6>>, IntrNoMem, IntrHasSideEffects]>,
+                    [ImmArg<ArgIndex<4>>,ImmArg<ArgIndex<6>>, IntrNoMem]>,
                     RISCVVIntrinsic {
     let VLOperand = 5;
   }
@@ -733,7 +733,7 @@ let TargetPrefix = "riscv" in {
         : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
                      llvm_anyint_ty, LLVMMatchType<3>],
-                    [ImmArg<ArgIndex<3>>, IntrNoMem, IntrHasSideEffects]>,
+                    [ImmArg<ArgIndex<3>>, IntrNoMem]>,
                     RISCVVIntrinsic {
     let VLOperand = 4;
   }
@@ -746,8 +746,7 @@ let TargetPrefix = "riscv" in {
                     [LLVMMatchType<0>, llvm_anyvector_ty, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyint_ty,
                      LLVMMatchType<3>, LLVMMatchType<3>],
-                    [ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, IntrNoMem,
-                     IntrHasSideEffects]>, RISCVVIntrinsic {
+                    [ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 5;
   }
   // Input: (vector_in, vector_in, scalar_in, vl, policy)
diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index 20a0bf70edec..5d79407ae8ce 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -156,6 +156,10 @@ public:
   void enableDebugTypeODRUniquing();
   void disableDebugTypeODRUniquing();
 
+  /// generateMachineFunctionNum - Get a unique number for MachineFunction
+  /// that associated with the given Function.
+  unsigned generateMachineFunctionNum(Function &);
+
   /// Defines the type of a yield callback.
   /// \see LLVMContext::setYieldCallback.
   using YieldCallbackTy = void (*)(LLVMContext *Context, void *OpaqueHandle);
@@ -333,7 +337,7 @@ private:
   void addModule(Module*);
 
   /// removeModule - Unregister a module from this context.
-  void removeModule(Module*);
+  void removeModule(Module *);
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 5d137d4b3553..fa2a7b42c9aa 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1423,7 +1423,7 @@ public:
   // in the way some record are interpreted, like flags for instance.
   // Note that incrementing this may require changes in both BitcodeReader.cpp
   // and BitcodeWriter.cpp.
-  static constexpr uint64_t BitcodeSummaryVersion = 9;
+  static constexpr uint64_t BitcodeSummaryVersion = 10;
 
   // Regular LTO module name for ASM writer
   static constexpr const char *getRegularLTOModuleName() {
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 0b13b4aad9c3..739208e74dcb 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2513,7 +2513,7 @@ inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty m_CopySign(const Opnd0 &Op0,
 
 template <typename Opnd0>
 inline typename m_Intrinsic_Ty<Opnd0>::Ty m_VecReverse(const Opnd0 &Op0) {
-  return m_Intrinsic<Intrinsic::experimental_vector_reverse>(Op0);
+  return m_Intrinsic<Intrinsic::vector_reverse>(Op0);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index c0897408986f..88fbad4d6b9d 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -65,9 +65,14 @@ bool extractBranchWeights(const MDNode *ProfileData,
                           SmallVectorImpl<uint32_t> &Weights);
 
 /// Faster version of extractBranchWeights() that skips checks and must only
-/// be called with "branch_weights" metadata nodes.
-void extractFromBranchWeightMD(const MDNode *ProfileData,
-                               SmallVectorImpl<uint32_t> &Weights);
+/// be called with "branch_weights" metadata nodes. Supports uint32_t.
+void extractFromBranchWeightMD32(const MDNode *ProfileData,
+                                 SmallVectorImpl<uint32_t> &Weights);
+
+/// Faster version of extractBranchWeights() that skips checks and must only
+/// be called with "branch_weights" metadata nodes. Supports uint64_t.
+void extractFromBranchWeightMD64(const MDNode *ProfileData,
+                                 SmallVectorImpl<uint64_t> &Weights);
 
 /// Extract branch weights attatched to an Instruction
 ///
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 1c2708a9e854..f1cc8bcae467 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -282,6 +282,15 @@ BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ZERO_UNDEF, -1, vp_cttz_zero_undef, 1, 2)
 END_REGISTER_VP_SDNODE(VP_CTTZ_ZERO_UNDEF)
 END_REGISTER_VP_INTRINSIC(vp_cttz)
 
+// llvm.vp.cttz.elts(x,is_zero_poison,mask,vl)
+BEGIN_REGISTER_VP_INTRINSIC(vp_cttz_elts, 2, 3)
+VP_PROPERTY_NO_FUNCTIONAL
+BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ELTS, 0, vp_cttz_elts, 1, 2)
+END_REGISTER_VP_SDNODE(VP_CTTZ_ELTS)
+BEGIN_REGISTER_VP_SDNODE(VP_CTTZ_ELTS_ZERO_UNDEF, 0, vp_cttz_elts_zero_undef, 1, 2)
+END_REGISTER_VP_SDNODE(VP_CTTZ_ELTS_ZERO_UNDEF)
+END_REGISTER_VP_INTRINSIC(vp_cttz_elts)
+
 // llvm.vp.fshl(x,y,z,mask,vlen)
 BEGIN_REGISTER_VP(vp_fshl, 3, 4, VP_FSHL, -1)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(fshl)
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index 1309b17bff9c..1ff029d44d37 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -156,7 +156,7 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> TAB,
                                     std::unique_ptr<MCObjectWriter> OW,
                                     std::unique_ptr<MCCodeEmitter> Emitter,
-                                    bool RelaxAll, bool IsThumb, bool IsAndroid);
+                                    bool IsThumb, bool IsAndroid);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/MC/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h
index 47051447404d..5038b87cd1dc 100644
--- a/llvm/include/llvm/MC/TargetRegistry.h
+++ b/llvm/include/llvm/MC/TargetRegistry.h
@@ -92,39 +92,33 @@ createAsmStreamer(MCContext &Ctx, std::unique_ptr<formatted_raw_ostream> OS,
 MCStreamer *createELFStreamer(MCContext &Ctx,
                               std::unique_ptr<MCAsmBackend> &&TAB,
                               std::unique_ptr<MCObjectWriter> &&OW,
-                              std::unique_ptr<MCCodeEmitter> &&CE,
-                              bool RelaxAll);
+                              std::unique_ptr<MCCodeEmitter> &&CE);
 MCStreamer *createGOFFStreamer(MCContext &Ctx,
                                std::unique_ptr<MCAsmBackend> &&TAB,
                                std::unique_ptr<MCObjectWriter> &&OW,
-                               std::unique_ptr<MCCodeEmitter> &&CE,
-                               bool RelaxAll);
+                               std::unique_ptr<MCCodeEmitter> &&CE);
 MCStreamer *createMachOStreamer(MCContext &Ctx,
                                 std::unique_ptr<MCAsmBackend> &&TAB,
                                 std::unique_ptr<MCObjectWriter> &&OW,
                                 std::unique_ptr<MCCodeEmitter> &&CE,
-                                bool RelaxAll, bool DWARFMustBeAtTheEnd,
+                                bool DWARFMustBeAtTheEnd,
                                 bool LabelSections = false);
 MCStreamer *createWasmStreamer(MCContext &Ctx,
                                std::unique_ptr<MCAsmBackend> &&TAB,
                                std::unique_ptr<MCObjectWriter> &&OW,
-                               std::unique_ptr<MCCodeEmitter> &&CE,
-                               bool RelaxAll);
+                               std::unique_ptr<MCCodeEmitter> &&CE);
 MCStreamer *createXCOFFStreamer(MCContext &Ctx,
                                 std::unique_ptr<MCAsmBackend> &&TAB,
                                 std::unique_ptr<MCObjectWriter> &&OW,
-                                std::unique_ptr<MCCodeEmitter> &&CE,
-                                bool RelaxAll);
+                                std::unique_ptr<MCCodeEmitter> &&CE);
 MCStreamer *createSPIRVStreamer(MCContext &Ctx,
                                 std::unique_ptr<MCAsmBackend> &&TAB,
                                 std::unique_ptr<MCObjectWriter> &&OW,
-                                std::unique_ptr<MCCodeEmitter> &&CE,
-                                bool RelaxAll);
+                                std::unique_ptr<MCCodeEmitter> &&CE);
 MCStreamer *createDXContainerStreamer(MCContext &Ctx,
                                       std::unique_ptr<MCAsmBackend> &&TAB,
                                       std::unique_ptr<MCObjectWriter> &&OW,
-                                      std::unique_ptr<MCCodeEmitter> &&CE,
-                                      bool RelaxAll);
+                                      std::unique_ptr<MCCodeEmitter> &&CE);
 
 MCRelocationInfo *createMCRelocationInfo(const Triple &TT, MCContext &Ctx);
 
@@ -199,42 +193,42 @@ public:
       MCStreamer *(*)(const Triple &T, MCContext &Ctx,
                       std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
+                      std::unique_ptr<MCCodeEmitter> &&Emitter);
   using GOFFStreamerCtorTy =
       MCStreamer *(*)(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
+                      std::unique_ptr<MCCodeEmitter> &&Emitter);
   using MachOStreamerCtorTy =
       MCStreamer *(*)(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
+                      std::unique_ptr<MCCodeEmitter> &&Emitter,
                       bool DWARFMustBeAtTheEnd);
   using COFFStreamerCtorTy =
       MCStreamer *(*)(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
+                      std::unique_ptr<MCCodeEmitter> &&Emitter,
                       bool IncrementalLinkerCompatible);
   using WasmStreamerCtorTy =
       MCStreamer *(*)(const Triple &T, MCContext &Ctx,
                       std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
+                      std::unique_ptr<MCCodeEmitter> &&Emitter);
   using XCOFFStreamerCtorTy =
       MCStreamer *(*)(const Triple &T, MCContext &Ctx,
                       std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
+                      std::unique_ptr<MCCodeEmitter> &&Emitter);
   using SPIRVStreamerCtorTy =
       MCStreamer *(*)(const Triple &T, MCContext &Ctx,
                       std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
-  
+                      std::unique_ptr<MCCodeEmitter> &&Emitter);
+
   using DXContainerStreamerCtorTy =
       MCStreamer *(*)(const Triple &T, MCContext &Ctx,
                       std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll);
+                      std::unique_ptr<MCCodeEmitter> &&Emitter);
 
   using NullTargetStreamerCtorTy = MCTargetStreamer *(*)(MCStreamer &S);
   using AsmTargetStreamerCtorTy = MCTargetStreamer *(*)(
@@ -566,7 +560,7 @@ public:
                                      std::unique_ptr<MCAsmBackend> &&TAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                     const MCSubtargetInfo &STI, bool RelaxAll,
+                                     const MCSubtargetInfo &STI, bool,
                                      bool IncrementalLinkerCompatible,
                                      bool DWARFMustBeAtTheEnd) const {
     MCStreamer *S = nullptr;
@@ -577,66 +571,63 @@ public:
       assert((T.isOSWindows() || T.isUEFI()) &&
              "only Windows and UEFI COFF are supported");
       S = COFFStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
-                             std::move(Emitter), RelaxAll,
-                             IncrementalLinkerCompatible);
+                             std::move(Emitter), IncrementalLinkerCompatible);
       break;
     case Triple::MachO:
       if (MachOStreamerCtorFn)
         S = MachOStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), RelaxAll,
-                                DWARFMustBeAtTheEnd);
+                                std::move(Emitter), DWARFMustBeAtTheEnd);
       else
         S = createMachOStreamer(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), RelaxAll,
-                                DWARFMustBeAtTheEnd);
+                                std::move(Emitter), DWARFMustBeAtTheEnd);
       break;
     case Triple::ELF:
       if (ELFStreamerCtorFn)
         S = ELFStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                              std::move(Emitter), RelaxAll);
+                              std::move(Emitter));
       else
         S = createELFStreamer(Ctx, std::move(TAB), std::move(OW),
-                              std::move(Emitter), RelaxAll);
+                              std::move(Emitter));
       break;
     case Triple::Wasm:
       if (WasmStreamerCtorFn)
         S = WasmStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                               std::move(Emitter), RelaxAll);
+                               std::move(Emitter));
       else
         S = createWasmStreamer(Ctx, std::move(TAB), std::move(OW),
-                               std::move(Emitter), RelaxAll);
+                               std::move(Emitter));
       break;
     case Triple::GOFF:
       if (GOFFStreamerCtorFn)
         S = GOFFStreamerCtorFn(Ctx, std::move(TAB), std::move(OW),
-                               std::move(Emitter), RelaxAll);
+                               std::move(Emitter));
       else
         S = createGOFFStreamer(Ctx, std::move(TAB), std::move(OW),
-                               std::move(Emitter), RelaxAll);
+                               std::move(Emitter));
       break;
     case Triple::XCOFF:
       if (XCOFFStreamerCtorFn)
         S = XCOFFStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), RelaxAll);
+                                std::move(Emitter));
       else
         S = createXCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), RelaxAll);
+                                std::move(Emitter));
       break;
     case Triple::SPIRV:
       if (SPIRVStreamerCtorFn)
         S = SPIRVStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), RelaxAll);
+                                std::move(Emitter));
       else
         S = createSPIRVStreamer(Ctx, std::move(TAB), std::move(OW),
-                                std::move(Emitter), RelaxAll);
+                                std::move(Emitter));
       break;
     case Triple::DXContainer:
       if (DXContainerStreamerCtorFn)
         S = DXContainerStreamerCtorFn(T, Ctx, std::move(TAB), std::move(OW),
-                              std::move(Emitter), RelaxAll);
+                                      std::move(Emitter));
       else
         S = createDXContainerStreamer(Ctx, std::move(TAB), std::move(OW),
-                                      std::move(Emitter), RelaxAll);
+                                      std::move(Emitter));
       break;
     }
     if (ObjectTargetStreamerCtorFn)
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index 1d457be93741..8cc09e7fd7d5 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -199,6 +199,14 @@ public:
   }
 };
 
+inline bool operator<(const ELFSymbolRef &A, const ELFSymbolRef &B) {
+  const DataRefImpl &DRIA = A.getRawDataRefImpl();
+  const DataRefImpl &DRIB = B.getRawDataRefImpl();
+  if (DRIA.d.a == DRIB.d.a)
+    return DRIA.d.b < DRIB.d.b;
+  return DRIA.d.a < DRIB.d.a;
+}
+
 class elf_symbol_iterator : public symbol_iterator {
 public:
   elf_symbol_iterator(const basic_symbol_iterator &B)
@@ -801,9 +809,8 @@ Expected<uint32_t> ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Sym) const {
   } else if (EF.getHeader().e_machine == ELF::EM_RISCV) {
     if (Expected<StringRef> NameOrErr = getSymbolName(Sym)) {
       StringRef Name = *NameOrErr;
-      // Mark empty name symbols (used for label differences) and mapping
-      // symbols.
-      if (Name.empty() || Name.starts_with("$d") || Name.starts_with("$x"))
+      // Mark fake labels (used for label differences) and mapping symbols.
+      if (Name == ".L0 " || Name.starts_with("$d") || Name.starts_with("$x"))
         Result |= SymbolRef::SF_FormatSpecific;
     } else {
       // TODO: Actually report errors helpfully.
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 2e94a1950213..17bea5da48ce 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -29,7 +29,6 @@
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandReductions.h"
-#include "llvm/CodeGen/FreeMachineFunction.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GlobalMerge.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
@@ -38,6 +37,8 @@
 #include "llvm/CodeGen/JMCInstrumenter.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
 #include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
@@ -199,8 +200,13 @@ protected:
     AddMachinePass(ModulePassManager &MPM, const DerivedT &PB)
         : MPM(MPM), PB(PB) {}
     ~AddMachinePass() {
-      if (!MFPM.isEmpty())
-        MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+      if (!MFPM.isEmpty()) {
+        FunctionPassManager FPM;
+        FPM.addPass(
+            createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
+        FPM.addPass(InvalidateAnalysisPass<MachineFunctionAnalysis>());
+        MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+      }
     }
 
     template <typename PassT>
@@ -219,8 +225,8 @@ protected:
       } else {
         // Add Module Pass
         if (!MFPM.isEmpty()) {
-          MPM.addPass(
-              createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+          MPM.addPass(createModuleToFunctionPassAdaptor(
+              createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))));
           MFPM = MachineFunctionPassManager();
         }
 
@@ -512,6 +518,7 @@ Error CodeGenPassBuilder<Derived, TargetMachineT>::buildPipeline(
 
   {
     AddIRPass addIRPass(MPM, derived());
+    addIRPass(RequireAnalysisPass<MachineModuleAnalysis, Module>());
     addIRPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
     addIRPass(RequireAnalysisPass<CollectorMetadataAnalysis, Module>());
     addISelPasses(addIRPass);
@@ -538,7 +545,6 @@ Error CodeGenPassBuilder<Derived, TargetMachineT>::buildPipeline(
   if (PrintMIR)
     addPass(PrintMIRPass(Out), /*Force=*/true);
 
-  addPass(FreeMachineFunctionPass());
   return verifyStartStop(*StartStopInfo);
 }
 
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 2f77ae655d9b..5a14d619ea07 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -124,7 +124,6 @@ MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PI
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass())
-// MACHINE_FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass())
 MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass())
 MACHINE_FUNCTION_PASS("print", PrintMIRPass())
 MACHINE_FUNCTION_PASS("require-all-machine-function-properties",
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index b0ae8f364fca..08db8fa6e7ef 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -85,11 +85,15 @@ private:
   // The MemProf version we should write.
   memprof::IndexedVersion MemProfVersionRequested;
 
+  // Whether to serialize the full schema.
+  bool MemProfFullSchema;
+
 public:
   InstrProfWriter(
       bool Sparse = false, uint64_t TemporalProfTraceReservoirSize = 0,
       uint64_t MaxTemporalProfTraceLength = 0, bool WritePrevVersion = false,
-      memprof::IndexedVersion MemProfVersionRequested = memprof::Version0);
+      memprof::IndexedVersion MemProfVersionRequested = memprof::Version0,
+      bool MemProfFullSchema = false);
   ~InstrProfWriter();
 
   StringMap<ProfilingData> &getProfileData() { return FunctionData; }
@@ -203,6 +207,7 @@ public:
   void setMemProfVersionRequested(memprof::IndexedVersion Version) {
     MemProfVersionRequested = Version;
   }
+  void setMemProfFullSchema(bool Full) { MemProfFullSchema = Full; }
   // Compute the overlap b/w this object and Other. Program level result is
   // stored in Overlap and function level result is stored in FuncLevelOverlap.
   void overlapRecord(NamedInstrProfRecord &&Other, OverlapStats &Overlap,
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 37019bcab544..4274f2a6849b 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -2,6 +2,7 @@
 #define LLVM_PROFILEDATA_MEMPROF_H_
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/GlobalValue.h"
@@ -10,6 +11,7 @@
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <bitset>
 #include <cstdint>
 #include <optional>
 
@@ -44,12 +46,21 @@ enum class Meta : uint64_t {
 
 using MemProfSchema = llvm::SmallVector<Meta, static_cast<int>(Meta::Size)>;
 
+// Returns the full schema currently in use.
+MemProfSchema getFullSchema();
+
+// Returns the schema consisting of the fields used for hot cold memory hinting.
+MemProfSchema getHotColdSchema();
+
 // Holds the actual MemInfoBlock data with all fields. Contents may be read or
 // written partially by providing an appropriate schema to the serialize and
 // deserialize methods.
 struct PortableMemInfoBlock {
   PortableMemInfoBlock() = default;
-  explicit PortableMemInfoBlock(const MemInfoBlock &Block) {
+  explicit PortableMemInfoBlock(const MemInfoBlock &Block,
+                                const MemProfSchema &IncomingSchema) {
+    for (const Meta Id : IncomingSchema)
+      Schema.set(llvm::to_underlying(Id));
 #define MIBEntryDef(NameTag, Name, Type) Name = Block.Name;
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
@@ -61,10 +72,12 @@ struct PortableMemInfoBlock {
 
   // Read the contents of \p Ptr based on the \p Schema to populate the
   // MemInfoBlock member.
-  void deserialize(const MemProfSchema &Schema, const unsigned char *Ptr) {
+  void deserialize(const MemProfSchema &IncomingSchema,
+                   const unsigned char *Ptr) {
     using namespace support;
 
-    for (const Meta Id : Schema) {
+    Schema.reset();
+    for (const Meta Id : IncomingSchema) {
       switch (Id) {
 #define MIBEntryDef(NameTag, Name, Type)                                       \
   case Meta::Name: {                                                           \
@@ -76,6 +89,8 @@ struct PortableMemInfoBlock {
         llvm_unreachable("Unknown meta type id, is the profile collected from "
                          "a newer version of the runtime?");
       }
+
+      Schema.set(llvm::to_underlying(Id));
     }
   }
 
@@ -108,26 +123,29 @@ struct PortableMemInfoBlock {
 #undef MIBEntryDef
   }
 
+  // Return the schema, only for unit tests.
+  std::bitset<llvm::to_underlying(Meta::Size)> getSchema() const {
+    return Schema;
+  }
+
   // Define getters for each type which can be called by analyses.
 #define MIBEntryDef(NameTag, Name, Type)                                       \
-  Type get##Name() const { return Name; }
+  Type get##Name() const {                                                     \
+    assert(Schema[llvm::to_underlying(Meta::Name)]);                           \
+    return Name;                                                               \
+  }
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
 
   void clear() { *this = PortableMemInfoBlock(); }
 
-  // Returns the full schema currently in use.
-  static MemProfSchema getSchema() {
-    MemProfSchema List;
-#define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name);
-#include "llvm/ProfileData/MIBEntryDef.inc"
-#undef MIBEntryDef
-    return List;
-  }
-
   bool operator==(const PortableMemInfoBlock &Other) const {
+    if (Other.Schema != Schema)
+      return false;
+
 #define MIBEntryDef(NameTag, Name, Type)                                       \
-  if (Other.get##Name() != get##Name())                                        \
+  if (Schema[llvm::to_underlying(Meta::Name)] &&                               \
+      Other.get##Name() != get##Name())                                        \
     return false;
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
@@ -158,6 +176,9 @@ struct PortableMemInfoBlock {
   }
 
 private:
+  // The set of available fields, indexed by Meta::Name.
+  std::bitset<llvm::to_underlying(Meta::Size)> Schema;
+
 #define MIBEntryDef(NameTag, Name, Type) Type Name = Type();
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
@@ -299,8 +320,9 @@ struct IndexedAllocationInfo {
 
   IndexedAllocationInfo() = default;
   IndexedAllocationInfo(ArrayRef<FrameId> CS, CallStackId CSId,
-                        const MemInfoBlock &MB)
-      : CallStack(CS.begin(), CS.end()), CSId(CSId), Info(MB) {}
+                        const MemInfoBlock &MB,
+                        const MemProfSchema &Schema = getFullSchema())
+      : CallStack(CS.begin(), CS.end()), CSId(CSId), Info(MB, Schema) {}
 
   // Returns the size in bytes when this allocation info struct is serialized.
   size_t serializedSize(const MemProfSchema &Schema,
@@ -740,6 +762,64 @@ public:
 // Compute a CallStackId for a given call stack.
 CallStackId hashCallStack(ArrayRef<FrameId> CS);
 
+namespace detail {
+// "Dereference" the iterator from DenseMap or OnDiskChainedHashTable.  We have
+// to do so in one of two different ways depending on the type of the hash
+// table.
+template <typename value_type, typename IterTy>
+value_type DerefIterator(IterTy Iter) {
+  using deref_type = llvm::remove_cvref_t<decltype(*Iter)>;
+  if constexpr (std::is_same_v<deref_type, value_type>)
+    return *Iter;
+  else
+    return Iter->second;
+}
+} // namespace detail
+
+// A function object that returns a frame for a given FrameId.
+template <typename MapTy> struct FrameIdConverter {
+  std::optional<FrameId> LastUnmappedId;
+  MapTy &Map;
+
+  FrameIdConverter() = delete;
+  FrameIdConverter(MapTy &Map) : Map(Map) {}
+
+  Frame operator()(FrameId Id) {
+    auto Iter = Map.find(Id);
+    if (Iter == Map.end()) {
+      LastUnmappedId = Id;
+      return Frame(0, 0, 0, false);
+    }
+    return detail::DerefIterator<Frame>(Iter);
+  }
+};
+
+// A function object that returns a call stack for a given CallStackId.
+template <typename MapTy> struct CallStackIdConverter {
+  std::optional<CallStackId> LastUnmappedId;
+  MapTy &Map;
+  std::function<Frame(FrameId)> FrameIdToFrame;
+
+  CallStackIdConverter() = delete;
+  CallStackIdConverter(MapTy &Map, std::function<Frame(FrameId)> FrameIdToFrame)
+      : Map(Map), FrameIdToFrame(FrameIdToFrame) {}
+
+  llvm::SmallVector<Frame> operator()(CallStackId CSId) {
+    llvm::SmallVector<Frame> Frames;
+    auto CSIter = Map.find(CSId);
+    if (CSIter == Map.end()) {
+      LastUnmappedId = CSId;
+    } else {
+      llvm::SmallVector<FrameId> CS =
+          detail::DerefIterator<llvm::SmallVector<FrameId>>(CSIter);
+      Frames.reserve(CS.size());
+      for (FrameId Id : CS)
+        Frames.push_back(FrameIdToFrame(Id));
+    }
+    return Frames;
+  }
+};
+
 // Verify that each CallStackId is computed with hashCallStack.  This function
 // is intended to help transition from CallStack to CSId in
 // IndexedAllocationInfo.
diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h
index 444c58e8bdc8..b42e4f597774 100644
--- a/llvm/include/llvm/ProfileData/MemProfReader.h
+++ b/llvm/include/llvm/ProfileData/MemProfReader.h
@@ -76,20 +76,16 @@ public:
       Callback =
           std::bind(&MemProfReader::idToFrame, this, std::placeholders::_1);
 
-    auto CallStackCallback = [&](CallStackId CSId) {
-      llvm::SmallVector<Frame> CallStack;
-      auto Iter = CSIdToCallStack.find(CSId);
-      assert(Iter != CSIdToCallStack.end());
-      for (FrameId Id : Iter->second)
-        CallStack.push_back(Callback(Id));
-      return CallStack;
-    };
+    memprof::CallStackIdConverter<decltype(CSIdToCallStack)> CSIdConv(
+        CSIdToCallStack, Callback);
 
     const IndexedMemProfRecord &IndexedRecord = Iter->second;
     GuidRecord = {
         Iter->first,
-        IndexedRecord.toMemProfRecord(CallStackCallback),
+        IndexedRecord.toMemProfRecord(CSIdConv),
     };
+    if (CSIdConv.LastUnmappedId)
+      return make_error<InstrProfError>(instrprof_error::hash_mismatch);
     Iter++;
     return Error::success();
   }
diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h
index c1e5c6d2853b..eb6c4d366855 100644
--- a/llvm/include/llvm/Support/Allocator.h
+++ b/llvm/include/llvm/Support/Allocator.h
@@ -159,9 +159,9 @@ public:
 #endif
 
     // Check if we have enough space.
-    if (Adjustment + SizeToAllocate <= size_t(End - CurPtr)
-        // We can't return nullptr even for a zero-sized allocation!
-        && CurPtr != nullptr) {
+    if (LLVM_LIKELY(Adjustment + SizeToAllocate <= size_t(End - CurPtr)
+                    // We can't return nullptr even for a zero-sized allocation!
+                    && CurPtr != nullptr)) {
       char *AlignedPtr = CurPtr + Adjustment;
       CurPtr = AlignedPtr + SizeToAllocate;
       // Update the allocation point of this memory block in MemorySanitizer.
@@ -173,6 +173,11 @@ public:
       return AlignedPtr;
     }
 
+    return AllocateSlow(Size, SizeToAllocate, Alignment);
+  }
+
+  LLVM_ATTRIBUTE_RETURNS_NONNULL LLVM_ATTRIBUTE_NOINLINE void *
+  AllocateSlow(size_t Size, size_t SizeToAllocate, Align Alignment) {
     // If Size is really big, allocate a separate slab for it.
     size_t PaddedSize = SizeToAllocate + Alignment.value() - 1;
     if (PaddedSize > SizeThreshold) {
diff --git a/llvm/include/llvm/Support/RISCVISAUtils.h b/llvm/include/llvm/Support/RISCVISAUtils.h
index 94aedb75faa2..77f8c3e45f1a 100644
--- a/llvm/include/llvm/Support/RISCVISAUtils.h
+++ b/llvm/include/llvm/Support/RISCVISAUtils.h
@@ -14,6 +14,7 @@
 #define LLVM_SUPPORT_RISCVISAUTILS_H
 
 #include "llvm/ADT/StringRef.h"
+#include <map>
 #include <string>
 
 namespace llvm {
@@ -35,6 +36,12 @@ struct ExtensionComparator {
     return compareExtension(LHS, RHS);
   }
 };
+
+/// OrderedExtensionMap is std::map, it's specialized to keep entries
+/// in canonical order of extension.
+typedef std::map<std::string, ExtensionVersion, ExtensionComparator>
+    OrderedExtensionMap;
+
 } // namespace RISCVISAUtils
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h
index 32987c3b98f1..8d221aaab9ab 100644
--- a/llvm/include/llvm/Support/RWMutex.h
+++ b/llvm/include/llvm/Support/RWMutex.h
@@ -63,6 +63,10 @@ public:
   /// Unconditionally release the lock in reader mode.
   bool unlock_shared();
 
+  /// Attempts to acquire the lock in reader mode. Returns immediately.
+  /// @returns true on successful lock acquisition, false otherwise.
+  bool try_lock_shared();
+
   /// Attempts to unconditionally acquire the lock in reader mode. If the
   /// lock is held by any readers, this method will wait until it can
   /// acquire the lock.
@@ -75,6 +79,10 @@ public:
   /// Unconditionally release the lock in write mode.
   bool unlock();
 
+  /// Attempts to acquire the lock in writer mode. Returns immediately.
+  /// @returns true on successful lock acquisition, false otherwise.
+  bool try_lock();
+
   //@}
   /// @name Platform Dependent Data
   /// @{
@@ -123,6 +131,8 @@ public:
     return true;
   }
 
+  bool try_lock_shared() { return impl.try_lock_shared(); }
+
   bool lock() {
     if (!mt_only || llvm_is_multithreaded()) {
       impl.lock();
@@ -148,6 +158,8 @@ public:
     --writers;
     return true;
   }
+
+  bool try_lock() { return impl.try_lock(); }
 };
 
 typedef SmartRWMutex<false> RWMutex;
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 68dbe1ea3062..c6779e258be7 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -181,6 +181,18 @@ public:
     return getKnownMinValue() % RHS == 0;
   }
 
+  /// Returns whether or not the callee is known to be a multiple of RHS.
+  constexpr bool isKnownMultipleOf(const FixedOrScalableQuantity &RHS) const {
+    // x % y == 0 => x % y == 0
+    // x % y == 0 => (vscale * x) % y == 0
+    // x % y == 0 => (vscale * x) % (vscale * y) == 0
+    // but
+    // x % y == 0 !=> x % (vscale * y) == 0
+    if (!isScalable() && RHS.isScalable())
+      return false;
+    return getKnownMinValue() % RHS.getKnownMinValue() == 0;
+  }
+
   // Return the minimum value with the assumption that the count is exact.
   // Use in places where a scalable count doesn't make sense (e.g. non-vector
   // types, or vectors in backends which don't support scalable vectors).
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 3b1f4bad57fc..33aeb039320d 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -671,7 +671,11 @@ inline bool isBool(StringRef S) {
 // (except for TAB #x9, LF #xA, and CR #xD which are allowed), DEL #x7F, the C1
 // control block #x80-#x9F (except for NEL #x85 which is allowed), the surrogate
 // block #xD800-#xDFFF, #xFFFE, and #xFFFF.
-inline QuotingType needsQuotes(StringRef S) {
+//
+// Some strings are valid YAML values even unquoted, but without quotes are
+// interpreted as non-string type, for instance null, boolean or numeric values.
+// If ForcePreserveAsString is set, such strings are quoted.
+inline QuotingType needsQuotes(StringRef S, bool ForcePreserveAsString = true) {
   if (S.empty())
     return QuotingType::Single;
 
@@ -679,12 +683,14 @@ inline QuotingType needsQuotes(StringRef S) {
   if (isSpace(static_cast<unsigned char>(S.front())) ||
       isSpace(static_cast<unsigned char>(S.back())))
     MaxQuotingNeeded = QuotingType::Single;
-  if (isNull(S))
-    MaxQuotingNeeded = QuotingType::Single;
-  if (isBool(S))
-    MaxQuotingNeeded = QuotingType::Single;
-  if (isNumeric(S))
-    MaxQuotingNeeded = QuotingType::Single;
+  if (ForcePreserveAsString) {
+    if (isNull(S))
+      MaxQuotingNeeded = QuotingType::Single;
+    if (isBool(S))
+      MaxQuotingNeeded = QuotingType::Single;
+    if (isNumeric(S))
+      MaxQuotingNeeded = QuotingType::Single;
+  }
 
   // 7.3.3 Plain Style
   // Plain scalars must not begin with most indicators, as this would cause
@@ -1636,6 +1642,7 @@ public:
 
 private:
   void output(StringRef s);
+  void output(StringRef, QuotingType);
   void outputUpToEndOfLine(StringRef s);
   void newLineCheck(bool EmptySequence = false);
   void outputNewLine();
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 31b903e63d99..dbbb3abaa830 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1525,11 +1525,39 @@ def combine_shuffle_concat : GICombineRule<
         [{ return Helper.matchCombineShuffleConcat(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineShuffleConcat(*${root}, ${matchinfo}); }])>;
 
-// match_extract_of_element must be the first!
+def insert_vector_element_idx_undef : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_IMPLICIT_DEF $idx),
+          (G_INSERT_VECTOR_ELT $root, $src, $elt, $idx)),
+   (apply (G_IMPLICIT_DEF $root))>;
+
+def insert_vector_element_elt_undef : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_IMPLICIT_DEF $elt),
+          (G_INSERT_VECTOR_ELT $root, $src, $elt, $idx),
+          [{ return isGuaranteedNotToBePoison(${src}.getReg(), MRI); }]),
+   (apply (GIReplaceReg $root, $src))>;
+
+def insert_vector_element_extract_vector_element : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_EXTRACT_VECTOR_ELT $elt, $src, $idx),
+          (G_INSERT_VECTOR_ELT $root, $src, $elt, $idx)),
+   (apply (GIReplaceReg $root, $src))>;
+
+def insert_vector_elt_oob : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root,
+         [{ return Helper.matchInsertVectorElementOOB(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+// match_extract_of_element and insert_vector_elt_oob must be the first!
 def vector_ops_combines: GICombineGroup<[
 match_extract_of_element_undef_vector,
 match_extract_of_element_undef_index,
+insert_vector_element_idx_undef,
+insert_vector_element_elt_undef,
 match_extract_of_element,
+insert_vector_elt_oob,
 extract_vector_element_not_const,
 extract_vector_element_different_indices,
 extract_vector_element_build_vector2,
@@ -1553,7 +1581,8 @@ extract_vector_element_build_vector_trunc5,
 extract_vector_element_build_vector_trunc6,
 extract_vector_element_build_vector_trunc7,
 extract_vector_element_build_vector_trunc8,
-extract_vector_element_freeze
+extract_vector_element_freeze,
+insert_vector_element_extract_vector_element
 ]>;
 
 // FIXME: These should use the custom predicate feature once it lands.
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index ea3520835fa0..1684b424e3b4 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1979,6 +1979,7 @@ class Pattern<dag patternToMatch, list<dag> resultInstrs> {
   list<dag>       ResultInstrs    = resultInstrs;
   list<Predicate> Predicates      = [];  // See class Instruction in Target.td.
   int             AddedComplexity = 0;   // See class Instruction in Target.td.
+  bit           GISelShouldIgnore = 0;
 }
 
 // Pat - A simple (but common) form of a pattern, which produces a simple result
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 805b963a7a13..04fbaf07adfb 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -103,90 +103,31 @@ enum CPUFeatures {
 static_assert(FEAT_MAX < 62,
               "Number of features in CPUFeatures are limited to 62 entries");
 
-// Arch extension modifiers for CPUs. These are labelled with their Arm ARM
-// feature name (though the canonical reference for those is AArch64.td)
-// clang-format off
+// Each ArchExtKind correponds directly to a possible -target-feature.
 enum ArchExtKind : unsigned {
-  AEK_NONE =          1,
-  AEK_CRC =           2,  // FEAT_CRC32
-  AEK_CRYPTO =        3,
-  AEK_FP =            4,  // FEAT_FP
-  AEK_SIMD =          5,  // FEAT_AdvSIMD
-  AEK_FP16 =          6,  // FEAT_FP16
-  AEK_PROFILE =       7,  // FEAT_SPE
-  AEK_RAS =           8,  // FEAT_RAS, FEAT_RASv1p1
-  AEK_LSE =           9,  // FEAT_LSE
-  AEK_SVE =           10, // FEAT_SVE
-  AEK_DOTPROD =       11, // FEAT_DotProd
-  AEK_RCPC =          12, // FEAT_LRCPC
-  AEK_RDM =           13, // FEAT_RDM
-  AEK_SM4 =           14, // FEAT_SM4, FEAT_SM3
-  AEK_SHA3 =          15, // FEAT_SHA3, FEAT_SHA512
-  AEK_SHA2 =          16, // FEAT_SHA1, FEAT_SHA256
-  AEK_AES =           17, // FEAT_AES, FEAT_PMULL
-  AEK_FP16FML =       18, // FEAT_FHM
-  AEK_RAND =          19, // FEAT_RNG
-  AEK_MTE =           20, // FEAT_MTE, FEAT_MTE2
-  AEK_SSBS =          21, // FEAT_SSBS, FEAT_SSBS2
-  AEK_SB =            22, // FEAT_SB
-  AEK_PREDRES =       23, // FEAT_SPECRES
-  AEK_SVE2 =          24, // FEAT_SVE2
-  AEK_SVE2AES =       25, // FEAT_SVE_AES, FEAT_SVE_PMULL128
-  AEK_SVE2SM4 =       26, // FEAT_SVE_SM4
-  AEK_SVE2SHA3 =      27, // FEAT_SVE_SHA3
-  AEK_SVE2BITPERM =   28, // FEAT_SVE_BitPerm
-  AEK_TME =           29, // FEAT_TME
-  AEK_BF16 =          30, // FEAT_BF16
-  AEK_I8MM =          31, // FEAT_I8MM
-  AEK_F32MM =         32, // FEAT_F32MM
-  AEK_F64MM =         33, // FEAT_F64MM
-  AEK_LS64 =          34, // FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA
-  AEK_BRBE =          35, // FEAT_BRBE
-  AEK_PAUTH =         36, // FEAT_PAuth
-  AEK_FLAGM =         37, // FEAT_FlagM
-  AEK_SME =           38, // FEAT_SME
-  AEK_SMEF64F64 =     39, // FEAT_SME_F64F64
-  AEK_SMEI16I64 =     40, // FEAT_SME_I16I64
-  AEK_HBC =           41, // FEAT_HBC
-  AEK_MOPS =          42, // FEAT_MOPS
-  AEK_PERFMON =       43, // FEAT_PMUv3
-  AEK_SME2 =          44, // FEAT_SME2
-  AEK_SVE2p1 =        45, // FEAT_SVE2p1
-  AEK_SME2p1 =        46, // FEAT_SME2p1
-  AEK_B16B16 =        47, // FEAT_B16B16
-  AEK_SMEF16F16 =     48, // FEAT_SMEF16F16
-  AEK_CSSC =          49, // FEAT_CSSC
-  AEK_RCPC3 =         50, // FEAT_LRCPC3
-  AEK_THE =           51, // FEAT_THE
-  AEK_D128 =          52, // FEAT_D128
-  AEK_LSE128 =        53, // FEAT_LSE128
-  AEK_SPECRES2 =      54, // FEAT_SPECRES2
-  AEK_RASv2 =         55, // FEAT_RASv2
-  AEK_ITE =           56, // FEAT_ITE
-  AEK_GCS =           57, // FEAT_GCS
-  AEK_FPMR =          58, // FEAT_FPMR
-  AEK_FP8 =           59, // FEAT_FP8
-  AEK_FAMINMAX =      60, // FEAT_FAMINMAX
-  AEK_FP8FMA =        61, // FEAT_FP8FMA
-  AEK_SSVE_FP8FMA =   62, // FEAT_SSVE_FP8FMA
-  AEK_FP8DOT2 =       63, // FEAT_FP8DOT2
-  AEK_SSVE_FP8DOT2 =  64, // FEAT_SSVE_FP8DOT2
-  AEK_FP8DOT4 =       65, // FEAT_FP8DOT4
-  AEK_SSVE_FP8DOT4 =  66, // FEAT_SSVE_FP8DOT4
-  AEK_LUT =           67, // FEAT_LUT
-  AEK_SME_LUTv2 =     68, // FEAT_SME_LUTv2
-  AEK_SMEF8F16 =      69, // FEAT_SME_F8F16
-  AEK_SMEF8F32 =      70, // FEAT_SME_F8F32
-  AEK_SMEFA64 =       71, // FEAT_SME_FA64
-  AEK_CPA =           72, // FEAT_CPA
-  AEK_PAUTHLR =       73, // FEAT_PAuth_LR
-  AEK_TLBIW =         74, // FEAT_TLBIW
-  AEK_JSCVT =         75, // FEAT_JSCVT
-  AEK_FCMA =          76, // FEAT_FCMA
-  AEK_NUM_EXTENSIONS
+  AEK_NONE = 1,
+#define ARM_EXTENSION(NAME, ENUM) ENUM,
+#include "llvm/TargetParser/AArch64TargetParserDef.inc"
+  AEK_NUM_EXTENSIONS,
+
+  // FIXME temporary fixes for inconsistent naming.
+  AEK_F32MM = AEK_MATMULFP32,
+  AEK_F64MM = AEK_MATMULFP64,
+  AEK_FCMA = AEK_COMPLXNUM,
+  AEK_FP = AEK_FPARMV8,
+  AEK_FP16 = AEK_FULLFP16,
+  AEK_I8MM = AEK_MATMULINT8,
+  AEK_JSCVT = AEK_JS,
+  AEK_PROFILE = AEK_SPE,
+  AEK_RASv2 = AEK_RASV2,
+  AEK_RAND = AEK_RANDGEN,
+  AEK_SIMD = AEK_NEON,
+  AEK_SME2p1 = AEK_SME2P1,
+  AEK_SVE2p1 = AEK_SVE2P1,
+  AEK_SME_LUTv2 = AEK_SME_LUTV2,
+
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
-// clang-format on
 
 // Represents an extension that can be enabled with -march=<arch>+<extension>.
 // Typically these correspond to Arm Architecture extensions, unlike
@@ -268,7 +209,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sha3", AArch64::AEK_SHA3, "+sha3", "-sha3", FEAT_SHA3, "+sha3,+sha2,+fp-armv8,+neon", 140},
     {"simd", AArch64::AEK_SIMD, "+neon", "-neon", FEAT_SIMD, "+fp-armv8,+neon", 100},
     {"sm4", AArch64::AEK_SM4, "+sm4", "-sm4", FEAT_SM4, "+sm4,+fp-armv8,+neon", 106},
-    {"sme-f16f16", AArch64::AEK_SMEF16F16, "+sme-f16f16", "-sme-f16f16", FEAT_INIT, "", 0},
+    {"sme-f16f16", AArch64::AEK_SMEF16F16, "+sme-f16f16", "-sme-f16f16", FEAT_INIT, "+sme2,+sme-f16f16", 0},
     {"sme-f64f64", AArch64::AEK_SMEF64F64, "+sme-f64f64", "-sme-f64f64", FEAT_SME_F64, "+sme,+sme-f64f64,+bf16", 560},
     {"sme-i16i64", AArch64::AEK_SMEI16I64, "+sme-i16i64", "-sme-i16i64", FEAT_SME_I64, "+sme,+sme-i16i64,+bf16", 570},
     {"sme", AArch64::AEK_SME, "+sme", "-sme", FEAT_SME, "+sme,+bf16", 430},
@@ -302,7 +243,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"ssve-fp8dot4", AArch64::AEK_SSVE_FP8DOT4, "+ssve-fp8dot4", "-ssve-fp8dot4", FEAT_INIT, "+sme2", 0},
     {"lut", AArch64::AEK_LUT, "+lut", "-lut", FEAT_INIT, "", 0},
     {"sme-lutv2", AArch64::AEK_SME_LUTv2, "+sme-lutv2", "-sme-lutv2", FEAT_INIT, "", 0},
-    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+sme2,+fp8", 0},
+    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+fp8,+sme2", 0},
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
@@ -632,7 +573,12 @@ inline constexpr CpuInfo CpuInfos[] = {
                                AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
                                AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
                                AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})},
-    {"cortex-r82", ARMV8R, AArch64::ExtensionBitset({AArch64::AEK_LSE})},
+    {"cortex-r82", ARMV8R,
+     AArch64::ExtensionBitset({AArch64::AEK_LSE, AArch64::AEK_FLAGM,
+                               AArch64::AEK_PERFMON, AArch64::AEK_PREDRES})},
+    {"cortex-r82ae", ARMV8R,
+     AArch64::ExtensionBitset({AArch64::AEK_LSE, AArch64::AEK_FLAGM,
+                               AArch64::AEK_PERFMON, AArch64::AEK_PREDRES})},
     {"cortex-x1", ARMV8_2A,
      AArch64::ExtensionBitset({AArch64::AEK_AES, AArch64::AEK_SHA2,
                                AArch64::AEK_FP16, AArch64::AEK_DOTPROD,
@@ -677,6 +623,13 @@ inline constexpr CpuInfo CpuInfos[] = {
           AArch64::AEK_FP16FML, AArch64::AEK_I8MM, AArch64::AEK_MTE,
           AArch64::AEK_SB, AArch64::AEK_SSBS, AArch64::AEK_SVE,
           AArch64::AEK_SVE2, AArch64::AEK_SVE2BITPERM})},
+    {"neoverse-n3", ARMV9_2A,
+     AArch64::ExtensionBitset({AArch64::AEK_MTE, AArch64::AEK_SSBS,
+                               AArch64::AEK_SB, AArch64::AEK_PREDRES,
+                               AArch64::AEK_FP16FML, AArch64::AEK_PAUTH,
+                               AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
+                               AArch64::AEK_RAND, AArch64::AEK_SVE2BITPERM,
+                               AArch64::AEK_PROFILE, AArch64::AEK_PERFMON})},
     {"neoverse-512tvb", ARMV8_4A,
      AArch64::ExtensionBitset(
          {AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_SHA3,
@@ -697,6 +650,20 @@ inline constexpr CpuInfo CpuInfos[] = {
           AArch64::AEK_FP16, AArch64::AEK_BF16, AArch64::AEK_RAND,
           AArch64::AEK_DOTPROD, AArch64::AEK_PROFILE, AArch64::AEK_SVE2BITPERM,
           AArch64::AEK_FP16FML, AArch64::AEK_I8MM, AArch64::AEK_MTE})},
+    {"neoverse-v3", ARMV9_2A,
+     AArch64::ExtensionBitset(
+         {AArch64::AEK_PROFILE, AArch64::AEK_MTE, AArch64::AEK_SSBS,
+          AArch64::AEK_SB, AArch64::AEK_PREDRES, AArch64::AEK_LS64,
+          AArch64::AEK_BRBE, AArch64::AEK_PAUTH, AArch64::AEK_FLAGM,
+          AArch64::AEK_PERFMON, AArch64::AEK_RAND, AArch64::AEK_SVE2BITPERM,
+          AArch64::AEK_FP16FML})},
+    {"neoverse-v3ae", ARMV9_2A,
+     (AArch64::ExtensionBitset(
+         {AArch64::AEK_PROFILE, AArch64::AEK_MTE, AArch64::AEK_SSBS,
+          AArch64::AEK_SB, AArch64::AEK_PREDRES, AArch64::AEK_LS64,
+          AArch64::AEK_BRBE, AArch64::AEK_PAUTH, AArch64::AEK_FLAGM,
+          AArch64::AEK_PERFMON, AArch64::AEK_RAND, AArch64::AEK_SVE2BITPERM,
+          AArch64::AEK_FP16FML}))},
     {"cyclone", ARMV8A,
      AArch64::ExtensionBitset(
          {AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_NONE})},
diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
index 83c4f1e620fc..36617a9b6259 100644
--- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h
+++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
@@ -26,13 +26,7 @@ public:
   RISCVISAInfo(const RISCVISAInfo &) = delete;
   RISCVISAInfo &operator=(const RISCVISAInfo &) = delete;
 
-  /// OrderedExtensionMap is std::map, it's specialized to keep entries
-  /// in canonical order of extension.
-  typedef std::map<std::string, RISCVISAUtils::ExtensionVersion,
-                   RISCVISAUtils::ExtensionComparator>
-      OrderedExtensionMap;
-
-  RISCVISAInfo(unsigned XLen, OrderedExtensionMap &Exts)
+  RISCVISAInfo(unsigned XLen, RISCVISAUtils::OrderedExtensionMap &Exts)
       : XLen(XLen), FLen(0), MinVLen(0), MaxELen(0), MaxELenFp(0), Exts(Exts) {}
 
   /// Parse RISC-V ISA info from arch string.
@@ -59,7 +53,9 @@ public:
   std::vector<std::string> toFeatures(bool AddAllExtensions = false,
                                       bool IgnoreUnknown = true) const;
 
-  const OrderedExtensionMap &getExtensions() const { return Exts; }
+  const RISCVISAUtils::OrderedExtensionMap &getExtensions() const {
+    return Exts;
+  }
 
   unsigned getXLen() const { return XLen; }
   unsigned getFLen() const { return FLen; }
@@ -82,15 +78,14 @@ public:
   static std::string getTargetFeatureForExtension(StringRef Ext);
 
 private:
-  RISCVISAInfo(unsigned XLen)
-      : XLen(XLen), FLen(0), MinVLen(0), MaxELen(0), MaxELenFp(0) {}
+  RISCVISAInfo(unsigned XLen) : XLen(XLen) {}
 
   unsigned XLen;
-  unsigned FLen;
-  unsigned MinVLen;
-  unsigned MaxELen, MaxELenFp;
+  unsigned FLen = 0;
+  unsigned MinVLen = 0;
+  unsigned MaxELen = 0, MaxELenFp = 0;
 
-  OrderedExtensionMap Exts;
+  RISCVISAUtils::OrderedExtensionMap Exts;
 
   void addExtension(StringRef ExtName, RISCVISAUtils::ExtensionVersion Version);
 
@@ -98,9 +93,9 @@ private:
 
   void updateImplication();
   void updateCombination();
-  void updateFLen();
-  void updateMinVLen();
-  void updateMaxELen();
+
+  /// Update FLen, MinVLen, MaxELen, and MaxELenFp.
+  void updateImpliedLengths();
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index ea1f4fc3b85d..855d1aeddfae 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -461,9 +461,10 @@ public:
 
   OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
                                                const Value *RHS,
-                                               const Instruction *CxtI) const {
-    return llvm::computeOverflowForUnsignedMul(LHS, RHS,
-                                               SQ.getWithInstruction(CxtI));
+                                               const Instruction *CxtI,
+                                               bool IsNSW = false) const {
+    return llvm::computeOverflowForUnsignedMul(
+        LHS, RHS, SQ.getWithInstruction(CxtI), IsNSW);
   }
 
   OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
new file mode 100644
index 000000000000..38afa0c6fd32
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
@@ -0,0 +1,24 @@
+//===-- PGOCtxProfLowering.h - Contextual PGO Instr. Lowering ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PGOCtxProfLoweringPass class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_PGOCTXPROFLOWERING_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_PGOCTXPROFLOWERING_H
+
+namespace llvm {
+class Type;
+
+class PGOCtxProfLoweringPass {
+public:
+  explicit PGOCtxProfLoweringPass() = default;
+  static bool isContextualIRPGOEnabled();
+};
+} // namespace llvm
+#endif
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 4ba9b74ccb00..debe2ee79917 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -232,13 +232,67 @@ private:
 
   /// A mapping from value numbers to lists of Value*'s that
   /// have that value number.  Use findLeader to query it.
-  struct LeaderTableEntry {
-    Value *Val;
-    const BasicBlock *BB;
-    LeaderTableEntry *Next;
+  class LeaderMap {
+  public:
+    struct LeaderTableEntry {
+      Value *Val;
+      const BasicBlock *BB;
+    };
+
+  private:
+    struct LeaderListNode {
+      LeaderTableEntry Entry;
+      LeaderListNode *Next;
+    };
+    DenseMap<uint32_t, LeaderListNode> NumToLeaders;
+    BumpPtrAllocator TableAllocator;
+
+  public:
+    class leader_iterator {
+      const LeaderListNode *Current;
+
+    public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = const LeaderTableEntry;
+      using difference_type = std::ptrdiff_t;
+      using pointer = value_type *;
+      using reference = value_type &;
+
+      leader_iterator(const LeaderListNode *C) : Current(C) {}
+      leader_iterator &operator++() {
+        assert(Current && "Dereferenced end of leader list!");
+        Current = Current->Next;
+        return *this;
+      }
+      bool operator==(const leader_iterator &Other) const {
+        return Current == Other.Current;
+      }
+      bool operator!=(const leader_iterator &Other) const {
+        return Current != Other.Current;
+      }
+      reference operator*() const { return Current->Entry; }
+    };
+
+    iterator_range<leader_iterator> getLeaders(uint32_t N) {
+      auto I = NumToLeaders.find(N);
+      if (I == NumToLeaders.end()) {
+        return iterator_range(leader_iterator(nullptr),
+                              leader_iterator(nullptr));
+      }
+
+      return iterator_range(leader_iterator(&I->second),
+                            leader_iterator(nullptr));
+    }
+
+    void insert(uint32_t N, Value *V, const BasicBlock *BB);
+    void erase(uint32_t N, Instruction *I, const BasicBlock *BB);
+    void verifyRemoved(const Value *Inst) const;
+    void clear() {
+      NumToLeaders.clear();
+      TableAllocator.Reset();
+    }
   };
-  DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
-  BumpPtrAllocator TableAllocator;
+  LeaderMap LeaderTable;
 
   // Block-local map of equivalent values to their leader, does not
   // propagate to any successors. Entries added mid-block are applied
@@ -264,51 +318,6 @@ private:
                MemoryDependenceResults *RunMD, LoopInfo &LI,
                OptimizationRemarkEmitter *ORE, MemorySSA *MSSA = nullptr);
 
-  /// Push a new Value to the LeaderTable onto the list for its value number.
-  void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
-    LeaderTableEntry &Curr = LeaderTable[N];
-    if (!Curr.Val) {
-      Curr.Val = V;
-      Curr.BB = BB;
-      return;
-    }
-
-    LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
-    Node->Val = V;
-    Node->BB = BB;
-    Node->Next = Curr.Next;
-    Curr.Next = Node;
-  }
-
-  /// Scan the list of values corresponding to a given
-  /// value number, and remove the given instruction if encountered.
-  void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
-    LeaderTableEntry *Prev = nullptr;
-    LeaderTableEntry *Curr = &LeaderTable[N];
-
-    while (Curr && (Curr->Val != I || Curr->BB != BB)) {
-      Prev = Curr;
-      Curr = Curr->Next;
-    }
-
-    if (!Curr)
-      return;
-
-    if (Prev) {
-      Prev->Next = Curr->Next;
-    } else {
-      if (!Curr->Next) {
-        Curr->Val = nullptr;
-        Curr->BB = nullptr;
-      } else {
-        LeaderTableEntry *Next = Curr->Next;
-        Curr->Val = Next->Val;
-        Curr->BB = Next->BB;
-        Curr->Next = Next->Next;
-      }
-    }
-  }
-
   // List of critical edges to be split between iterations.
   SmallVector<std::pair<Instruction *, unsigned>, 4> toSplit;
 
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index 3364d7eaee42..f7358ac9b1ee 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <optional>
 #include <utility>
 
@@ -114,11 +115,10 @@ public:
   bool processBlock(BasicBlock *BB);
   bool maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB);
   void updateSSA(BasicBlock *BB, BasicBlock *NewBB,
-                 DenseMap<Instruction *, Value *> &ValueMapping);
-  DenseMap<Instruction *, Value *> cloneInstructions(BasicBlock::iterator BI,
-                                                     BasicBlock::iterator BE,
-                                                     BasicBlock *NewBB,
-                                                     BasicBlock *PredBB);
+                 ValueToValueMapTy &ValueMapping);
+  void cloneInstructions(ValueToValueMapTy &ValueMapping,
+                         BasicBlock::iterator BI, BasicBlock::iterator BE,
+                         BasicBlock *NewBB, BasicBlock *PredBB);
   bool tryThreadEdge(BasicBlock *BB,
                      const SmallVectorImpl<BasicBlock *> &PredBBs,
                      BasicBlock *SuccBB);
diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index 9ebb95007774..429d6a2e0523 100644
--- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -62,6 +62,13 @@ namespace llvm {
                      LibFunc TheLibFunc, AttributeList AttributeList,
                      FunctionType *Invalid, ArgsTy... Args) = delete;
 
+  // Handle -mregparm for the given function.
+  // Note that this function is a rough approximation that only works for simple
+  // function signatures; it does not apply other relevant attributes for
+  // function signatures, including sign/zero-extension for arguments and return
+  // values.
+  void markRegisterParameterAttributes(Function *F);
+
   /// Check whether the library function is available on target and also that
   /// it in the current Module is a Function with the right type.
   bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
diff --git a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
index 60c91fc30174..c001e587313c 100644
--- a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
+++ b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
@@ -24,9 +24,9 @@ class Value;
 ///
 bool isSafeToDestroyConstant(const Constant *C);
 
-/// As we analyze each global, keep track of some information about it.  If we
-/// find out that the address of the global is taken, none of this info will be
-/// accurate.
+/// As we analyze each global or thread-local variable, keep track of some
+/// information about it.  If we find out that the address of the global is
+/// taken, none of this info will be accurate.
 struct GlobalStatus {
   /// True if the global's address is used in a comparison.
   bool IsCompared = false;
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index e2143b5bfbe2..6937ec8dfd21 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -18,6 +18,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cstdint>
 
 namespace llvm {
@@ -490,6 +491,10 @@ void hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
 DIExpression *getExpressionForConstant(DIBuilder &DIB, const Constant &C,
                                        Type &Ty);
 
+/// Remap the operands of the debug records attached to \p Inst, and the
+/// operands of \p Inst itself if it's a debug intrinsic.
+void remapDebugVariable(ValueToValueMapTy &Mapping, Instruction *Inst);
+
 //===----------------------------------------------------------------------===//
 //  Intrinsic pattern matching
 //
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index fb3ab33a0629..16589a605e60 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -86,6 +86,8 @@ Value *getFP(IRBuilder<> &IRB);
 Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB);
 Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot);
 
+void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag);
+
 } // namespace memtag
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 326006fbb880..4f99d171469e 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -153,10 +153,15 @@ private:
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
-  bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
-                           unsigned Idx, unsigned MinVF);
-
-  bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
+  std::optional<bool> vectorizeStoreChain(ArrayRef<Value *> Chain,
+                                          slpvectorizer::BoUpSLP &R,
+                                          unsigned Idx, unsigned MinVF,
+                                          unsigned &Size);
+
+  bool vectorizeStores(
+      ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R,
+      DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
+          &Visited);
 
   /// The store instructions in a basic block organized by base pointer.
   StoreListMap Stores;
diff --git a/llvm/include/module.install.modulemap b/llvm/include/module.install.modulemap
index f7302830f561..b917cddc7803 100644
--- a/llvm/include/module.install.modulemap
+++ b/llvm/include/module.install.modulemap
@@ -31,5 +31,7 @@ module LLVM_Extern_Utils_DataTypes {
 }
 
 module LLVM_Extern_TargetParser_Gen {
+  textual header "llvm/TargetParser/ARMTargetParserDef.inc"
+  textual header "llvm/TargetParser/AArch64TargetParserDef.inc"
   textual header "llvm/TargetParser/RISCVTargetParserDef.inc"
 }
diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap
index e60e03a282ac..b00da6d7cd28 100644
--- a/llvm/include/module.modulemap
+++ b/llvm/include/module.modulemap
@@ -345,6 +345,11 @@ extern module LLVM_Extern_Utils_DataTypes "module.extern.modulemap"
 // Build the module with the tablegen-generated files needed by the
 // TargetParser module before building the TargetParser module itself.
 module TargetParserGen {
+  module AArch64TargetParserDef {
+    header "llvm/TargetParser/AArch64TargetParser.h"
+    extern module LLVM_Extern_TargetParser_Gen "module.extern.modulemap"
+    export *
+  }
   module RISCVTargetParserDef {
     header "llvm/TargetParser/RISCVTargetParser.h"
     extern module LLVM_Extern_TargetParser_Gen "module.extern.modulemap"
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index c06984c0d494..4061dae83c10 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6281,11 +6281,11 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
                m_Intrinsic<Intrinsic::pow>(m_SpecificFP(10.0), m_Value(X)))))
       return X;
     break;
-  case Intrinsic::experimental_vector_reverse:
-    // experimental.vector.reverse(experimental.vector.reverse(x)) -> x
+  case Intrinsic::vector_reverse:
+    // vector.reverse(vector.reverse(x)) -> x
     if (match(Op0, m_VecReverse(m_Value(X))))
       return X;
-    // experimental.vector.reverse(splat(X)) -> splat(X)
+    // vector.reverse(splat(X)) -> splat(X)
     if (isSplatValue(Op0))
       return Op0;
     break;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index b1ba8e7c0f60..b0d29e2409f7 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1805,20 +1805,20 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
 }
 
 /// Given a dependence-distance \p Dist between two
-/// memory accesses, that have the same stride whose absolute value is given
-/// in \p Stride, and that have the same type size \p TypeByteSize,
-/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
-/// possible to prove statically that the dependence distance is larger
-/// than the range that the accesses will travel through the execution of
-/// the loop. If so, return true; false otherwise. This is useful for
-/// example in loops such as the following (PR31098):
+/// memory accesses, that have strides in the same direction whose absolute
+/// value of the maximum stride is given in \p MaxStride, and that have the same
+/// type size \p TypeByteSize, in a loop whose takenCount is \p
+/// BackedgeTakenCount, check if it is possible to prove statically that the
+/// dependence distance is larger than the range that the accesses will travel
+/// through the execution of the loop. If so, return true; false otherwise. This
+/// is useful for example in loops such as the following (PR31098):
 ///     for (i = 0; i < D; ++i) {
 ///                = out[i];
 ///       out[i+D] =
 ///     }
 static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
                                      const SCEV &BackedgeTakenCount,
-                                     const SCEV &Dist, uint64_t Stride,
+                                     const SCEV &Dist, uint64_t MaxStride,
                                      uint64_t TypeByteSize) {
 
   // If we can prove that
@@ -1838,7 +1838,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
   // will be executed only if LoopCount >= VF, proving distance >= LoopCount
   // also guarantees that distance >= VF.
   //
-  const uint64_t ByteStride = Stride * TypeByteSize;
+  const uint64_t ByteStride = MaxStride * TypeByteSize;
   const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
   const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
 
@@ -1920,20 +1920,21 @@ isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
 namespace {
 struct DepDistanceStrideAndSizeInfo {
   const SCEV *Dist;
-  uint64_t Stride;
+  uint64_t StrideA;
+  uint64_t StrideB;
   uint64_t TypeByteSize;
   bool AIsWrite;
   bool BIsWrite;
 
-  DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t Stride,
-                               uint64_t TypeByteSize, bool AIsWrite,
-                               bool BIsWrite)
-      : Dist(Dist), Stride(Stride), TypeByteSize(TypeByteSize),
-        AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
+  DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA,
+                               uint64_t StrideB, uint64_t TypeByteSize,
+                               bool AIsWrite, bool BIsWrite)
+      : Dist(Dist), StrideA(StrideA), StrideB(StrideB),
+        TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
 };
 } // namespace
 
-// Get the dependence distance, stride, type size and whether it is a write for
+// Get the dependence distance, strides, type size and whether it is a write for
 // the dependence between A and B. Returns a DepType, if we can prove there's
 // no dependence or the analysis fails. Outlined to lambda to limit he scope
 // of various temporary variables, like A/BPtr, StrideA/BPtr and others.
@@ -1995,10 +1996,11 @@ getDependenceDistanceStrideAndSize(
                                    InnermostLoop))
     return MemoryDepChecker::Dependence::IndirectUnsafe;
 
-  // Need accesses with constant stride. We don't want to vectorize
-  // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap
-  // in the address space.
-  if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr) {
+  // Need accesses with constant strides and the same direction. We don't want
+  // to vectorize "A[B[i]] += ..." and similar code or pointer arithmetic that
+  // could wrap in the address space.
+  if (!StrideAPtr || !StrideBPtr || (StrideAPtr > 0 && StrideBPtr < 0) ||
+      (StrideAPtr < 0 && StrideBPtr > 0)) {
     LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
     return MemoryDepChecker::Dependence::Unknown;
   }
@@ -2008,9 +2010,9 @@ getDependenceDistanceStrideAndSize(
       DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
   if (!HasSameSize)
     TypeByteSize = 0;
-  uint64_t Stride = std::abs(StrideAPtr);
-  return DepDistanceStrideAndSizeInfo(Dist, Stride, TypeByteSize, AIsWrite,
-                                      BIsWrite);
+  return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtr),
+                                      std::abs(StrideBPtr), TypeByteSize,
+                                      AIsWrite, BIsWrite);
 }
 
 MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
@@ -2028,41 +2030,64 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   if (std::holds_alternative<Dependence::DepType>(Res))
     return std::get<Dependence::DepType>(Res);
 
-  const auto &[Dist, Stride, TypeByteSize, AIsWrite, BIsWrite] =
+  const auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
       std::get<DepDistanceStrideAndSizeInfo>(Res);
   bool HasSameSize = TypeByteSize > 0;
 
+  std::optional<uint64_t> CommonStride =
+      StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt;
+  if (isa<SCEVCouldNotCompute>(Dist)) {
+    // TODO: Relax requirement that there is a common stride to retry with
+    // non-constant distance dependencies.
+    FoundNonConstantDistanceDependence |= !!CommonStride;
+    LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
+    return Dependence::Unknown;
+  }
+
   ScalarEvolution &SE = *PSE.getSE();
   auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
-  // If the distance between the acecsses is larger than their absolute stride
-  // multiplied by the backedge taken count, the accesses are independet, i.e.
-  // they are far enough appart that accesses won't access the same location
-  // across all loop ierations.
-  if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize &&
+  uint64_t MaxStride = std::max(StrideA, StrideB);
+
+  // If the distance between the acecsses is larger than their maximum absolute
+  // stride multiplied by the backedge taken count, the accesses are independet,
+  // i.e. they are far enough appart that accesses won't access the same
+  // location across all loop ierations.
+  if (HasSameSize &&
       isSafeDependenceDistance(DL, SE, *(PSE.getBackedgeTakenCount()), *Dist,
-                               Stride, TypeByteSize))
+                               MaxStride, TypeByteSize))
     return Dependence::NoDep;
 
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
-  if (!C) {
-    LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
-    FoundNonConstantDistanceDependence = true;
-    return Dependence::Unknown;
-  }
 
-  const APInt &Val = C->getAPInt();
-  int64_t Distance = Val.getSExtValue();
-
-  // If the distance between accesses and their strides are known constants,
-  // check whether the accesses interlace each other.
-  if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize &&
-      areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
-    LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
-    return Dependence::NoDep;
+  // Attempt to prove strided accesses independent.
+  if (C) {
+    const APInt &Val = C->getAPInt();
+    int64_t Distance = Val.getSExtValue();
+
+    // If the distance between accesses and their strides are known constants,
+    // check whether the accesses interlace each other.
+    if (std::abs(Distance) > 0 && CommonStride && *CommonStride > 1 &&
+        HasSameSize &&
+        areStridedAccessesIndependent(std::abs(Distance), *CommonStride,
+                                      TypeByteSize)) {
+      LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
+      return Dependence::NoDep;
+    }
   }
 
   // Negative distances are not plausible dependencies.
-  if (Val.isNegative()) {
+  if (SE.isKnownNonPositive(Dist)) {
+    if (SE.isKnownNonNegative(Dist)) {
+      if (HasSameSize) {
+        // Write to the same location with the same size.
+        return Dependence::Forward;
+      } else {
+        LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
+                             "different type sizes\n");
+        return Dependence::Unknown;
+      }
+    }
+
     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
     // Check if the first access writes to a location that is read in a later
     // iteration, where the distance between them is not a multiple of a vector
@@ -2071,27 +2096,40 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
     // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
     // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
     // forward dependency will allow vectorization using any width.
-    if (IsTrueDataDependence && EnableForwardingConflictDetection &&
-        (!HasSameSize || couldPreventStoreLoadForward(Val.abs().getZExtValue(),
-                                                      TypeByteSize))) {
-      LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
-      return Dependence::ForwardButPreventsForwarding;
+
+    if (IsTrueDataDependence && EnableForwardingConflictDetection) {
+      if (!C) {
+        // TODO: FoundNonConstantDistanceDependence is used as a necessary
+        // condition to consider retrying with runtime checks. Historically, we
+        // did not set it when strides were different but there is no inherent
+        // reason to.
+        FoundNonConstantDistanceDependence |= CommonStride.has_value();
+        return Dependence::Unknown;
+      }
+      if (!HasSameSize ||
+          couldPreventStoreLoadForward(C->getAPInt().abs().getZExtValue(),
+                                       TypeByteSize)) {
+        LLVM_DEBUG(
+            dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
+        return Dependence::ForwardButPreventsForwarding;
+      }
     }
 
     LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
     return Dependence::Forward;
   }
 
-  // Write to the same location with the same size.
-  if (Val == 0) {
-    if (HasSameSize)
-      return Dependence::Forward;
-    LLVM_DEBUG(
-        dbgs() << "LAA: Zero dependence difference but different type sizes\n");
+  if (!C) {
+    // TODO: FoundNonConstantDistanceDependence is used as a necessary condition
+    // to consider retrying with runtime checks. Historically, we did not set it
+    // when strides were different but there is no inherent reason to.
+    FoundNonConstantDistanceDependence |= CommonStride.has_value();
+    LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
     return Dependence::Unknown;
   }
 
-  assert(Val.isStrictlyPositive() && "Expect a positive value");
+  if (!SE.isKnownPositive(Dist))
+    return Dependence::Unknown;
 
   if (!HasSameSize) {
     LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
@@ -2099,6 +2137,14 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
     return Dependence::Unknown;
   }
 
+  // The logic below currently only supports StrideA ==  StrideB, i.e. there's a
+  // common stride.
+  if (!CommonStride)
+    return Dependence::Unknown;
+
+  const APInt &Val = C->getAPInt();
+  int64_t Distance = Val.getSExtValue();
+
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
                            VectorizerParams::VectorizationFactor : 1);
@@ -2134,7 +2180,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // the minimum distance needed is 28, which is greater than distance. It is
   // not safe to do vectorization.
   uint64_t MinDistanceNeeded =
-      TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
+      TypeByteSize * (*CommonStride) * (MinNumIter - 1) + TypeByteSize;
   if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
     LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
                       << Distance << '\n');
@@ -2183,7 +2229,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
 
   // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
   // since there is a backwards dependency.
-  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * Stride);
+  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * (*CommonStride));
   LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
                     << " with max VF = " << MaxVF << '\n');
   uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index de38eddaa98f..fed2061aae3a 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6686,9 +6686,15 @@ llvm::computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
 
 OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
                                                    const Value *RHS,
-                                                   const SimplifyQuery &SQ) {
+                                                   const SimplifyQuery &SQ,
+                                                   bool IsNSW) {
   KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, SQ);
   KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, SQ);
+
+  // mul nsw of two non-negative numbers is also nuw.
+  if (IsNSW && LHSKnown.isNonNegative() && RHSKnown.isNonNegative())
+    return OverflowResult::NeverOverflows;
+
   ConstantRange LHSRange = ConstantRange::fromKnownBits(LHSKnown, false);
   ConstantRange RHSRange = ConstantRange::fromKnownBits(RHSKnown, false);
   return mapOverflowResult(LHSRange.unsignedMulMayOverflow(RHSRange));
@@ -7283,31 +7289,35 @@ static bool isGuaranteedNotToBeUndefOrPoison(
   // BB1:
   //   CtxI ; V cannot be undef or poison here
   auto *Dominator = DNode->getIDom();
-  while (Dominator) {
-    auto *TI = Dominator->getBlock()->getTerminator();
-
-    Value *Cond = nullptr;
-    if (auto BI = dyn_cast_or_null<BranchInst>(TI)) {
-      if (BI->isConditional())
-        Cond = BI->getCondition();
-    } else if (auto SI = dyn_cast_or_null<SwitchInst>(TI)) {
-      Cond = SI->getCondition();
-    }
+  // This check is purely for compile time reasons: we can skip the IDom walk
+  // if what we are checking for includes undef and the value is not an integer.
+  if (!includesUndef(Kind) || V->getType()->isIntegerTy())
+    while (Dominator) {
+      auto *TI = Dominator->getBlock()->getTerminator();
+
+      Value *Cond = nullptr;
+      if (auto BI = dyn_cast_or_null<BranchInst>(TI)) {
+        if (BI->isConditional())
+          Cond = BI->getCondition();
+      } else if (auto SI = dyn_cast_or_null<SwitchInst>(TI)) {
+        Cond = SI->getCondition();
+      }
 
-    if (Cond) {
-      if (Cond == V)
-        return true;
-      else if (!includesUndef(Kind) && isa<Operator>(Cond)) {
-        // For poison, we can analyze further
-        auto *Opr = cast<Operator>(Cond);
-        if (any_of(Opr->operands(),
-                   [V](const Use &U) { return V == U && propagatesPoison(U); }))
+      if (Cond) {
+        if (Cond == V)
           return true;
+        else if (!includesUndef(Kind) && isa<Operator>(Cond)) {
+          // For poison, we can analyze further
+          auto *Opr = cast<Operator>(Cond);
+          if (any_of(Opr->operands(), [V](const Use &U) {
+                return V == U && propagatesPoison(U);
+              }))
+            return true;
+        }
       }
-    }
 
-    Dominator = Dominator->getIDom();
-  }
+      Dominator = Dominator->getIDom();
+    }
 
   if (getKnowledgeValidInContext(V, {Attribute::NoUndef}, CtxI, DT, AC))
     return true;
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index e4e5b5dd8c0e..732426617268 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -411,6 +411,16 @@ llvm::dwarf::LanguageLowerBound(dwarf::SourceLanguage Lang) {
   }
 }
 
+StringRef llvm::dwarf::LanguageDescription(dwarf::SourceLanguageName lname) {
+  switch (lname) {
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND)                           \
+  case DW_LNAME_##NAME:                                                        \
+    return DESC;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+  return "Unknown";
+}
+
 StringRef llvm::dwarf::CaseString(unsigned Case) {
   switch (Case) {
   case DW_ID_case_sensitive:
diff --git a/llvm/lib/BinaryFormat/ELF.cpp b/llvm/lib/BinaryFormat/ELF.cpp
index 8c10ed1a980b..9878f5769087 100644
--- a/llvm/lib/BinaryFormat/ELF.cpp
+++ b/llvm/lib/BinaryFormat/ELF.cpp
@@ -568,12 +568,11 @@ StringRef ELF::convertEMachineToArchName(uint16_t EMachine) {
   }
 }
 
-uint8_t ELF::convertOSToOSAbi(StringRef OS) {
-  std::string LowerOS = OS.lower();
-  return StringSwitch<uint16_t>(LowerOS)
+uint8_t ELF::convertNameToOSABI(StringRef Name) {
+  return StringSwitch<uint16_t>(Name)
       .StartsWith("hpux", ELFOSABI_HPUX)
       .StartsWith("netbsd", ELFOSABI_NETBSD)
-      .StartsWith("linux", ELFOSABI_LINUX)
+      .StartsWith("gnu", ELFOSABI_GNU)
       .StartsWith("hurd", ELFOSABI_HURD)
       .StartsWith("solaris", ELFOSABI_SOLARIS)
       .StartsWith("aix", ELFOSABI_AIX)
@@ -597,14 +596,14 @@ uint8_t ELF::convertOSToOSAbi(StringRef OS) {
       .Default(ELFOSABI_NONE);
 }
 
-StringRef ELF::convertOSAbiToOS(uint8_t OSAbi) {
-  switch (OSAbi) {
+StringRef ELF::convertOSABIToName(uint8_t OSABI) {
+  switch (OSABI) {
   case ELFOSABI_HPUX:
     return "hpux";
   case ELFOSABI_NETBSD:
     return "netbsd";
-  case ELFOSABI_LINUX:
-    return "linux";
+  case ELFOSABI_GNU:
+    return "gnu";
   case ELFOSABI_HURD:
     return "hurd";
   case ELFOSABI_SOLARIS:
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 0b7fcd884188..a0779f955cf2 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7513,9 +7513,14 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       TheIndex.setFlags(Record[0]);
       break;
     }
-    case bitc::FS_VALUE_GUID: { // [valueid, refguid]
+    case bitc::FS_VALUE_GUID: { // [valueid, refguid_upper32, refguid_lower32]
       uint64_t ValueID = Record[0];
-      GlobalValue::GUID RefGUID = Record[1];
+      GlobalValue::GUID RefGUID;
+      if (Version >= 10) {
+        RefGUID = Record[1] << 32 | Record[2];
+      } else {
+        RefGUID = Record[1];
+      }
       ValueIdToValueInfoMap[ValueID] = std::make_tuple(
           TheIndex.getOrInsertValueInfo(RefGUID), RefGUID, RefGUID);
       break;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 6d01e3b4d821..1aaf160e91ca 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4299,9 +4299,20 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     return;
   }
 
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_VALUE_GUID));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  // GUIDS often use up most of 64-bits, so encode as two Fixed 32.
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  unsigned ValueGuidAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
   for (const auto &GVI : valueIds()) {
     Stream.EmitRecord(bitc::FS_VALUE_GUID,
-                      ArrayRef<uint64_t>{GVI.second, GVI.first});
+                      ArrayRef<uint32_t>{GVI.second,
+                                         static_cast<uint32_t>(GVI.first >> 32),
+                                         static_cast<uint32_t>(GVI.first)},
+                      ValueGuidAbbrev);
   }
 
   if (!Index->stackIds().empty()) {
@@ -4315,7 +4326,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   }
 
   // Abbrev for FS_PERMODULE_PROFILE.
-  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // flags
@@ -4467,9 +4478,20 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // Write the index flags.
   Stream.EmitRecord(bitc::FS_FLAGS, ArrayRef<uint64_t>{Index.getFlags()});
 
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_VALUE_GUID));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  // GUIDS often use up most of 64-bits, so encode as two Fixed 32.
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  unsigned ValueGuidAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
   for (const auto &GVI : valueIds()) {
     Stream.EmitRecord(bitc::FS_VALUE_GUID,
-                      ArrayRef<uint64_t>{GVI.second, GVI.first});
+                      ArrayRef<uint32_t>{GVI.second,
+                                         static_cast<uint32_t>(GVI.first >> 32),
+                                         static_cast<uint32_t>(GVI.first)},
+                      ValueGuidAbbrev);
   }
 
   if (!StackIdIndices.empty()) {
@@ -4488,7 +4510,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   }
 
   // Abbrev for FS_COMBINED_PROFILE.
-  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 721d144d7f4c..869670d43a17 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -114,6 +114,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Support/VCSRevision.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -497,12 +498,15 @@ bool AsmPrinter::doInitialization(Module &M) {
     else
       FileName = M.getSourceFileName();
     if (MAI->hasFourStringsDotFile()) {
-#ifdef PACKAGE_VENDOR
       const char VerStr[] =
-          PACKAGE_VENDOR " " PACKAGE_NAME " version " PACKAGE_VERSION;
-#else
-      const char VerStr[] = PACKAGE_NAME " version " PACKAGE_VERSION;
+#ifdef PACKAGE_VENDOR
+          PACKAGE_VENDOR " "
+#endif
+          PACKAGE_NAME " version " PACKAGE_VERSION
+#ifdef LLVM_REVISION
+                         " (" LLVM_REVISION ")"
 #endif
+          ;
       // TODO: Add timestamp and description.
       OutStreamer->emitFileDirective(FileName, VerStr, "", "");
     } else {
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 2c24de60edd4..77bf1b165d0c 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -65,8 +65,8 @@ add_llvm_component_library(LLVMCodeGen
   FEntryInserter.cpp
   FinalizeISel.cpp
   FixupStatepointCallerSaved.cpp
-  FreeMachineFunction.cpp
   FuncletLayout.cpp
+  MachineFunctionAnalysis.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCRootLowering.cpp
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 22a766f8d625..339a1f1f2f00 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1431,10 +1431,8 @@ static bool SinkCast(CastInst *CI) {
     if (!InsertedCast) {
       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       assert(InsertPt != UserBB->end());
-      InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
-                                      CI->getType(), "");
+      InsertedCast = cast<CastInst>(CI->clone());
       InsertedCast->insertBefore(*UserBB, InsertPt);
-      InsertedCast->setDebugLoc(CI->getDebugLoc());
     }
 
     // Replace a use of the cast with a use of the new cast.
@@ -8272,6 +8270,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
       IRBuilder<> Builder(Branch);
       if (UI->getParent() != Branch->getParent())
         UI->moveBefore(Branch);
+      UI->dropPoisonGeneratingFlags();
       Value *NewCmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, UI,
                                         ConstantInt::get(UI->getType(), 0));
       LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
@@ -8285,6 +8284,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
       IRBuilder<> Builder(Branch);
       if (UI->getParent() != Branch->getParent())
         UI->moveBefore(Branch);
+      UI->dropPoisonGeneratingFlags();
       Value *NewCmp = Builder.CreateCmp(Cmp->getPredicate(), UI,
                                         ConstantInt::get(UI->getType(), 0));
       LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 031a271de5bd..8573b016d1e5 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1639,8 +1639,7 @@ bool ComplexDeinterleavingGraph::checkNodes() {
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
   if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) {
-    if (Intrinsic->getIntrinsicID() !=
-        Intrinsic::experimental_vector_interleave2)
+    if (Intrinsic->getIntrinsicID() != Intrinsic::vector_interleave2)
       return nullptr;
 
     auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0));
@@ -1675,7 +1674,7 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
   Value *FinalValue = nullptr;
   if (match(Real, m_ExtractValue<0>(m_Instruction(I))) &&
       match(Imag, m_ExtractValue<1>(m_Specific(I))) &&
-      match(I, m_Intrinsic<Intrinsic::experimental_vector_deinterleave2>(
+      match(I, m_Intrinsic<Intrinsic::vector_deinterleave2>(
                    m_Value(FinalValue)))) {
     NodePtr PlaceholderNode = prepareCompositeNode(
         llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag);
@@ -1960,13 +1959,11 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
       // Splats that are not constant are interleaved where they are located
       Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
       IRBuilder<> IRB(InsertPoint);
-      ReplacementNode =
-          IRB.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewTy,
-                              {Node->Real, Node->Imag});
+      ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2,
+                                            NewTy, {Node->Real, Node->Imag});
     } else {
-      ReplacementNode =
-          Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2,
-                                  NewTy, {Node->Real, Node->Imag});
+      ReplacementNode = Builder.CreateIntrinsic(
+          Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag});
     }
     break;
   }
@@ -1991,9 +1988,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     auto *B = replaceNode(Builder, Node->Operands[1]);
     auto *NewMaskTy = VectorType::getDoubleElementsVectorType(
         cast<VectorType>(MaskReal->getType()));
-    auto *NewMask =
-        Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2,
-                                NewMaskTy, {MaskReal, MaskImag});
+    auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2,
+                                            NewMaskTy, {MaskReal, MaskImag});
     ReplacementNode = Builder.CreateSelect(NewMask, A, B);
     break;
   }
@@ -2021,8 +2017,8 @@ void ComplexDeinterleavingGraph::processReductionOperation(
   Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming);
 
   IRBuilder<> Builder(Incoming->getTerminator());
-  auto *NewInit = Builder.CreateIntrinsic(
-      Intrinsic::experimental_vector_interleave2, NewVTy, {InitReal, InitImag});
+  auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
+                                          {InitReal, InitImag});
 
   NewPHI->addIncoming(NewInit, Incoming);
   NewPHI->addIncoming(OperationReplacement, BackEdge);
@@ -2034,9 +2030,9 @@ void ComplexDeinterleavingGraph::processReductionOperation(
 
   Builder.SetInsertPoint(
       &*FinalReductionReal->getParent()->getFirstInsertionPt());
-  auto *Deinterleave = Builder.CreateIntrinsic(
-      Intrinsic::experimental_vector_deinterleave2,
-      OperationReplacement->getType(), OperationReplacement);
+  auto *Deinterleave = Builder.CreateIntrinsic(Intrinsic::vector_deinterleave2,
+                                               OperationReplacement->getType(),
+                                               OperationReplacement);
 
   auto *NewReal = Builder.CreateExtractValue(Deinterleave, (uint64_t)0);
   FinalReductionReal->replaceUsesOfWith(Real, NewReal);
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index facc01452d2f..578854cdb4a5 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -68,7 +68,7 @@ DeadMachineInstructionElimPass::run(MachineFunction &MF,
                                     MachineFunctionAnalysisManager &) {
   if (!DeadMachineInstructionElimImpl().runImpl(MF))
     return PreservedAnalyses::all();
-  PreservedAnalyses PA;
+  PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
   PA.preserveSet<CFGAnalyses>();
   return PA;
 }
diff --git a/llvm/lib/CodeGen/FreeMachineFunction.cpp b/llvm/lib/CodeGen/FreeMachineFunction.cpp
deleted file mode 100644
index f38f3e63a3f3..000000000000
--- a/llvm/lib/CodeGen/FreeMachineFunction.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===- FreeMachineFunction.cpp --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/FreeMachineFunction.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-
-using namespace llvm;
-
-PreservedAnalyses
-FreeMachineFunctionPass::run(MachineFunction &MF,
-                             MachineFunctionAnalysisManager &MFAM) {
-  auto &MMI = MF.getMMI();
-  MFAM.invalidate(MF, PreservedAnalyses::none());
-  MMI.deleteMachineFunctionFor(MF.getFunction()); // MF is dangling now.
-  return PreservedAnalyses::none();
-}
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 050f42e9039b..653e7689b577 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5069,6 +5069,9 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
   const unsigned EltBits = ScalarTy.getScalarSizeInBits();
   LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
   LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
+
+  unsigned KnownLeadingZeros =
+      KB ? KB->getKnownBits(LHS).countMinLeadingZeros() : 0;
   auto &MIB = Builder;
 
   bool UseNPQ = false;
@@ -5086,8 +5089,12 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
     // at the end.
     // TODO: Use undef values for divisor of 1.
     if (!Divisor.isOne()) {
+
+      // UnsignedDivisionByConstantInfo doesn't work correctly if leading zeros
+      // in the dividend exceeds the leading zeros for the divisor.
       UnsignedDivisionByConstantInfo magics =
-          UnsignedDivisionByConstantInfo::get(Divisor);
+          UnsignedDivisionByConstantInfo::get(
+              Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
 
       Magic = std::move(magics.Magic);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
index 123bf21f657c..fb33801a3a33 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
@@ -77,7 +77,7 @@ bool CombinerHelper::matchExtractVectorElement(MachineInstr &MI,
 
   // Fold extractVectorElement(Vector, TOOLARGE) -> undef
   if (IndexC && VectorTy.isFixedVector() &&
-      IndexC->getZExtValue() >= VectorTy.getNumElements() &&
+      IndexC->uge(VectorTy.getNumElements()) &&
       isLegalOrBeforeLegalizer({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) {
     // For fixed-length vectors, it's invalid to extract out-of-range elements.
     MatchInfo = [=](MachineIRBuilder &B) { B.buildUndef(Dst); };
@@ -324,3 +324,26 @@ bool CombinerHelper::matchExtractVectorElementWithBuildVectorTrunc(
 
   return true;
 }
+
+bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI,
+                                                 BuildFnTy &MatchInfo) {
+  GInsertVectorElement *Insert = cast<GInsertVectorElement>(&MI);
+
+  Register Dst = Insert->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+  Register Index = Insert->getIndexReg();
+
+  if (!DstTy.isFixedVector())
+    return false;
+
+  std::optional<ValueAndVReg> MaybeIndex =
+      getIConstantVRegValWithLookThrough(Index, MRI);
+
+  if (MaybeIndex && MaybeIndex->Value.uge(DstTy.getNumElements()) &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildUndef(Dst); };
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 51ab7b6262c6..529e50c8ebe0 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -694,6 +694,20 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
     const MachineMemOperand *MMO = *MI.memoperands_begin();
     return TyBits - MMO->getSizeInBits().getValue();
   }
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
+    Register Src1 = MI.getOperand(1).getReg();
+    unsigned Src1NumSignBits =
+        computeNumSignBits(Src1, DemandedElts, Depth + 1);
+    if (Src1NumSignBits != 1) {
+      Register Src2 = MI.getOperand(2).getReg();
+      unsigned Src2NumSignBits =
+          computeNumSignBits(Src2, DemandedElts, Depth + 1);
+      FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits);
+    }
+    break;
+  }
   case TargetOpcode::G_TRUNC: {
     Register Src = MI.getOperand(1).getReg();
     LLT SrcTy = MRI.getType(Src);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 8cf392ab0567..e26c6ca3d616 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1804,7 +1804,7 @@ bool IRTranslator::translateTrap(const CallInst &CI,
 
 bool IRTranslator::translateVectorInterleave2Intrinsic(
     const CallInst &CI, MachineIRBuilder &MIRBuilder) {
-  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2 &&
+  assert(CI.getIntrinsicID() == Intrinsic::vector_interleave2 &&
          "This function can only be called on the interleave2 intrinsic!");
   // Canonicalize interleave2 to G_SHUFFLE_VECTOR (similar to SelectionDAG).
   Register Op0 = getOrCreateVReg(*CI.getOperand(0));
@@ -1820,7 +1820,7 @@ bool IRTranslator::translateVectorInterleave2Intrinsic(
 
 bool IRTranslator::translateVectorDeinterleave2Intrinsic(
     const CallInst &CI, MachineIRBuilder &MIRBuilder) {
-  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2 &&
+  assert(CI.getIntrinsicID() == Intrinsic::vector_deinterleave2 &&
          "This function can only be called on the deinterleave2 intrinsic!");
   // Canonicalize deinterleave2 to shuffles that extract sub-vectors (similar to
   // SelectionDAG).
@@ -2223,7 +2223,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     // addresses. We can treat it like a normal dbg_value intrinsic here; to
     // benefit from the full analysis of stack/SSA locations, GlobalISel would
     // need to register for and use the AssignmentTrackingAnalysis pass.
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst &DI = cast<DbgValueInst>(CI);
@@ -2572,15 +2572,15 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return true;
   }
 
-  case Intrinsic::experimental_vector_interleave2:
-  case Intrinsic::experimental_vector_deinterleave2: {
+  case Intrinsic::vector_interleave2:
+  case Intrinsic::vector_deinterleave2: {
     // Both intrinsics have at least one operand.
     Value *Op0 = CI.getOperand(0);
     LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout());
     if (!ResTy.isFixedVector())
       return false;
 
-    if (CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+    if (CI.getIntrinsicID() == Intrinsic::vector_interleave2)
       return translateVectorInterleave2Intrinsic(CI, MIRBuilder);
 
     return translateVectorDeinterleave2Intrinsic(CI, MIRBuilder);
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index fb9656c09ca3..526ea0868af5 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -84,21 +84,20 @@ BaseIndexOffset GISelAddressing::getPointerInfo(Register Ptr,
                                                 MachineRegisterInfo &MRI) {
   BaseIndexOffset Info;
   Register PtrAddRHS;
-  if (!mi_match(Ptr, MRI, m_GPtrAdd(m_Reg(Info.BaseReg), m_Reg(PtrAddRHS)))) {
-    Info.BaseReg = Ptr;
-    Info.IndexReg = Register();
-    Info.IsIndexSignExt = false;
+  Register BaseReg;
+  if (!mi_match(Ptr, MRI, m_GPtrAdd(m_Reg(BaseReg), m_Reg(PtrAddRHS)))) {
+    Info.setBase(Ptr);
+    Info.setOffset(0);
     return Info;
   }
-
+  Info.setBase(BaseReg);
   auto RHSCst = getIConstantVRegValWithLookThrough(PtrAddRHS, MRI);
   if (RHSCst)
-    Info.Offset = RHSCst->Value.getSExtValue();
+    Info.setOffset(RHSCst->Value.getSExtValue());
 
   // Just recognize a simple case for now. In future we'll need to match
   // indexing patterns for base + index + constant.
-  Info.IndexReg = PtrAddRHS;
-  Info.IsIndexSignExt = false;
+  Info.setIndex(PtrAddRHS);
   return Info;
 }
 
@@ -114,15 +113,16 @@ bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
   BaseIndexOffset BasePtr0 = getPointerInfo(LdSt1->getPointerReg(), MRI);
   BaseIndexOffset BasePtr1 = getPointerInfo(LdSt2->getPointerReg(), MRI);
 
-  if (!BasePtr0.BaseReg.isValid() || !BasePtr1.BaseReg.isValid())
+  if (!BasePtr0.getBase().isValid() || !BasePtr1.getBase().isValid())
     return false;
 
   LocationSize Size1 = LdSt1->getMemSize();
   LocationSize Size2 = LdSt2->getMemSize();
 
   int64_t PtrDiff;
-  if (BasePtr0.BaseReg == BasePtr1.BaseReg) {
-    PtrDiff = BasePtr1.Offset - BasePtr0.Offset;
+  if (BasePtr0.getBase() == BasePtr1.getBase() && BasePtr0.hasValidOffset() &&
+      BasePtr1.hasValidOffset()) {
+    PtrDiff = BasePtr1.getOffset() - BasePtr0.getOffset();
     // If the size of memory access is unknown, do not use it to do analysis.
     // One example of unknown size memory access is to load/store scalable
     // vector objects on the stack.
@@ -149,8 +149,8 @@ bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
   // able to calculate their relative offset if at least one arises
   // from an alloca. However, these allocas cannot overlap and we
   // can infer there is no alias.
-  auto *Base0Def = getDefIgnoringCopies(BasePtr0.BaseReg, MRI);
-  auto *Base1Def = getDefIgnoringCopies(BasePtr1.BaseReg, MRI);
+  auto *Base0Def = getDefIgnoringCopies(BasePtr0.getBase(), MRI);
+  auto *Base1Def = getDefIgnoringCopies(BasePtr1.getBase(), MRI);
   if (!Base0Def || !Base1Def)
     return false; // Couldn't tell anything.
 
@@ -534,16 +534,20 @@ bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI,
 
   Register StoreAddr = StoreMI.getPointerReg();
   auto BIO = getPointerInfo(StoreAddr, *MRI);
-  Register StoreBase = BIO.BaseReg;
-  uint64_t StoreOffCst = BIO.Offset;
+  Register StoreBase = BIO.getBase();
   if (C.Stores.empty()) {
+    C.BasePtr = StoreBase;
+    if (!BIO.hasValidOffset()) {
+      C.CurrentLowestOffset = 0;
+    } else {
+      C.CurrentLowestOffset = BIO.getOffset();
+    }
     // This is the first store of the candidate.
     // If the offset can't possibly allow for a lower addressed store with the
     // same base, don't bother adding it.
-    if (StoreOffCst < ValueTy.getSizeInBytes())
+    if (BIO.hasValidOffset() &&
+        BIO.getOffset() < static_cast<int64_t>(ValueTy.getSizeInBytes()))
       return false;
-    C.BasePtr = StoreBase;
-    C.CurrentLowestOffset = StoreOffCst;
     C.Stores.emplace_back(&StoreMI);
     LLVM_DEBUG(dbgs() << "Starting a new merge candidate group with: "
                       << StoreMI);
@@ -563,8 +567,12 @@ bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI,
   // writes to the next lowest adjacent address.
   if (C.BasePtr != StoreBase)
     return false;
-  if ((C.CurrentLowestOffset - ValueTy.getSizeInBytes()) !=
-      static_cast<uint64_t>(StoreOffCst))
+  // If we don't have a valid offset, we can't guarantee to be an adjacent
+  // offset.
+  if (!BIO.hasValidOffset())
+    return false;
+  if ((C.CurrentLowestOffset -
+       static_cast<int64_t>(ValueTy.getSizeInBytes())) != BIO.getOffset())
     return false;
 
   // This writes to an adjacent address. Allow it.
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index ae43e9ccf611..4e3781cb4e9d 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -12,6 +12,7 @@
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
@@ -28,6 +29,7 @@
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Constants.h"
@@ -1709,3 +1711,84 @@ bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) {
     return false;
   }
 }
+
+namespace {
+enum class UndefPoisonKind {
+  PoisonOnly = (1 << 0),
+  UndefOnly = (1 << 1),
+  UndefOrPoison = PoisonOnly | UndefOnly,
+};
+}
+
+[[maybe_unused]] static bool includesPoison(UndefPoisonKind Kind) {
+  return (unsigned(Kind) & unsigned(UndefPoisonKind::PoisonOnly)) != 0;
+}
+
+[[maybe_unused]] static bool includesUndef(UndefPoisonKind Kind) {
+  return (unsigned(Kind) & unsigned(UndefPoisonKind::UndefOnly)) != 0;
+}
+
+static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
+                                   bool ConsiderFlagsAndMetadata,
+                                   UndefPoisonKind Kind) {
+  MachineInstr *RegDef = MRI.getVRegDef(Reg);
+
+  switch (RegDef->getOpcode()) {
+  case TargetOpcode::G_FREEZE:
+    return false;
+  default:
+    return true;
+  }
+}
+
+static bool isGuaranteedNotToBeUndefOrPoison(Register Reg,
+                                             const MachineRegisterInfo &MRI,
+                                             unsigned Depth,
+                                             UndefPoisonKind Kind) {
+  if (Depth >= MaxAnalysisRecursionDepth)
+    return false;
+
+  MachineInstr *RegDef = MRI.getVRegDef(Reg);
+
+  switch (RegDef->getOpcode()) {
+  case TargetOpcode::G_FREEZE:
+    return true;
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return !includesUndef(Kind);
+  default:
+    return false;
+  }
+}
+
+bool llvm::canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
+                                  bool ConsiderFlagsAndMetadata) {
+  return ::canCreateUndefOrPoison(Reg, MRI, ConsiderFlagsAndMetadata,
+                                  UndefPoisonKind::UndefOrPoison);
+}
+
+bool canCreatePoison(Register Reg, const MachineRegisterInfo &MRI,
+                     bool ConsiderFlagsAndMetadata = true) {
+  return ::canCreateUndefOrPoison(Reg, MRI, ConsiderFlagsAndMetadata,
+                                  UndefPoisonKind::PoisonOnly);
+}
+
+bool llvm::isGuaranteedNotToBeUndefOrPoison(Register Reg,
+                                            const MachineRegisterInfo &MRI,
+                                            unsigned Depth) {
+  return ::isGuaranteedNotToBeUndefOrPoison(Reg, MRI, Depth,
+                                            UndefPoisonKind::UndefOrPoison);
+}
+
+bool llvm::isGuaranteedNotToBePoison(Register Reg,
+                                     const MachineRegisterInfo &MRI,
+                                     unsigned Depth) {
+  return ::isGuaranteedNotToBeUndefOrPoison(Reg, MRI, Depth,
+                                            UndefPoisonKind::PoisonOnly);
+}
+
+bool llvm::isGuaranteedNotToBeUndef(Register Reg,
+                                    const MachineRegisterInfo &MRI,
+                                    unsigned Depth) {
+  return ::isGuaranteedNotToBeUndefOrPoison(Reg, MRI, Depth,
+                                            UndefPoisonKind::UndefOnly);
+}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8989eabbe6df..8c9065aec7fa 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -535,9 +535,9 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       // At present, we only have intrinsics to represent (de)interleaving
       // with a factor of 2.
-      if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
+      if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-      if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+      if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index f2d5c3c867c2..e5f164b18272 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -890,7 +890,7 @@ public:
           ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),
           ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),
       };
-      int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, ArrayRef(Idx, 2));
+      int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, Idx);
       Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
     }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 4737ff5cb43a..d758e633fc2d 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
@@ -97,13 +98,15 @@ public:
   /// Create an empty function with the given name.
   Function *createDummyFunction(StringRef Name, Module &M);
 
-  bool parseMachineFunctions(Module &M, MachineModuleInfo &MMI);
+  bool parseMachineFunctions(Module &M, MachineModuleInfo &MMI,
+                             ModuleAnalysisManager *FAM = nullptr);
 
   /// Parse the machine function in the current YAML document.
   ///
   ///
   /// Return true if an error occurred.
-  bool parseMachineFunction(Module &M, MachineModuleInfo &MMI);
+  bool parseMachineFunction(Module &M, MachineModuleInfo &MMI,
+                            ModuleAnalysisManager *FAM);
 
   /// Initialize the machine function to the state that's described in the MIR
   /// file.
@@ -275,13 +278,14 @@ MIRParserImpl::parseIRModule(DataLayoutCallbackTy DataLayoutCallback) {
   return M;
 }
 
-bool MIRParserImpl::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) {
+bool MIRParserImpl::parseMachineFunctions(Module &M, MachineModuleInfo &MMI,
+                                          ModuleAnalysisManager *MAM) {
   if (NoMIRDocuments)
     return false;
 
   // Parse the machine functions.
   do {
-    if (parseMachineFunction(M, MMI))
+    if (parseMachineFunction(M, MMI, MAM))
       return true;
     In.nextDocument();
   } while (In.setCurrentDocument());
@@ -303,7 +307,8 @@ Function *MIRParserImpl::createDummyFunction(StringRef Name, Module &M) {
   return F;
 }
 
-bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) {
+bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI,
+                                         ModuleAnalysisManager *MAM) {
   // Parse the yaml.
   yaml::MachineFunction YamlMF;
   yaml::EmptyContext Ctx;
@@ -327,14 +332,28 @@ bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) {
                    "' isn't defined in the provided LLVM IR");
     }
   }
-  if (MMI.getMachineFunction(*F) != nullptr)
-    return error(Twine("redefinition of machine function '") + FunctionName +
-                 "'");
 
-  // Create the MachineFunction.
-  MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
-  if (initializeMachineFunction(YamlMF, MF))
-    return true;
+  if (!MAM) {
+    if (MMI.getMachineFunction(*F) != nullptr)
+      return error(Twine("redefinition of machine function '") + FunctionName +
+                   "'");
+
+    // Create the MachineFunction.
+    MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+    if (initializeMachineFunction(YamlMF, MF))
+      return true;
+  } else {
+    auto &FAM =
+        MAM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+    if (FAM.getCachedResult<MachineFunctionAnalysis>(*F))
+      return error(Twine("redefinition of machine function '") + FunctionName +
+                   "'");
+
+    // Create the MachineFunction.
+    MachineFunction &MF = FAM.getResult<MachineFunctionAnalysis>(*F).getMF();
+    if (initializeMachineFunction(YamlMF, MF))
+      return true;
+  }
 
   return false;
 }
@@ -766,6 +785,7 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
   MFI.setHasVAStart(YamlMFI.HasVAStart);
   MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc);
   MFI.setHasTailCall(YamlMFI.HasTailCall);
+  MFI.setCalleeSavedInfoValid(YamlMFI.IsCalleeSavedInfoValid);
   MFI.setLocalFrameSize(YamlMFI.LocalFrameSize);
   if (!YamlMFI.SavePoint.Value.empty()) {
     MachineBasicBlock *MBB = nullptr;
@@ -1107,6 +1127,11 @@ bool MIRParser::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) {
   return Impl->parseMachineFunctions(M, MMI);
 }
 
+bool MIRParser::parseMachineFunctions(Module &M, ModuleAnalysisManager &MAM) {
+  auto &MMI = MAM.getResult<MachineModuleAnalysis>(M).getMMI();
+  return Impl->parseMachineFunctions(M, MMI, &MAM);
+}
+
 std::unique_ptr<MIRParser> llvm::createMIRParserFromFile(
     StringRef Filename, SMDiagnostic &Error, LLVMContext &Context,
     std::function<void(Function &)> ProcessIRFunction) {
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index e49d54f31c3e..a3bff83333b2 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -368,6 +368,7 @@ void MIRPrinter::convert(ModuleSlotTracker &MST,
   YamlMFI.HasVAStart = MFI.hasVAStart();
   YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc();
   YamlMFI.HasTailCall = MFI.hasTailCall();
+  YamlMFI.IsCalleeSavedInfoValid = MFI.isCalleeSavedInfoValid();
   YamlMFI.LocalFrameSize = MFI.getLocalFrameSize();
   if (MFI.getSavePoint()) {
     raw_string_ostream StrOS(YamlMFI.SavePoint.Value);
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
new file mode 100644
index 000000000000..519a77c44acf
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -0,0 +1,46 @@
+//===- MachineFunctionAnalysis.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the MachineFunctionAnalysis
+// members.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+AnalysisKey MachineFunctionAnalysis::Key;
+
+bool MachineFunctionAnalysis::Result::invalidate(
+    Function &, const PreservedAnalyses &PA,
+    FunctionAnalysisManager::Invalidator &) {
+  // Unless it is invalidated explicitly, it should remain preserved.
+  auto PAC = PA.getChecker<MachineFunctionAnalysis>();
+  return !PAC.preservedWhenStateless();
+}
+
+MachineFunctionAnalysis::Result
+MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
+  auto &Context = F.getContext();
+  const TargetSubtargetInfo &STI = *TM->getSubtargetImpl(F);
+  auto &MMI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F)
+                  .getCachedResult<MachineModuleAnalysis>(*F.getParent())
+                  ->getMMI();
+  auto MF = std::make_unique<MachineFunction>(
+      F, *TM, STI, Context.generateMachineFunctionNum(F), MMI);
+  MF->initTargetMachineFunctionInfo(STI);
+
+  // MRI callback for target specific initializations.
+  TM->registerMachineRegisterInfoCallback(*MF);
+
+  return Result(std::move(MF));
+}
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 997f6eb08512..727a98c41bce 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1264,26 +1264,32 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
 
   // If we have a COPY with other uses in the loop, hoist to allow the users to
   // also be hoisted.
-  Register DefReg;
-  if (MI.isCopy() && MI.getOperand(0).isReg() &&
-      (DefReg = MI.getOperand(0).getReg()).isVirtual() &&
-      MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isVirtual() &&
-      IsLoopInvariantInst(MI, CurLoop) &&
-      any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
-             [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
-               if (!CurLoop->contains(&UseMI))
-                 return false;
-
-               // COPY is a cheap instruction, but if moving it won't cause high
-               // RP we're fine to hoist it even if the user can't be hoisted
-               // later Otherwise we want to check the user if it's hoistable
-               if (CanCauseHighRegPressure(Cost, false) &&
-                   !CurLoop->isLoopInvariant(UseMI, DefReg))
-                 return false;
-
-               return true;
-             }))
-    return true;
+  // TODO: Handle all isCopyLike?
+  if (MI.isCopy() || MI.isRegSequence()) {
+    Register DefReg = MI.getOperand(0).getReg();
+    if (DefReg.isVirtual() &&
+        all_of(MI.uses(),
+               [](const MachineOperand &UseOp) {
+                 return !UseOp.isReg() || UseOp.getReg().isVirtual();
+               }) &&
+        IsLoopInvariantInst(MI, CurLoop) &&
+        any_of(MRI->use_nodbg_instructions(DefReg),
+               [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
+                 if (!CurLoop->contains(&UseMI))
+                   return false;
+
+                 // COPY is a cheap instruction, but if moving it won't cause
+                 // high RP we're fine to hoist it even if the user can't be
+                 // hoisted later Otherwise we want to check the user if it's
+                 // hoistable
+                 if (CanCauseHighRegPressure(Cost, false) &&
+                     !CurLoop->isLoopInvariant(UseMI, DefReg))
+                   return false;
+
+                 return true;
+               }))
+      return true;
+  }
 
   // High register pressure situation, only hoist if the instruction is going
   // to be remat'ed.
diff --git a/llvm/lib/CodeGen/MachinePassManager.cpp b/llvm/lib/CodeGen/MachinePassManager.cpp
index 2763193b2c30..6d540808d4cc 100644
--- a/llvm/lib/CodeGen/MachinePassManager.cpp
+++ b/llvm/lib/CodeGen/MachinePassManager.cpp
@@ -12,21 +12,24 @@
 
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/PassManagerImpl.h"
 
 using namespace llvm;
 
-namespace llvm {
-
 AnalysisKey FunctionAnalysisManagerMachineFunctionProxy::Key;
 
+namespace llvm {
 template class AnalysisManager<MachineFunction>;
 template class PassManager<MachineFunction>;
 template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
                                          Module>;
+template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
+                                         Function>;
 template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
                                          MachineFunction>;
+} // namespace llvm
 
 bool FunctionAnalysisManagerMachineFunctionProxy::Result::invalidate(
     MachineFunction &IR, const PreservedAnalyses &PA,
@@ -69,38 +72,65 @@ bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate(
   return false;
 }
 
+template <>
+bool MachineFunctionAnalysisManagerFunctionProxy::Result::invalidate(
+    Function &F, const PreservedAnalyses &PA,
+    FunctionAnalysisManager::Invalidator &Inv) {
+  // If literally everything is preserved, we're done.
+  if (PA.areAllPreserved())
+    return false; // This is still a valid proxy.
+
+  // If this proxy isn't marked as preserved, then even if the result remains
+  // valid, the key itself may no longer be valid, so we clear everything.
+  //
+  // Note that in order to preserve this proxy, a module pass must ensure that
+  // the MFAM has been completely updated to handle the deletion of functions.
+  // Specifically, any MFAM-cached results for those functions need to have been
+  // forcibly cleared. When preserved, this proxy will only invalidate results
+  // cached on functions *still in the module* at the end of the module pass.
+  auto PAC = PA.getChecker<MachineFunctionAnalysisManagerFunctionProxy>();
+  if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>()) {
+    InnerAM->clear();
+    return true;
+  }
+
+  // FIXME: be more precise, see
+  // FunctionAnalysisManagerModuleProxy::Result::invalidate.
+  if (!PA.allAnalysesInSetPreserved<AllAnalysesOn<MachineFunction>>()) {
+    InnerAM->clear();
+    return true;
+  }
+
+  // Return false to indicate that this result is still a valid proxy.
+  return false;
+}
+
 PreservedAnalyses
-ModuleToMachineFunctionPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
-  auto &MMI = AM.getResult<MachineModuleAnalysis>(M).getMMI();
+FunctionToMachineFunctionPassAdaptor::run(Function &F,
+                                          FunctionAnalysisManager &FAM) {
   MachineFunctionAnalysisManager &MFAM =
-      AM.getResult<MachineFunctionAnalysisManagerModuleProxy>(M).getManager();
-  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M);
+      FAM.getResult<MachineFunctionAnalysisManagerFunctionProxy>(F)
+          .getManager();
+  PassInstrumentation PI = FAM.getResult<PassInstrumentationAnalysis>(F);
   PreservedAnalyses PA = PreservedAnalyses::all();
-  for (Function &F : M) {
-    // Do not codegen any 'available_externally' functions at all, they have
-    // definitions outside the translation unit.
-    if (F.isDeclaration() || F.hasAvailableExternallyLinkage())
-      continue;
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (F.isDeclaration() || F.hasAvailableExternallyLinkage())
+    return PreservedAnalyses::all();
 
-    MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+  MachineFunction &MF = FAM.getResult<MachineFunctionAnalysis>(F).getMF();
 
-    if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
-      continue;
-    PreservedAnalyses PassPA = Pass->run(MF, MFAM);
-    if (MMI.getMachineFunction(F)) {
-      MFAM.invalidate(MF, PassPA);
-      PI.runAfterPass(*Pass, MF, PassPA);
-    } else {
-      MFAM.clear(MF, F.getName());
-      PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA);
-    }
-    PA.intersect(std::move(PassPA));
-  }
+  if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PassPA = Pass->run(MF, MFAM);
+  MFAM.invalidate(MF, PassPA);
+  PI.runAfterPass(*Pass, MF, PassPA);
+  PA.intersect(std::move(PassPA));
 
   return PA;
 }
 
-void ModuleToMachineFunctionPassAdaptor::printPipeline(
+void FunctionToMachineFunctionPassAdaptor::printPipeline(
     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
   OS << "machine-function(";
   Pass->printPipeline(OS, MapClassName2PassName);
@@ -112,27 +142,24 @@ PreservedAnalyses
 PassManager<MachineFunction>::run(MachineFunction &MF,
                                   AnalysisManager<MachineFunction> &MFAM) {
   PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(MF);
-  Function &F = MF.getFunction();
-  MachineModuleInfo &MMI =
-      MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
-          .getCachedResult<MachineModuleAnalysis>(*F.getParent())
-          ->getMMI();
   PreservedAnalyses PA = PreservedAnalyses::all();
   for (auto &Pass : Passes) {
     if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
       continue;
 
     PreservedAnalyses PassPA = Pass->run(MF, MFAM);
-    if (MMI.getMachineFunction(F)) {
-      MFAM.invalidate(MF, PassPA);
-      PI.runAfterPass(*Pass, MF, PassPA);
-    } else {
-      MFAM.clear(MF, F.getName());
-      PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA);
-    }
+    MFAM.invalidate(MF, PassPA);
+    PI.runAfterPass(*Pass, MF, PassPA);
     PA.intersect(std::move(PassPA));
   }
   return PA;
 }
 
-} // namespace llvm
+PreservedAnalyses llvm::getMachineFunctionPassPreservedAnalyses() {
+  PreservedAnalyses PA;
+  // Machine function passes are not allowed to modify the LLVM
+  // representation, therefore we should preserve all IR analyses.
+  PA.template preserveSet<AllAnalysesOn<Module>>();
+  PA.template preserveSet<AllAnalysesOn<Function>>();
+  return PA;
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fd265b12d73c..c0bbea16a642 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -530,6 +530,7 @@ namespace {
     bool refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(SDNode *N);
 
     SDValue visitSTORE(SDNode *N);
+    SDValue visitATOMIC_STORE(SDNode *N);
     SDValue visitLIFETIME_END(SDNode *N);
     SDValue visitINSERT_VECTOR_ELT(SDNode *N);
     SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
@@ -1909,6 +1910,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::BR_CC:              return visitBR_CC(N);
   case ISD::LOAD:               return visitLOAD(N);
   case ISD::STORE:              return visitSTORE(N);
+  case ISD::ATOMIC_STORE:       return visitATOMIC_STORE(N);
   case ISD::INSERT_VECTOR_ELT:  return visitINSERT_VECTOR_ELT(N);
   case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
   case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
@@ -7620,6 +7622,8 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, const SDLoc &DL) {
 static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
                                   SDNode *N) {
   EVT VT = N0.getValueType();
+  unsigned BW = VT.getScalarSizeInBits();
+  SDLoc DL(N);
 
   auto peekThroughResize = [](SDValue V) {
     if (V->getOpcode() == ISD::ZERO_EXTEND || V->getOpcode() == ISD::TRUNCATE)
@@ -7642,16 +7646,16 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
     if (SDValue NotOperand = getBitwiseNotOperand(N01, N00,
                                                   /* AllowUndefs */ false)) {
       if (peekThroughResize(NotOperand) == N1Resized)
-        return DAG.getNode(ISD::OR, SDLoc(N), VT,
-                           DAG.getZExtOrTrunc(N00, SDLoc(N), VT), N1);
+        return DAG.getNode(ISD::OR, DL, VT, DAG.getZExtOrTrunc(N00, DL, VT),
+                           N1);
     }
 
     // fold (or (and (xor Y, -1), X), Y) -> (or X, Y)
     if (SDValue NotOperand = getBitwiseNotOperand(N00, N01,
                                                   /* AllowUndefs */ false)) {
       if (peekThroughResize(NotOperand) == N1Resized)
-        return DAG.getNode(ISD::OR, SDLoc(N), VT,
-                           DAG.getZExtOrTrunc(N01, SDLoc(N), VT), N1);
+        return DAG.getNode(ISD::OR, DL, VT, DAG.getZExtOrTrunc(N01, DL, VT),
+                           N1);
     }
   }
 
@@ -7659,13 +7663,13 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
 
   // fold or (xor X, N1), N1 --> or X, N1
   if (sd_match(N0, m_Xor(m_Value(X), m_Specific(N1))))
-    return DAG.getNode(ISD::OR, SDLoc(N), VT, X, N1);
+    return DAG.getNode(ISD::OR, DL, VT, X, N1);
 
   // fold or (xor x, y), (x and/or y) --> or x, y
   if (sd_match(N0, m_Xor(m_Value(X), m_Value(Y))) &&
       (sd_match(N1, m_And(m_Specific(X), m_Specific(Y))) ||
        sd_match(N1, m_Or(m_Specific(X), m_Specific(Y)))))
-    return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Y);
+    return DAG.getNode(ISD::OR, DL, VT, X, Y);
 
   if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
     return R;
@@ -7688,6 +7692,26 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
       peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
     return N0;
 
+  // Attempt to match a legalized build_pair-esque pattern:
+  // or(shl(aext(Hi),BW/2),zext(Lo))
+  SDValue Lo, Hi;
+  if (sd_match(N0,
+               m_OneUse(m_Shl(m_AnyExt(m_Value(Hi)), m_SpecificInt(BW / 2)))) &&
+      sd_match(N1, m_ZExt(m_Value(Lo))) &&
+      Lo.getScalarValueSizeInBits() == (BW / 2) &&
+      Lo.getValueType() == Hi.getValueType()) {
+    // Fold build_pair(not(Lo),not(Hi)) -> not(build_pair(Lo,Hi)).
+    SDValue NotLo, NotHi;
+    if (sd_match(Lo, m_OneUse(m_Not(m_Value(NotLo)))) &&
+        sd_match(Hi, m_OneUse(m_Not(m_Value(NotHi))))) {
+      Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotLo);
+      Hi = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NotHi);
+      Hi = DAG.getNode(ISD::SHL, DL, VT, Hi,
+                       DAG.getShiftAmountConstant(BW / 2, VT, DL));
+      return DAG.getNOT(DL, DAG.getNode(ISD::OR, DL, VT, Lo, Hi), VT);
+    }
+  }
+
   return SDValue();
 }
 
@@ -9716,17 +9740,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   if (SDValue V = DAG.simplifyShift(N0, N1))
     return V;
 
+  SDLoc DL(N);
   EVT VT = N0.getValueType();
   EVT ShiftVT = N1.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // fold (shl c1, c2) -> c1<<c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, DL, VT, {N0, N1}))
     return C;
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
     BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
@@ -9742,8 +9767,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
             TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
                 TargetLowering::ZeroOrNegativeOneBooleanContent) {
           if (SDValue C =
-                  DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1}))
-            return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
+                  DAG.FoldConstantArithmetic(ISD::SHL, DL, VT, {N01, N1}))
+            return DAG.getNode(ISD::AND, DL, VT, N00, C);
         }
       }
     }
@@ -9754,13 +9779,13 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
 
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
 
   // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
     if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
+      return DAG.getNode(ISD::SHL, DL, VT, N0, NewOp1);
   }
 
   // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
@@ -9773,7 +9798,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       return (c1 + c2).uge(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
-      return DAG.getConstant(0, SDLoc(N), VT);
+      return DAG.getConstant(0, DL, VT);
 
     auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
                                        ConstantSDNode *RHS) {
@@ -9783,7 +9808,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       return (c1 + c2).ult(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
-      SDLoc DL(N);
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
       return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
     }
@@ -9814,7 +9838,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange,
                                   /*AllowUndefs*/ false,
                                   /*AllowTypeMismatch*/ true))
-      return DAG.getConstant(0, SDLoc(N), VT);
+      return DAG.getConstant(0, DL, VT);
 
     auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
                                                       ConstantSDNode *RHS) {
@@ -9827,7 +9851,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange,
                                   /*AllowUndefs*/ false,
                                   /*AllowTypeMismatch*/ true)) {
-      SDLoc DL(N);
       SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0));
       SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT);
       Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1);
@@ -9852,7 +9875,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual,
                                   /*AllowUndefs*/ false,
                                   /*AllowTypeMismatch*/ true)) {
-      SDLoc DL(N);
       EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType();
       SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT);
       NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL);
@@ -9869,9 +9891,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
              LHSC.getZExtValue() <= RHSC.getZExtValue();
     };
-
-    SDLoc DL(N);
-
+    
     // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
     // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
     if (N0->getFlags().hasExact()) {
@@ -9925,7 +9945,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
   if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
       isConstantOrConstantVector(N1, /* No Opaques */ true)) {
-    SDLoc DL(N);
     SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
     SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
@@ -9946,7 +9965,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       // Preserve the disjoint flag for Or.
       if (N0.getOpcode() == ISD::OR && N0->getFlags().hasDisjoint())
         Flags.setDisjoint(true);
-      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1, Flags);
+      return DAG.getNode(N0.getOpcode(), DL, VT, Shl0, Shl1, Flags);
     }
   }
 
@@ -9976,7 +9995,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     SDValue N01 = N0.getOperand(1);
     if (SDValue Shl =
             DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, {N01, N1}))
-      return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
+      return DAG.getNode(ISD::MUL, DL, VT, N0.getOperand(0), Shl);
   }
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
@@ -9991,7 +10010,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   if (N0.getOpcode() == ISD::VSCALE && N1C) {
     const APInt &C0 = N0.getConstantOperandAPInt(0);
     const APInt &C1 = N1C->getAPIntValue();
-    return DAG.getVScale(SDLoc(N), VT, C0 << C1);
+    return DAG.getVScale(DL, VT, C0 << C1);
   }
 
   // Fold (shl step_vector(C0), C1) to (step_vector(C0 << C1)).
@@ -10001,7 +10020,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     const APInt &C0 = N0.getConstantOperandAPInt(0);
     if (ShlVal.ult(C0.getBitWidth())) {
       APInt NewStep = C0 << ShlVal;
-      return DAG.getStepVector(SDLoc(N), VT, NewStep);
+      return DAG.getStepVector(DL, VT, NewStep);
     }
   }
 
@@ -10012,7 +10031,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
 // Examples:
 // (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b)
 // (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b)
-static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
+static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                   const TargetLowering &TLI) {
   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
          "SRL or SRA node is required here!");
@@ -10023,8 +10042,6 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
   if (!ShiftAmtSrc)
     return SDValue();
 
-  SDLoc DL(N);
-
   // The operation feeding into the shift must be a multiply.
   SDValue ShiftOperand = N->getOperand(0);
   if (ShiftOperand.getOpcode() != ISD::MUL)
@@ -10166,11 +10183,12 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (SDValue V = DAG.simplifyShift(N0, N1))
     return V;
 
+  SDLoc DL(N);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // fold (sra c1, c2) -> (sra c1, c2)
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, DL, VT, {N0, N1}))
     return C;
 
   // Arithmetic shifting an all-sign-bit value is a no-op.
@@ -10181,7 +10199,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -10192,7 +10210,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
   // clamp (add c1, c2) to max shift.
   if (N0.getOpcode() == ISD::SRA) {
-    SDLoc DL(N);
     EVT ShiftVT = N1.getValueType();
     EVT ShiftSVT = ShiftVT.getScalarType();
     SmallVector<SDValue, 16> ShiftValues;
@@ -10249,7 +10266,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
           TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
           TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
           TLI.isTruncateFree(VT, TruncVT)) {
-        SDLoc DL(N);
         SDValue Amt = DAG.getConstant(ShiftAmt, DL,
             getShiftAmountTy(N0.getOperand(0).getValueType()));
         SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
@@ -10290,7 +10306,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
         //       that restriction may conflict with other transforms.
         if (TruncVT.isSimple() && isTypeLegal(TruncVT) &&
             TLI.isTruncateFree(VT, TruncVT)) {
-          SDLoc DL(N);
           SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
           SDValue ShiftC =
               DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).trunc(
@@ -10311,7 +10326,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
     if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
-      return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
+      return DAG.getNode(ISD::SRA, DL, VT, N0, NewOp1);
   }
 
   // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
@@ -10328,7 +10343,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       EVT LargeVT = N0Op0.getValueType();
       unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits;
       if (LargeShift->getAPIntValue() == TruncBits) {
-        SDLoc DL(N);
         EVT LargeShiftVT = getShiftAmountTy(LargeVT);
         SDValue Amt = DAG.getZExtOrTrunc(N1, DL, LargeShiftVT);
         Amt = DAG.getNode(ISD::ADD, DL, LargeShiftVT, Amt,
@@ -10346,7 +10360,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
 
   // If the sign bit is known to be zero, switch this to a SRL.
   if (DAG.SignBitIsZero(N0))
-    return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
+    return DAG.getNode(ISD::SRL, DL, VT, N0, N1);
 
   if (N1C && !N1C->isOpaque())
     if (SDValue NewSRA = visitShiftByConstant(N))
@@ -10354,7 +10368,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
 
   // Try to transform this shift into a multiply-high if
   // it matches the appropriate pattern detected in combineShiftToMULH.
-  if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
+  if (SDValue MULH = combineShiftToMULH(N, DL, DAG, TLI))
     return MULH;
 
   // Attempt to convert a sra of a load into a narrower sign-extending load.
@@ -10370,17 +10384,18 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (SDValue V = DAG.simplifyShift(N0, N1))
     return V;
 
+  SDLoc DL(N);
   EVT VT = N0.getValueType();
   EVT ShiftVT = N1.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // fold (srl c1, c2) -> c1 >>u c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, DL, VT, {N0, N1}))
     return C;
 
   // fold vector ops
   if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -10390,7 +10405,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C &&
       DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
 
   // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
   if (N0.getOpcode() == ISD::SRL) {
@@ -10402,7 +10417,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       return (c1 + c2).uge(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
-      return DAG.getConstant(0, SDLoc(N), VT);
+      return DAG.getConstant(0, DL, VT);
 
     auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
                                        ConstantSDNode *RHS) {
@@ -10412,7 +10427,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       return (c1 + c2).ult(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
-      SDLoc DL(N);
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
       return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
     }
@@ -10431,7 +10445,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       // srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2)))
       // This is only valid if the OpSizeInBits + c1 = size of inner shift.
       if (c1 + OpSizeInBits == InnerShiftSize) {
-        SDLoc DL(N);
         if (c1 + c2 >= InnerShiftSize)
           return DAG.getConstant(0, DL, VT);
         SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
@@ -10443,7 +10456,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask)
       if (N0.hasOneUse() && InnerShift.hasOneUse() &&
           c1 + c2 < InnerShiftSize) {
-        SDLoc DL(N);
         SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
         SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
                                        InnerShift.getOperand(0), NewShiftAmt);
@@ -10471,7 +10483,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
                                   /*AllowUndefs*/ false,
                                   /*AllowTypeMismatch*/ true)) {
-      SDLoc DL(N);
       SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
       SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
       SDValue Mask = DAG.getAllOnesConstant(DL, VT);
@@ -10483,7 +10494,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
                                   /*AllowUndefs*/ false,
                                   /*AllowTypeMismatch*/ true)) {
-      SDLoc DL(N);
       SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
       SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
       SDValue Mask = DAG.getAllOnesConstant(DL, VT);
@@ -10511,7 +10521,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
                                           getShiftAmountTy(SmallVT)));
       AddToWorklist(SmallShift.getNode());
       APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
-      SDLoc DL(N);
       return DAG.getNode(ISD::AND, DL, VT,
                          DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
                          DAG.getConstant(Mask, DL, VT));
@@ -10522,7 +10531,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // bit, which is unmodified by sra.
   if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) {
     if (N0.getOpcode() == ISD::SRA)
-      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
+      return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
   }
 
   // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit), and x has a power
@@ -10557,10 +10566,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
                                   getShiftAmountTy(Op.getValueType())));
         AddToWorklist(Op.getNode());
       }
-
-      SDLoc DL(N);
-      return DAG.getNode(ISD::XOR, DL, VT,
-                         Op, DAG.getConstant(1, DL, VT));
+      return DAG.getNode(ISD::XOR, DL, VT, Op, DAG.getConstant(1, DL, VT));
     }
   }
 
@@ -10568,7 +10574,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
     if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
-      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
+      return DAG.getNode(ISD::SRL, DL, VT, N0, NewOp1);
   }
 
   // fold operands of srl based on knowledge that the low bits are not
@@ -10622,7 +10628,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
 
   // Try to transform this shift into a multiply-high if
   // it matches the appropriate pattern detected in combineShiftToMULH.
-  if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
+  if (SDValue MULH = combineShiftToMULH(N, DL, DAG, TLI))
     return MULH;
 
   return SDValue();
@@ -10748,11 +10754,11 @@ SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
   if (SDValue V = DAG.simplifyShift(N0, N1))
     return V;
 
+  SDLoc DL(N);
   EVT VT = N0.getValueType();
 
   // fold (*shlsat c1, c2) -> c1<<c2
-  if (SDValue C =
-          DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1}))
     return C;
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
@@ -10761,13 +10767,13 @@ SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
     // fold (sshlsat x, c) -> (shl x, c)
     if (N->getOpcode() == ISD::SSHLSAT && N1C &&
         N1C->getAPIntValue().ult(DAG.ComputeNumSignBits(N0)))
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
+      return DAG.getNode(ISD::SHL, DL, VT, N0, N1);
 
     // fold (ushlsat x, c) -> (shl x, c)
     if (N->getOpcode() == ISD::USHLSAT && N1C &&
         N1C->getAPIntValue().ule(
             DAG.computeKnownBits(N0).countMinLeadingZeros()))
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
+      return DAG.getNode(ISD::SHL, DL, VT, N0, N1);
   }
 
   return SDValue();
@@ -11161,7 +11167,8 @@ SDValue DAGCombiner::combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
 
 /// If a (v)select has a condition value that is a sign-bit test, try to smear
 /// the condition operand sign-bit across the value width and use it as a mask.
-static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
+static SDValue foldSelectOfConstantsUsingSra(SDNode *N, const SDLoc &DL,
+                                             SelectionDAG &DAG) {
   SDValue Cond = N->getOperand(0);
   SDValue C1 = N->getOperand(1);
   SDValue C2 = N->getOperand(2);
@@ -11181,14 +11188,12 @@ static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
   if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) &&
       isAllOnesOrAllOnesSplat(C2)) {
     // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1
-    SDLoc DL(N);
     SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
     return DAG.getNode(ISD::OR, DL, VT, Sra, C1);
   }
   if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) {
     // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1
-    SDLoc DL(N);
     SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
     return DAG.getNode(ISD::AND, DL, VT, Sra, C1);
@@ -11328,7 +11333,7 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
     return DAG.getNode(ISD::OR, DL, VT, NotCond, N1);
   }
 
-  if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
+  if (SDValue V = foldSelectOfConstantsUsingSra(N, DL, DAG))
     return V;
 
   return SDValue();
@@ -12049,7 +12054,7 @@ SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
     return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC);
   }
 
-  if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
+  if (SDValue V = foldSelectOfConstantsUsingSra(N, DL, DAG))
     return V;
 
   // The general case for select-of-constants:
@@ -15435,6 +15440,12 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
     return N0;
 
+  // We currently avoid folding freeze over SRA/SRL, due to the problems seen
+  // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
+  // example https://reviews.llvm.org/D136529#4120959.
+  if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
+    return SDValue();
+
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
@@ -15451,6 +15462,26 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
                                           N0.getOpcode() == ISD::BUILD_PAIR ||
                                           N0.getOpcode() == ISD::CONCAT_VECTORS;
 
+  // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
+  // ones" or "constant" into something that depends on FrozenUndef. We can
+  // instead pick undef values to keep those properties, while at the same time
+  // folding away the freeze.
+  // If we implement a more general solution for folding away freeze(undef) in
+  // the future, then this special handling can be removed.
+  if (N0.getOpcode() == ISD::BUILD_VECTOR) {
+    SDLoc DL(N0);
+    EVT VT = N0.getValueType();
+    if (llvm::ISD::isBuildVectorAllOnes(N0.getNode()))
+      return DAG.getAllOnesConstant(DL, VT);
+    if (llvm::ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+      SmallVector<SDValue, 8> NewVecC;
+      for (const SDValue &Op : N0->op_values())
+        NewVecC.push_back(
+            Op.isUndef() ? DAG.getConstant(0, DL, Op.getValueType()) : Op);
+      return DAG.getBuildVector(VT, DL, NewVecC);
+    }
+  }
+
   SmallSetVector<SDValue, 8> MaybePoisonOperands;
   for (SDValue Op : N0->ops()) {
     if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
@@ -21095,6 +21126,24 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
                       ST->getMemOperand()->getFlags());
 }
 
+SDValue DAGCombiner::visitATOMIC_STORE(SDNode *N) {
+  AtomicSDNode *ST = cast<AtomicSDNode>(N);
+  SDValue Val = ST->getVal();
+  EVT VT = Val.getValueType();
+  EVT MemVT = ST->getMemoryVT();
+
+  if (MemVT.bitsLT(VT)) { // Is truncating store
+    APInt TruncDemandedBits = APInt::getLowBitsSet(VT.getScalarSizeInBits(),
+                                                   MemVT.getScalarSizeInBits());
+    // See if we can simplify the operation with SimplifyDemandedBits, which
+    // only works if the value has a single use.
+    if (SimplifyDemandedBits(Val, TruncDemandedBits))
+      return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSTORE(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
@@ -28045,7 +28094,10 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
 #endif
 
   if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
-      Size0.hasValue() && Size1.hasValue()) {
+      Size0.hasValue() && Size1.hasValue() &&
+      // Can't represent a scalable size + fixed offset in LocationSize
+      (!Size0.isScalable() || SrcValOffset0 == 0) &&
+      (!Size1.isScalable() || SrcValOffset1 == 0)) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
     int64_t Overlap0 =
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index aefedd060f89..ef9f78335519 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1424,7 +1424,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     // happened (such as an optimised function being always-inlined into an
     // optnone function). We will not be using the extra information in the
     // dbg.assign in that case, just use its dbg.value fields.
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst *DI = cast<DbgValueInst>(II);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 24f69ea1b742..bfc3e08c1632 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -180,6 +180,13 @@ private:
                              SmallVectorImpl<SDValue> &Results);
   SDValue PromoteLegalFP_TO_INT_SAT(SDNode *Node, const SDLoc &dl);
 
+  /// Implements vector reduce operation promotion.
+  ///
+  /// All vector operands are promoted to a vector type with larger element
+  /// type, and the start value is promoted to a larger scalar type. Then the
+  /// result is truncated back to the original scalar type.
+  SDValue PromoteReduction(SDNode *Node);
+
   SDValue ExpandPARITY(SDValue Op, const SDLoc &dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
@@ -1220,6 +1227,11 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(
         Node->getOpcode(), Node->getOperand(1).getValueType());
     break;
+  case ISD::VP_CTTZ_ELTS:
+  case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(0).getValueType());
+    break;
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
       Action = TLI.getCustomOperationAction(*Node);
@@ -2979,6 +2991,44 @@ SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) {
   return DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(1, dl, VT));
 }
 
+SDValue SelectionDAGLegalize::PromoteReduction(SDNode *Node) {
+  MVT VecVT = Node->getOperand(1).getSimpleValueType();
+  MVT NewVecVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VecVT);
+  MVT ScalarVT = Node->getSimpleValueType(0);
+  MVT NewScalarVT = NewVecVT.getVectorElementType();
+
+  SDLoc DL(Node);
+  SmallVector<SDValue, 4> Operands(Node->getNumOperands());
+
+  // promote the initial value.
+  // FIXME: Support integer.
+  assert(Node->getOperand(0).getValueType().isFloatingPoint() &&
+         "Only FP promotion is supported");
+  Operands[0] =
+      DAG.getNode(ISD::FP_EXTEND, DL, NewScalarVT, Node->getOperand(0));
+
+  for (unsigned j = 1; j != Node->getNumOperands(); ++j)
+    if (Node->getOperand(j).getValueType().isVector() &&
+        !(ISD::isVPOpcode(Node->getOpcode()) &&
+          ISD::getVPMaskIdx(Node->getOpcode()) == j)) { // Skip mask operand.
+      // promote the vector operand.
+      // FIXME: Support integer.
+      assert(Node->getOperand(j).getValueType().isFloatingPoint() &&
+             "Only FP promotion is supported");
+      Operands[j] =
+          DAG.getNode(ISD::FP_EXTEND, DL, NewVecVT, Node->getOperand(j));
+    } else {
+      Operands[j] = Node->getOperand(j); // Skip VL operand.
+    }
+
+  SDValue Res = DAG.getNode(Node->getOpcode(), DL, NewScalarVT, Operands,
+                            Node->getFlags());
+
+  assert(ScalarVT.isFloatingPoint() && "Only FP promotion is supported");
+  return DAG.getNode(ISD::FP_ROUND, DL, ScalarVT, Res,
+                     DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
+}
+
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to expand node\n");
   SmallVector<SDValue, 8> Results;
@@ -3195,7 +3245,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       break;
     }
 
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   }
   case ISD::BITCAST:
     if ((Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
@@ -3556,6 +3606,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(Expanded);
     break;
   }
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM: {
+    if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG))
+      Results.push_back(Expanded);
+    break;
+  }
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);
@@ -4228,6 +4284,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::VECREDUCE_FMINIMUM:
     Results.push_back(TLI.expandVecReduce(Node, DAG));
     break;
+  case ISD::VP_CTTZ_ELTS:
+  case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
+    Results.push_back(TLI.expandVPCTTZElements(Node, DAG));
+    break;
   case ISD::GLOBAL_OFFSET_TABLE:
   case ISD::GlobalAddress:
   case ISD::GlobalTLSAddress:
@@ -4949,7 +5009,12 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   if (Node->getOpcode() == ISD::STRICT_UINT_TO_FP ||
       Node->getOpcode() == ISD::STRICT_SINT_TO_FP ||
       Node->getOpcode() == ISD::STRICT_FSETCC ||
-      Node->getOpcode() == ISD::STRICT_FSETCCS)
+      Node->getOpcode() == ISD::STRICT_FSETCCS ||
+      Node->getOpcode() == ISD::VP_REDUCE_FADD ||
+      Node->getOpcode() == ISD::VP_REDUCE_FMUL ||
+      Node->getOpcode() == ISD::VP_REDUCE_FMAX ||
+      Node->getOpcode() == ISD::VP_REDUCE_FMIN ||
+      Node->getOpcode() == ISD::VP_REDUCE_SEQ_FADD)
     OVT = Node->getOperand(1).getSimpleValueType();
   if (Node->getOpcode() == ISD::BR_CC ||
       Node->getOpcode() == ISD::SELECT_CC)
@@ -5575,6 +5640,21 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Results.push_back(NewAtomic.getValue(1));
     break;
   }
+  case ISD::ATOMIC_LOAD: {
+    AtomicSDNode *AM = cast<AtomicSDNode>(Node);
+    SDLoc SL(Node);
+    assert(NVT.getSizeInBits() == OVT.getSizeInBits() &&
+           "unexpected promotion type");
+    assert(AM->getMemoryVT().getSizeInBits() == NVT.getSizeInBits() &&
+           "unexpected atomic_load with illegal type");
+
+    SDValue NewAtomic =
+        DAG.getAtomic(ISD::ATOMIC_LOAD, SL, NVT, DAG.getVTList(NVT, MVT::Other),
+                      {AM->getChain(), AM->getBasePtr()}, AM->getMemOperand());
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic));
+    Results.push_back(NewAtomic.getValue(1));
+    break;
+  }
   case ISD::SPLAT_VECTOR: {
     SDValue Scalar = Node->getOperand(0);
     MVT ScalarType = Scalar.getSimpleValueType();
@@ -5592,6 +5672,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                     DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)));
     break;
   }
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_SEQ_FADD:
+    Results.push_back(PromoteReduction(Node));
+    break;
   }
 
   // Replace the original node with the legalized result.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 7685bc73cf96..abe5be763825 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2449,6 +2449,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
       R = PromoteFloatRes_STRICT_FP_ROUND(N);
       break;
     case ISD::LOAD:       R = PromoteFloatRes_LOAD(N); break;
+    case ISD::ATOMIC_LOAD:
+      R = PromoteFloatRes_ATOMIC_LOAD(N);
+      break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
 
@@ -2695,6 +2698,25 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
   return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, newL);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatRes_ATOMIC_LOAD(SDNode *N) {
+  AtomicSDNode *AM = cast<AtomicSDNode>(N);
+  EVT VT = AM->getValueType(0);
+
+  // Load the value as an integer value with the same number of bits.
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  SDValue newL = DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(N), IVT, DAG.getVTList(IVT, MVT::Other),
+      {AM->getChain(), AM->getBasePtr()}, AM->getMemOperand());
+
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), newL.getValue(1));
+
+  // Convert the integer value to the desired FP type
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, IVT), SDLoc(N), NVT, newL);
+}
+
 // Construct a new SELECT node with the promoted true- and false- values.
 SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) {
   SDValue TrueVal = GetPromotedFloat(N->getOperand(1));
@@ -2855,6 +2877,9 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FFREXP:      R = SoftPromoteHalfRes_FFREXP(N); break;
 
   case ISD::LOAD:        R = SoftPromoteHalfRes_LOAD(N); break;
+  case ISD::ATOMIC_LOAD:
+    R = SoftPromoteHalfRes_ATOMIC_LOAD(N);
+    break;
   case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
   case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
   case ISD::SINT_TO_FP:
@@ -3039,6 +3064,20 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) {
   return NewL;
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N) {
+  AtomicSDNode *AM = cast<AtomicSDNode>(N);
+
+  // Load the value as an integer value with the same number of bits.
+  SDValue NewL = DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(N), MVT::i16, DAG.getVTList(MVT::i16, MVT::Other),
+      {AM->getChain(), AM->getBasePtr()}, AM->getMemOperand());
+
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  return NewL;
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) {
   SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
   SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 55f9737bc94d..0aa36deda79d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -76,6 +76,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
+  case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
+  case ISD::VP_CTTZ_ELTS:
+    Res = PromoteIntRes_VP_CttzElements(N);
+    break;
   case ISD::EXTRACT_VECTOR_ELT:
                          Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
   case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break;
@@ -724,6 +728,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
                      N->getOperand(2));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_VP_CttzElements(SDNode *N) {
+  SDLoc DL(N);
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDLoc dl(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9c855e558553..49be824deb51 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -309,6 +309,7 @@ private:
   SDValue PromoteIntRes_CTLZ(SDNode *N);
   SDValue PromoteIntRes_CTPOP_PARITY(SDNode *N);
   SDValue PromoteIntRes_CTTZ(SDNode *N);
+  SDValue PromoteIntRes_VP_CttzElements(SDNode *N);
   SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT_SAT(SDNode *N);
@@ -691,6 +692,7 @@ private:
   SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
   SDValue PromoteFloatRes_STRICT_FP_ROUND(SDNode *N);
   SDValue PromoteFloatRes_LOAD(SDNode *N);
+  SDValue PromoteFloatRes_ATOMIC_LOAD(SDNode *N);
   SDValue PromoteFloatRes_SELECT(SDNode *N);
   SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
   SDValue PromoteFloatRes_UnaryOp(SDNode *N);
@@ -734,6 +736,7 @@ private:
   SDValue SoftPromoteHalfRes_FFREXP(SDNode *N);
   SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N);
   SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
+  SDValue SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
   SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
@@ -910,6 +913,7 @@ private:
   SDValue SplitVecOp_FP_ROUND(SDNode *N);
   SDValue SplitVecOp_FPOpDifferentTypes(SDNode *N);
   SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N);
+  SDValue SplitVecOp_VP_CttzElements(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Vector Widening Support: LegalizeVectorTypes.cpp
@@ -1017,6 +1021,7 @@ private:
   SDValue WidenVecOp_VECREDUCE_SEQ(SDNode *N);
   SDValue WidenVecOp_VP_REDUCE(SDNode *N);
   SDValue WidenVecOp_ExpOp(SDNode *N);
+  SDValue WidenVecOp_VP_CttzElements(SDNode *N);
 
   /// Helper function to generate a set of operations to perform
   /// a vector operation for a wider type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 1de43a4f60e3..423df9ae6b2a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -176,13 +176,6 @@ class VectorLegalizer {
   /// truncated back to the original type.
   void PromoteFP_TO_INT(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
-  /// Implements vector reduce operation promotion.
-  ///
-  /// All vector operands are promoted to a vector type with larger element
-  /// type, and the start value is promoted to a larger scalar type. Then the
-  /// result is truncated back to the original scalar type.
-  void PromoteReduction(SDNode *Node, SmallVectorImpl<SDValue> &Results);
-
   /// Implements vector setcc operation promotion.
   ///
   /// All vector operands are promoted to a vector type with larger element
@@ -510,6 +503,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
       if (Action != TargetLowering::Legal)                                     \
         break;                                                                 \
     }                                                                          \
+    /* Defer non-vector results to LegalizeDAG. */                             \
+    if (!Node->getValueType(0).isVector()) {                                   \
+      Action = TargetLowering::Legal;                                          \
+      break;                                                                   \
+    }                                                                          \
     Action = TLI.getOperationAction(Node->getOpcode(), LegalizeVT);            \
   } break;
 #include "llvm/IR/VPIntrinsics.def"
@@ -580,50 +578,6 @@ bool VectorLegalizer::LowerOperationWrapper(SDNode *Node,
   return true;
 }
 
-void VectorLegalizer::PromoteReduction(SDNode *Node,
-                                       SmallVectorImpl<SDValue> &Results) {
-  MVT VecVT = Node->getOperand(1).getSimpleValueType();
-  MVT NewVecVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VecVT);
-  MVT ScalarVT = Node->getSimpleValueType(0);
-  MVT NewScalarVT = NewVecVT.getVectorElementType();
-
-  SDLoc DL(Node);
-  SmallVector<SDValue, 4> Operands(Node->getNumOperands());
-
-  // promote the initial value.
-  if (Node->getOperand(0).getValueType().isFloatingPoint())
-    Operands[0] =
-        DAG.getNode(ISD::FP_EXTEND, DL, NewScalarVT, Node->getOperand(0));
-  else
-    Operands[0] =
-        DAG.getNode(ISD::ANY_EXTEND, DL, NewScalarVT, Node->getOperand(0));
-
-  for (unsigned j = 1; j != Node->getNumOperands(); ++j)
-    if (Node->getOperand(j).getValueType().isVector() &&
-        !(ISD::isVPOpcode(Node->getOpcode()) &&
-          ISD::getVPMaskIdx(Node->getOpcode()) == j)) // Skip mask operand.
-      // promote the vector operand.
-      if (Node->getOperand(j).getValueType().isFloatingPoint())
-        Operands[j] =
-            DAG.getNode(ISD::FP_EXTEND, DL, NewVecVT, Node->getOperand(j));
-      else
-        Operands[j] =
-            DAG.getNode(ISD::ANY_EXTEND, DL, NewVecVT, Node->getOperand(j));
-    else
-      Operands[j] = Node->getOperand(j); // Skip VL operand.
-
-  SDValue Res = DAG.getNode(Node->getOpcode(), DL, NewScalarVT, Operands,
-                            Node->getFlags());
-
-  if (ScalarVT.isFloatingPoint())
-    Res = DAG.getNode(ISD::FP_ROUND, DL, ScalarVT, Res,
-                      DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
-  else
-    Res = DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, Res);
-
-  Results.push_back(Res);
-}
-
 void VectorLegalizer::PromoteSETCC(SDNode *Node,
                                    SmallVectorImpl<SDValue> &Results) {
   MVT VecVT = Node->getOperand(0).getSimpleValueType();
@@ -708,23 +662,6 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     // Promote the operation by extending the operand.
     PromoteFP_TO_INT(Node, Results);
     return;
-  case ISD::VP_REDUCE_ADD:
-  case ISD::VP_REDUCE_MUL:
-  case ISD::VP_REDUCE_AND:
-  case ISD::VP_REDUCE_OR:
-  case ISD::VP_REDUCE_XOR:
-  case ISD::VP_REDUCE_SMAX:
-  case ISD::VP_REDUCE_SMIN:
-  case ISD::VP_REDUCE_UMAX:
-  case ISD::VP_REDUCE_UMIN:
-  case ISD::VP_REDUCE_FADD:
-  case ISD::VP_REDUCE_FMUL:
-  case ISD::VP_REDUCE_FMAX:
-  case ISD::VP_REDUCE_FMIN:
-  case ISD::VP_REDUCE_SEQ_FADD:
-    // Promote the operation by extending the operand.
-    PromoteReduction(Node, Results);
-    return;
   case ISD::VP_SETCC:
   case ISD::SETCC:
     // Promote the operation by extending the operand.
@@ -1049,6 +986,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     }
     break;
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+    if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 985c9f16ab97..cab4dc5f3c15 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3098,6 +3098,10 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VP_REDUCE_FMIN:
     Res = SplitVecOp_VP_REDUCE(N, OpNo);
     break;
+  case ISD::VP_CTTZ_ELTS:
+  case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
+    Res = SplitVecOp_VP_CttzElements(N);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -4056,6 +4060,29 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_TO_XINT_SAT(SDNode *N) {
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_VP_CttzElements(SDNode *N) {
+  SDLoc DL(N);
+  EVT ResVT = N->getValueType(0);
+
+  SDValue Lo, Hi;
+  SDValue VecOp = N->getOperand(0);
+  GetSplitVector(VecOp, Lo, Hi);
+
+  auto [MaskLo, MaskHi] = SplitMask(N->getOperand(1));
+  auto [EVLLo, EVLHi] =
+      DAG.SplitEVL(N->getOperand(2), VecOp.getValueType(), DL);
+  SDValue VLo = DAG.getZExtOrTrunc(EVLLo, DL, ResVT);
+
+  // if VP_CTTZ_ELTS(Lo) != EVLLo => VP_CTTZ_ELTS(Lo).
+  // else => EVLLo + (VP_CTTZ_ELTS(Hi) or VP_CTTZ_ELTS_ZERO_UNDEF(Hi)).
+  SDValue ResLo = DAG.getNode(ISD::VP_CTTZ_ELTS, DL, ResVT, Lo, MaskLo, EVLLo);
+  SDValue ResLoNotEVL =
+      DAG.getSetCC(DL, getSetCCResultType(ResVT), ResLo, VLo, ISD::SETNE);
+  SDValue ResHi = DAG.getNode(N->getOpcode(), DL, ResVT, Hi, MaskHi, EVLHi);
+  return DAG.getSelect(DL, ResVT, ResLoNotEVL, ResLo,
+                       DAG.getNode(ISD::ADD, DL, ResVT, VLo, ResHi));
+}
+
 //===----------------------------------------------------------------------===//
 //  Result Vector Widening
 //===----------------------------------------------------------------------===//
@@ -6161,6 +6188,10 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VP_REDUCE_FMIN:
     Res = WidenVecOp_VP_REDUCE(N);
     break;
+  case ISD::VP_CTTZ_ELTS:
+  case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
+    Res = WidenVecOp_VP_CttzElements(N);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -6924,6 +6955,17 @@ SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) {
                      DAG.getVectorIdxConstant(0, DL));
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_VP_CttzElements(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Source = GetWidenedVector(N->getOperand(0));
+  EVT SrcVT = Source.getValueType();
+  SDValue Mask =
+      GetWidenedMask(N->getOperand(1), SrcVT.getVectorElementCount());
+
+  return DAG.getNode(N->getOpcode(), DL, N->getValueType(0),
+                     {Source, Mask, N->getOperand(2)}, N->getFlags());
+}
+
 //===----------------------------------------------------------------------===//
 // Vector Widening Utilities
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 23ebfe466c74..d92976bc2f30 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1701,8 +1701,9 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
          "APInt size does not match type size!");
   unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
+  SDVTList VTs = getVTList(EltVT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(EltVT), std::nullopt);
+  AddNodeIDNode(ID, Opc, VTs, std::nullopt);
   ID.AddPointer(Elt);
   ID.AddBoolean(isO);
   void *IP = nullptr;
@@ -1712,7 +1713,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
       return SDValue(N, 0);
 
   if (!N) {
-    N = newSDNode<ConstantSDNode>(isT, isO, Elt, EltVT);
+    N = newSDNode<ConstantSDNode>(isT, isO, Elt, VTs);
     CSEMap.InsertNode(N, IP);
     InsertNode(N);
     NewSDValueDbgMsg(SDValue(N, 0), "Creating constant: ", this);
@@ -1762,8 +1763,9 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
   // value, so that we don't have problems with 0.0 comparing equal to -0.0, and
   // we don't have issues with SNANs.
   unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
+  SDVTList VTs = getVTList(EltVT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(EltVT), std::nullopt);
+  AddNodeIDNode(ID, Opc, VTs, std::nullopt);
   ID.AddPointer(&V);
   void *IP = nullptr;
   SDNode *N = nullptr;
@@ -1772,7 +1774,7 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
       return SDValue(N, 0);
 
   if (!N) {
-    N = newSDNode<ConstantFPSDNode>(isTarget, &V, EltVT);
+    N = newSDNode<ConstantFPSDNode>(isTarget, &V, VTs);
     CSEMap.InsertNode(N, IP);
     InsertNode(N);
   }
@@ -1819,8 +1821,9 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
   else
     Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
 
+  SDVTList VTs = getVTList(VT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  AddNodeIDNode(ID, Opc, VTs, std::nullopt);
   ID.AddPointer(GV);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
@@ -1829,7 +1832,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
     return SDValue(E, 0);
 
   auto *N = newSDNode<GlobalAddressSDNode>(
-      Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
+      Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VTs, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
     InsertNode(N);
   return SDValue(N, 0);
@@ -1837,14 +1840,15 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
 
 SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
   unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
+  SDVTList VTs = getVTList(VT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  AddNodeIDNode(ID, Opc, VTs, std::nullopt);
   ID.AddInteger(FI);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
+  auto *N = newSDNode<FrameIndexSDNode>(FI, VTs, isTarget);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1855,15 +1859,16 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent jump tables");
   unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
+  SDVTList VTs = getVTList(VT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  AddNodeIDNode(ID, Opc, VTs, std::nullopt);
   ID.AddInteger(JTI);
   ID.AddInteger(TargetFlags);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
+  auto *N = newSDNode<JumpTableSDNode>(JTI, VTs, isTarget, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -1886,8 +1891,9 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
                     ? getDataLayout().getABITypeAlign(C->getType())
                     : getDataLayout().getPrefTypeAlign(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  SDVTList VTs = getVTList(VT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  AddNodeIDNode(ID, Opc, VTs, std::nullopt);
   ID.AddInteger(Alignment->value());
   ID.AddInteger(Offset);
   ID.AddPointer(C);
@@ -1896,7 +1902,7 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, *Alignment,
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VTs, Offset, *Alignment,
                                           TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
@@ -1913,8 +1919,9 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   if (!Alignment)
     Alignment = getDataLayout().getPrefTypeAlign(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  SDVTList VTs = getVTList(VT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  AddNodeIDNode(ID, Opc, VTs, std::nullopt);
   ID.AddInteger(Alignment->value());
   ID.AddInteger(Offset);
   C->addSelectionDAGCSEId(ID);
@@ -1923,7 +1930,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, *Alignment,
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VTs, Offset, *Alignment,
                                           TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
@@ -1961,7 +1968,7 @@ SDValue SelectionDAG::getValueType(EVT VT) {
 SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
   SDNode *&N = ExternalSymbols[Sym];
   if (N) return SDValue(N, 0);
-  N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
+  N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, getVTList(VT));
   InsertNode(N);
   return SDValue(N, 0);
 }
@@ -1970,7 +1977,7 @@ SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
   SDNode *&N = MCSymbols[Sym];
   if (N)
     return SDValue(N, 0);
-  N = newSDNode<MCSymbolSDNode>(Sym, VT);
+  N = newSDNode<MCSymbolSDNode>(Sym, getVTList(VT));
   InsertNode(N);
   return SDValue(N, 0);
 }
@@ -1980,7 +1987,7 @@ SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
   SDNode *&N =
       TargetExternalSymbols[std::pair<std::string, unsigned>(Sym, TargetFlags)];
   if (N) return SDValue(N, 0);
-  N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
+  N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, getVTList(VT));
   InsertNode(N);
   return SDValue(N, 0);
 }
@@ -2196,9 +2203,10 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
     }
   }
 
+  SDVTList VTs = getVTList(VT);
   FoldingSetNodeID ID;
   SDValue Ops[2] = { N1, N2 };
-  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
+  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, VTs, Ops);
   for (int i = 0; i != NElts; ++i)
     ID.AddInteger(MaskVec[i]);
 
@@ -2212,7 +2220,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
   int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
   llvm::copy(MaskVec, MaskAlloc);
 
-  auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
+  auto *N = newSDNode<ShuffleVectorSDNode>(VTs, dl.getIROrder(),
                                            dl.getDebugLoc(), MaskAlloc);
   createOperands(N, Ops);
 
@@ -2234,14 +2242,15 @@ SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
 }
 
 SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
+  SDVTList VTs = getVTList(VT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::Register, getVTList(VT), std::nullopt);
+  AddNodeIDNode(ID, ISD::Register, VTs, std::nullopt);
   ID.AddInteger(RegNo);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
+  auto *N = newSDNode<RegisterSDNode>(RegNo, VTs);
   N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
@@ -2290,9 +2299,10 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
                                       int64_t Offset, bool isTarget,
                                       unsigned TargetFlags) {
   unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
+  SDVTList VTs = getVTList(VT);
 
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  AddNodeIDNode(ID, Opc, VTs, std::nullopt);
   ID.AddPointer(BA);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
@@ -2300,7 +2310,7 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
+  auto *N = newSDNode<BlockAddressSDNode>(Opc, VTs, BA, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
   return SDValue(N, 0);
@@ -2345,9 +2355,10 @@ SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
 
 SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
                                        unsigned SrcAS, unsigned DestAS) {
+  SDVTList VTs = getVTList(VT);
   SDValue Ops[] = {Ptr};
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
+  AddNodeIDNode(ID, ISD::ADDRSPACECAST, VTs, Ops);
   ID.AddInteger(SrcAS);
   ID.AddInteger(DestAS);
 
@@ -2356,7 +2367,7 @@ SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
     return SDValue(E, 0);
 
   auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                           VT, SrcAS, DestAS);
+                                           VTs, SrcAS, DestAS);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
@@ -5052,6 +5063,7 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
   case ISD::VALUETYPE:
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
+  case ISD::CopyFromReg:
     return true;
 
   case ISD::UNDEF:
@@ -5117,11 +5129,24 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   if (VT.isScalableVector())
     return true;
 
+  if (ConsiderFlags && Op->hasPoisonGeneratingFlags())
+    return true;
+
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
   case ISD::FREEZE:
   case ISD::CONCAT_VECTORS:
   case ISD::INSERT_SUBVECTOR:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
   case ISD::AND:
   case ISD::XOR:
   case ISD::ROTL:
@@ -5142,6 +5167,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BUILD_PAIR:
     return false;
 
+  case ISD::SELECT_CC:
   case ISD::SETCC: {
     // Integer setcc cannot create undef or poison.
     if (Op.getOperand(0).getValueType().isInteger())
@@ -5151,39 +5177,28 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     // based on options and flags. The options and flags also cause special
     // nonan condition codes to be used. Those condition codes may be preserved
     // even if the nonan flag is dropped somewhere.
-    ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    unsigned CCOp = Opcode == ISD::SETCC ? 2 : 4;
+    ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(CCOp))->get();
     if (((unsigned)CCCode & 0x10U))
       return true;
 
     const TargetOptions &Options = getTarget().Options;
-    return Options.NoNaNsFPMath || Options.NoInfsFPMath ||
-           (ConsiderFlags &&
-            (Op->getFlags().hasNoNaNs() || Op->getFlags().hasNoInfs()));
+    return Options.NoNaNsFPMath || Options.NoInfsFPMath;
   }
 
-  // Matches hasPoisonGeneratingFlags().
+  case ISD::OR:
   case ISD::ZERO_EXTEND:
-    return ConsiderFlags && Op->getFlags().hasNonNeg();
-
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
-    // Matches hasPoisonGeneratingFlags().
-    return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() ||
-                             Op->getFlags().hasNoUnsignedWrap());
+    // No poison except from flags (which is handled above)
+    return false;
 
   case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
     // If the max shift amount isn't in range, then the shift can create poison.
-    if (!getValidMaximumShiftAmountConstant(Op, DemandedElts))
-      return true;
-
-    // Matches hasPoisonGeneratingFlags().
-    return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() ||
-                             Op->getFlags().hasNoUnsignedWrap());
-
-  // Matches hasPoisonGeneratingFlags().
-  case ISD::OR:
-    return ConsiderFlags && Op->getFlags().hasDisjoint();
+    return !getValidMaximumShiftAmountConstant(Op, DemandedElts);
 
   case ISD::SCALAR_TO_VECTOR:
     // Check if we demand any upper (undef) elements.
@@ -5216,13 +5231,13 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   return true;
 }
 
-bool SelectionDAG::isADDLike(SDValue Op) const {
+bool SelectionDAG::isADDLike(SDValue Op, bool NoWrap) const {
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::OR)
     return Op->getFlags().hasDisjoint() ||
            haveNoCommonBitsSet(Op.getOperand(0), Op.getOperand(1));
   if (Opcode == ISD::XOR)
-    return isMinSignedConstant(Op.getOperand(1));
+    return !NoWrap && isMinSignedConstant(Op.getOperand(1));
   return false;
 }
 
@@ -5713,14 +5728,14 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
 
 /// Gets or creates the specified node.
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
+  SDVTList VTs = getVTList(VT);
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opcode, getVTList(VT), std::nullopt);
+  AddNodeIDNode(ID, Opcode, VTs, std::nullopt);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                              getVTList(VT));
+  auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
   CSEMap.InsertNode(N, IP);
 
   InsertNode(N);
@@ -6667,16 +6682,17 @@ SDValue SelectionDAG::getAssertAlign(const SDLoc &DL, SDValue Val, Align A) {
   if (A == Align(1))
     return Val;
 
+  SDVTList VTs = getVTList(Val.getValueType());
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::AssertAlign, getVTList(Val.getValueType()), {Val});
+  AddNodeIDNode(ID, ISD::AssertAlign, VTs, {Val});
   ID.AddInteger(A.value());
 
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<AssertAlignSDNode>(DL.getIROrder(), DL.getDebugLoc(),
-                                         Val.getValueType(), A);
+  auto *N =
+      newSDNode<AssertAlignSDNode>(DL.getIROrder(), DL.getDebugLoc(), VTs, A);
   createOperands(N, {Val});
 
   CSEMap.InsertNode(N, IP);
@@ -11795,20 +11811,6 @@ HandleSDNode::~HandleSDNode() {
   DropOperands();
 }
 
-GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
-                                         const DebugLoc &DL,
-                                         const GlobalValue *GA, EVT VT,
-                                         int64_t o, unsigned TF)
-    : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
-  TheGlobal = GA;
-}
-
-AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
-                                         EVT VT, unsigned SrcAS,
-                                         unsigned DestAS)
-    : SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
-      SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
-
 MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
                      SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
     : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1348fc08cd02..3d00ccaa2fa1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7931,19 +7931,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
              DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
     return;
   }
-  case Intrinsic::experimental_vector_reverse:
+  case Intrinsic::vector_reverse:
     visitVectorReverse(I);
     return;
-  case Intrinsic::experimental_vector_splice:
+  case Intrinsic::vector_splice:
     visitVectorSplice(I);
     return;
   case Intrinsic::callbr_landingpad:
     visitCallBrLandingPad(I);
     return;
-  case Intrinsic::experimental_vector_interleave2:
+  case Intrinsic::vector_interleave2:
     visitVectorInterleave(I);
     return;
-  case Intrinsic::experimental_vector_deinterleave2:
+  case Intrinsic::vector_deinterleave2:
     visitVectorDeinterleave(I);
     return;
   case Intrinsic::experimental_convergence_anchor:
@@ -8077,6 +8077,11 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
     ResOPC = IsZeroUndef ? ISD::VP_CTTZ_ZERO_UNDEF : ISD::VP_CTTZ;
     break;
   }
+  case Intrinsic::vp_cttz_elts: {
+    bool IsZeroPoison = cast<ConstantInt>(VPIntrin.getArgOperand(1))->isOne();
+    ResOPC = IsZeroPoison ? ISD::VP_CTTZ_ELTS_ZERO_UNDEF : ISD::VP_CTTZ_ELTS;
+    break;
+  }
 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)                                    \
   case Intrinsic::VPID:                                                        \
     ResOPC = ISD::VPSD;                                                        \
@@ -8429,7 +8434,9 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
   case ISD::VP_CTLZ:
   case ISD::VP_CTLZ_ZERO_UNDEF:
   case ISD::VP_CTTZ:
-  case ISD::VP_CTTZ_ZERO_UNDEF: {
+  case ISD::VP_CTTZ_ZERO_UNDEF:
+  case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
+  case ISD::VP_CTTZ_ELTS: {
     SDValue Result =
         DAG.getNode(Opcode, DL, VTs, {OpValues[0], OpValues[2], OpValues[3]});
     setValue(&VPIntrin, Result);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c938b3996be3..336d89fbcf63 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8381,6 +8381,64 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   return SDValue();
 }
 
+SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned Opc = N->getOpcode();
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  bool IsMax = Opc == ISD::FMAXIMUM;
+
+  if (VT.isVector() &&
+      isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
+    return SDValue();
+
+  // First, implement comparison not propagating NaN. If no native fmin or fmax
+  // available, use plain select with setcc instead.
+  SDValue MinMax;
+  unsigned CompOpcIeee = IsMax ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
+  unsigned CompOpc = IsMax ? ISD::FMAXNUM : ISD::FMINNUM;
+  if (isOperationLegalOrCustom(CompOpcIeee, VT)) {
+    MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS);
+  } else if (isOperationLegalOrCustom(CompOpc, VT)) {
+    MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS);
+  } else {
+    // NaN (if exists) will be propagated later, so orderness doesn't matter.
+    SDValue Compare =
+        DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
+    MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS);
+  }
+
+  // Propagate any NaN of both operands
+  if (!N->getFlags().hasNoNaNs() &&
+      (!DAG.isKnownNeverNaN(RHS) || !DAG.isKnownNeverNaN(LHS))) {
+    ConstantFP *FPNaN = ConstantFP::get(
+        *DAG.getContext(), APFloat::getNaN(DAG.EVTToAPFloatSemantics(VT)));
+    MinMax = DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, LHS, RHS, ISD::SETUO),
+                           DAG.getConstantFP(*FPNaN, DL, VT), MinMax);
+  }
+
+  // fminimum/fmaximum requires -0.0 less than +0.0
+  if (!N->getFlags().hasNoSignedZeros() && !DAG.isKnownNeverZeroFloat(RHS) &&
+      !DAG.isKnownNeverZeroFloat(LHS)) {
+    SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
+                                  DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
+    SDValue TestZero =
+        DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
+    SDValue LCmp = DAG.getSelect(
+        DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
+        MinMax);
+    SDValue RCmp = DAG.getSelect(
+        DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS,
+        LCmp);
+    MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax);
+  }
+
+  return MinMax;
+}
+
 /// Returns a true value if if this FPClassTest can be performed with an ordered
 /// fcmp to 0, and a false value if it's an unordered fcmp to 0. Returns
 /// std::nullopt if it cannot be performed as a compare with 0.
@@ -9016,6 +9074,39 @@ SDValue TargetLowering::expandVPCTTZ(SDNode *Node, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::VP_CTPOP, dl, VT, Tmp, Mask, VL);
 }
 
+SDValue TargetLowering::expandVPCTTZElements(SDNode *N,
+                                             SelectionDAG &DAG) const {
+  // %cond = to_bool_vec %source
+  // %splat = splat /*val=*/VL
+  // %tz = step_vector
+  // %v = vp.select %cond, /*true=*/tz, /*false=*/%splat
+  // %r = vp.reduce.umin %v
+  SDLoc DL(N);
+  SDValue Source = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  SDValue EVL = N->getOperand(2);
+  EVT SrcVT = Source.getValueType();
+  EVT ResVT = N->getValueType(0);
+  EVT ResVecVT =
+      EVT::getVectorVT(*DAG.getContext(), ResVT, SrcVT.getVectorElementCount());
+
+  // Convert to boolean vector.
+  if (SrcVT.getScalarType() != MVT::i1) {
+    SDValue AllZero = DAG.getConstant(0, DL, SrcVT);
+    SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                             SrcVT.getVectorElementCount());
+    Source = DAG.getNode(ISD::VP_SETCC, DL, SrcVT, Source, AllZero,
+                         DAG.getCondCode(ISD::SETNE), Mask, EVL);
+  }
+
+  SDValue ExtEVL = DAG.getZExtOrTrunc(EVL, DL, ResVT);
+  SDValue Splat = DAG.getSplat(ResVecVT, DL, ExtEVL);
+  SDValue StepVec = DAG.getStepVector(DL, ResVecVT);
+  SDValue Select =
+      DAG.getNode(ISD::VP_SELECT, DL, ResVecVT, Source, StepVec, Splat, EVL);
+  return DAG.getNode(ISD::VP_REDUCE_UMIN, DL, ResVT, ExtEVL, Select, Mask, EVL);
+}
+
 SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
                                   bool IsNegative) const {
   SDLoc dl(N);
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index ff6b560d1172..601686fdd3dd 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -612,7 +612,7 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
       DWARFDie Die = getDie(*CU);
       if (Die) {
         CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
-        pool.async([this, CUI, &LogMutex, Out, Die]() mutable {
+        pool.async([this, CUI, &LogMutex, &Out, Die]() mutable {
           std::string storage;
           raw_string_ostream StrStream(storage);
           OutputAggregator ThreadOut(Out.GetOS() ? &StrStream : nullptr);
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index 2c87b344083e..bba3329e8cc2 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -16,6 +16,10 @@
 #if defined(LLVM_ON_UNIX) && !defined(__ANDROID__)
 #include <fcntl.h>
 #include <sys/mman.h>
+#if defined(__MVS__)
+#include "llvm/Support/BLAKE3.h"
+#include <sys/shm.h>
+#endif
 #include <unistd.h>
 #elif defined(_WIN32)
 #include <windows.h>
@@ -239,6 +243,24 @@ void SharedMemoryMapper::reserve(size_t NumBytes,
 
 #if defined(LLVM_ON_UNIX)
 
+#if defined(__MVS__)
+        ArrayRef<uint8_t> Data(
+            reinterpret_cast<const uint8_t *>(SharedMemoryName.c_str()),
+            SharedMemoryName.size());
+        auto HashedName = BLAKE3::hash<sizeof(key_t)>(Data);
+        key_t Key = *reinterpret_cast<key_t *>(HashedName.data());
+        int SharedMemoryId =
+            shmget(Key, NumBytes, IPC_CREAT | __IPC_SHAREAS | 0700);
+        if (SharedMemoryId < 0) {
+          return OnReserved(errorCodeToError(
+              std::error_code(errno, std::generic_category())));
+        }
+        LocalAddr = shmat(SharedMemoryId, nullptr, 0);
+        if (LocalAddr == reinterpret_cast<void *>(-1)) {
+          return OnReserved(errorCodeToError(
+              std::error_code(errno, std::generic_category())));
+        }
+#else
         int SharedMemoryFile = shm_open(SharedMemoryName.c_str(), O_RDWR, 0700);
         if (SharedMemoryFile < 0) {
           return OnReserved(errorCodeToError(errnoAsErrorCode()));
@@ -254,6 +276,7 @@ void SharedMemoryMapper::reserve(size_t NumBytes,
         }
 
         close(SharedMemoryFile);
+#endif
 
 #elif defined(_WIN32)
 
@@ -373,8 +396,13 @@ void SharedMemoryMapper::release(ArrayRef<ExecutorAddr> Bases,
 
 #if defined(LLVM_ON_UNIX)
 
+#if defined(__MVS__)
+      if (shmdt(Reservations[Base].LocalAddr) < 0)
+        Err = joinErrors(std::move(Err), errorCodeToError(errnoAsErrorCode()));
+#else
       if (munmap(Reservations[Base].LocalAddr, Reservations[Base].Size) != 0)
         Err = joinErrors(std::move(Err), errorCodeToError(errnoAsErrorCode()));
+#endif
 
 #elif defined(_WIN32)
 
@@ -415,7 +443,11 @@ SharedMemoryMapper::~SharedMemoryMapper() {
 
 #if defined(LLVM_ON_UNIX) && !defined(__ANDROID__)
 
+#if defined(__MVS__)
+    shmdt(R.second.LocalAddr);
+#else
     munmap(R.second.LocalAddr, R.second.Size);
+#endif
 
 #elif defined(_WIN32)
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 131728fd7e7e..57ade9476786 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -156,7 +156,10 @@ public:
       std::unique_ptr<MaterializationResponsibility> MR,
       std::unique_ptr<MemoryBuffer> ObjBuffer)
       : JITLinkContext(&MR->getTargetJITDylib()), Layer(Layer),
-        MR(std::move(MR)), ObjBuffer(std::move(ObjBuffer)) {}
+        MR(std::move(MR)), ObjBuffer(std::move(ObjBuffer)) {
+    std::lock_guard<std::mutex> Lock(Layer.LayerMutex);
+    Plugins = Layer.Plugins;
+  }
 
   ~ObjectLinkingLayerJITLinkContext() {
     // If there is an object buffer return function then use it to
@@ -168,14 +171,14 @@ public:
   JITLinkMemoryManager &getMemoryManager() override { return Layer.MemMgr; }
 
   void notifyMaterializing(LinkGraph &G) {
-    for (auto &P : Layer.Plugins)
+    for (auto &P : Plugins)
       P->notifyMaterializing(*MR, G, *this,
                              ObjBuffer ? ObjBuffer->getMemBufferRef()
                              : MemoryBufferRef());
   }
 
   void notifyFailed(Error Err) override {
-    for (auto &P : Layer.Plugins)
+    for (auto &P : Plugins)
       Err = joinErrors(std::move(Err), P->notifyFailed(*MR));
     Layer.getExecutionSession().reportError(std::move(Err));
     MR->failMaterialization();
@@ -317,12 +320,12 @@ public:
     if (auto Err = MR->notifyResolved(InternedResult))
       return Err;
 
-    Layer.notifyLoaded(*MR);
+    notifyLoaded();
     return Error::success();
   }
 
   void notifyFinalized(JITLinkMemoryManager::FinalizedAlloc A) override {
-    if (auto Err = Layer.notifyEmitted(*MR, std::move(A))) {
+    if (auto Err = notifyEmitted(std::move(A))) {
       Layer.getExecutionSession().reportError(std::move(Err));
       MR->failMaterialization();
       return;
@@ -344,7 +347,8 @@ public:
       return claimOrExternalizeWeakAndCommonSymbols(G);
     });
 
-    Layer.modifyPassConfig(*MR, LG, Config);
+    for (auto &P : Plugins)
+      P->modifyPassConfig(*MR, LG, Config);
 
     Config.PreFixupPasses.push_back(
         [this](LinkGraph &G) { return registerDependencies(G); });
@@ -352,6 +356,29 @@ public:
     return Error::success();
   }
 
+  void notifyLoaded() {
+    for (auto &P : Plugins)
+      P->notifyLoaded(*MR);
+  }
+
+  Error notifyEmitted(jitlink::JITLinkMemoryManager::FinalizedAlloc FA) {
+    Error Err = Error::success();
+    for (auto &P : Plugins)
+      Err = joinErrors(std::move(Err), P->notifyEmitted(*MR));
+
+    if (Err) {
+      if (FA)
+        Err =
+            joinErrors(std::move(Err), Layer.MemMgr.deallocate(std::move(FA)));
+      return Err;
+    }
+
+    if (FA)
+      return Layer.recordFinalizedAlloc(*MR, std::move(FA));
+
+    return Error::success();
+  }
+
 private:
   // Symbol name dependencies:
   // Internal: Defined in this graph.
@@ -522,7 +549,7 @@ private:
 
     SymbolDependenceGroup SynthSDG;
 
-    for (auto &P : Layer.Plugins) {
+    for (auto &P : Plugins) {
       auto SynthDeps = P->getSyntheticSymbolDependencies(*MR);
       if (SynthDeps.empty())
         continue;
@@ -636,6 +663,7 @@ private:
   }
 
   ObjectLinkingLayer &Layer;
+  std::vector<std::shared_ptr<ObjectLinkingLayer::Plugin>> Plugins;
   std::unique_ptr<MaterializationResponsibility> MR;
   std::unique_ptr<MemoryBuffer> ObjBuffer;
   DenseMap<Block *, SymbolNameSet> ExternalBlockDeps;
@@ -702,34 +730,9 @@ void ObjectLinkingLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
   link(std::move(G), std::move(Ctx));
 }
 
-void ObjectLinkingLayer::modifyPassConfig(MaterializationResponsibility &MR,
-                                          LinkGraph &G,
-                                          PassConfiguration &PassConfig) {
-  for (auto &P : Plugins)
-    P->modifyPassConfig(MR, G, PassConfig);
-}
-
-void ObjectLinkingLayer::notifyLoaded(MaterializationResponsibility &MR) {
-  for (auto &P : Plugins)
-    P->notifyLoaded(MR);
-}
-
-Error ObjectLinkingLayer::notifyEmitted(MaterializationResponsibility &MR,
-                                        FinalizedAlloc FA) {
-  Error Err = Error::success();
-  for (auto &P : Plugins)
-    Err = joinErrors(std::move(Err), P->notifyEmitted(MR));
-
-  if (Err) {
-    if (FA)
-      Err = joinErrors(std::move(Err), MemMgr.deallocate(std::move(FA)));
-    return Err;
-  }
-
-  if (!FA)
-    return Error::success();
-
-  Err = MR.withResourceKeyDo(
+Error ObjectLinkingLayer::recordFinalizedAlloc(
+    MaterializationResponsibility &MR, FinalizedAlloc FA) {
+  auto Err = MR.withResourceKeyDo(
       [&](ResourceKey K) { Allocs[K].push_back(std::move(FA)); });
 
   if (Err)
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
index 6614beec760f..f5118c0f2bfa 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
@@ -18,6 +18,10 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <sys/mman.h>
+#if defined(__MVS__)
+#include "llvm/Support/BLAKE3.h"
+#include <sys/shm.h>
+#endif
 #include <unistd.h>
 #endif
 
@@ -59,6 +63,21 @@ ExecutorSharedMemoryMapperService::reserve(uint64_t Size) {
     SharedMemoryName = SharedMemoryNameStream.str();
   }
 
+#if defined(__MVS__)
+  ArrayRef<uint8_t> Data(
+      reinterpret_cast<const uint8_t *>(SharedMemoryName.c_str()),
+      SharedMemoryName.size());
+  auto HashedName = BLAKE3::hash<sizeof(key_t)>(Data);
+  key_t Key = *reinterpret_cast<key_t *>(HashedName.data());
+  int SharedMemoryId =
+      shmget(Key, Size, IPC_CREAT | IPC_EXCL | __IPC_SHAREAS | 0700);
+  if (SharedMemoryId < 0)
+    return errorCodeToError(errnoAsErrorCode());
+
+  void *Addr = shmat(SharedMemoryId, nullptr, 0);
+  if (Addr == reinterpret_cast<void *>(-1))
+    return errorCodeToError(errnoAsErrorCode());
+#else
   int SharedMemoryFile =
       shm_open(SharedMemoryName.c_str(), O_RDWR | O_CREAT | O_EXCL, 0700);
   if (SharedMemoryFile < 0)
@@ -73,6 +92,7 @@ ExecutorSharedMemoryMapperService::reserve(uint64_t Size) {
     return errorCodeToError(errnoAsErrorCode());
 
   close(SharedMemoryFile);
+#endif
 
 #elif defined(_WIN32)
 
@@ -131,6 +151,9 @@ Expected<ExecutorAddr> ExecutorSharedMemoryMapperService::initialize(
 
 #if defined(LLVM_ON_UNIX)
 
+#if defined(__MVS__)
+      // TODO Is it possible to change the protection level?
+#else
     int NativeProt = 0;
     if ((Segment.RAG.Prot & MemProt::Read) == MemProt::Read)
       NativeProt |= PROT_READ;
@@ -141,6 +164,7 @@ Expected<ExecutorAddr> ExecutorSharedMemoryMapperService::initialize(
 
     if (mprotect(Segment.Addr.toPtr<void *>(), Segment.Size, NativeProt))
       return errorCodeToError(errnoAsErrorCode());
+#endif
 
 #elif defined(_WIN32)
 
@@ -239,8 +263,15 @@ Error ExecutorSharedMemoryMapperService::release(
 
 #if defined(LLVM_ON_UNIX)
 
+#if defined(__MVS__)
+    (void)Size;
+
+    if (shmdt(Base.toPtr<void *>()) < 0)
+      Err = joinErrors(std::move(Err), errorCodeToError(errnoAsErrorCode()));
+#else
     if (munmap(Base.toPtr<void *>(), Size) != 0)
       Err = joinErrors(std::move(Err), errorCodeToError(errnoAsErrorCode()));
+#endif
 
 #elif defined(_WIN32)
     (void)Size;
diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h
index 9a6427bbc3d5..58dc14588f41 100644
--- a/llvm/lib/IR/AttributeImpl.h
+++ b/llvm/lib/IR/AttributeImpl.h
@@ -121,8 +121,8 @@ public:
   static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind,
                       const ConstantRange &CR) {
     ID.AddInteger(Kind);
-    ID.AddInteger(CR.getLower());
-    ID.AddInteger(CR.getUpper());
+    CR.getLower().Profile(ID);
+    CR.getUpper().Profile(ID);
   }
 };
 
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 9c48a481de1f..c6e511b46e51 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -173,8 +173,8 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
   LLVMContextImpl *pImpl = Context.pImpl;
   FoldingSetNodeID ID;
   ID.AddInteger(Kind);
-  ID.AddInteger(CR.getLower());
-  ID.AddInteger(CR.getUpper());
+  CR.getLower().Profile(ID);
+  CR.getUpper().Profile(ID);
 
   void *InsertPoint;
   AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint);
@@ -2282,7 +2282,7 @@ struct StrBoolAttr {
   static bool isSet(const Function &Fn,
                     StringRef Kind) {
     auto A = Fn.getFnAttribute(Kind);
-    return A.getValueAsString().equals("true");
+    return A.getValueAsString() == "true";
   }
 
   static void set(Function &Fn,
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 634b2dd5119e..5c65efba9e50 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -846,6 +846,18 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
         return false; // No other 'aarch64.sve.bf*'.
       }
 
+      if (Name.consume_front("addqv")) {
+        // 'aarch64.sve.addqv'.
+        if (!F->getReturnType()->isFPOrFPVectorTy())
+          return false;
+
+        auto Args = F->getFunctionType()->params();
+        Type *Tys[] = {F->getReturnType(), Args[1]};
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::aarch64_sve_faddqv, Tys);
+        return true;
+      }
+
       if (Name.consume_front("ld")) {
         // 'aarch64.sve.ld*'.
         static const Regex LdRegex("^[234](.nxv[a-z0-9]+|$)");
@@ -1047,7 +1059,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       }
     }
 
-    if (F->arg_size() == 2 && Name.equals("coro.end")) {
+    if (F->arg_size() == 2 && Name == "coro.end") {
       rename(F);
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::coro_end);
       return true;
@@ -1080,17 +1092,24 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
     break;
   case 'e':
     if (Name.consume_front("experimental.vector.")) {
-      Intrinsic::ID ID = StringSwitch<Intrinsic::ID>(Name)
-                             .StartsWith("extract.", Intrinsic::vector_extract)
-                             .StartsWith("insert.", Intrinsic::vector_insert)
-                             .Default(Intrinsic::not_intrinsic);
+      Intrinsic::ID ID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .StartsWith("extract.", Intrinsic::vector_extract)
+              .StartsWith("insert.", Intrinsic::vector_insert)
+              .StartsWith("splice.", Intrinsic::vector_splice)
+              .StartsWith("reverse.", Intrinsic::vector_reverse)
+              .StartsWith("interleave2.", Intrinsic::vector_interleave2)
+              .StartsWith("deinterleave2.", Intrinsic::vector_deinterleave2)
+              .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
         const auto *FT = F->getFunctionType();
         SmallVector<Type *, 2> Tys;
-        if (ID == Intrinsic::vector_extract)
+        if (ID == Intrinsic::vector_extract ||
+            ID == Intrinsic::vector_interleave2)
           // Extracting overloads the return type.
           Tys.push_back(FT->getReturnType());
-        Tys.push_back(FT->getParamType(0));
+        if (ID != Intrinsic::vector_interleave2)
+          Tys.push_back(FT->getParamType(0));
         if (ID == Intrinsic::vector_insert)
           // Inserting overloads the inserted type.
           Tys.push_back(FT->getParamType(1));
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 6e62767c99e2..29f2cbf611fa 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -682,7 +682,7 @@ std::optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
   if (MDNode *MDIrrLoopHeader =
       TI->getMetadata(LLVMContext::MD_irr_loop)) {
     MDString *MDName = cast<MDString>(MDIrrLoopHeader->getOperand(0));
-    if (MDName->getString().equals("loop_header_weight")) {
+    if (MDName->getString() == "loop_header_weight") {
       auto *CI = mdconst::extract<ConstantInt>(MDIrrLoopHeader->getOperand(1));
       return std::optional<uint64_t>(CI->getValue().getZExtValue());
     }
@@ -1009,9 +1009,9 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
     // generate the iterator with begin() / getFirstInsertionPt(), it means
     // any trailing debug-info at the end of the block would "normally" have
     // been pushed in front of "First". Move it there now.
-    DbgMarker *FirstMarker = getMarker(First);
     DbgMarker *TrailingDbgRecords = getTrailingDbgRecords();
     if (TrailingDbgRecords) {
+      DbgMarker *FirstMarker = createMarker(First);
       FirstMarker->absorbDebugValues(*TrailingDbgRecords, true);
       TrailingDbgRecords->eraseFromParent();
       deleteTrailingDbgRecords();
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 4206162d1768..7976904b1fe9 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -80,8 +80,7 @@ TinyPtrVector<DbgVariableRecord *> llvm::findDVRDeclares(Value *V) {
   return Declares;
 }
 
-template <typename IntrinsicT, DbgVariableRecord::LocationType Type =
-                                   DbgVariableRecord::LocationType::Any>
+template <typename IntrinsicT, bool DbgAssignAndValuesOnly>
 static void
 findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
                   SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
@@ -114,8 +113,7 @@ findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
     // Get DbgVariableRecords that use this as a single value.
     if (LocalAsMetadata *L = dyn_cast<LocalAsMetadata>(MD)) {
       for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers()) {
-        if (Type == DbgVariableRecord::LocationType::Any ||
-            DVR->getType() == Type)
+        if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign())
           if (EncounteredDbgVariableRecords.insert(DVR).second)
             DbgVariableRecords->push_back(DVR);
       }
@@ -130,8 +128,7 @@ findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
         continue;
       DIArgList *DI = cast<DIArgList>(AL);
       for (DbgVariableRecord *DVR : DI->getAllDbgVariableRecordUsers())
-        if (Type == DbgVariableRecord::LocationType::Any ||
-            DVR->getType() == Type)
+        if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign())
           if (EncounteredDbgVariableRecords.insert(DVR).second)
             DbgVariableRecords->push_back(DVR);
     }
@@ -141,14 +138,14 @@ findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
 void llvm::findDbgValues(
     SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
     SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
-  findDbgIntrinsics<DbgValueInst, DbgVariableRecord::LocationType::Value>(
+  findDbgIntrinsics<DbgValueInst, /*DbgAssignAndValuesOnly=*/true>(
       DbgValues, V, DbgVariableRecords);
 }
 
 void llvm::findDbgUsers(
     SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, Value *V,
     SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
-  findDbgIntrinsics<DbgVariableIntrinsic, DbgVariableRecord::LocationType::Any>(
+  findDbgIntrinsics<DbgVariableIntrinsic, /*DbgAssignAndValuesOnly=*/false>(
       DbgUsers, V, DbgVariableRecords);
 }
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e8..690184080657 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -735,6 +735,10 @@ void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo,
                                                                   ArgNo, Bytes);
 }
 
+void Function::addRangeRetAttr(const ConstantRange &CR) {
+  AttributeSets = AttributeSets.addRangeRetAttr(getContext(), CR);
+}
+
 DenormalMode Function::getDenormalMode(const fltSemantics &FPType) const {
   if (&FPType == &APFloat::IEEEsingle()) {
     DenormalMode Mode = getDenormalModeF32Raw();
@@ -1843,8 +1847,8 @@ bool Function::hasAddressTaken(const User **PutOffender,
         if (llvm::all_of(FUU->users(), [](const User *U) {
               if (const auto *GV = dyn_cast<GlobalVariable>(U))
                 return GV->hasName() &&
-                       (GV->getName().equals("llvm.compiler.used") ||
-                        GV->getName().equals("llvm.used"));
+                       (GV->getName() == "llvm.compiler.used" ||
+                        GV->getName() == "llvm.used");
               return false;
             }))
           continue;
@@ -1989,7 +1993,7 @@ std::optional<ProfileCount> Function::getEntryCount(bool AllowSynthetic) const {
   MDNode *MD = getMetadata(LLVMContext::MD_prof);
   if (MD && MD->getOperand(0))
     if (MDString *MDS = dyn_cast<MDString>(MD->getOperand(0))) {
-      if (MDS->getString().equals("function_entry_count")) {
+      if (MDS->getString() == "function_entry_count") {
         ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(1));
         uint64_t Count = CI->getValue().getZExtValue();
         // A value of -1 is used for SamplePGO when there were no samples.
@@ -1998,7 +2002,7 @@ std::optional<ProfileCount> Function::getEntryCount(bool AllowSynthetic) const {
           return std::nullopt;
         return ProfileCount(Count, PCT_Real);
       } else if (AllowSynthetic &&
-                 MDS->getString().equals("synthetic_function_entry_count")) {
+                 MDS->getString() == "synthetic_function_entry_count") {
         ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(1));
         uint64_t Count = CI->getValue().getZExtValue();
         return ProfileCount(Count, PCT_Synthetic);
@@ -2011,7 +2015,7 @@ DenseSet<GlobalValue::GUID> Function::getImportGUIDs() const {
   DenseSet<GlobalValue::GUID> R;
   if (MDNode *MD = getMetadata(LLVMContext::MD_prof))
     if (MDString *MDS = dyn_cast<MDString>(MD->getOperand(0)))
-      if (MDS->getString().equals("function_entry_count"))
+      if (MDS->getString() == "function_entry_count")
         for (unsigned i = 2; i < MD->getNumOperands(); i++)
           R.insert(mdconst::extract<ConstantInt>(MD->getOperand(i))
                        ->getValue()
@@ -2027,9 +2031,8 @@ void Function::setSectionPrefix(StringRef Prefix) {
 
 std::optional<StringRef> Function::getSectionPrefix() const {
   if (MDNode *MD = getMetadata(LLVMContext::MD_section_prefix)) {
-    assert(cast<MDString>(MD->getOperand(0))
-               ->getString()
-               .equals("function_section_prefix") &&
+    assert(cast<MDString>(MD->getOperand(0))->getString() ==
+               "function_section_prefix" &&
            "Metadata not match");
     return cast<MDString>(MD->getOperand(1))->getString();
   }
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index d6746d1d4382..9ec5a7deeec6 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1171,8 +1171,7 @@ Value *IRBuilderBase::CreateVectorReverse(Value *V, const Twine &Name) {
   auto *Ty = cast<VectorType>(V->getType());
   if (isa<ScalableVectorType>(Ty)) {
     Module *M = BB->getParent()->getParent();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, Ty);
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, Ty);
     return Insert(CallInst::Create(F, V), Name);
   }
   // Keep the original behaviour for fixed vector
@@ -1191,8 +1190,7 @@ Value *IRBuilderBase::CreateVectorSplice(Value *V1, Value *V2, int64_t Imm,
 
   if (auto *VTy = dyn_cast<ScalableVectorType>(V1->getType())) {
     Module *M = BB->getParent()->getParent();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_splice, VTy);
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_splice, VTy);
 
     Value *Ops[] = {V1, V2, getInt32(Imm)};
     return Insert(CallInst::Create(F, Ops), Name);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index d2babc748731..7ad1ad4cddb7 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2889,7 +2889,7 @@ bool ShuffleVectorInst::isOneUseSingleSourceMask(int VF) const {
 bool ShuffleVectorInst::isInterleave(unsigned Factor) {
   FixedVectorType *OpTy = dyn_cast<FixedVectorType>(getOperand(0)->getType());
   // shuffle_vector can only interleave fixed length vectors - for scalable
-  // vectors, see the @llvm.experimental.vector.interleave2 intrinsic
+  // vectors, see the @llvm.vector.interleave2 intrinsic
   if (!OpTy)
     return false;
   unsigned OpNumElts = OpTy->getNumElements();
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 89403e1d7fcb..6743b315c74a 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -119,7 +119,8 @@ static ValueAsMetadata *getAsMetadata(Value *V) {
 }
 
 void DbgVariableIntrinsic::replaceVariableLocationOp(Value *OldValue,
-                                                     Value *NewValue) {
+                                                     Value *NewValue,
+                                                     bool AllowEmpty) {
   // If OldValue is used as the address part of a dbg.assign intrinsic replace
   // it with NewValue and return true.
   auto ReplaceDbgAssignAddress = [this, OldValue, NewValue]() -> bool {
@@ -136,6 +137,8 @@ void DbgVariableIntrinsic::replaceVariableLocationOp(Value *OldValue,
   auto Locations = location_ops();
   auto OldIt = find(Locations, OldValue);
   if (OldIt == Locations.end()) {
+    if (AllowEmpty || DbgAssignAddrReplaced)
+      return;
     assert(DbgAssignAddrReplaced &&
            "OldValue must be dbg.assign addr if unused in DIArgList");
     return;
@@ -291,6 +294,12 @@ Value *InstrProfIncrementInst::getStep() const {
   return ConstantInt::get(Type::getInt64Ty(Context), 1);
 }
 
+Value *InstrProfCallsite::getCallee() const {
+  if (isa<InstrProfCallsite>(this))
+    return getArgOperand(4);
+  return nullptr;
+}
+
 std::optional<RoundingMode> ConstrainedFPIntrinsic::getRoundingMode() const {
   unsigned NumOperands = arg_size();
   Metadata *MD = nullptr;
@@ -668,6 +677,7 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
   case Intrinsic::vp_inttoptr:
   case Intrinsic::vp_lrint:
   case Intrinsic::vp_llrint:
+  case Intrinsic::vp_cttz_elts:
     VPFunc =
         Intrinsic::getDeclaration(M, VPID, {ReturnType, Params[0]->getType()});
     break;
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index d987acf309ea..b057cc5e92d4 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -123,6 +123,13 @@ void LLVMContext::addModule(Module *M) {
 
 void LLVMContext::removeModule(Module *M) {
   pImpl->OwnedModules.erase(M);
+  pImpl->MachineFunctionNums.erase(M);
+}
+
+unsigned LLVMContext::generateMachineFunctionNum(Function &F) {
+  Module *M = F.getParent();
+  assert(pImpl->OwnedModules.contains(M) && "Unexpected module!");
+  return pImpl->MachineFunctionNums[M]++;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 7c67e191348e..2713015c266c 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1450,6 +1450,10 @@ public:
   /// will be automatically deleted if this context is deleted.
   SmallPtrSet<Module *, 4> OwnedModules;
 
+  /// MachineFunctionNums - Keep the next available unique number available for
+  /// a MachineFunction in given module. Module must in OwnedModules.
+  DenseMap<Module *, unsigned> MachineFunctionNums;
+
   /// The main remark streamer used by all the other streamers (e.g. IR, MIR,
   /// frontends, etc.). This should only be used by the specific streamers, and
   /// never directly.
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 4472bf128c32..b6c932495a14 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1195,8 +1195,7 @@ MDNode *MDNode::mergeDirectCallProfMetadata(MDNode *A, MDNode *B,
          "first operand should be a non-null MDString");
   StringRef AProfName = AMDS->getString();
   StringRef BProfName = BMDS->getString();
-  if (AProfName.equals("branch_weights") &&
-      BProfName.equals("branch_weights")) {
+  if (AProfName == "branch_weights" && BProfName == "branch_weights") {
     ConstantInt *AInstrWeight =
         mdconst::dyn_extract<ConstantInt>(A->getOperand(1));
     ConstantInt *BInstrWeight =
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index dc86f4204b1a..3d72418593a7 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -62,7 +62,27 @@ bool isTargetMD(const MDNode *ProfData, const char *Name, unsigned MinOps) {
   if (!ProfDataName)
     return false;
 
-  return ProfDataName->getString().equals(Name);
+  return ProfDataName->getString() == Name;
+}
+
+template <typename T,
+          typename = typename std::enable_if<std::is_arithmetic_v<T>>>
+static void extractFromBranchWeightMD(const MDNode *ProfileData,
+                                      SmallVectorImpl<T> &Weights) {
+  assert(isBranchWeightMD(ProfileData) && "wrong metadata");
+
+  unsigned NOps = ProfileData->getNumOperands();
+  assert(WeightsIdx < NOps && "Weights Index must be less than NOps.");
+  Weights.resize(NOps - WeightsIdx);
+
+  for (unsigned Idx = WeightsIdx, E = NOps; Idx != E; ++Idx) {
+    ConstantInt *Weight =
+        mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(Idx));
+    assert(Weight && "Malformed branch_weight in MD_prof node");
+    assert(Weight->getValue().getActiveBits() <= 32 &&
+           "Too many bits for uint32_t");
+    Weights[Idx - WeightsIdx] = Weight->getZExtValue();
+  }
 }
 
 } // namespace
@@ -100,22 +120,14 @@ MDNode *getValidBranchWeightMDNode(const Instruction &I) {
   return nullptr;
 }
 
-void extractFromBranchWeightMD(const MDNode *ProfileData,
-                               SmallVectorImpl<uint32_t> &Weights) {
-  assert(isBranchWeightMD(ProfileData) && "wrong metadata");
-
-  unsigned NOps = ProfileData->getNumOperands();
-  assert(WeightsIdx < NOps && "Weights Index must be less than NOps.");
-  Weights.resize(NOps - WeightsIdx);
+void extractFromBranchWeightMD32(const MDNode *ProfileData,
+                                 SmallVectorImpl<uint32_t> &Weights) {
+  extractFromBranchWeightMD(ProfileData, Weights);
+}
 
-  for (unsigned Idx = WeightsIdx, E = NOps; Idx != E; ++Idx) {
-    ConstantInt *Weight =
-        mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(Idx));
-    assert(Weight && "Malformed branch_weight in MD_prof node");
-    assert(Weight->getValue().getActiveBits() <= 32 &&
-           "Too many bits for uint32_t");
-    Weights[Idx - WeightsIdx] = Weight->getZExtValue();
-  }
+void extractFromBranchWeightMD64(const MDNode *ProfileData,
+                                 SmallVectorImpl<uint64_t> &Weights) {
+  extractFromBranchWeightMD(ProfileData, Weights);
 }
 
 bool extractBranchWeights(const MDNode *ProfileData,
@@ -161,7 +173,7 @@ bool extractProfTotalWeight(const MDNode *ProfileData, uint64_t &TotalVal) {
   if (!ProfDataName)
     return false;
 
-  if (ProfDataName->getString().equals("branch_weights")) {
+  if (ProfDataName->getString() == "branch_weights") {
     for (unsigned Idx = 1; Idx < ProfileData->getNumOperands(); Idx++) {
       auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(Idx));
       assert(V && "Malformed branch_weight in MD_prof node");
@@ -170,8 +182,7 @@ bool extractProfTotalWeight(const MDNode *ProfileData, uint64_t &TotalVal) {
     return true;
   }
 
-  if (ProfDataName->getString().equals("VP") &&
-      ProfileData->getNumOperands() > 3) {
+  if (ProfDataName->getString() == "VP" && ProfileData->getNumOperands() > 3) {
     TotalVal = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2))
                    ->getValue()
                    .getZExtValue();
@@ -197,8 +208,8 @@ void scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
     return;
 
   auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
-  if (!ProfDataName || (!ProfDataName->getString().equals("branch_weights") &&
-                        !ProfDataName->getString().equals("VP")))
+  if (!ProfDataName || (ProfDataName->getString() != "branch_weights" &&
+                        ProfDataName->getString() != "VP"))
     return;
 
   LLVMContext &C = I.getContext();
@@ -207,7 +218,7 @@ void scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
   SmallVector<Metadata *, 3> Vals;
   Vals.push_back(ProfileData->getOperand(0));
   APInt APS(128, S), APT(128, T);
-  if (ProfDataName->getString().equals("branch_weights") &&
+  if (ProfDataName->getString() == "branch_weights" &&
       ProfileData->getNumOperands() > 0) {
     // Using APInt::div may be expensive, but most cases should fit 64 bits.
     APInt Val(128, mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1))
@@ -216,7 +227,7 @@ void scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
     Val *= APS;
     Vals.push_back(MDB.createConstant(ConstantInt::get(
         Type::getInt32Ty(C), Val.udiv(APT).getLimitedValue(UINT32_MAX))));
-  } else if (ProfDataName->getString().equals("VP"))
+  } else if (ProfDataName->getString() == "VP")
     for (unsigned i = 1; i < ProfileData->getNumOperands(); i += 2) {
       // The first value is the key of the value profile, which will not change.
       Vals.push_back(ProfileData->getOperand(i));
diff --git a/llvm/lib/IR/ProfileSummary.cpp b/llvm/lib/IR/ProfileSummary.cpp
index 9f7335ecbe44..acb4c52e8918 100644
--- a/llvm/lib/IR/ProfileSummary.cpp
+++ b/llvm/lib/IR/ProfileSummary.cpp
@@ -110,7 +110,7 @@ static ConstantAsMetadata *getValMD(MDTuple *MD, const char *Key) {
   ConstantAsMetadata *ValMD = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
   if (!KeyMD || !ValMD)
     return nullptr;
-  if (!KeyMD->getString().equals(Key))
+  if (KeyMD->getString() != Key)
     return nullptr;
   return ValMD;
 }
@@ -140,7 +140,7 @@ static bool isKeyValuePair(MDTuple *MD, const char *Key, const char *Val) {
   MDString *ValMD = dyn_cast<MDString>(MD->getOperand(1));
   if (!KeyMD || !ValMD)
     return false;
-  if (!KeyMD->getString().equals(Key) || !ValMD->getString().equals(Val))
+  if (KeyMD->getString() != Key || ValMD->getString() != Val)
     return false;
   return true;
 }
@@ -150,7 +150,7 @@ static bool getSummaryFromMD(MDTuple *MD, SummaryEntryVector &Summary) {
   if (!MD || MD->getNumOperands() != 2)
     return false;
   MDString *KeyMD = dyn_cast<MDString>(MD->getOperand(0));
-  if (!KeyMD || !KeyMD->getString().equals("DetailedSummary"))
+  if (!KeyMD || KeyMD->getString() != "DetailedSummary")
     return false;
   MDTuple *EntriesMD = dyn_cast<MDTuple>(MD->getOperand(1));
   if (!EntriesMD)
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index c59bc3622fde..5c61ad9f000b 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -834,7 +834,7 @@ struct TargetTypeInfo {
 static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
   LLVMContext &C = Ty->getContext();
   StringRef Name = Ty->getName();
-  if (Name.equals("spirv.Image"))
+  if (Name == "spirv.Image")
     return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal);
   if (Name.starts_with("spirv."))
     return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::HasZeroInit,
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index de78cd5d5fac..91c259acb2c1 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2379,8 +2379,8 @@ void Verifier::verifyFunctionMetadata(
             "expected string with name of the !prof annotation", MD);
       MDString *MDS = cast<MDString>(MD->getOperand(0));
       StringRef ProfName = MDS->getString();
-      Check(ProfName.equals("function_entry_count") ||
-                ProfName.equals("synthetic_function_entry_count"),
+      Check(ProfName == "function_entry_count" ||
+                ProfName == "synthetic_function_entry_count",
             "first operand should be 'function_entry_count'"
             " or 'synthetic_function_entry_count'",
             MD);
@@ -4789,7 +4789,7 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
   StringRef ProfName = MDS->getString();
 
   // Check consistency of !prof branch_weights metadata.
-  if (ProfName.equals("branch_weights")) {
+  if (ProfName == "branch_weights") {
     if (isa<InvokeInst>(&I)) {
       Check(MD->getNumOperands() == 2 || MD->getNumOperands() == 3,
             "Wrong number of InvokeInst branch_weights operands", MD);
@@ -6023,7 +6023,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     break;
   }
-  case Intrinsic::experimental_vector_splice: {
+  case Intrinsic::vector_splice: {
     VectorType *VecTy = cast<VectorType>(Call.getType());
     int64_t Idx = cast<ConstantInt>(Call.getArgOperand(2))->getSExtValue();
     int64_t KnownMinNumElements = VecTy->getElementCount().getKnownMinValue();
@@ -6230,7 +6230,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     break;
   }
   case Intrinsic::experimental_convergence_entry:
-    LLVM_FALLTHROUGH;
   case Intrinsic::experimental_convergence_anchor:
     break;
   case Intrinsic::experimental_convergence_loop:
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 53060df7f503..21cad1de0ced 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -114,12 +114,12 @@ void llvm::computeLTOCacheKey(
   auto AddUnsigned = [&](unsigned I) {
     uint8_t Data[4];
     support::endian::write32le(Data, I);
-    Hasher.update(ArrayRef<uint8_t>{Data, 4});
+    Hasher.update(Data);
   };
   auto AddUint64 = [&](uint64_t I) {
     uint8_t Data[8];
     support::endian::write64le(Data, I);
-    Hasher.update(ArrayRef<uint8_t>{Data, 8});
+    Hasher.update(Data);
   };
   AddString(Conf.CPU);
   // FIXME: Hash more of Options. For now all clients initialize Options from
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 19b6f7e78792..0611099c4690 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -572,6 +572,9 @@ bool LTOCodeGenerator::optimize() {
   if (!this->determineTarget())
     return false;
 
+  // libLTO parses options late, so re-set them here.
+  Context.setDiscardValueNames(LTODiscardValueNames);
+
   auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
       Context, RemarksFilename, RemarksPasses, RemarksFormat,
       RemarksWithHotness, RemarksHotnessThreshold);
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 005521bad6e0..b8ef2654ed6e 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -725,7 +725,13 @@ void ELFWriter::computeSymbolTable(
         HasLargeSectionIndex = true;
     }
 
+    // Temporary symbols generated for certain assembler features (.eh_frame,
+    // .debug_line) of an empty name may be referenced by relocations due to
+    // linker relaxation. Rename them to ".L0 " to match the gas fake label name
+    // and allow ld/objcopy --discard-locals to discard such symbols.
     StringRef Name = Symbol.getName();
+    if (Name.empty())
+      Name = ".L0 ";
 
     // Sections have their own string table
     if (Symbol.getType() != ELF::STT_SECTION) {
diff --git a/llvm/lib/MC/MCDXContainerStreamer.cpp b/llvm/lib/MC/MCDXContainerStreamer.cpp
index 3cb452f3dfa5..a596c9a16d49 100644
--- a/llvm/lib/MC/MCDXContainerStreamer.cpp
+++ b/llvm/lib/MC/MCDXContainerStreamer.cpp
@@ -21,11 +21,8 @@ void MCDXContainerStreamer::emitInstToData(const MCInst &,
 
 MCStreamer *llvm::createDXContainerStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB,
-    std::unique_ptr<MCObjectWriter> &&OW, std::unique_ptr<MCCodeEmitter> &&CE,
-    bool RelaxAll) {
+    std::unique_ptr<MCObjectWriter> &&OW, std::unique_ptr<MCCodeEmitter> &&CE) {
   auto *S = new MCDXContainerStreamer(Context, std::move(MAB), std::move(OW),
                                       std::move(CE));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index e541090769e9..23e926c3a9d1 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -892,11 +892,8 @@ void MCELFStreamer::createAttributesSection(
 MCStreamer *llvm::createELFStreamer(MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
                                     std::unique_ptr<MCObjectWriter> &&OW,
-                                    std::unique_ptr<MCCodeEmitter> &&CE,
-                                    bool RelaxAll) {
+                                    std::unique_ptr<MCCodeEmitter> &&CE) {
   MCELFStreamer *S =
       new MCELFStreamer(Context, std::move(MAB), std::move(OW), std::move(CE));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp
index 58d13c9f3788..2b6d5c8e75a7 100644
--- a/llvm/lib/MC/MCGOFFStreamer.cpp
+++ b/llvm/lib/MC/MCGOFFStreamer.cpp
@@ -24,11 +24,8 @@ MCGOFFStreamer::~MCGOFFStreamer() {}
 MCStreamer *llvm::createGOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
-                                     std::unique_ptr<MCCodeEmitter> &&CE,
-                                     bool RelaxAll) {
+                                     std::unique_ptr<MCCodeEmitter> &&CE) {
   MCGOFFStreamer *S =
       new MCGOFFStreamer(Context, std::move(MAB), std::move(OW), std::move(CE));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index d7d343f15eaa..10f9988b9d16 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -564,7 +564,7 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context,
                                       std::unique_ptr<MCAsmBackend> &&MAB,
                                       std::unique_ptr<MCObjectWriter> &&OW,
                                       std::unique_ptr<MCCodeEmitter> &&CE,
-                                      bool RelaxAll, bool DWARFMustBeAtTheEnd,
+                                      bool DWARFMustBeAtTheEnd,
                                       bool LabelSections) {
   MCMachOStreamer *S =
       new MCMachOStreamer(Context, std::move(MAB), std::move(OW), std::move(CE),
@@ -574,8 +574,6 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context,
       Target, Context.getObjectFileInfo()->getSDKVersion(),
       Context.getObjectFileInfo()->getDarwinTargetVariantTriple(),
       Context.getObjectFileInfo()->getDarwinTargetVariantSDKVersion());
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
 
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 490e0a4dd404..d2da5d0d3f90 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -34,6 +34,8 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
       EmitEHFrame(true), EmitDebugFrame(false) {
   if (Assembler->getBackendPtr())
     setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
+  if (Context.getTargetOptions() && Context.getTargetOptions()->MCRelaxAll)
+    Assembler->setRelaxAll(true);
 }
 
 MCObjectStreamer::~MCObjectStreamer() = default;
diff --git a/llvm/lib/MC/MCSPIRVStreamer.cpp b/llvm/lib/MC/MCSPIRVStreamer.cpp
index 0bb73c7ff7ee..3b75a2e17a4a 100644
--- a/llvm/lib/MC/MCSPIRVStreamer.cpp
+++ b/llvm/lib/MC/MCSPIRVStreamer.cpp
@@ -34,11 +34,8 @@ void MCSPIRVStreamer::emitInstToData(const MCInst &Inst,
 MCStreamer *llvm::createSPIRVStreamer(MCContext &Context,
                                       std::unique_ptr<MCAsmBackend> &&MAB,
                                       std::unique_ptr<MCObjectWriter> &&OW,
-                                      std::unique_ptr<MCCodeEmitter> &&CE,
-                                      bool RelaxAll) {
+                                      std::unique_ptr<MCCodeEmitter> &&CE) {
   MCSPIRVStreamer *S = new MCSPIRVStreamer(Context, std::move(MAB),
                                            std::move(OW), std::move(CE));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index fbab72fb5f3d..c553ede77555 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -275,11 +275,8 @@ void MCWasmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
 MCStreamer *llvm::createWasmStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
-                                     std::unique_ptr<MCCodeEmitter> &&CE,
-                                     bool RelaxAll) {
+                                     std::unique_ptr<MCCodeEmitter> &&CE) {
   MCWasmStreamer *S =
       new MCWasmStreamer(Context, std::move(MAB), std::move(OW), std::move(CE));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 458b4be61983..175d7d6b6c31 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -162,12 +162,9 @@ void MCXCOFFStreamer::emitInstToData(const MCInst &Inst,
 MCStreamer *llvm::createXCOFFStreamer(MCContext &Context,
                                       std::unique_ptr<MCAsmBackend> &&MAB,
                                       std::unique_ptr<MCObjectWriter> &&OW,
-                                      std::unique_ptr<MCCodeEmitter> &&CE,
-                                      bool RelaxAll) {
+                                      std::unique_ptr<MCCodeEmitter> &&CE) {
   MCXCOFFStreamer *S = new MCXCOFFStreamer(Context, std::move(MAB),
                                            std::move(OW), std::move(CE));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
 
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 1a82e45763a2..2e3ebe3d9073 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -542,8 +542,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
   if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
     return make_error<InstructionError<MCInst>>(
-        "found an unsupported instruction in the input assembly sequence.",
-        MCI);
+        "found an unsupported instruction in the input assembly sequence", MCI);
   }
 
   LLVM_DEBUG(dbgs() << "\n\t\tOpcode Name= " << MCII.getName(Opcode) << '\n');
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8d408ca2363a..30d3e7a1ec05 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -82,7 +82,6 @@
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
-#include "llvm/CodeGen/FreeMachineFunction.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GlobalMerge.h"
 #include "llvm/CodeGen/HardwareLoops.h"
@@ -92,6 +91,7 @@
 #include "llvm/CodeGen/JMCInstrumenter.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
 #include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/SafeStack.h"
 #include "llvm/CodeGen/SelectOptimize.h"
@@ -1230,7 +1230,7 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
   StringRef NameNoBracket = Name.take_until([](char C) { return C == '<'; });
   if (NameNoBracket == "function")
     return true;
-  if (Name == "loop" || Name == "loop-mssa")
+  if (Name == "loop" || Name == "loop-mssa" || Name == "machine-function")
     return true;
 
   // Explicitly handle custom-parsed pass names.
@@ -1408,13 +1408,6 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
       return Error::success();
     }
-    if (Name == "machine-function") {
-      MachineFunctionPassManager MFPM;
-      if (auto Err = parseMachinePassPipeline(MFPM, InnerPipeline))
-        return Err;
-      MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
-      return Error::success();
-    }
     if (auto Params = parseFunctionPipelineName(Name)) {
       if (Params->second)
         return make_error<StringError>(
@@ -1732,6 +1725,13 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
       FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM)));
       return Error::success();
     }
+    if (Name == "machine-function") {
+      MachineFunctionPassManager MFPM;
+      if (auto Err = parseMachinePassPipeline(MFPM, InnerPipeline))
+        return Err;
+      FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
+      return Error::success();
+    }
 
     for (auto &C : FunctionPipelineParsingCallbacks)
       if (C(Name, FPM, InnerPipeline))
@@ -1975,6 +1975,8 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
   if (MFAM) {
     MAM.registerPass(
         [&] { return MachineFunctionAnalysisManagerModuleProxy(*MFAM); });
+    FAM.registerPass(
+        [&] { return MachineFunctionAnalysisManagerFunctionProxy(*MFAM); });
     MFAM->registerPass(
         [&] { return ModuleAnalysisManagerMachineFunctionProxy(MAM); });
     MFAM->registerPass(
@@ -2023,7 +2025,7 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
                                  std::move(*Pipeline)}}}};
     } else if (isMachineFunctionPassName(
                    FirstName, MachineFunctionPipelineParsingCallbacks)) {
-      Pipeline = {{"machine-function", std::move(*Pipeline)}};
+      Pipeline = {{"function", {{"machine-function", std::move(*Pipeline)}}}};
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline))
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 90ba3b541553..100889c0845b 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -963,7 +963,8 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>()));
 
-  MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
 
   // Make sure we don't affect potential future NoRerun CGSCC adaptors.
   MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
@@ -1005,8 +1006,9 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
       buildFunctionSimplificationPipeline(Level, Phase),
       PTO.EagerlyInvalidateAnalyses));
 
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-      CoroSplitPass(Level != OptimizationLevel::O0)));
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+        CoroSplitPass(Level != OptimizationLevel::O0)));
 
   return MPM;
 }
@@ -1183,7 +1185,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // and argument promotion.
   MPM.addPass(DeadArgumentEliminationPass());
 
-  MPM.addPass(CoroCleanupPass());
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MPM.addPass(CoroCleanupPass());
 
   // Optimize globals now that functions are fully simplified.
   MPM.addPass(GlobalOptPass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 2fbc7f7d88ba..9b670e4e3a44 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -254,6 +254,9 @@ FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis())
 FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
 FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
 FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis())
+FUNCTION_ANALYSIS(
+    "machine-function-info",
+    MachineFunctionAnalysis(static_cast<const LLVMTargetMachine *>(TM)))
 FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis())
 FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index c18b46225862..63490c83e85f 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -297,14 +297,6 @@ void unwrapAndPrint(raw_ostream &OS, Any IR) {
     auto *M = unwrapModule(IR);
     assert(M && "should have unwrapped module");
     printIR(OS, M);
-
-    if (const auto *MF = unwrapIR<MachineFunction>(IR)) {
-      auto &MMI = MF->getMMI();
-      for (const auto &F : *M) {
-        if (auto *MF = MMI.getMachineFunction(F))
-          MF->print(OS);
-      }
-    }
     return;
   }
 
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index cefb6af12d00..ba21e01abfba 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1520,54 +1520,38 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
 
   // Setup a callback to convert from frame ids to frame using the on-disk
   // FrameData hash table.
-  std::optional<memprof::FrameId> LastUnmappedFrameId;
-  auto IdToFrameCallback = [&](const memprof::FrameId Id) {
-    auto FrIter = MemProfFrameTable->find(Id);
-    if (FrIter == MemProfFrameTable->end()) {
-      LastUnmappedFrameId = Id;
-      return memprof::Frame(0, 0, 0, false);
-    }
-    return *FrIter;
-  };
-
-  // Setup a callback to convert call stack ids to call stacks using the on-disk
-  // hash table.
-  std::optional<memprof::CallStackId> LastUnmappedCSId;
-  auto CSIdToCallStackCallback = [&](memprof::CallStackId CSId) {
-    llvm::SmallVector<memprof::Frame> Frames;
-    auto CSIter = MemProfCallStackTable->find(CSId);
-    if (CSIter == MemProfCallStackTable->end()) {
-      LastUnmappedCSId = CSId;
-    } else {
-      const llvm::SmallVector<memprof::FrameId> &CS = *CSIter;
-      Frames.reserve(CS.size());
-      for (memprof::FrameId Id : CS)
-        Frames.push_back(IdToFrameCallback(Id));
-    }
-    return Frames;
-  };
+  memprof::FrameIdConverter<MemProfFrameHashTable> FrameIdConv(
+      *MemProfFrameTable.get());
 
   const memprof::IndexedMemProfRecord IndexedRecord = *Iter;
   memprof::MemProfRecord Record;
-  if (MemProfCallStackTable)
-    Record = IndexedRecord.toMemProfRecord(CSIdToCallStackCallback);
-  else
-    Record = memprof::MemProfRecord(IndexedRecord, IdToFrameCallback);
+  if (MemProfCallStackTable) {
+    // Setup a callback to convert call stack ids to call stacks using the
+    // on-disk hash table.
+    memprof::CallStackIdConverter<MemProfCallStackHashTable> CSIdConv(
+        *MemProfCallStackTable.get(), FrameIdConv);
 
-  // Check that all frame ids were successfully converted to frames.
-  if (LastUnmappedFrameId) {
-    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
-                                      "memprof frame not found for frame id " +
-                                          Twine(*LastUnmappedFrameId));
+    Record = IndexedRecord.toMemProfRecord(CSIdConv);
+
+    // Check that all call stack ids were successfully converted to call stacks.
+    if (CSIdConv.LastUnmappedId) {
+      return make_error<InstrProfError>(
+          instrprof_error::hash_mismatch,
+          "memprof call stack not found for call stack id " +
+              Twine(*CSIdConv.LastUnmappedId));
+    }
+  } else {
+    Record = memprof::MemProfRecord(IndexedRecord, FrameIdConv);
   }
 
-  // Check that all call stack ids were successfully converted to call stacks.
-  if (LastUnmappedCSId) {
+  // Check that all frame ids were successfully converted to frames.
+  if (FrameIdConv.LastUnmappedId) {
     return make_error<InstrProfError>(
         instrprof_error::hash_mismatch,
-        "memprof call stack not found for call stack id " +
-            Twine(*LastUnmappedCSId));
+        "memprof frame not found for frame id " +
+            Twine(*FrameIdConv.LastUnmappedId));
   }
+
   return Record;
 }
 
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 4a6fc9d64b69..b61c59aacc0f 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -184,12 +184,13 @@ public:
 InstrProfWriter::InstrProfWriter(
     bool Sparse, uint64_t TemporalProfTraceReservoirSize,
     uint64_t MaxTemporalProfTraceLength, bool WritePrevVersion,
-    memprof::IndexedVersion MemProfVersionRequested)
+    memprof::IndexedVersion MemProfVersionRequested, bool MemProfFullSchema)
     : Sparse(Sparse), MaxTemporalProfTraceLength(MaxTemporalProfTraceLength),
       TemporalProfTraceReservoirSize(TemporalProfTraceReservoirSize),
       InfoObj(new InstrProfRecordWriterTrait()),
       WritePrevVersion(WritePrevVersion),
-      MemProfVersionRequested(MemProfVersionRequested) {}
+      MemProfVersionRequested(MemProfVersionRequested),
+      MemProfFullSchema(MemProfFullSchema) {}
 
 InstrProfWriter::~InstrProfWriter() { delete InfoObj; }
 
@@ -507,7 +508,7 @@ static Error writeMemProfV0(
   OS.write(0ULL); // Reserve space for the memprof frame payload offset.
   OS.write(0ULL); // Reserve space for the memprof frame table offset.
 
-  auto Schema = memprof::PortableMemInfoBlock::getSchema();
+  auto Schema = memprof::getFullSchema();
   writeMemProfSchema(OS, Schema);
 
   uint64_t RecordTableOffset =
@@ -533,7 +534,7 @@ static Error writeMemProfV1(
   OS.write(0ULL); // Reserve space for the memprof frame payload offset.
   OS.write(0ULL); // Reserve space for the memprof frame table offset.
 
-  auto Schema = memprof::PortableMemInfoBlock::getSchema();
+  auto Schema = memprof::getFullSchema();
   writeMemProfSchema(OS, Schema);
 
   uint64_t RecordTableOffset =
@@ -554,7 +555,8 @@ static Error writeMemProfV2(
         &MemProfRecordData,
     llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
     llvm::MapVector<memprof::CallStackId, llvm::SmallVector<memprof::FrameId>>
-        &MemProfCallStackData) {
+        &MemProfCallStackData,
+    bool MemProfFullSchema) {
   OS.write(memprof::Version2);
   uint64_t HeaderUpdatePos = OS.tell();
   OS.write(0ULL); // Reserve space for the memprof record table offset.
@@ -563,7 +565,9 @@ static Error writeMemProfV2(
   OS.write(0ULL); // Reserve space for the memprof call stack payload offset.
   OS.write(0ULL); // Reserve space for the memprof call stack table offset.
 
-  auto Schema = memprof::PortableMemInfoBlock::getSchema();
+  auto Schema = memprof::getHotColdSchema();
+  if (MemProfFullSchema)
+    Schema = memprof::getFullSchema();
   writeMemProfSchema(OS, Schema);
 
   uint64_t RecordTableOffset =
@@ -605,7 +609,7 @@ static Error writeMemProf(
     llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
     llvm::MapVector<memprof::CallStackId, llvm::SmallVector<memprof::FrameId>>
         &MemProfCallStackData,
-    memprof::IndexedVersion MemProfVersionRequested) {
+    memprof::IndexedVersion MemProfVersionRequested, bool MemProfFullSchema) {
 
   switch (MemProfVersionRequested) {
   case memprof::Version0:
@@ -614,7 +618,7 @@ static Error writeMemProf(
     return writeMemProfV1(OS, MemProfRecordData, MemProfFrameData);
   case memprof::Version2:
     return writeMemProfV2(OS, MemProfRecordData, MemProfFrameData,
-                          MemProfCallStackData);
+                          MemProfCallStackData, MemProfFullSchema);
   }
 
   return make_error<InstrProfError>(
@@ -653,8 +657,8 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
                        : IndexedInstrProf::ProfVersion::CurrentVersion;
   // The WritePrevVersion handling will either need to be removed or updated
   // if the version is advanced beyond 12.
-  assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
-         IndexedInstrProf::ProfVersion::Version12);
+  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
+                IndexedInstrProf::ProfVersion::Version12);
   if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
   if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
@@ -733,7 +737,8 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf)) {
     MemProfSectionStart = OS.tell();
     if (auto E = writeMemProf(OS, MemProfRecordData, MemProfFrameData,
-                              MemProfCallStackData, MemProfVersionRequested))
+                              MemProfCallStackData, MemProfVersionRequested,
+                              MemProfFullSchema))
       return E;
   }
 
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 9a46d1151311..4667778ca11d 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -10,6 +10,19 @@
 
 namespace llvm {
 namespace memprof {
+MemProfSchema getFullSchema() {
+  MemProfSchema List;
+#define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name);
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+  return List;
+}
+
+MemProfSchema getHotColdSchema() {
+  return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime,
+          Meta::TotalLifetimeAccessDensity};
+}
+
 static size_t serializedSizeV0(const IndexedAllocationInfo &IAI,
                                const MemProfSchema &Schema) {
   size_t Size = 0;
diff --git a/llvm/lib/Support/RWMutex.cpp b/llvm/lib/Support/RWMutex.cpp
index 5accf73e5f94..d6fa956da634 100644
--- a/llvm/lib/Support/RWMutex.cpp
+++ b/llvm/lib/Support/RWMutex.cpp
@@ -26,8 +26,10 @@ RWMutexImpl::~RWMutexImpl() = default;
 
 bool RWMutexImpl::lock_shared() { return true; }
 bool RWMutexImpl::unlock_shared() { return true; }
+bool RWMutexImpl::try_lock_shared() { return true; }
 bool RWMutexImpl::lock() { return true; }
 bool RWMutexImpl::unlock() { return true; }
+bool RWMutexImpl::try_lock() { return true; }
 
 #else
 
@@ -87,6 +89,14 @@ RWMutexImpl::unlock_shared()
   return errorcode == 0;
 }
 
+bool RWMutexImpl::try_lock_shared() {
+  pthread_rwlock_t *rwlock = static_cast<pthread_rwlock_t *>(data_);
+  assert(rwlock != nullptr);
+
+  int errorcode = pthread_rwlock_tryrdlock(rwlock);
+  return errorcode == 0;
+}
+
 bool
 RWMutexImpl::lock()
 {
@@ -107,6 +117,14 @@ RWMutexImpl::unlock()
   return errorcode == 0;
 }
 
+bool RWMutexImpl::try_lock() {
+  pthread_rwlock_t *rwlock = static_cast<pthread_rwlock_t *>(data_);
+  assert(rwlock != nullptr);
+
+  int errorcode = pthread_rwlock_trywrlock(rwlock);
+  return errorcode == 0;
+}
+
 #else
 
 RWMutexImpl::RWMutexImpl() : data_(new MutexImpl(false)) { }
@@ -123,6 +141,10 @@ bool RWMutexImpl::unlock_shared() {
   return static_cast<MutexImpl *>(data_)->release();
 }
 
+bool RWMutexImpl::try_lock_shared() {
+  return static_cast<MutexImpl *>(data_)->tryacquire();
+}
+
 bool RWMutexImpl::lock() {
   return static_cast<MutexImpl *>(data_)->acquire();
 }
@@ -131,6 +153,10 @@ bool RWMutexImpl::unlock() {
   return static_cast<MutexImpl *>(data_)->release();
 }
 
+bool RWMutexImpl::try_lock() {
+  return static_cast<MutexImpl *>(data_)->tryacquire();
+}
+
 #endif
 #endif
 #endif
diff --git a/llvm/lib/Support/SuffixTree.cpp b/llvm/lib/Support/SuffixTree.cpp
index eaa653078e09..c00c7989d1a6 100644
--- a/llvm/lib/Support/SuffixTree.cpp
+++ b/llvm/lib/Support/SuffixTree.cpp
@@ -242,8 +242,8 @@ void SuffixTree::RepeatedSubstringIterator::advance() {
     unsigned Length = Curr->getConcatLen();
 
     // Iterate over each child, saving internal nodes for visiting, and
-    // leaf nodes in LeafChildren. Internal nodes represent individual
-    // strings, which may repeat.
+    // leaf nodes' SuffixIdx in RepeatedSubstringStarts. Internal nodes
+    // represent individual strings, which may repeat.
     for (auto &ChildPair : Curr->Children) {
       // Save all of this node's children for processing.
       if (auto *InternalChild =
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 34635b5aba7a..29ebf7c696e0 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -168,7 +168,8 @@ static bool isDebugHelpInitialized() {
 }
 
 static bool load64BitDebugHelp(void) {
-  HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
+  HMODULE hLib =
+      ::LoadLibraryExA("Dbghelp.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32);
   if (hLib) {
     fMiniDumpWriteDump =
         (fpMiniDumpWriteDump)::GetProcAddress(hLib, "MiniDumpWriteDump");
diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp
index 4aaf59be2ce5..7bb60894b335 100644
--- a/llvm/lib/Support/YAMLTraits.cpp
+++ b/llvm/lib/Support/YAMLTraits.cpp
@@ -718,40 +718,8 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
     outputUpToEndOfLine("''");
     return;
   }
-  if (MustQuote == QuotingType::None) {
-    // Only quote if we must.
-    outputUpToEndOfLine(S);
-    return;
-  }
-
-  const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\"";
-  output(Quote); // Starting quote.
-
-  // When using double-quoted strings (and only in that case), non-printable characters may be
-  // present, and will be escaped using a variety of unicode-scalar and special short-form
-  // escapes. This is handled in yaml::escape.
-  if (MustQuote == QuotingType::Double) {
-    output(yaml::escape(S, /* EscapePrintable= */ false));
-    outputUpToEndOfLine(Quote);
-    return;
-  }
-
-  unsigned i = 0;
-  unsigned j = 0;
-  unsigned End = S.size();
-  const char *Base = S.data();
-
-  // When using single-quoted strings, any single quote ' must be doubled to be escaped.
-  while (j < End) {
-    if (S[j] == '\'') {                    // Escape quotes.
-      output(StringRef(&Base[i], j - i));  // "flush".
-      output(StringLiteral("''"));         // Print it as ''
-      i = j + 1;
-    }
-    ++j;
-  }
-  output(StringRef(&Base[i], j - i));
-  outputUpToEndOfLine(Quote); // Ending quote.
+  output(S, MustQuote);
+  outputUpToEndOfLine("");
 }
 
 void Output::blockScalarString(StringRef &S) {
@@ -801,6 +769,46 @@ void Output::output(StringRef s) {
   Out << s;
 }
 
+void Output::output(StringRef S, QuotingType MustQuote) {
+  if (MustQuote == QuotingType::None) {
+    // Only quote if we must.
+    output(S);
+    return;
+  }
+
+  StringLiteral Quote = MustQuote == QuotingType::Single ? StringLiteral("'")
+                                                         : StringLiteral("\"");
+  output(Quote); // Starting quote.
+
+  // When using double-quoted strings (and only in that case), non-printable
+  // characters may be present, and will be escaped using a variety of
+  // unicode-scalar and special short-form escapes. This is handled in
+  // yaml::escape.
+  if (MustQuote == QuotingType::Double) {
+    output(yaml::escape(S, /* EscapePrintable= */ false));
+    output(Quote);
+    return;
+  }
+
+  unsigned i = 0;
+  unsigned j = 0;
+  unsigned End = S.size();
+  const char *Base = S.data();
+
+  // When using single-quoted strings, any single quote ' must be doubled to be
+  // escaped.
+  while (j < End) {
+    if (S[j] == '\'') {                   // Escape quotes.
+      output(StringRef(&Base[i], j - i)); // "flush".
+      output(StringLiteral("''"));        // Print it as ''
+      i = j + 1;
+    }
+    ++j;
+  }
+  output(StringRef(&Base[i], j - i));
+  output(Quote); // Ending quote.
+}
+
 void Output::outputUpToEndOfLine(StringRef s) {
   output(s);
   if (StateStack.empty() || (!inFlowSeqAnyElement(StateStack.back()) &&
@@ -853,7 +861,7 @@ void Output::newLineCheck(bool EmptySequence) {
 }
 
 void Output::paddedKey(StringRef key) {
-  output(key);
+  output(key, needsQuotes(key, false));
   output(":");
   const char *spaces = "                ";
   if (key.size() < strlen(spaces))
@@ -872,7 +880,7 @@ void Output::flowKey(StringRef Key) {
     Column = ColumnAtMapFlowStart;
     output("  ");
   }
-  output(Key);
+  output(Key, needsQuotes(Key, false));
   output(": ");
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index f2f1c93ea225..4b2ce0d73949 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -76,7 +76,7 @@ def SME2p1Unsupported : AArch64Unsupported;
 
 def SME2Unsupported : AArch64Unsupported {
   let F = !listconcat([HasSME2, HasSVE2orSME2, HasSVE2p1_or_HasSME2, HasSSVE_FP8FMA,
-                      HasSMEF8F16, HasSMEF8F32],
+                      HasSMEF8F16, HasSMEF8F32, HasSMEF16F16orSMEF8F16],
                       SME2p1Unsupported.F);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 3bf6283b79e9..dddc181b0314 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -178,13 +178,14 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
   }
 
   for (unsigned E = FT->getNumParams(); I != E; ++I) {
-    Align ParamAlign = AttrList.getParamAlignment(I).valueOrOne();
 #if 0
     // FIXME: Need more information about argument size; see
     // https://reviews.llvm.org/D132926
     uint64_t ArgSizeBytes = AttrList.getParamArm64ECArgSizeBytes(I);
+    Align ParamAlign = AttrList.getParamAlignment(I).valueOrOne();
 #else
     uint64_t ArgSizeBytes = 0;
+    Align ParamAlign = Align();
 #endif
     Type *Arm64Ty, *X64Ty;
     canonicalizeThunkType(FT->getParamType(I), ParamAlign,
@@ -294,7 +295,7 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType(
     uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes;
     if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) {
       Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes;
-      if (Alignment.value() >= 8 && !T->isPointerTy())
+      if (Alignment.value() >= 16 && !Ret)
         Out << "a" << Alignment.value();
       Arm64Ty = T;
       if (TotalSizeBytes <= 8) {
@@ -325,7 +326,7 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType(
   Out << "m";
   if (TypeSize != 4)
     Out << TypeSize;
-  if (Alignment.value() >= 8 && !T->isPointerTy())
+  if (Alignment.value() >= 16 && !Ret)
     Out << "a" << Alignment.value();
   // FIXME: Try to canonicalize Arm64Ty more thoroughly?
   Arm64Ty = T;
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index efda45a72ef4..b6c8e5f16089 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -9,32 +9,43 @@
 //
 //===----------------------------------------------------------------------===//
 
+// A SubtargetFeature that can be toggled from the command line, and therefore
+// has an AEK_* entry in ArmExtKind.
+class Extension<
+  string TargetFeatureName,            // String used for -target-feature.
+  string Spelling,                     // The XYZ in HasXYZ and AEK_XYZ.
+  string Desc,                         // Description.
+  list<SubtargetFeature> Implies = []  // List of dependent features.
+> : SubtargetFeature<TargetFeatureName, "Has" # Spelling, "true", Desc, Implies>
+{
+    string ArchExtKindSpelling = "AEK_" # Spelling; // ArchExtKind enum name.
+}
+
 // Each SubtargetFeature which corresponds to an Arm Architecture feature should
 // be annotated with the respective FEAT_ feature name from the Architecture
 // Reference Manual. If a SubtargetFeature enables instructions from multiple
 // Arm Architecture Features, it should list all the relevant features. Not all
 // FEAT_ features have a corresponding SubtargetFeature.
 
-def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
-                                       "Enable ARMv8 FP (FEAT_FP)">;
+def FeatureFPARMv8 : Extension<"fp-armv8", "FPARMv8", "Enable ARMv8 (FEAT_FP)">;
 
-def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+def FeatureNEON : Extension<"neon", "NEON",
   "Enable Advanced SIMD instructions (FEAT_AdvSIMD)", [FeatureFPARMv8]>;
 
-def FeatureSM4 : SubtargetFeature<
-    "sm4", "HasSM4", "true",
+def FeatureSM4 : Extension<
+    "sm4", "SM4",
     "Enable SM3 and SM4 support (FEAT_SM4, FEAT_SM3)", [FeatureNEON]>;
 
-def FeatureSHA2 : SubtargetFeature<
-    "sha2", "HasSHA2", "true",
+def FeatureSHA2 : Extension<
+    "sha2", "SHA2",
     "Enable SHA1 and SHA256 support (FEAT_SHA1, FEAT_SHA256)", [FeatureNEON]>;
 
-def FeatureSHA3 : SubtargetFeature<
-    "sha3", "HasSHA3", "true",
+def FeatureSHA3 : Extension<
+    "sha3", "SHA3",
     "Enable SHA512 and SHA3 support (FEAT_SHA3, FEAT_SHA512)", [FeatureNEON, FeatureSHA2]>;
 
-def FeatureAES : SubtargetFeature<
-    "aes", "HasAES", "true",
+def FeatureAES : Extension<
+    "aes", "AES",
     "Enable AES support (FEAT_AES, FEAT_PMULL)", [FeatureNEON]>;
 
 // Crypto has been split up and any combination is now valid (see the
@@ -45,20 +56,20 @@ def FeatureAES : SubtargetFeature<
 // meaning anymore. We kept the Crypto definition here for backward
 // compatibility, and now imply features SHA2 and AES, which was the
 // "traditional" meaning of Crypto.
-def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+def FeatureCrypto : Extension<"crypto", "Crypto",
   "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;
 
-def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+def FeatureCRC : Extension<"crc", "CRC",
   "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)">;
 
-def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+def FeatureRAS : Extension<"ras", "RAS",
   "Enable ARMv8 Reliability, Availability and Serviceability Extensions (FEAT_RAS, FEAT_RASv1p1)">;
 
-def FeatureRASv2 : SubtargetFeature<"rasv2", "HasRASv2", "true",
+def FeatureRASv2 : Extension<"rasv2", "RASv2",
   "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions (FEAT_RASv2)",
   [FeatureRAS]>;
 
-def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
+def FeatureLSE : Extension<"lse", "LSE",
   "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)">;
 
 def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
@@ -70,7 +81,7 @@ def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics"
 def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true",
   "Enable Function Multi Versioning support.">;
 
-def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
+def FeatureRDM : Extension<"rdm", "RDM",
   "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions (FEAT_RDM)",
   [FeatureNEON]>;
 
@@ -91,16 +102,16 @@ def FeatureVH : SubtargetFeature<"vh", "HasVH", "true",
 // This SubtargetFeature is special. It controls only whether codegen will turn
 // `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The
 // `FEAT_PMUv3*` system registers are always available for assembly/disassembly.
-def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+def FeaturePerfMon : Extension<"perfmon", "PerfMon",
   "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension (FEAT_PMUv3)">;
 
-def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+def FeatureFullFP16 : Extension<"fullfp16", "FullFP16",
   "Full FP16 (FEAT_FP16)", [FeatureFPARMv8]>;
 
-def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true",
+def FeatureFP16FML : Extension<"fp16fml", "FP16FML",
   "Enable FP16 FML instructions (FEAT_FHM)", [FeatureFullFP16]>;
 
-def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
+def FeatureSPE : Extension<"spe", "SPE",
   "Enable Statistical Profiling extension (FEAT_SPE)">;
 
 def FeaturePAN_RWV : SubtargetFeature<
@@ -115,13 +126,13 @@ def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true",
 def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
     "true", "Enable v8.2 data Cache Clean to Point of Persistence (FEAT_DPB)" >;
 
-def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
+def FeatureSVE : Extension<"sve", "SVE",
   "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16]>;
 
-def FeatureFPMR : SubtargetFeature<"fpmr", "HasFPMR", "true",
+def FeatureFPMR : Extension<"fpmr", "FPMR",
   "Enable FPMR Register (FEAT_FPMR)">;
 
-def FeatureFP8 : SubtargetFeature<"fp8", "HasFP8", "true",
+def FeatureFP8 : Extension<"fp8", "FP8",
   "Enable FP8 instructions (FEAT_FP8)">;
 
 // This flag is currently still labeled as Experimental, but when fully
@@ -145,33 +156,33 @@ def FeatureExperimentalZeroingPseudos
 def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
   "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
 
-def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
-    "true", "Enable BFloat16 Extension (FEAT_BF16)" >;
+def FeatureBF16 : Extension<"bf16", "BF16",
+    "Enable BFloat16 Extension (FEAT_BF16)" >;
 
 def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
   "NoSVEFPLD1R", "true", "Avoid using LD1RX instructions for FP">;
 
-def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
+def FeatureSVE2 : Extension<"sve2", "SVE2",
   "Enable Scalable Vector Extension 2 (SVE2) instructions (FEAT_SVE2)",
   [FeatureSVE, FeatureUseScalarIncVL]>;
 
-def FeatureSVE2AES : SubtargetFeature<"sve2-aes", "HasSVE2AES", "true",
+def FeatureSVE2AES : Extension<"sve2-aes", "SVE2AES",
   "Enable AES SVE2 instructions (FEAT_SVE_AES, FEAT_SVE_PMULL128)",
   [FeatureSVE2, FeatureAES]>;
 
-def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true",
+def FeatureSVE2SM4 : Extension<"sve2-sm4", "SVE2SM4",
   "Enable SM4 SVE2 instructions (FEAT_SVE_SM4)", [FeatureSVE2, FeatureSM4]>;
 
-def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true",
+def FeatureSVE2SHA3 : Extension<"sve2-sha3", "SVE2SHA3",
   "Enable SHA3 SVE2 instructions (FEAT_SVE_SHA3)", [FeatureSVE2, FeatureSHA3]>;
 
-def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true",
+def FeatureSVE2BitPerm : Extension<"sve2-bitperm", "SVE2BitPerm",
   "Enable bit permutation SVE2 instructions (FEAT_SVE_BitPerm)", [FeatureSVE2]>;
 
-def FeatureSVE2p1: SubtargetFeature<"sve2p1", "HasSVE2p1", "true",
+def FeatureSVE2p1: Extension<"sve2p1", "SVE2p1",
   "Enable Scalable Vector Extension 2.1 instructions", [FeatureSVE2]>;
 
-def FeatureB16B16 : SubtargetFeature<"b16b16", "HasB16B16", "true",
+def FeatureB16B16 : Extension<"b16b16", "B16B16",
   "Enable SVE2.1 or SME2.1 non-widening BFloat16 to BFloat16 instructions (FEAT_B16B16)", [FeatureBF16]>;
 
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
@@ -303,23 +314,23 @@ def FeatureForce32BitJumpTables
    : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true",
                       "Force jump table entries to be 32-bits wide except at MinSize">;
 
-def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true",
-                                   "Enable support for RCPC extension (FEAT_LRCPC)">;
+def FeatureRCPC : Extension<"rcpc", "RCPC",
+    "Enable support for RCPC extension (FEAT_LRCPC)">;
 
 def FeatureUseRSqrt : SubtargetFeature<
     "use-reciprocal-square-root", "UseRSqrt", "true",
     "Use the reciprocal square root approximation">;
 
-def FeatureDotProd : SubtargetFeature<
-    "dotprod", "HasDotProd", "true",
+def FeatureDotProd : Extension<
+    "dotprod", "DotProd",
     "Enable dot product support (FEAT_DotProd)", [FeatureNEON]>;
 
-def FeaturePAuth : SubtargetFeature<
-    "pauth", "HasPAuth", "true",
+def FeaturePAuth : Extension<
+    "pauth", "PAuth",
     "Enable v8.3-A Pointer Authentication extension (FEAT_PAuth)">;
 
-def FeatureJS : SubtargetFeature<
-    "jsconv", "HasJS", "true",
+def FeatureJS : Extension<
+    "jsconv", "JS",
     "Enable v8.3-A JavaScript FP conversion instructions (FEAT_JSCVT)",
     [FeatureFPARMv8]>;
 
@@ -327,8 +338,8 @@ def FeatureCCIDX : SubtargetFeature<
     "ccidx", "HasCCIDX", "true",
     "Enable v8.3-A Extend of the CCSIDR number of sets (FEAT_CCIDX)">;
 
-def FeatureComplxNum : SubtargetFeature<
-    "complxnum", "HasComplxNum", "true",
+def FeatureComplxNum : Extension<
+    "complxnum", "ComplxNum",
     "Enable v8.3-A Floating-point complex number support (FEAT_FCMA)",
     [FeatureNEON]>;
 
@@ -365,8 +376,8 @@ def FeatureTLB_RMI : SubtargetFeature<
     "tlb-rmi", "HasTLB_RMI", "true",
     "Enable v8.4-A TLB Range and Maintenance Instructions (FEAT_TLBIOS, FEAT_TLBIRANGE)">;
 
-def FeatureFlagM : SubtargetFeature<
-    "flagm", "HasFlagM", "true",
+def FeatureFlagM : Extension<
+    "flagm", "FlagM",
     "Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM)">;
 
 // 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset
@@ -414,50 +425,50 @@ def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true",
 def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict",
   "true", "Enable architectural speculation restriction (FEAT_CSV2_2)">;
 
-def FeatureSB : SubtargetFeature<"sb", "HasSB",
-  "true", "Enable v8.5 Speculation Barrier (FEAT_SB)" >;
+def FeatureSB : Extension<"sb", "SB",
+  "Enable v8.5 Speculation Barrier (FEAT_SB)" >;
 
-def FeatureSSBS : SubtargetFeature<"ssbs", "HasSSBS",
-  "true", "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)" >;
+def FeatureSSBS : Extension<"ssbs", "SSBS",
+  "Enable Speculative Store Bypass Safe bit (FEAT_SSBS, FEAT_SSBS2)" >;
 
-def FeaturePredRes : SubtargetFeature<"predres", "HasPredRes", "true",
+def FeaturePredRes : Extension<"predres", "PredRes",
   "Enable v8.5a execution and data prediction invalidation instructions (FEAT_SPECRES)" >;
 
-def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "HasCCDP",
-    "true", "Enable v8.5 Cache Clean to Point of Deep Persistence (FEAT_DPB2)" >;
+def FeatureCacheDeepPersist : Extension<"ccdp", "CCDP",
+    "Enable v8.5 Cache Clean to Point of Deep Persistence (FEAT_DPB2)" >;
 
-def FeatureBranchTargetId : SubtargetFeature<"bti", "HasBTI",
-    "true", "Enable Branch Target Identification (FEAT_BTI)" >;
+def FeatureBranchTargetId : Extension<"bti", "BTI",
+    "Enable Branch Target Identification (FEAT_BTI)" >;
 
-def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen",
-    "true", "Enable Random Number generation instructions (FEAT_RNG)" >;
+def FeatureRandGen : Extension<"rand", "RandGen",
+    "Enable Random Number generation instructions (FEAT_RNG)" >;
 
-def FeatureMTE : SubtargetFeature<"mte", "HasMTE",
-    "true", "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)" >;
+def FeatureMTE : Extension<"mte", "MTE",
+    "Enable Memory Tagging Extension (FEAT_MTE, FEAT_MTE2)" >;
 
-def FeatureTRBE : SubtargetFeature<"trbe", "HasTRBE",
-    "true", "Enable Trace Buffer Extension (FEAT_TRBE)">;
+def FeatureTRBE : Extension<"trbe", "TRBE",
+    "Enable Trace Buffer Extension (FEAT_TRBE)">;
 
-def FeatureETE : SubtargetFeature<"ete", "HasETE",
-    "true", "Enable Embedded Trace Extension (FEAT_ETE)",
+def FeatureETE : Extension<"ete", "ETE",
+    "Enable Embedded Trace Extension (FEAT_ETE)",
     [FeatureTRBE]>;
 
-def FeatureTME : SubtargetFeature<"tme", "HasTME",
-    "true", "Enable Transactional Memory Extension (FEAT_TME)" >;
+def FeatureTME : Extension<"tme", "TME",
+    "Enable Transactional Memory Extension (FEAT_TME)" >;
 
 def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "AllowTaggedGlobals",
     "true", "Use an instruction sequence for taking the address of a global "
     "that allows a memory tag in the upper address bits">;
 
-def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
-    "true", "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)">;
+def FeatureMatMulInt8 : Extension<"i8mm", "MatMulInt8",
+    "Enable Matrix Multiply Int8 Extension (FEAT_I8MM)">;
 
-def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
-    "true", "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE]>;
+def FeatureMatMulFP32 : Extension<"f32mm", "MatMulFP32",
+    "Enable Matrix Multiply FP32 Extension (FEAT_F32MM)", [FeatureSVE]>;
 
-def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
-    "true", "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE]>;
+def FeatureMatMulFP64 : Extension<"f64mm", "MatMulFP64",
+    "Enable Matrix Multiply FP64 Extension (FEAT_F64MM)", [FeatureSVE]>;
 
 def FeatureXS : SubtargetFeature<"xs", "HasXS",
     "true", "Enable Armv8.7-A limited-TLB-maintenance instruction (FEAT_XS)">;
@@ -468,20 +479,20 @@ def FeatureWFxT : SubtargetFeature<"wfxt", "HasWFxT",
 def FeatureHCX : SubtargetFeature<
     "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register (FEAT_HCX)">;
 
-def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64",
-    "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)">;
+def FeatureLS64 : Extension<"ls64", "LS64",
+    "Enable Armv8.7-A LD64B/ST64B Accelerator Extension (FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA)">;
 
-def FeatureHBC : SubtargetFeature<"hbc", "HasHBC",
-    "true", "Enable Armv8.8-A Hinted Conditional Branches Extension (FEAT_HBC)">;
+def FeatureHBC : Extension<"hbc", "HBC",
+    "Enable Armv8.8-A Hinted Conditional Branches Extension (FEAT_HBC)">;
 
-def FeatureMOPS : SubtargetFeature<"mops", "HasMOPS",
-    "true", "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)">;
+def FeatureMOPS : Extension<"mops", "MOPS",
+    "Enable Armv8.8-A memcpy and memset acceleration instructions (FEAT_MOPS)">;
 
 def FeatureNMI : SubtargetFeature<"nmi", "HasNMI",
     "true", "Enable Armv8.8-A Non-maskable Interrupts (FEAT_NMI, FEAT_GICv3_NMI)">;
 
-def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE",
-    "true", "Enable Branch Record Buffer Extension (FEAT_BRBE)">;
+def FeatureBRBE : Extension<"brbe", "BRBE",
+    "Enable Branch Record Buffer Extension (FEAT_BRBE)">;
 
 def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF",
     "true", "Enable extra register in the Statistical Profiling Extension (FEAT_SPEv1p2)">;
@@ -496,57 +507,57 @@ def FeatureEnhancedCounterVirtualization :
 def FeatureRME : SubtargetFeature<"rme", "HasRME",
     "true", "Enable Realm Management Extension (FEAT_RME)">;
 
-def FeatureSME : SubtargetFeature<"sme", "HasSME", "true",
+def FeatureSME : Extension<"sme", "SME",
   "Enable Scalable Matrix Extension (SME) (FEAT_SME)", [FeatureBF16, FeatureUseScalarIncVL]>;
 
-def FeatureSMEF64F64 : SubtargetFeature<"sme-f64f64", "HasSMEF64F64", "true",
+def FeatureSMEF64F64 : Extension<"sme-f64f64", "SMEF64F64",
   "Enable Scalable Matrix Extension (SME) F64F64 instructions (FEAT_SME_F64F64)", [FeatureSME]>;
 
-def FeatureSMEI16I64 : SubtargetFeature<"sme-i16i64", "HasSMEI16I64", "true",
+def FeatureSMEI16I64 : Extension<"sme-i16i64", "SMEI16I64",
   "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME]>;
 
-def FeatureSMEF16F16 : SubtargetFeature<"sme-f16f16", "HasSMEF16F16", "true",
-  "Enable SME2.1 non-widening Float16 instructions (FEAT_SME_F16F16)", []>;
-
-def FeatureSMEFA64 : SubtargetFeature<"sme-fa64", "HasSMEFA64", "true",
+def FeatureSMEFA64 : Extension<"sme-fa64", "SMEFA64",
   "Enable the full A64 instruction set in streaming SVE mode (FEAT_SME_FA64)", [FeatureSME, FeatureSVE2]>;
 
-def FeatureSME2 : SubtargetFeature<"sme2", "HasSME2", "true",
+def FeatureSME2 : Extension<"sme2", "SME2",
   "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME]>;
 
-def FeatureSME2p1 : SubtargetFeature<"sme2p1", "HasSME2p1", "true",
+def FeatureSMEF16F16 : Extension<"sme-f16f16", "SMEF16F16",
+  "Enable SME non-widening Float16 instructions (FEAT_SME_F16F16)", [FeatureSME2]>;
+
+def FeatureSME2p1 : Extension<"sme2p1", "SME2p1",
   "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>;
 
-def FeatureFAMINMAX: SubtargetFeature<"faminmax", "HasFAMINMAX", "true",
+def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX",
    "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
 
-def FeatureFP8FMA : SubtargetFeature<"fp8fma", "HasFP8FMA", "true",
+def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA",
   "Enable fp8 multiply-add instructions (FEAT_FP8FMA)">;
 
-def FeatureSSVE_FP8FMA : SubtargetFeature<"ssve-fp8fma", "HasSSVE_FP8FMA", "true",
+def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA",
   "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2]>;
 
-def FeatureFP8DOT2: SubtargetFeature<"fp8dot2", "HasFP8DOT2", "true",
+def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2",
    "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)">;
 
-def FeatureSSVE_FP8DOT2 : SubtargetFeature<"ssve-fp8dot2", "HasSSVE_FP8DOT2", "true",
+def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
   "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSME2]>;
 
-def FeatureFP8DOT4: SubtargetFeature<"fp8dot4", "HasFP8DOT4", "true",
+def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
    "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)">;
 
-def FeatureSSVE_FP8DOT4 : SubtargetFeature<"ssve-fp8dot4", "HasSSVE_FP8DOT4", "true",
+def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
   "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSME2]>;
-def FeatureLUT: SubtargetFeature<"lut", "HasLUT", "true",
+def FeatureLUT: Extension<"lut", "LUT",
    "Enable Lookup Table instructions (FEAT_LUT)">;
 
-def FeatureSME_LUTv2 : SubtargetFeature<"sme-lutv2", "HasSME_LUTv2", "true",
+def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2",
   "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
 
-def FeatureSMEF8F16 : SubtargetFeature<"sme-f8f16", "HasSMEF8F16", "true",
+def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
   "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSME2, FeatureFP8]>;
 
-def FeatureSMEF8F32 : SubtargetFeature<"sme-f8f32", "HasSMEF8F32", "true",
+def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32",
   "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
 
 def FeatureAppleA7SysReg  : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
@@ -558,7 +569,7 @@ def FeatureEL2VMSA : SubtargetFeature<"el2vmsa", "HasEL2VMSA", "true",
 def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true",
   "Enable Exception Level 3">;
 
-def FeatureCSSC : SubtargetFeature<"cssc", "HasCSSC", "true",
+def FeatureCSSC : Extension<"cssc", "CSSC",
   "Enable Common Short Sequence Compression (CSSC) instructions (FEAT_CSSC)">;
 
 def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769",
@@ -572,8 +583,8 @@ def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice",
 def FeatureCHK : SubtargetFeature<"chk", "HasCHK",
     "true", "Enable Armv8.0-A Check Feature Status Extension (FEAT_CHK)">;
 
-def FeatureGCS : SubtargetFeature<"gcs", "HasGCS",
-    "true", "Enable Armv9.4-A Guarded Call Stack Extension", [FeatureCHK]>;
+def FeatureGCS : Extension<"gcs", "GCS",
+    "Enable Armv9.4-A Guarded Call Stack Extension", [FeatureCHK]>;
 
 def FeatureCLRBHB : SubtargetFeature<"clrbhb", "HasCLRBHB",
     "true", "Enable Clear BHB instruction (FEAT_CLRBHB)">;
@@ -581,32 +592,32 @@ def FeatureCLRBHB : SubtargetFeature<"clrbhb", "HasCLRBHB",
 def FeaturePRFM_SLC : SubtargetFeature<"prfm-slc-target", "HasPRFM_SLC",
     "true", "Enable SLC target for PRFM instruction">;
 
-def FeatureSPECRES2 : SubtargetFeature<"specres2", "HasSPECRES2",
-    "true", "Enable Speculation Restriction Instruction (FEAT_SPECRES2)",
+def FeatureSPECRES2 : Extension<"specres2", "SPECRES2",
+    "Enable Speculation Restriction Instruction (FEAT_SPECRES2)",
     [FeaturePredRes]>;
 
 def FeatureMEC : SubtargetFeature<"mec", "HasMEC",
     "true", "Enable Memory Encryption Contexts Extension", [FeatureRME]>;
 
-def FeatureITE : SubtargetFeature<"ite", "HasITE",
-    "true", "Enable Armv9.4-A Instrumentation Extension FEAT_ITE", [FeatureETE,
+def FeatureITE : Extension<"ite", "ITE",
+    "Enable Armv9.4-A Instrumentation Extension FEAT_ITE", [FeatureETE,
     FeatureTRBE]>;
 
-def FeatureRCPC3 : SubtargetFeature<"rcpc3", "HasRCPC3",
-    "true", "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set (FEAT_LRCPC3)",
+def FeatureRCPC3 : Extension<"rcpc3", "RCPC3",
+    "Enable Armv8.9-A RCPC instructions for A64 and Advanced SIMD and floating-point instruction set (FEAT_LRCPC3)",
     [FeatureRCPC_IMMO]>;
 
-def FeatureTHE : SubtargetFeature<"the", "HasTHE",
-    "true", "Enable Armv8.9-A Translation Hardening Extension (FEAT_THE)">;
+def FeatureTHE : Extension<"the", "THE",
+    "Enable Armv8.9-A Translation Hardening Extension (FEAT_THE)">;
 
-def FeatureLSE128 : SubtargetFeature<"lse128", "HasLSE128",
-    "true", "Enable Armv9.4-A 128-bit Atomic Instructions (FEAT_LSE128)",
+def FeatureLSE128 : Extension<"lse128", "LSE128",
+    "Enable Armv9.4-A 128-bit Atomic Instructions (FEAT_LSE128)",
     [FeatureLSE]>;
 
 // FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, and FEAT_SYSINSTR128 are mutually implicit.
 // Therefore group them all under a single feature flag, d128:
-def FeatureD128 : SubtargetFeature<"d128", "HasD128",
-    "true", "Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers "
+def FeatureD128 : Extension<"d128", "D128",
+    "Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers "
     "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)",
     [FeatureLSE128]>;
 
@@ -624,13 +635,13 @@ def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedO
 
 // AArch64 2023 Architecture Extensions (v9.5-A)
 
-def FeatureCPA : SubtargetFeature<"cpa", "HasCPA", "true",
+def FeatureCPA : Extension<"cpa", "CPA",
     "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
 
-def FeaturePAuthLR : SubtargetFeature<"pauth-lr", "HasPAuthLR",
-    "true", "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
+def FeaturePAuthLR : Extension<"pauth-lr", "PAuthLR",
+    "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
 
-def FeatureTLBIW : SubtargetFeature<"tlbiw", "HasTLBIW", "true",
+def FeatureTLBIW : Extension<"tlbiw", "TLBIW",
   "Enable ARMv9.5-A TLBI VMALL for Dirty State (FEAT_TLBIW)">;
 
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 419c141121c3..c86c98eed24f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1296,7 +1296,7 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
   }
   case AArch64::LDPQpost:
     Imm = -Imm;
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case AArch64::STPQpre: {
     unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
     unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 892b5853e00e..2af679e0755b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -842,6 +842,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
   setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
+  setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
+  setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
+  setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
 
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
   if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
@@ -1674,6 +1677,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
 
     setOperationAction(ISD::VSCALE, MVT::i32, Custom);
+
+    for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
+      setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
   }
 
   if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
@@ -4870,6 +4876,65 @@ SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
 }
 
+SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+
+  // Get current value of FPCR.
+  SDValue Ops[] = {
+      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
+  SDValue FPCR =
+      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
+  Chain = FPCR.getValue(1);
+  FPCR = FPCR.getValue(0);
+
+  // Truncate FPCR to 32 bits.
+  SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
+
+  return DAG.getMergeValues({Result, Chain}, DL);
+}
+
+SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue Mode = Op->getOperand(1);
+
+  // Extend the specified value to 64 bits.
+  SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
+
+  // Set new value of FPCR.
+  SDValue Ops2[] = {
+      Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
+  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
+}
+
+SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+
+  // Get current value of FPCR.
+  SDValue Ops[] = {
+      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
+  SDValue FPCR =
+      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
+  Chain = FPCR.getValue(1);
+  FPCR = FPCR.getValue(0);
+
+  // Clear bits that are not reserved.
+  SDValue FPSCRMasked = DAG.getNode(
+      ISD::AND, DL, MVT::i64, FPCR,
+      DAG.getConstant(AArch64::ReservedFPControlBits, DL, MVT::i64));
+
+  // Set new value of FPCR.
+  SDValue Ops2[] = {Chain,
+                    DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
+                    FPSCRMasked};
+  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
+}
+
 static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
                                  SDLoc DL, bool &IsMLA) {
   bool IsN0SExt = isSignExtended(N0, DAG);
@@ -5686,8 +5751,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::get_active_lane_mask: {
     SDValue ID =
         DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
-                       Op.getOperand(1), Op.getOperand(2));
+
+    EVT VT = Op.getValueType();
+    if (VT.isScalableVector())
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
+                         Op.getOperand(2));
+
+    // We can use the SVE whilelo instruction to lower this intrinsic by
+    // creating the appropriate sequence of scalable vector operations and
+    // then extracting a fixed-width subvector from the scalable vector.
+
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+    EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
+
+    SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
+                               Op.getOperand(1), Op.getOperand(2));
+    SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
+                       DAG.getVectorIdxConstant(0, dl));
   }
   case Intrinsic::aarch64_neon_uaddlv: {
     EVT OpVT = Op.getOperand(1).getValueType();
@@ -6484,6 +6565,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
     return LowerSET_ROUNDING(Op, DAG);
+  case ISD::GET_FPMODE:
+    return LowerGET_FPMODE(Op, DAG);
+  case ISD::SET_FPMODE:
+    return LowerSET_FPMODE(Op, DAG);
+  case ISD::RESET_FPMODE:
+    return LowerRESET_FPMODE(Op, DAG);
   case ISD::MUL:
     return LowerMUL(Op, DAG);
   case ISD::MULHS:
@@ -16330,7 +16417,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     IntrinsicInst *DI, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
-  if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+  if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
   // Only a factor of 2 supported at present.
@@ -16405,7 +16492,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     IntrinsicInst *II, StoreInst *SI) const {
   // Only interleave2 supported at present.
-  if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
   // Only a factor of 2 supported at present.
@@ -17585,12 +17672,32 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     return false;
   };
 
+  // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
+  // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
+  // the (2^N - 1) can't be execused via a single instruction.
+  auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
+    APInt CVMinus1 = C - 1;
+    if (CVMinus1.isNegative())
+      return false;
+    unsigned TrailingZeroes = CVMinus1.countr_zero();
+    APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
+    if (SCVMinus1.isPowerOf2()) {
+      unsigned BitWidth = SCVMinus1.getBitWidth();
+      M = APInt(BitWidth, SCVMinus1.logBase2());
+      N = APInt(BitWidth, TrailingZeroes);
+      return true;
+    }
+    return false;
+  };
+
   if (ConstValue.isNonNegative()) {
     // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
     // (mul x, 2^N - 1) => (sub (shl x, N), x)
     // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
     // (mul x, (2^M + 1) * (2^N + 1))
     //     => MV = (add (shl x, M), x); (add (shl MV, N), MV)
+    // (mul x, (2^M + 1) * 2^N + 1))
+    //     =>  MV = add (shl x, M), x); add (shl MV, N), x)
     APInt SCVMinus1 = ShiftedConstValue - 1;
     APInt SCVPlus1 = ShiftedConstValue + 1;
     APInt CVPlus1 = ConstValue + 1;
@@ -17604,18 +17711,29 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     } else if (SCVPlus1.isPowerOf2()) {
       ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
       return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
-    } else if (Subtarget->hasALULSLFast() &&
-               isPowPlusPlusConst(ConstValue, CVM, CVN)) {
+    }
+    if (Subtarget->hasALULSLFast() &&
+        isPowPlusPlusConst(ConstValue, CVM, CVN)) {
       APInt CVMMinus1 = CVM - 1;
       APInt CVNMinus1 = CVN - 1;
       unsigned ShiftM1 = CVMMinus1.logBase2();
       unsigned ShiftN1 = CVNMinus1.logBase2();
-      // LSLFast implicate that Shifts <= 3 places are fast
-      if (ShiftM1 <= 3 && ShiftN1 <= 3) {
+      // ALULSLFast implicate that Shifts <= 4 places are fast
+      if (ShiftM1 <= 4 && ShiftN1 <= 4) {
         SDValue MVal = Add(Shl(N0, ShiftM1), N0);
         return Add(Shl(MVal, ShiftN1), MVal);
       }
     }
+    if (Subtarget->hasALULSLFast() &&
+        isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
+      unsigned ShiftM = CVM.getZExtValue();
+      unsigned ShiftN = CVN.getZExtValue();
+      // ALULSLFast implicate that Shifts <= 4 places are fast
+      if (ShiftM <= 4 && ShiftN <= 4) {
+        SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
+        return Add(Shl(MVal, CVN.getZExtValue()), N0);
+      }
+    }
   } else {
     // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
     // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
@@ -18632,14 +18750,12 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  // Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
-  // destination size, combine into an avg of two contacts of the source
-  // vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
-  // concat(b, d))
+  // Optimise concat_vectors of two identical binops with a 128-bit destination
+  // size, combine into an binop of two contacts of the source vectors. eg:
+  // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
   if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
-      (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
-       N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS) &&
-      N0->hasOneUse() && N1->hasOneUse()) {
+      DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
+      N1->hasOneUse()) {
     SDValue N00 = N0->getOperand(0);
     SDValue N01 = N0->getOperand(1);
     SDValue N10 = N1->getOperand(0);
@@ -20433,39 +20549,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
   switch (IID) {
   default:
     break;
-  case Intrinsic::get_active_lane_mask: {
-    SDValue Res = SDValue();
-    EVT VT = N->getValueType(0);
-    if (VT.isFixedLengthVector()) {
-      // We can use the SVE whilelo instruction to lower this intrinsic by
-      // creating the appropriate sequence of scalable vector operations and
-      // then extracting a fixed-width subvector from the scalable vector.
-
-      SDLoc DL(N);
-      SDValue ID =
-          DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
-
-      EVT WhileVT = EVT::getVectorVT(
-          *DAG.getContext(), MVT::i1,
-          ElementCount::getScalable(VT.getVectorNumElements()));
-
-      // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
-      EVT PromVT = getPromotedVTForPredicate(WhileVT);
-
-      // Get the fixed-width equivalent of PromVT for extraction.
-      EVT ExtVT =
-          EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
-                           VT.getVectorElementCount());
-
-      Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
-                        N->getOperand(1), N->getOperand(2));
-      Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
-      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
-                        DAG.getConstant(0, DL, MVT::i64));
-      Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
-    }
-    return Res;
-  }
   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   case Intrinsic::aarch64_neon_vcvtfxu2fp:
     return tryCombineFixedPointConvert(N, DCI, DAG);
@@ -25539,8 +25622,6 @@ void AArch64TargetLowering::ReplaceNodeResults(
     return;
   case ISD::INTRINSIC_WO_CHAIN: {
     EVT VT = N->getValueType(0);
-    assert((VT == MVT::i8 || VT == MVT::i16) &&
-           "custom lowering for unexpected type");
 
     Intrinsic::ID IntID =
         static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
@@ -25548,6 +25629,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
     default:
       return;
     case Intrinsic::aarch64_sve_clasta_n: {
+      assert((VT == MVT::i8 || VT == MVT::i16) &&
+             "custom lowering for unexpected type");
       SDLoc DL(N);
       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
       auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
@@ -25556,6 +25639,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
       return;
     }
     case Intrinsic::aarch64_sve_clastb_n: {
+      assert((VT == MVT::i8 || VT == MVT::i16) &&
+             "custom lowering for unexpected type");
       SDLoc DL(N);
       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
       auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
@@ -25564,6 +25649,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
       return;
     }
     case Intrinsic::aarch64_sve_lasta: {
+      assert((VT == MVT::i8 || VT == MVT::i16) &&
+             "custom lowering for unexpected type");
       SDLoc DL(N);
       auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
                            N->getOperand(1), N->getOperand(2));
@@ -25571,12 +25658,28 @@ void AArch64TargetLowering::ReplaceNodeResults(
       return;
     }
     case Intrinsic::aarch64_sve_lastb: {
+      assert((VT == MVT::i8 || VT == MVT::i16) &&
+             "custom lowering for unexpected type");
       SDLoc DL(N);
       auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
                            N->getOperand(1), N->getOperand(2));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
+    case Intrinsic::get_active_lane_mask: {
+      if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
+        return;
+
+      // NOTE: Only trivial type promotion is supported.
+      EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
+      if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
+        return;
+
+      SDLoc DL(N);
+      auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
+      return;
+    }
     }
   }
   case ISD::READ_REGISTER: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 400368a5e130..fbdc4de5617f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -523,6 +523,9 @@ enum Rounding {
 // Bit position of rounding mode bits in FPCR.
 const unsigned RoundingBitsPos = 22;
 
+// Reserved bits should be preserved when modifying FPCR.
+const uint64_t ReservedFPControlBits = 0xfffffffff80040f8;
+
 // Registers used to pass function arguments.
 ArrayRef<MCPhysReg> getGPRArgRegs();
 ArrayRef<MCPhysReg> getFPRArgRegs();
@@ -1128,6 +1131,9 @@ private:
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7bf06e71a030..55fecc4b4845 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6924,19 +6924,26 @@ genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
     assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
            "Unexpected instruction opcode.");
 
+  uint32_t Flags = Root.mergeFlagsWith(*AddMI);
+  Flags &= ~MachineInstr::NoSWrap;
+  Flags &= ~MachineInstr::NoUWrap;
+
   MachineInstrBuilder MIB1 =
       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
           .addReg(RegA, getKillRegState(RegAIsKill))
-          .addReg(RegB, getKillRegState(RegBIsKill));
+          .addReg(RegB, getKillRegState(RegBIsKill))
+          .setMIFlags(Flags);
   MachineInstrBuilder MIB2 =
       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
           .addReg(NewVR, getKillRegState(true))
-          .addReg(RegC, getKillRegState(RegCIsKill));
+          .addReg(RegC, getKillRegState(RegCIsKill))
+          .setMIFlags(Flags);
 
   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
   InsInstrs.push_back(MIB1);
   InsInstrs.push_back(MIB2);
   DelInstrs.push_back(AddMI);
+  DelInstrs.push_back(&Root);
 }
 
 /// When getMachineCombinerPatterns() finds potential patterns,
@@ -6966,13 +6973,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     // ==> (A - B) - C
     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
                      InstrIdxForVirtReg);
-    break;
+    return;
   case AArch64MachineCombinerPattern::SUBADD_OP2:
     // A - (B + C)
     // ==> (A - C) - B
     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
                      InstrIdxForVirtReg);
-    break;
+    return;
   case AArch64MachineCombinerPattern::MULADDW_OP1:
   case AArch64MachineCombinerPattern::MULADDX_OP1:
     // MUL I=A,B,0
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index a7abb58064a5..17d96370c04a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -230,6 +230,12 @@ def HasSVE2p1_or_HasSME2
 def HasSVE2p1_or_HasSME2p1
     : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()">,
                  AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">;
+
+def HasSMEF16F16orSMEF8F16
+    : Predicate<"Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16()">,
+                AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
+                "sme-f16f16 or sme-f8f16">;
+
 // A subset of NEON instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
 def HasNEONorSME
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index c50a8200dd89..f2286ae17dba 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -194,6 +194,11 @@ def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
                                "Cortex-R82 ARM processors", [
                                FeaturePostRAScheduler]>;
 
+def TuneR82AE : SubtargetFeature<"cortex-r82ae", "ARMProcFamily",
+                                 "CortexR82AE",
+                                 "Cortex-R82-AE ARM processors",
+                                 [FeaturePostRAScheduler]>;
+
 def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   "Cortex-X1 ARM processors", [
                                   FeatureCmpBccFusion,
@@ -447,6 +452,15 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
 
+def TuneNeoverseN3 : SubtargetFeature<"neoversen3", "ARMProcFamily", "NeoverseN3",
+                                      "Neoverse N3 ARM processors", [
+                                      FeatureFuseAES,
+                                      FeaturePostRAScheduler,
+                                      FeatureALULSLFast,
+                                      FeatureFuseAdrpAdd,
+                                      FeatureEnableSelectOptimize,
+                                      FeaturePredictableSelectIsExpensive]>;
+
 def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB",
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
@@ -476,6 +490,24 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
 
+def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
+                                      "Neoverse V3 ARM processors", [
+                                      FeatureFuseAES,
+                                      FeatureALULSLFast,
+                                      FeatureFuseAdrpAdd,
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize,
+                                      FeaturePredictableSelectIsExpensive]>;
+
+def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3",
+                                      "Neoverse V3AE ARM processors", [
+                                      FeatureFuseAES,
+                                      FeatureALULSLFast,
+                                      FeatureFuseAdrpAdd,
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize,
+                                      FeaturePredictableSelectIsExpensive]>;
+
 def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
                                    FeaturePostRAScheduler,
@@ -640,7 +672,13 @@ def ProcessorFeatures {
   list<SubtargetFeature> R82  = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
                                  FeatureFP16FML, FeatureSSBS, FeaturePredRes,
                                  FeatureSB, FeatureRDM, FeatureDotProd,
-                                 FeatureComplxNum, FeatureJS];
+                                 FeatureComplxNum, FeatureJS,
+                                 FeatureCacheDeepPersist];
+  list<SubtargetFeature> R82AE = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
+                                  FeatureFP16FML, FeatureSSBS, FeaturePredRes,
+                                  FeatureSB, FeatureRDM, FeatureDotProd,
+                                  FeatureComplxNum, FeatureJS,
+                                  FeatureCacheDeepPersist];
   list<SubtargetFeature> X1   = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureRCPC, FeaturePerfMon,
                                  FeatureSPE, FeatureFullFP16, FeatureDotProd,
@@ -715,6 +753,10 @@ def ProcessorFeatures {
                                        FeatureMatMulInt8, FeatureMTE, FeatureSVE2,
                                        FeatureSVE2BitPerm, FeatureTRBE,
                                        FeaturePerfMon];
+  list<SubtargetFeature> NeoverseN3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
+                                      FeatureFullFP16, FeatureMTE, FeaturePerfMon,
+                                      FeatureRandGen, FeatureSPE, FeatureSPE_EEF,
+                                      FeatureSVE2BitPerm];
   list<SubtargetFeature> Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist,
                                            FeatureCrypto, FeatureFPARMv8, FeatureFP16FML,
                                            FeatureFullFP16, FeatureMatMulInt8, FeatureNEON,
@@ -729,6 +771,14 @@ def ProcessorFeatures {
                                        FeaturePerfMon, FeatureETE, FeatureMatMulInt8,
                                        FeatureNEON, FeatureSVE2BitPerm, FeatureFP16FML,
                                        FeatureMTE, FeatureRandGen];
+  list<SubtargetFeature> NeoverseV3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
+                                      FeatureFullFP16, FeatureLS64, FeatureMTE,
+                                      FeaturePerfMon, FeatureRandGen, FeatureSPE,
+                                      FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE];
+  list<SubtargetFeature> NeoverseV3AE = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
+                                      FeatureFullFP16, FeatureLS64, FeatureMTE,
+                                      FeaturePerfMon, FeatureRandGen, FeatureSPE,
+                                      FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE];
   list<SubtargetFeature> Saphira    = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
                                        FeatureNEON, FeatureSPE, FeaturePerfMon];
   list<SubtargetFeature> ThunderX   = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -815,6 +865,8 @@ def : ProcessorModel<"cortex-a720ae", NeoverseN2Model, ProcessorFeatures.A720AE,
                      [TuneA720AE]>;
 def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82,
                      [TuneR82]>;
+def : ProcessorModel<"cortex-r82ae", CortexA55Model, ProcessorFeatures.R82AE,
+                     [TuneR82AE]>;
 def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1,
                      [TuneX1]>;
 def : ProcessorModel<"cortex-x1c", CortexA57Model, ProcessorFeatures.X1C,
@@ -831,12 +883,18 @@ def : ProcessorModel<"neoverse-n1", NeoverseN1Model,
                      ProcessorFeatures.NeoverseN1, [TuneNeoverseN1]>;
 def : ProcessorModel<"neoverse-n2", NeoverseN2Model,
                      ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>;
+def : ProcessorModel<"neoverse-n3", NeoverseN2Model,
+                     ProcessorFeatures.NeoverseN3, [TuneNeoverseN3]>;
 def : ProcessorModel<"neoverse-512tvb", NeoverseV1Model,
                      ProcessorFeatures.Neoverse512TVB, [TuneNeoverse512TVB]>;
 def : ProcessorModel<"neoverse-v1", NeoverseV1Model,
                      ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>;
 def : ProcessorModel<"neoverse-v2", NeoverseV2Model,
                      ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>;
+def : ProcessorModel<"neoverse-v3", NeoverseV2Model,
+                     ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>;
+def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model,
+                     ProcessorFeatures.NeoverseV3AE, [TuneNeoverseV3AE]>;
 def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3,
                      [TuneExynosM3]>;
 def : ProcessorModel<"exynos-m4", ExynosM4Model, ProcessorFeatures.ExynosM4,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2db0fa253434..574178c8d524 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -792,12 +792,14 @@ defm LUTI4_S_2ZTZI : sme2p1_luti4_vector_vg2_index<"luti4">;
 defm LUTI4_S_4ZTZI : sme2p1_luti4_vector_vg4_index<"luti4">;
 }
 
-let Predicates = [HasSME2p1, HasSMEF16F16] in {
+let Predicates = [HasSMEF16F16orSMEF8F16] in {
 defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
 defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
+}
 
+let Predicates = [HasSMEF16F16] in {
 defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16>;
 defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16>;
 defm FMLA_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 525ae79da996..62e68de1359f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -508,6 +508,16 @@ def AArch64smulh : PatFrag<(ops node:$op1, node:$op2),
 def AArch64umulh : PatFrag<(ops node:$op1, node:$op2),
                            (AArch64umulh_p (SVEAnyPredicate), node:$op1, node:$op2)>;
 
+
+def AArch64bsl  : PatFrags<(ops node:$Op1, node:$Op2, node:$Op3),
+                           [(int_aarch64_sve_bsl node:$Op1, node:$Op2, node:$Op3),
+                            (AArch64bsp node:$Op3, node:$Op1, node:$Op2)]>;
+
+def AArch64nbsl : PatFrags<(ops node:$Op1, node:$Op2, node:$Op3),
+                           [(int_aarch64_sve_nbsl node:$Op1, node:$Op2, node:$Op3),
+                            (vnot (AArch64bsp node:$Op3, node:$Op1, node:$Op2))]>;
+
+
 let Predicates = [HasSVE] in {
   def RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
   def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
@@ -3757,10 +3767,10 @@ let Predicates = [HasSVE2orSME] in {
   // SVE2 bitwise ternary operations
   defm EOR3_ZZZZ  : sve2_int_bitwise_ternary_op<0b000, "eor3",  AArch64eor3>;
   defm BCAX_ZZZZ  : sve2_int_bitwise_ternary_op<0b010, "bcax",  AArch64bcax>;
-  defm BSL_ZZZZ   : sve2_int_bitwise_ternary_op<0b001, "bsl",   int_aarch64_sve_bsl, AArch64bsp>;
+  defm BSL_ZZZZ   : sve2_int_bitwise_ternary_op<0b001, "bsl",   AArch64bsl>;
   defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
   defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
-  defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  int_aarch64_sve_nbsl>;
+  defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  AArch64nbsl>;
 
   // SVE2 bitwise xor and rotate right by immediate
   defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
@@ -4119,7 +4129,7 @@ defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64fclamp>;
 // SME2.1 or SVE2.1 instructions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasSVE2p1_or_HasSME2p1] in {
-defm FADDQV   : sve2p1_fp_reduction_q<0b000, "faddqv", int_aarch64_sve_addqv>;
+defm FADDQV   : sve2p1_fp_reduction_q<0b000, "faddqv", int_aarch64_sve_faddqv>;
 defm FMAXNMQV : sve2p1_fp_reduction_q<0b100, "fmaxnmqv", int_aarch64_sve_fmaxnmqv>;
 defm FMINNMQV : sve2p1_fp_reduction_q<0b101, "fminnmqv", int_aarch64_sve_fminnmqv>;
 defm FMAXQV   : sve2p1_fp_reduction_q<0b110, "fmaxqv", int_aarch64_sve_fmaxqv>;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index ef09a3cde495..5d185fcaefc4 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -117,6 +117,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
   case CortexA35:
   case CortexA53:
   case CortexA55:
+  case CortexR82:
+  case CortexR82AE:
     PrefFunctionAlignment = Align(16);
     PrefLoopAlignment = Align(16);
     MaxBytesForLoopAlignment = 8;
@@ -142,7 +144,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
   case CortexA78:
   case CortexA78AE:
   case CortexA78C:
-  case CortexR82:
   case CortexX1:
     PrefFunctionAlignment = Align(16);
     PrefLoopAlignment = Align(32);
@@ -234,7 +235,9 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     MaxBytesForLoopAlignment = 16;
     break;
   case NeoverseN2:
+  case NeoverseN3:
   case NeoverseV2:
+  case NeoverseV3:
     PrefFunctionAlignment = Align(16);
     PrefLoopAlignment = Align(32);
     MaxBytesForLoopAlignment = 16;
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index a3b966aa6155..c9bba9bf6314 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6136,6 +6136,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
   case Match_InvalidSVEVectorListMul2x16:
   case Match_InvalidSVEVectorListMul2x32:
   case Match_InvalidSVEVectorListMul2x64:
+  case Match_InvalidSVEVectorListMul2x128:
     return Error(Loc, "Invalid vector list, expected list with 2 consecutive "
                       "SVE vectors, where the first vector is a multiple of 2 "
                       "and with matching element types");
@@ -6143,6 +6144,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
   case Match_InvalidSVEVectorListMul4x16:
   case Match_InvalidSVEVectorListMul4x32:
   case Match_InvalidSVEVectorListMul4x64:
+  case Match_InvalidSVEVectorListMul4x128:
     return Error(Loc, "Invalid vector list, expected list with 4 consecutive "
                       "SVE vectors, where the first vector is a multiple of 4 "
                       "and with matching element types");
@@ -6739,10 +6741,12 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidSVEVectorListMul2x16:
   case Match_InvalidSVEVectorListMul2x32:
   case Match_InvalidSVEVectorListMul2x64:
+  case Match_InvalidSVEVectorListMul2x128:
   case Match_InvalidSVEVectorListMul4x8:
   case Match_InvalidSVEVectorListMul4x16:
   case Match_InvalidSVEVectorListMul4x32:
   case Match_InvalidSVEVectorListMul4x64:
+  case Match_InvalidSVEVectorListMul4x128:
   case Match_InvalidSVEVectorListStrided2x8:
   case Match_InvalidSVEVectorListStrided2x16:
   case Match_InvalidSVEVectorListStrided2x32:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index f42c415a9e44..243891249668 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -740,6 +740,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalForCartesianProduct({s32, v2s16, v4s8})
       .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
       .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
+      .lowerIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].isVector() != Query.Types[1].isVector();
+      })
       .moreElementsToNextPow2(0)
       .clampNumElements(0, v8s8, v16s8)
       .clampNumElements(0, v4s16, v8s16)
@@ -1567,7 +1570,7 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     MI.eraseFromParent();
     return true;
   }
-  case Intrinsic::experimental_vector_reverse:
+  case Intrinsic::vector_reverse:
     // TODO: Add support for vector_reverse
     return false;
   }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index ad21f2673a64..f5bea3336cbf 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -306,13 +306,12 @@ llvm::createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
   return new AArch64TargetAsmStreamer(S, OS);
 }
 
-MCELFStreamer *llvm::createAArch64ELFStreamer(
-    MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
-    bool RelaxAll) {
+MCELFStreamer *
+llvm::createAArch64ELFStreamer(MCContext &Context,
+                               std::unique_ptr<MCAsmBackend> TAB,
+                               std::unique_ptr<MCObjectWriter> OW,
+                               std::unique_ptr<MCCodeEmitter> Emitter) {
   AArch64ELFStreamer *S = new AArch64ELFStreamer(
       Context, std::move(TAB), std::move(OW), std::move(Emitter));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index 25c609ee1496..e6df79ba19d4 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -20,8 +20,7 @@ namespace llvm {
 MCELFStreamer *createAArch64ELFStreamer(MCContext &Context,
                                         std::unique_ptr<MCAsmBackend> TAB,
                                         std::unique_ptr<MCObjectWriter> OW,
-                                        std::unique_ptr<MCCodeEmitter> Emitter,
-                                        bool RelaxAll);
+                                        std::unique_ptr<MCCodeEmitter> Emitter);
 }
 
 #endif
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 043f0a03b797..0dd4a78f962d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -378,30 +378,28 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
 static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
                                      std::unique_ptr<MCAsmBackend> &&TAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
-                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                     bool RelaxAll) {
+                                     std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createAArch64ELFStreamer(Ctx, std::move(TAB), std::move(OW),
-                                  std::move(Emitter), RelaxAll);
+                                  std::move(Emitter));
 }
 
 static MCStreamer *createMachOStreamer(MCContext &Ctx,
                                        std::unique_ptr<MCAsmBackend> &&TAB,
                                        std::unique_ptr<MCObjectWriter> &&OW,
                                        std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                       bool RelaxAll,
                                        bool DWARFMustBeAtTheEnd) {
   return createMachOStreamer(Ctx, std::move(TAB), std::move(OW),
-                             std::move(Emitter), RelaxAll, DWARFMustBeAtTheEnd,
+                             std::move(Emitter), DWARFMustBeAtTheEnd,
                              /*LabelSections*/ true);
 }
 
 static MCStreamer *
 createWinCOFFStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
+                      std::unique_ptr<MCCodeEmitter> &&Emitter,
                       bool IncrementalLinkerCompatible) {
   return createAArch64WinCOFFStreamer(Ctx, std::move(TAB), std::move(OW),
-                                      std::move(Emitter), RelaxAll,
+                                      std::move(Emitter),
                                       IncrementalLinkerCompatible);
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 438ac6cc4788..c25cc2e99adc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -294,7 +294,7 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFISaveAnyRegQPX(unsigned Reg,
 MCWinCOFFStreamer *llvm::createAArch64WinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
     std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
-    bool RelaxAll, bool IncrementalLinkerCompatible) {
+    bool IncrementalLinkerCompatible) {
   auto *S = new AArch64WinCOFFStreamer(Context, std::move(MAB),
                                        std::move(Emitter), std::move(OW));
   S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
index 8c0656652eed..a13b1a451be5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -21,7 +21,7 @@ namespace llvm {
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
     std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
-    bool RelaxAll, bool IncrementalLinkerCompatible);
+    bool IncrementalLinkerCompatible);
 } // end llvm namespace
 
 #endif
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 485b38a29a0f..69c3238c7d61 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -605,13 +605,6 @@ class SVE_2_Op_Fp_Imm_Pat_Zero<ValueType vt, SDPatternOperator op,
                       (vt (splat_vector (it immL))))),
       (inst $Pg, $Zs1, imm)>;
 
-// Used to re-order the operands of BSP when lowering to BSL. BSP has the order:
-// mask, in1, in2 whereas BSL for SVE2 has them ordered in1, in2, mask
-class SVE_3_Op_BSP_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
-                   ValueType vt2, ValueType vt3, Instruction inst>
-: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
-      (inst $Op2, $Op3, $Op1)>;
-
 class SVE_Shift_Add_All_Active_Pat<ValueType vtd, SDPatternOperator op, ValueType pt,
                                    ValueType vt1, ValueType vt2, ValueType vt3,
                                    Instruction inst>
@@ -4922,8 +4915,8 @@ class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
   let hasSideEffects = 0;
 }
 
-multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op,
-                                       SDPatternOperator ir_op = null_frag> {
+multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm,
+                                       SDPatternOperator op> {
   def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;
 
   def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
@@ -4937,11 +4930,6 @@ multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperato
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
-
-  def : SVE_3_Op_BSP_Pat<nxv16i8, ir_op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_BSP_Pat<nxv8i16, ir_op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_BSP_Pat<nxv4i32, ir_op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_BSP_Pat<nxv2i64, ir_op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
 }
 
 class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 2b81f5d51032..8abe9920c02c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1493,6 +1493,7 @@ def FeatureISAVersion11_Generic: FeatureSet<
     [FeatureMSAALoadDstSelBug,
      FeatureVALUTransUseHazard,
      FeatureUserSGPRInit16Bug,
+     FeatureMADIntraFwdBug,
      FeaturePrivEnabledTrap2NopBug,
      FeatureRequiresCOV6])>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f4a747784d1f..3124fb23fb0b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -148,6 +148,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::LOAD, MVT::i128, Promote);
   AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
 
+  // TODO: Would be better to consume as directly legal
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
+  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
+
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
+  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
+
   // There are no 64-bit extloads. These should be done as a 32-bit extload and
   // an extension to 64-bit.
   for (MVT VT : MVT::integer_valuetypes())
@@ -300,11 +313,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
+  setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+  setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
+  setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
 
   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
@@ -312,6 +331,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+  setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 
   setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
@@ -321,17 +341,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
   setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
+  setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
 
   setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
   setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
+  setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 
   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
+  setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 
   setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
+  setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 72661a8d29f8..269c414521db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -230,6 +230,12 @@ public:
   bool isCheapToSpeculateCtlz(Type *Ty) const override;
 
   bool isSDNodeAlwaysUniform(const SDNode *N) const override;
+
+  // FIXME: This hook should not exist
+  AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override {
+    return AtomicExpansionKind::None;
+  }
+
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index f6a63502ec2c..9415bd3695f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -83,37 +83,42 @@ public:
         // instruction to be marked as a single use producer.
         bool AllProducerOperandsAreSingleUse = true;
 
-        for (const auto &Operand : MI.operands()) {
-          if (!Operand.isReg())
-            continue;
-          const auto Reg = Operand.getReg();
-
-          // Count the number of times each register is read.
-          if (Operand.readsReg())
-            for (const MCRegUnit &Unit : TRI->regunits(Reg))
-              RegisterUseCount[Unit]++;
-
-          // Do not attempt to optimise across exec mask changes.
-          if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
-            for (auto &UsedReg : RegisterUseCount)
-              UsedReg.second = 2;
-          }
+        // Gather a list of Registers used before updating use counts to avoid
+        // double counting registers that appear multiple times in a single
+        // MachineInstr.
+        SmallVector<MCRegUnit> RegistersUsed;
 
-          // If we are at the point where the register first became live,
-          // check if the operands are single use.
-          if (!MI.modifiesRegister(Reg, TRI))
-            continue;
+        for (const auto &Operand : MI.all_defs()) {
+          const auto Reg = Operand.getReg();
 
           const auto RegUnits = TRI->regunits(Reg);
-          if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit &Unit) {
+          if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) {
                 return RegisterUseCount[Unit] > 1;
               }))
             AllProducerOperandsAreSingleUse = false;
 
           // Reset uses count when a register is no longer live.
-          for (const MCRegUnit &Unit : RegUnits)
+          for (const MCRegUnit Unit : RegUnits)
             RegisterUseCount.erase(Unit);
         }
+
+        for (const auto &Operand : MI.all_uses()) {
+          const auto Reg = Operand.getReg();
+
+          // Count the number of times each register is read.
+          for (const MCRegUnit Unit : TRI->regunits(Reg)) {
+            if (!is_contained(RegistersUsed, Unit))
+              RegistersUsed.push_back(Unit);
+          }
+        }
+        for (const MCRegUnit Unit : RegistersUsed)
+          RegisterUseCount[Unit]++;
+
+        // Do not attempt to optimise across exec mask changes.
+        if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+          for (auto &UsedReg : RegisterUseCount)
+            UsedReg.second = 2;
+        }
         if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
           // TODO: Replace with candidate logging for instruction grouping
           // later.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index aa4ec785bf02..56345d14a331 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2261,7 +2261,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   case AMDGPU::G_FCMP:
     if (!Subtarget.hasSALUFloatInsts())
       break;
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case AMDGPU::G_ICMP:
   case AMDGPU::G_UADDO:
   case AMDGPU::G_USUBO:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 386672352114..4d036fdea63b 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -172,6 +172,7 @@ public:
     ImmTyWaitEXP,
     ImmTyWaitVAVDst,
     ImmTyWaitVMVSrc,
+    ImmTyByteSel,
   };
 
   // Immediate operand kind.
@@ -375,7 +376,6 @@ public:
   }
 
   bool isOModSI() const { return isImmTy(ImmTyOModSI); }
-  bool isDMask() const { return isImmTy(ImmTyDMask); }
   bool isDim() const { return isImmTy(ImmTyDim); }
   bool isR128A16() const { return isImmTy(ImmTyR128A16); }
   bool isOff() const { return isImmTy(ImmTyOff); }
@@ -383,9 +383,6 @@ public:
   bool isOffen() const { return isImmTy(ImmTyOffen); }
   bool isIdxen() const { return isImmTy(ImmTyIdxen); }
   bool isAddr64() const { return isImmTy(ImmTyAddr64); }
-  bool isOffset() const { return isImmTy(ImmTyOffset); }
-  bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); }
-  bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
   bool isSMEMOffsetMod() const { return isImmTy(ImmTySMEMOffsetMod); }
   bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); }
   bool isGDS() const { return isImmTy(ImmTyGDS); }
@@ -395,9 +392,6 @@ public:
   bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
   bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
-  bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
-  bool isDppRowMask() const { return isImmTy(ImmTyDppRowMask); }
-  bool isDppBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
   bool isDppFI() const { return isImmTy(ImmTyDppFI); }
   bool isSDWADstSel() const { return isImmTy(ImmTySDWADstSel); }
   bool isSDWASrc0Sel() const { return isImmTy(ImmTySDWASrc0Sel); }
@@ -947,16 +941,10 @@ public:
   bool isDPP8() const;
   bool isDPPCtrl() const;
   bool isBLGP() const;
-  bool isCBSZ() const;
-  bool isABID() const;
   bool isGPRIdxMode() const;
   bool isS16Imm() const;
   bool isU16Imm() const;
   bool isEndpgm() const;
-  bool isWaitVDST() const;
-  bool isWaitEXP() const;
-  bool isWaitVAVDst() const;
-  bool isWaitVMVSrc() const;
 
   auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const {
     return std::bind(P, *this);
@@ -1139,6 +1127,7 @@ public:
     case ImmTyWaitEXP: OS << "WaitEXP"; break;
     case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
     case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
+    case ImmTyByteSel: OS << "ByteSel" ; break;
     }
     // clang-format on
   }
@@ -8644,6 +8633,13 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
     }
   }
 
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
+    assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
+    Inst.addOperand(Inst.getOperand(0));
+    addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                          AMDGPUOperand::ImmTyByteSel);
+  }
+
   if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
     addOptionalImmOperand(Inst, Operands, OptionalIdx,
                           AMDGPUOperand::ImmTyClampSI);
@@ -8680,8 +8676,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
 
   if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
       Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
-      Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
-      Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
+      Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 ||
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) {
     Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
     Inst.addOperand(Inst.getOperand(0));
   }
@@ -8692,7 +8688,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
       !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
         Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
         Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
-        Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
     assert(!IsPacked);
     Inst.addOperand(Inst.getOperand(0));
   }
@@ -8922,14 +8922,6 @@ bool AMDGPUOperand::isBLGP() const {
   return isImm() && getImmTy() == ImmTyBLGP && isUInt<3>(getImm());
 }
 
-bool AMDGPUOperand::isCBSZ() const {
-  return isImm() && getImmTy() == ImmTyCBSZ && isUInt<3>(getImm());
-}
-
-bool AMDGPUOperand::isABID() const {
-  return isImm() && getImmTy() == ImmTyABID && isUInt<4>(getImm());
-}
-
 bool AMDGPUOperand::isS16Imm() const {
   return isImmLiteral() && (isInt<16>(getImm()) || isUInt<16>(getImm()));
 }
@@ -9207,10 +9199,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
       Inst.addOperand(Inst.getOperand(0));
     }
 
-    bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
-                          Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
-                          Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
-                          Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
+    bool IsVOP3CvtSrDpp =
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12;
     if (IsVOP3CvtSrDpp) {
       if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
         Inst.addOperand(MCOperand::createImm(0));
@@ -9243,6 +9236,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
       llvm_unreachable("unhandled operand type");
     }
   }
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
+    addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                          AMDGPUOperand::ImmTyByteSel);
+
   if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
 
@@ -9645,30 +9643,6 @@ ParseStatus AMDGPUAsmParser::parseEndpgm(OperandVector &Operands) {
 bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
 
 //===----------------------------------------------------------------------===//
-// LDSDIR
-//===----------------------------------------------------------------------===//
-
-bool AMDGPUOperand::isWaitVDST() const {
-  return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());
-}
-
-bool AMDGPUOperand::isWaitVAVDst() const {
-  return isImmTy(ImmTyWaitVAVDst) && isUInt<4>(getImm());
-}
-
-bool AMDGPUOperand::isWaitVMVSrc() const {
-  return isImmTy(ImmTyWaitVMVSrc) && isUInt<1>(getImm());
-}
-
-//===----------------------------------------------------------------------===//
-// VINTERP
-//===----------------------------------------------------------------------===//
-
-bool AMDGPUOperand::isWaitEXP() const {
-  return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm());
-}
-
-//===----------------------------------------------------------------------===//
 // Split Barrier
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index dc1bf92771b4..8fd36b84a00c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -869,10 +869,6 @@ void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   if (VDstInIdx != -1)
     insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
 
-  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
-      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
-    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
-
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
   if (MI.getNumOperands() < DescNumOps &&
       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -902,10 +898,6 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   if (VDstInIdx != -1)
     insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
 
-  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
-      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
-    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
-
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
   if (MI.getNumOperands() < DescNumOps &&
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 5090b0a07da4..91733c2933b4 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -409,6 +409,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) {
         DPPInst.addImm(NegHiOpr->getImm());
       }
+      auto *ByteSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
+      if (ByteSelOpr &&
+          AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
+        DPPInst.addImm(ByteSelOpr->getImm());
+      }
     }
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index f0c111eaf060..4fd9ed2a8927 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2342,7 +2342,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
         case 8:
           NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
           break;
-        case 16: [[fallthrough]];
+        case 16:
         default:
           NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
         }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
index 1ce7012040da..4e9a33227a5d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -28,10 +28,11 @@ public:
 
 }
 
-MCELFStreamer *llvm::createAMDGPUELFStreamer(
-    const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
-    bool RelaxAll) {
+MCELFStreamer *
+llvm::createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
+                              std::unique_ptr<MCAsmBackend> MAB,
+                              std::unique_ptr<MCObjectWriter> OW,
+                              std::unique_ptr<MCCodeEmitter> Emitter) {
   return new AMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
                                std::move(Emitter));
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
index e09e2dca1b47..f9ece5f22b0f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -26,8 +26,7 @@ class Triple;
 MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context,
                                        std::unique_ptr<MCAsmBackend> MAB,
                                        std::unique_ptr<MCObjectWriter> OW,
-                                       std::unique_ptr<MCCodeEmitter> Emitter,
-                                       bool RelaxAll);
+                                       std::unique_ptr<MCCodeEmitter> Emitter);
 } // namespace llvm.
 
 #endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index b6a95906bc45..883b6c4407fe 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1806,4 +1806,14 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
   O << ' ' << formatDec(Imm);
 }
 
+void AMDGPUInstPrinter::printByteSel(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  uint8_t Imm = MI->getOperand(OpNo).getImm();
+  if (!Imm)
+    return;
+
+  O << " byte_sel:" << formatDec(Imm);
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index c801eaf1111e..d6d7fd34b68c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -186,6 +186,8 @@ private:
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpTgt(const MCInst *MI, unsigned OpNo,
                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printByteSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
 
 public:
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 4700a984770b..30dd384051b9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -112,10 +112,9 @@ static MCTargetStreamer *createAMDGPUNullTargetStreamer(MCStreamer &S) {
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
                                     std::unique_ptr<MCObjectWriter> &&OW,
-                                    std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                    bool RelaxAll) {
+                                    std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createAMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW),
-                                 std::move(Emitter), RelaxAll);
+                                 std::move(Emitter));
 }
 
 namespace {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 4f106bf0dfb1..eae666ab0e7d 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -683,6 +683,12 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   }
   assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
 
+  if (hasFP(MF)) {
+    Register FPReg = MFI->getFrameOffsetReg();
+    assert(FPReg != AMDGPU::FP_REG);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
+  }
+
   if (requiresStackPointerReference(MF)) {
     Register SPReg = MFI->getStackPtrOffsetReg();
     assert(SPReg != AMDGPU::SP_REG);
@@ -690,12 +696,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
         .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
   }
 
-  if (hasFP(MF)) {
-    Register FPReg = MFI->getFrameOffsetReg();
-    assert(FPReg != AMDGPU::FP_REG);
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
-  }
-
   bool NeedsFlatScratchInit =
       MFI->getUserSGPRInfo().hasFlatScratchInit() &&
       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 91a1c40dd824..839ac927a0ee 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -187,8 +187,12 @@ VmemType getVmemType(const MachineInstr &Inst) {
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
   const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
       AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
-  return BaseInfo->BVH ? VMEM_BVH
-                       : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
+  // We have to make an additional check for isVSAMPLE here since some
+  // instructions don't have a sampler, but are still classified as sampler
+  // instructions for the purposes of e.g. waitcnt.
+  return BaseInfo->BVH                                         ? VMEM_BVH
+         : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
+                                                               : VMEM_NOSAMPLER;
 }
 
 unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
@@ -444,12 +448,19 @@ protected:
   const SIInstrInfo *TII = nullptr;
   AMDGPU::IsaVersion IV;
   InstCounterType MaxCounter;
+  bool OptNone;
 
 public:
   WaitcntGenerator() {}
-  WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
-      : ST(ST), TII(ST->getInstrInfo()),
-        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
+      : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
+        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
+        OptNone(MF.getFunction().hasOptNone() ||
+                MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
+
+  // Return true if the current function should be compiled with no
+  // optimization.
+  bool isOptNone() const { return OptNone; }
 
   // Edits an existing sequence of wait count instructions according
   // to an incoming Waitcnt value, which is itself updated to reflect
@@ -500,8 +511,8 @@ public:
 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
 public:
   WaitcntGeneratorPreGFX12() {}
-  WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
-      : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+  WaitcntGeneratorPreGFX12(const MachineFunction &MF)
+      : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
 
   bool
   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -535,8 +546,9 @@ public:
 class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 public:
   WaitcntGeneratorGFX12Plus() {}
-  WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
-      : WaitcntGenerator(ST, MaxCounter) {}
+  WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
+                            InstCounterType MaxCounter)
+      : WaitcntGenerator(MF, MaxCounter) {}
 
   bool
   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -593,8 +605,6 @@ private:
   bool ForceEmitZeroWaitcnts;
   bool ForceEmitWaitcnt[NUM_INST_CNTS];
 
-  bool OptNone;
-
   // In any given run of this pass, WCG will point to one of these two
   // generator objects, which must have been re-initialised before use
   // from a value made using a subtarget constructor.
@@ -1199,19 +1209,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
       continue;
 
     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
-    bool IsSoft = Opcode != II.getOpcode();
+    bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
 
     // Update required wait count. If this is a soft waitcnt (= it was added
     // by an earlier pass), it may be entirely removed.
     if (Opcode == AMDGPU::S_WAITCNT) {
       unsigned IEnc = II.getOperand(0).getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
-      if (IsSoft)
+      if (TrySimplify)
         ScoreBrackets.simplifyWaitcnt(OldWait);
       Wait = Wait.combined(OldWait);
 
       // Merge consecutive waitcnt of the same type by erasing multiples.
-      if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
+      if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
         II.eraseFromParent();
         Modified = true;
       } else
@@ -1222,11 +1232,11 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 
       unsigned OldVSCnt =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
-      if (IsSoft)
+      if (TrySimplify)
         ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
       Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
 
-      if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
+      if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
         II.eraseFromParent();
         Modified = true;
       } else
@@ -1352,13 +1362,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     // by an earlier pass), it may be entirely removed.
 
     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
-    bool IsSoft = Opcode != II.getOpcode();
+    bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
 
     if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
       unsigned OldEnc =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
-      if (IsSoft)
+      if (TrySimplify)
         ScoreBrackets.simplifyWaitcnt(OldWait);
       Wait = Wait.combined(OldWait);
       UpdatableInstr = &CombinedLoadDsCntInstr;
@@ -1366,7 +1376,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       unsigned OldEnc =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
-      if (IsSoft)
+      if (TrySimplify)
         ScoreBrackets.simplifyWaitcnt(OldWait);
       Wait = Wait.combined(OldWait);
       UpdatableInstr = &CombinedStoreDsCntInstr;
@@ -1375,7 +1385,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       assert(CT.has_value());
       unsigned OldCnt =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
-      if (IsSoft)
+      if (TrySimplify)
         ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
       addWait(Wait, CT.value(), OldCnt);
       UpdatableInstr = &WaitInstrs[CT.value()];
@@ -1645,7 +1655,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // * we are not in Dynamic VGPR mode
   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
-    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
+    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
         ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
         !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
       ReleaseVGPRInsts.insert(&MI);
@@ -1853,7 +1863,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // not, we need to ensure the subtarget is capable of backing off barrier
   // instructions in case there are any outstanding memory operations that may
   // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
-  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
+  if (TII->isBarrierStart(MI.getOpcode()) &&
       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
   }
@@ -2467,11 +2477,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   if (ST->hasExtendedWaitCounts()) {
     MaxCounter = NUM_EXTENDED_INST_CNTS;
-    WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
+    WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
     WCG = &WCGGFX12Plus;
   } else {
     MaxCounter = NUM_NORMAL_INST_CNTS;
-    WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
+    WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
     WCG = &WCGPreGFX12;
   }
 
@@ -2483,9 +2493,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
 
-  OptNone = MF.getFunction().hasOptNone() ||
-            MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
-
   HardwareLimits Limits = {};
   if (ST->hasExtendedWaitCounts()) {
     Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e20fe1b716b6..76b90042d65f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -461,8 +461,10 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
     return true;
   }
 
-  if (isMIMG(LdSt)) {
-    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+  if (isImage(LdSt)) {
+    auto RsrcOpName =
+        isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
+    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
     BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
     if (VAddr0Idx >= 0) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b314b9b2fb51..84bb73cc9a79 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -925,6 +925,17 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
   }
 
+  // Check to see if opcode is for a barrier start. Pre gfx12 this is just the
+  // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
+  // to check for the barrier start (S_BARRIER_SIGNAL*)
+  bool isBarrierStart(unsigned Opcode) const {
+    return Opcode == AMDGPU::S_BARRIER ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
+  }
+
   static bool doesNotReadTiedSource(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f1afbcc060b2..7a8b6c98fc36 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1000,8 +1000,13 @@ def SDWAVopcDst : BoolRC {
 }
 
 class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
-                      string name = NAME, string ConvertMethod = "nullptr">
+                      string name = NAME>
     : CustomOperand<Type, Optional, name> {
+  let PredicateMethod =
+    "getPredicate([](const AMDGPUOperand &Op) -> bool { "#
+    "return Op.isImmTy(AMDGPUOperand::"#ImmTy#"); })";
+  string Validator = "[](int64_t V) { return true; }";
+  string ConvertMethod = "[](int64_t &V) { return "#Validator#"(V); }";
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
@@ -1045,8 +1050,10 @@ class ArrayOperand0<string Id, string Name = NAME>
 let ImmTy = "ImmTyOffset" in
 def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
 def Offset : NamedIntOperand<i32, "offset">;
+let Validator = "isUInt<8>" in {
 def Offset0 : NamedIntOperand<i8, "offset0">;
 def Offset1 : NamedIntOperand<i8, "offset1">;
+}
 
 def gds : NamedBitOperand<"gds", "GDS">;
 
@@ -1103,25 +1110,41 @@ let DefaultValue = "0xf" in {
 def DppRowMask : NamedIntOperand<i32, "row_mask">;
 def DppBankMask : NamedIntOperand<i32, "bank_mask">;
 }
-def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl",
-    "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
+def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl"> {
+  let ConvertMethod = "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }";
+}
 
 let DecoderMethod = "decodeDpp8FI" in
 def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
-def CBSZ : NamedIntOperand<i32, "cbsz">;
-def ABID : NamedIntOperand<i32, "abid">;
-
+def CBSZ : NamedIntOperand<i32, "cbsz"> {
+  let Validator = "isUInt<3>";
+}
+def ABID : NamedIntOperand<i32, "abid"> {
+  let Validator = "isUInt<4>";
+}
 def hwreg : CustomOperand<i32, 0, "Hwreg">;
 
 def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
 
-def WaitVDST : NamedIntOperand<i8, "wait_vdst">;
-def WaitEXP : NamedIntOperand<i8, "wait_exp">;
-def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst">;
-def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc">;
+def WaitVDST : NamedIntOperand<i8, "wait_vdst"> {
+  let Validator = "isUInt<4>";
+}
+def WaitEXP : NamedIntOperand<i8, "wait_exp"> {
+  let Validator = "isUInt<3>";
+}
+def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst"> {
+  let Validator = "isUInt<4>";
+}
+def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc"> {
+  let Validator = "isUInt<1>";
+}
+
+def ByteSel : NamedIntOperand<i8, "byte_sel"> {
+  let Validator = "isUInt<2>";
+}
 
 class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
   let OperandNamespace = "AMDGPU";
@@ -1700,9 +1723,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                Src1Mod:$src1_modifiers, Src1RC:$src1,
                clampmod0:$clamp, omod0:$omod),
-           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-               Src1Mod:$src1_modifiers, Src1RC:$src1,
-               clampmod0:$clamp))
+          !con((ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                    Src1Mod:$src1_modifiers, Src1RC:$src1),
+                !if(HasClamp, (ins clampmod0:$clamp), (ins))))
       /* else */,
         // VOP2 without modifiers
         !if (HasClamp,
@@ -2036,7 +2059,8 @@ class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
 class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
                        bit HasOpSel, bit HasOMod, bit IsVOP3P,
                        bit HasModifiers, bit Src0HasMods,
-                       bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32> {
+                       bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32,
+                       bit HasByteSel = 0> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
                        "$sdst",
@@ -2058,6 +2082,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
   string src1 = !if(Src1HasMods, src1mods, src1nomods);
   string src2 = !if(Src2HasMods, src2mods, src2nomods);
   string opsel = !if(HasOpSel, "$op_sel", "");
+  string bytesel = !if(HasByteSel, "$byte_sel", "");
   string 3PMods = !if(IsVOP3P,
                       !if(HasOpSel, "$op_sel_hi", "")
                         #!if(HasModifiers, "$neg_lo$neg_hi", ""),
@@ -2065,7 +2090,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
   string clamp = !if(HasClamp, "$clamp", "");
   string omod = !if(HasOMod, "$omod", "");
 
-  string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#3PMods#clamp#omod, "");
+  string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod, "");
 
 }
 
@@ -2282,6 +2307,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit IsSWMMAC = 0;
 
   field bit IsFP8 = 0;
+  field bit IsFP8DstByteSel = 0;
 
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
   field bit HasDst32 = HasDst;
@@ -2401,7 +2427,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
   field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
    HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
-   HasModifiers, DstVT>.ret;
+   HasModifiers, DstVT, IsFP8DstByteSel>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
   field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index e2e70ba9733b..ba01b8513dca 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -376,14 +376,14 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM:
     return 8;
-  case AMDGPU::DS_READ_B32:      [[fallthrough]];
-  case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
-  case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
+  case AMDGPU::DS_READ_B32:
+  case AMDGPU::DS_READ_B32_gfx9:
+  case AMDGPU::DS_WRITE_B32:
   case AMDGPU::DS_WRITE_B32_gfx9:
     return 1;
-  case AMDGPU::DS_READ_B64:      [[fallthrough]];
-  case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
-  case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
+  case AMDGPU::DS_READ_B64:
+  case AMDGPU::DS_READ_B64_gfx9:
+  case AMDGPU::DS_WRITE_B64:
   case AMDGPU::DS_WRITE_B64_gfx9:
     return 2;
   default:
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index bf4a501cc315..072c5aedc220 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -110,7 +110,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   }
 
   if (!AMDGPU::isGraphics(CC) ||
-      ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_CS) &&
+      ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_Gfx) &&
        ST.hasArchitectedSGPRs())) {
     if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
       WorkGroupIDX = true;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 647595d9ccab..616bc7684753 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -580,6 +580,22 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
                                           HasSrc2FloatMods>.ret>.ret);
 }
 
+class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
+  VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
+  let IsFP8DstByteSel = 1;
+  let HasClamp = 0;
+  defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
+  let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+                            HasClamp, HasModifiers, HasSrc2Mods,
+                            HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+                   bytesel);
+  let InsVOP3Base = !con(
+    getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
+                   Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
+                   Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel>.ret,
+    bytesel);
+}
+
 def IsPow2Plus1: PatLeaf<(i32 imm), [{
   uint32_t V = N->getZExtValue();
   return isPowerOf2_32(V - 1);
@@ -645,12 +661,17 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
   let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
     defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
     defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+
+    let SubtargetPredicate = isGFX12Plus in {
+      defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
+      defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
+    }
   }
 
   // These instructions have non-standard use of op_sel. In particular they are
   // using op_sel bits 2 and 3 while only having two sources. Therefore dummy
   // src2 is used to hold the op_sel value.
-  let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+  let Constraints = "$vdst = $src2", DisableEncoding = "$src2", SubtargetPredicate = isGFX940Plus in {
     defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
     defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
   }
@@ -667,15 +688,28 @@ class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst>
           !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
 >;
 
+class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType SrcVT> : GCNPat<
+    (i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers),
+          i32:$old, timm:$byte_sel)),
+    (inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel))
+>;
+
 let OtherPredicates = [HasFP8ConversionInsts] in {
 foreach Index = [0, -1] in {
   def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
   def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
 }
 
-foreach Index = [0, 1, 2, 3] in {
-  def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
-  def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+let SubtargetPredicate = isGFX940Plus in {
+  foreach Index = [0, 1, 2, 3] in {
+    def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
+    def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+  }
+}
+
+let SubtargetPredicate = isGFX12Plus in {
+  def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
+  def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
 }
 }
 
@@ -1040,8 +1074,8 @@ defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
 
 defm V_CVT_PK_FP8_F32  : VOP3Only_Realtriple_gfx12<0x369>;
 defm V_CVT_PK_BF8_F32  : VOP3Only_Realtriple_gfx12<0x36a>;
-defm V_CVT_SR_FP8_F32  : VOP3Only_Realtriple_gfx12<0x36b>;
-defm V_CVT_SR_BF8_F32  : VOP3Only_Realtriple_gfx12<0x36c>;
+defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
+defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
 
 //===----------------------------------------------------------------------===//
 // GFX11, GFX12
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index da16178cb58b..7cdb5cbfe297 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -311,6 +311,14 @@ class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
   let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
 }
 
+ class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+   bits<2> byte_sel;
+
+   let Inst{11} = 0; // op_sel0
+   let Inst{12} = 0; // op_sel1
+   let Inst{14-13} = byte_sel;  // op_sel2/3
+ }
+
 class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
   let Inst{11} = ?;
   let Inst{12} = ?;
@@ -741,6 +749,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   bits<3> src2_modifiers;
   bits<1> clamp;
   bits<2> omod;
+  bits<2> byte_sel;
 
   let Inst{8}     = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
   let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
@@ -748,8 +757,8 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
   let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
   let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
-  let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
-  let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
+  let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
+  let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
   let Inst{15}    = !if(P.HasClamp, clamp, 0);
   let Inst{25-16} = op;
   let Inst{31-26} = 0x35;
@@ -1388,7 +1397,11 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
                           bit isSingle = 0> {
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
-    if ps.Pfl.HasOpSel then {
+    if ps.Pfl.IsFP8DstByteSel then {
+      def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
+        VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
+    } if ps.Pfl.HasOpSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
         VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
@@ -1419,6 +1432,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
         VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+    } else if ps.Pfl.IsFP8DstByteSel then {
+      def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
+        VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
     } else if ps.Pfl.HasOpSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 30f0730774b7..a3c2684ac1fb 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -683,6 +683,7 @@ static inline bool isIndirectCall(const MachineInstr &MI) {
   case ARM::BX_CALL:
   case ARM::BMOVPCRX_CALL:
   case ARM::TCRETURNri:
+  case ARM::TCRETURNrinotr12:
   case ARM::TAILJMPr:
   case ARM::TAILJMPr4:
   case ARM::tBLXr:
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 0f7858a3be9f..df10613fcc7c 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2197,7 +2197,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     }
 
     case ARM::TCRETURNdi:
-    case ARM::TCRETURNri: {
+    case ARM::TCRETURNri:
+    case ARM::TCRETURNrinotr12: {
       MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
       if (MBBI->getOpcode() == ARM::SEH_EpilogEnd)
         MBBI--;
@@ -2241,7 +2242,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         // Add the default predicate in Thumb mode.
         if (STI->isThumb())
           MIB.add(predOps(ARMCC::AL));
-      } else if (RetOpcode == ARM::TCRETURNri) {
+      } else if (RetOpcode == ARM::TCRETURNri ||
+                 RetOpcode == ARM::TCRETURNrinotr12) {
         unsigned Opcode =
           STI->isThumb() ? ARM::tTAILJMPr
                          : (STI->hasV4TOps() ? ARM::TAILJMPr : ARM::TAILJMPr4);
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 111c87838291..84481af650be 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -15,6 +15,18 @@ def ModeSoftFloat         : SubtargetFeature<"soft-float","UseSoftFloat",
 // ARM Subtarget features.
 //
 
+// This is currently only used by AArch64, but is required here because ARM and
+// AArch64 share a tablegen backend for TargetParser.
+class Extension<
+  string TargetFeatureName,            // String used for -target-feature.
+  string Spelling,                     // The XYZ in HasXYZ and AEK_XYZ.
+  string Desc,                         // Description.
+  list<SubtargetFeature> Implies = []  // List of dependent features.
+> : SubtargetFeature<TargetFeatureName, "Has" # Spelling, "true", Desc, Implies>
+{
+    string ArchExtKindSpelling = "AEK_" # Spelling; // ArchExtKind enum name.
+}
+
 // Floating Point, HW Division and Neon Support
 
 // FP loads/stores/moves, shared between VFP and MVE (even in the integer-only
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index a332f743f495..11496a6e032d 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -257,7 +257,8 @@ static int getArgumentStackToRestore(MachineFunction &MF,
   if (MBB.end() != MBBI) {
     unsigned RetOpcode = MBBI->getOpcode();
     IsTailCallReturn = RetOpcode == ARM::TCRETURNdi ||
-                       RetOpcode == ARM::TCRETURNri;
+                       RetOpcode == ARM::TCRETURNri ||
+                       RetOpcode == ARM::TCRETURNrinotr12;
   }
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -486,6 +487,7 @@ static MachineBasicBlock::iterator insertSEH(MachineBasicBlock::iterator MBBI,
 
   case ARM::tBX_RET:
   case ARM::TCRETURNri:
+  case ARM::TCRETURNrinotr12:
     MIB = BuildMI(MF, DL, TII.get(ARM::SEH_Nop_Ret))
               .addImm(/*Wide=*/0)
               .setMIFlags(Flags);
@@ -1615,7 +1617,9 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
   if (MBB.end() != MI) {
     DL = MI->getDebugLoc();
     unsigned RetOpcode = MI->getOpcode();
-    isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri);
+    isTailCall =
+        (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri ||
+         RetOpcode == ARM::TCRETURNrinotr12);
     isInterrupt =
         RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
     isTrap =
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d0e9f61c0bd1..f67a68acbf23 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1555,15 +1555,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   if (Subtarget->hasNEON()) {
     // vmin and vmax aren't available in a scalar form, so we can use
-    // a NEON instruction with an undef lane instead.  This has a performance
-    // penalty on some cores, so we don't do this unless we have been
-    // asked to by the core tuning model.
-    if (Subtarget->useNEONForSinglePrecisionFP()) {
-      setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
-      setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
-      setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
-      setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
-    }
+    // a NEON instruction with an undef lane instead.
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 08b519e4d5cb..1f7bd8dd3121 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -2677,6 +2677,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
   def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, i32imm:$SPDiff), IIC_Br, []>,
                    Sched<[WriteBr]>;
 
+  def TCRETURNrinotr12 : PseudoInst<(outs), (ins tcGPRnotr12:$dst, i32imm:$SPDiff), IIC_Br, []>,
+                   Sched<[WriteBr]>;
+
   def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
                                  4, IIC_Br, [],
                                  (Bcc arm_br_target:$dst, (ops 14, zero_reg))>,
@@ -6081,8 +6084,14 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst),
 // TODO: add,sub,and, 3-instr forms?
 
 // Tail calls. These patterns also apply to Thumb mode.
+// Regular indirect tail call
 def : Pat<(ARMtcret tcGPR:$dst, (i32 timm:$SPDiff)),
-          (TCRETURNri tcGPR:$dst, timm:$SPDiff)>;
+          (TCRETURNri tcGPR:$dst, timm:$SPDiff)>,
+          Requires<[NoSignRetAddr]>;
+// Indirect tail call when PACBTI is enabled
+def : Pat<(ARMtcret tcGPRnotr12:$dst, (i32 timm:$SPDiff)),
+          (TCRETURNrinotr12 tcGPRnotr12:$dst, timm:$SPDiff)>,
+          Requires<[SignRetAddr]>;
 def : Pat<(ARMtcret (i32 tglobaladdr:$dst), (i32 timm:$SPDiff)),
           (TCRETURNdi texternalsym:$dst, (i32 timm:$SPDiff))>;
 def : Pat<(ARMtcret (i32 texternalsym:$dst), (i32 timm:$SPDiff)),
diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td
index aca970d900a8..ddc5ad8754ee 100644
--- a/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/llvm/lib/Target/ARM/ARMPredicates.td
@@ -228,6 +228,10 @@ def DontGenExecuteOnly : Predicate<"!Subtarget->genExecuteOnly()">;
 def GenT1ExecuteOnly : Predicate<"Subtarget->genExecuteOnly() && "
                                  "Subtarget->isThumb1Only() && "
                                  "!Subtarget->hasV8MBaselineOps()">;
+let RecomputePerFunction = 1 in {
+  def SignRetAddr : Predicate<[{ MF->getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true) }]>;
+  def NoSignRetAddr : Predicate<[{ !MF->getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true) }]>;
+}
 
 // Armv8.5-A extensions
 def HasSB            : Predicate<"Subtarget->hasSB()">,
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 5d4ae9a7648e..a6fdece10ba4 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -156,11 +156,6 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) {
            "Subclass not added?");
     assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
            "Subclass not added?");
-    assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnoip_and_tcGPRRegClassID)) &&
-           "Subclass not added?");
-    assert(RBGPR.covers(*TRI.getRegClass(
-               ARM::tGPREven_and_GPRnoip_and_tcGPRRegClassID)) &&
-           "Subclass not added?");
     assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) &&
            "Subclass not added?");
     assert(getMaximumSize(RBGPR.getID()) == 32 &&
@@ -188,16 +183,16 @@ ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case GPRnoip_and_GPRnopcRegClassID:
   case rGPRRegClassID:
   case GPRspRegClassID:
-  case GPRnoip_and_tcGPRRegClassID:
   case tcGPRRegClassID:
+  case tcGPRnotr12RegClassID:
   case tGPRRegClassID:
   case tGPREvenRegClassID:
   case tGPROddRegClassID:
   case tGPR_and_tGPREvenRegClassID:
   case tGPR_and_tGPROddRegClassID:
   case tGPREven_and_tcGPRRegClassID:
-  case tGPREven_and_GPRnoip_and_tcGPRRegClassID:
   case tGPROdd_and_tcGPRRegClassID:
+  case tGPREven_and_tcGPRnotr12RegClassID:
     return getRegBank(ARM::GPRRegBankID);
   case HPRRegClassID:
   case SPR_8RegClassID:
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 194d65cad8d1..212f22651f9f 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -373,6 +373,16 @@ def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
   }];
 }
 
+// Some pointer authentication instructions require the use of R12. When return
+// address signing is enabled, authentication of the caller's return address
+// must be performed before a tail call is made. Therefore, indirect tail call
+// jump cannot be from R12.
+// FIXME: All PACBTI instruction currently implemented in the compiler
+// implicitly use R12. When instructions that allow PAC to be placed in a
+// specific register are implemented the restriction needs to be updated to
+// make sure that PACBTI signature and indirect tail call both use a different register.
+def tcGPRnotr12 : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3)>;
+
 def tGPROdd : RegisterClass<"ARM", [i32], 32, (add R1, R3, R5, R7, R9, R11)> {
   let AltOrders = [(and tGPROdd, tGPR)];
   let AltOrderSelect = [{
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 028db9d17e30..e54314cc7d00 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -11104,7 +11104,7 @@ ARMAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
       return Match_MnemonicFail;
     }
   }
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   default:
     return Match_Success;
   }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 1d80af590d16..afd7dccbeca9 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1487,8 +1487,7 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> TAB,
                                     std::unique_ptr<MCObjectWriter> OW,
                                     std::unique_ptr<MCCodeEmitter> Emitter,
-                                    bool RelaxAll, bool IsThumb,
-                                    bool IsAndroid) {
+                                    bool IsThumb, bool IsAndroid) {
   ARMELFStreamer *S =
       new ARMELFStreamer(Context, std::move(TAB), std::move(OW),
                          std::move(Emitter), IsThumb, IsAndroid);
@@ -1497,8 +1496,6 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context,
   // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
   S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 8d9959a9457d..20603b6cf1b0 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -359,10 +359,9 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
 static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
-                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                     bool RelaxAll) {
+                                     std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createARMELFStreamer(
-      Ctx, std::move(MAB), std::move(OW), std::move(Emitter), false,
+      Ctx, std::move(MAB), std::move(OW), std::move(Emitter),
       (T.getArch() == Triple::thumb || T.getArch() == Triple::thumbeb),
       T.isAndroid());
 }
@@ -370,10 +369,10 @@ static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
 static MCStreamer *
 createARMMachOStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&MAB,
                        std::unique_ptr<MCObjectWriter> &&OW,
-                       std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
+                       std::unique_ptr<MCCodeEmitter> &&Emitter,
                        bool DWARFMustBeAtTheEnd) {
   return createMachOStreamer(Ctx, std::move(MAB), std::move(OW),
-                             std::move(Emitter), false, DWARFMustBeAtTheEnd);
+                             std::move(Emitter), DWARFMustBeAtTheEnd);
 }
 
 static MCInstPrinter *createARMMCInstPrinter(const Triple &T,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 3066d9ba6783..a673d590419e 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -94,7 +94,6 @@ MCStreamer *createARMWinCOFFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                     bool RelaxAll,
                                      bool IncrementalLinkerCompatible);
 
 /// Construct an ELF Mach-O object writer.
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index cdd7f6fb715a..0fcf6eb1a5ab 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -66,11 +66,12 @@ void ARMWinCOFFStreamer::finishImpl() {
 }
 }
 
-MCStreamer *llvm::createARMWinCOFFStreamer(
-    MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB,
-    std::unique_ptr<MCObjectWriter> &&OW,
-    std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll,
-    bool IncrementalLinkerCompatible) {
+MCStreamer *
+llvm::createARMWinCOFFStreamer(MCContext &Context,
+                               std::unique_ptr<MCAsmBackend> &&MAB,
+                               std::unique_ptr<MCObjectWriter> &&OW,
+                               std::unique_ptr<MCCodeEmitter> &&Emitter,
+                               bool IncrementalLinkerCompatible) {
   auto *S = new ARMWinCOFFStreamer(Context, std::move(MAB), std::move(Emitter),
                                    std::move(OW));
   S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 047c6731333c..e908f1fb9512 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -1044,9 +1044,9 @@ static void popRegsFromStack(MachineBasicBlock &MBB,
         continue;
 
       if (Reg == ARM::LR) {
-        if (!MBB.succ_empty() ||
-            MI->getOpcode() == ARM::TCRETURNdi ||
-            MI->getOpcode() == ARM::TCRETURNri)
+        if (!MBB.succ_empty() || MI->getOpcode() == ARM::TCRETURNdi ||
+            MI->getOpcode() == ARM::TCRETURNri ||
+            MI->getOpcode() == ARM::TCRETURNrinotr12)
           // LR may only be popped into PC, as part of return sequence.
           // If this isn't the return sequence, we'll need emitPopSpecialFixUp
           // to restore LR the hard way.
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index ba370261e284..119baff83dae 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -72,10 +72,9 @@ static MCInstPrinter *createAVRMCInstPrinter(const Triple &T,
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
                                     std::unique_ptr<MCObjectWriter> &&OW,
-                                    std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                    bool RelaxAll) {
+                                    std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createELFStreamer(Context, std::move(MAB), std::move(OW),
-                           std::move(Emitter), RelaxAll);
+                           std::move(Emitter));
 }
 
 static MCTargetStreamer *
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index ebd8447eba85..8c9f5c4dc554 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -973,8 +973,7 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
 }
 
 /// Read file contents from the actual file or from the source
-std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
-  auto File = SP->getFile();
+std::string BTFDebug::populateFileContent(const DIFile *File) {
   std::string FileName;
 
   if (!File->getFilename().starts_with("/") && File->getDirectory().size())
@@ -1005,9 +1004,9 @@ std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
   return FileName;
 }
 
-void BTFDebug::constructLineInfo(const DISubprogram *SP, MCSymbol *Label,
+void BTFDebug::constructLineInfo(MCSymbol *Label, const DIFile *File,
                                  uint32_t Line, uint32_t Column) {
-  std::string FileName = populateFileContent(SP);
+  std::string FileName = populateFileContent(File);
   BTFLineInfo LineInfo;
 
   LineInfo.Label = Label;
@@ -1366,10 +1365,10 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
   if (!CurMI) // no debug info
     return;
 
-  // Skip this instruction if no DebugLoc or the DebugLoc
-  // is the same as the previous instruction.
+  // Skip this instruction if no DebugLoc, the DebugLoc
+  // is the same as the previous instruction or Line is 0.
   const DebugLoc &DL = MI->getDebugLoc();
-  if (!DL || PrevInstLoc == DL) {
+  if (!DL || PrevInstLoc == DL || DL.getLine() == 0) {
     // This instruction will be skipped, no LineInfo has
     // been generated, construct one based on function signature.
     if (LineInfoGenerated == false) {
@@ -1377,7 +1376,7 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
       if (!S)
         return;
       MCSymbol *FuncLabel = Asm->getFunctionBegin();
-      constructLineInfo(S, FuncLabel, S->getLine(), 0);
+      constructLineInfo(FuncLabel, S->getFile(), S->getLine(), 0);
       LineInfoGenerated = true;
     }
 
@@ -1389,8 +1388,7 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
   OS.emitLabel(LineSym);
 
   // Construct the lineinfo.
-  auto SP = DL->getScope()->getSubprogram();
-  constructLineInfo(SP, LineSym, DL.getLine(), DL.getCol());
+  constructLineInfo(LineSym, DL->getFile(), DL.getLine(), DL.getCol());
 
   LineInfoGenerated = true;
   PrevInstLoc = DL;
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index 7536006ed21c..11a0c59ba6c9 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -343,10 +343,10 @@ class BTFDebug : public DebugHandlerBase {
 
   /// Get the file content for the subprogram. Certain lines of the file
   /// later may be put into string table and referenced by line info.
-  std::string populateFileContent(const DISubprogram *SP);
+  std::string populateFileContent(const DIFile *File);
 
   /// Construct a line info.
-  void constructLineInfo(const DISubprogram *SP, MCSymbol *Label, uint32_t Line,
+  void constructLineInfo(MCSymbol *Label, const DIFile *File, uint32_t Line,
                          uint32_t Column);
 
   /// Generate types and variables for globals.
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 44932383fb43..caf84701b999 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -50,13 +50,13 @@ static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
   return createBPFMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
 }
 
-static MCStreamer *createBPFMCStreamer(const Triple &T, MCContext &Ctx,
-                                       std::unique_ptr<MCAsmBackend> &&MAB,
-                                       std::unique_ptr<MCObjectWriter> &&OW,
-                                       std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                       bool RelaxAll) {
-  return createELFStreamer(Ctx, std::move(MAB), std::move(OW), std::move(Emitter),
-                           RelaxAll);
+static MCStreamer *
+createBPFMCStreamer(const Triple &T, MCContext &Ctx,
+                    std::unique_ptr<MCAsmBackend> &&MAB,
+                    std::unique_ptr<MCObjectWriter> &&OW,
+                    std::unique_ptr<MCCodeEmitter> &&Emitter) {
+  return createELFStreamer(Ctx, std::move(MAB), std::move(OW),
+                           std::move(Emitter));
 }
 
 static MCInstPrinter *createBPFMCInstPrinter(const Triple &T,
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
index 64f01cd1c9fa..c3403ade389c 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp
@@ -88,13 +88,10 @@ createCSKYObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
 static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
                                      std::unique_ptr<MCAsmBackend> &&MAB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
-                                     std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                     bool RelaxAll) {
+                                     std::unique_ptr<MCCodeEmitter> &&Emitter) {
   CSKYELFStreamer *S = new CSKYELFStreamer(Ctx, std::move(MAB), std::move(OW),
                                            std::move(Emitter));
 
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
   return S;
 }
 
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 65cf1dfdb403..67e04c212a69 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -18,18 +18,25 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/DXContainerPSVInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
 using namespace llvm::dxil;
+using namespace llvm::mcdxbc;
 
 namespace {
 class DXContainerGlobals : public llvm::ModulePass {
 
+  GlobalVariable *buildContainerGlobal(Module &M, Constant *Content,
+                                       StringRef Name, StringRef SectionName);
   GlobalVariable *getFeatureFlags(Module &M);
   GlobalVariable *computeShaderHash(Module &M);
+  GlobalVariable *buildSignature(Module &M, Signature &Sig, StringRef Name,
+                                 StringRef SectionName);
+  void addSignature(Module &M, SmallVector<GlobalValue *> &Globals);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -55,7 +62,7 @@ bool DXContainerGlobals::runOnModule(Module &M) {
   llvm::SmallVector<GlobalValue *> Globals;
   Globals.push_back(getFeatureFlags(M));
   Globals.push_back(computeShaderHash(M));
-
+  addSignature(M, Globals);
   appendToCompilerUsed(M, Globals);
   return true;
 }
@@ -68,12 +75,7 @@ GlobalVariable *DXContainerGlobals::getFeatureFlags(Module &M) {
 
   Constant *FeatureFlagsConstant =
       ConstantInt::get(M.getContext(), APInt(64, FeatureFlags));
-  auto *GV = new llvm::GlobalVariable(M, FeatureFlagsConstant->getType(), true,
-                                      GlobalValue::PrivateLinkage,
-                                      FeatureFlagsConstant, "dx.sfi0");
-  GV->setSection("SFI0");
-  GV->setAlignment(Align(4));
-  return GV;
+  return buildContainerGlobal(M, FeatureFlagsConstant, "dx.sfi0", "SFI0");
 }
 
 GlobalVariable *DXContainerGlobals::computeShaderHash(Module &M) {
@@ -96,14 +98,41 @@ GlobalVariable *DXContainerGlobals::computeShaderHash(Module &M) {
 
   Constant *ModuleConstant =
       ConstantDataArray::get(M.getContext(), arrayRefFromStringRef(Data));
-  auto *GV = new llvm::GlobalVariable(M, ModuleConstant->getType(), true,
-                                      GlobalValue::PrivateLinkage,
-                                      ModuleConstant, "dx.hash");
-  GV->setSection("HASH");
+  return buildContainerGlobal(M, ModuleConstant, "dx.hash", "HASH");
+}
+
+GlobalVariable *DXContainerGlobals::buildContainerGlobal(
+    Module &M, Constant *Content, StringRef Name, StringRef SectionName) {
+  auto *GV = new llvm::GlobalVariable(
+      M, Content->getType(), true, GlobalValue::PrivateLinkage, Content, Name);
+  GV->setSection(SectionName);
   GV->setAlignment(Align(4));
   return GV;
 }
 
+GlobalVariable *DXContainerGlobals::buildSignature(Module &M, Signature &Sig,
+                                                   StringRef Name,
+                                                   StringRef SectionName) {
+  SmallString<256> Data;
+  raw_svector_ostream OS(Data);
+  Sig.write(OS);
+  Constant *Constant =
+      ConstantDataArray::getString(M.getContext(), Data, /*AddNull*/ false);
+  return buildContainerGlobal(M, Constant, Name, SectionName);
+}
+
+void DXContainerGlobals::addSignature(Module &M,
+                                      SmallVector<GlobalValue *> &Globals) {
+  // FIXME: support graphics shader.
+  //  see issue https://github.com/llvm/llvm-project/issues/90504.
+
+  Signature InputSig;
+  Globals.emplace_back(buildSignature(M, InputSig, "dx.isg1", "ISG1"));
+
+  Signature OutputSig;
+  Globals.emplace_back(buildSignature(M, OutputSig, "dx.osg1", "OSG1"));
+}
+
 char DXContainerGlobals::ID = 0;
 INITIALIZE_PASS_BEGIN(DXContainerGlobals, "dxil-globals",
                       "DXContainer Global Emitter", false, true)
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 4d99bc006900..4b162a35365c 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -77,8 +77,8 @@ static bool expandIntegerDot(CallInst *Orig, Intrinsic::ID DotIntrinsic) {
                                    : Intrinsic::dx_umad;
   Value *A = Orig->getOperand(0);
   Value *B = Orig->getOperand(1);
-  Type *ATy = A->getType();
-  Type *BTy = B->getType();
+  [[maybe_unused]] Type *ATy = A->getType();
+  [[maybe_unused]] Type *BTy = B->getType();
   assert(ATy->isVectorTy() && BTy->isVectorTy());
 
   IRBuilder<> Builder(Orig->getParent());
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index cdc062eee72b..9e4ca08aea40 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_target(HexagonCodeGen
   HexagonCommonGEP.cpp
   HexagonConstExtenders.cpp
   HexagonConstPropagation.cpp
+  HexagonCopyHoisting.cpp
   HexagonCopyToCombine.cpp
   HexagonEarlyIfConv.cpp
   HexagonExpandCondsets.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
new file mode 100644
index 000000000000..97917270601b
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
@@ -0,0 +1,272 @@
+//===--------- HexagonCopyHoisting.cpp - Hexagon Copy Hoisting  ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// The purpose of this pass is to move the copy instructions that are
+// present in all the successor of a basic block (BB) to the end of BB.
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "CopyHoist"
+
+using namespace llvm;
+
+static cl::opt<std::string> CPHoistFn("cphoistfn", cl::Hidden, cl::desc(""),
+                                      cl::init(""));
+
+namespace llvm {
+void initializeHexagonCopyHoistingPass(PassRegistry &Registry);
+FunctionPass *createHexagonCopyHoisting();
+} // namespace llvm
+
+namespace {
+
+class HexagonCopyHoisting : public MachineFunctionPass {
+
+public:
+  static char ID;
+  HexagonCopyHoisting() : MachineFunctionPass(ID), MFN(nullptr), MRI(nullptr) {
+    initializeHexagonCopyHoistingPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Hexagon Copy Hoisting"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<SlotIndexes>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+  void collectCopyInst();
+  void addMItoCopyList(MachineInstr *MI);
+  bool analyzeCopy(MachineBasicBlock *BB);
+  bool isSafetoMove(MachineInstr *CandMI);
+  void moveCopyInstr(MachineBasicBlock *DestBB,
+                     std::pair<Register, Register> Key, MachineInstr *MI);
+
+  MachineFunction *MFN;
+  MachineRegisterInfo *MRI;
+  std::vector<DenseMap<std::pair<Register, Register>, MachineInstr *>>
+      CopyMIList;
+};
+
+} // namespace
+
+char HexagonCopyHoisting::ID = 0;
+
+namespace llvm {
+char &HexagonCopyHoistingID = HexagonCopyHoisting::ID;
+} // namespace llvm
+
+bool HexagonCopyHoisting::runOnMachineFunction(MachineFunction &Fn) {
+
+  if ((CPHoistFn != "") && (CPHoistFn != Fn.getFunction().getName()))
+    return false;
+
+  MFN = &Fn;
+  MRI = &Fn.getRegInfo();
+
+  LLVM_DEBUG(dbgs() << "\nCopy Hoisting:" << "\'" << Fn.getName() << "\'\n");
+
+  CopyMIList.clear();
+  CopyMIList.resize(Fn.getNumBlockIDs());
+
+  // Traverse through all basic blocks and collect copy instructions.
+  collectCopyInst();
+
+  // Traverse through the basic blocks again and move the COPY instructions
+  // that are present in all the successors of BB to BB.
+  bool Changed = false;
+  for (MachineBasicBlock *BB : post_order(&Fn)) {
+    if (!BB->empty()) {
+      if (BB->pred_size() != 1)
+        continue;
+      auto &BBCopyInst = CopyMIList[BB->getNumber()];
+      if (BBCopyInst.size() > 0)
+        Changed |= analyzeCopy(*BB->pred_begin());
+    }
+  }
+  // Re-compute liveness
+  if (Changed) {
+    LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+    SlotIndexes *SI = LIS.getSlotIndexes();
+    SI->releaseMemory();
+    SI->runOnMachineFunction(Fn);
+    LIS.releaseMemory();
+    LIS.runOnMachineFunction(Fn);
+  }
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Save all COPY instructions for each basic block in CopyMIList vector.
+//===----------------------------------------------------------------------===//
+void HexagonCopyHoisting::collectCopyInst() {
+  for (MachineBasicBlock &BB : *MFN) {
+#ifndef NDEBUG
+    auto &BBCopyInst = CopyMIList[BB.getNumber()];
+    LLVM_DEBUG(dbgs() << "Visiting BB#" << BB.getNumber() << ":\n");
+#endif
+
+    for (MachineInstr &MI : BB) {
+      if (MI.getOpcode() == TargetOpcode::COPY)
+        addMItoCopyList(&MI);
+    }
+    LLVM_DEBUG(dbgs() << "\tNumber of copies: " << BBCopyInst.size() << "\n");
+  }
+}
+
+void HexagonCopyHoisting::addMItoCopyList(MachineInstr *MI) {
+  unsigned BBNum = MI->getParent()->getNumber();
+  auto &BBCopyInst = CopyMIList[BBNum];
+  Register DstReg = MI->getOperand(0).getReg();
+  Register SrcReg = MI->getOperand(1).getReg();
+
+  if (!Register::isVirtualRegister(DstReg) ||
+      !Register::isVirtualRegister(SrcReg) ||
+      MRI->getRegClass(DstReg) != &Hexagon::IntRegsRegClass ||
+      MRI->getRegClass(SrcReg) != &Hexagon::IntRegsRegClass)
+    return;
+
+  BBCopyInst.insert(std::pair(std::pair(SrcReg, DstReg), MI));
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "\tAdding Copy Instr to the list: " << MI << "\n");
+  for (auto II : BBCopyInst) {
+    MachineInstr *TempMI = II.getSecond();
+    LLVM_DEBUG(dbgs() << "\tIn the list: " << TempMI << "\n");
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+// Look at the COPY instructions of all the successors of BB. If the same
+// instruction is present in every successor and can be safely moved,
+// pull it into BB.
+//===----------------------------------------------------------------------===//
+bool HexagonCopyHoisting::analyzeCopy(MachineBasicBlock *BB) {
+
+  bool Changed = false;
+  if (BB->succ_size() < 2)
+    return false;
+
+  for (MachineBasicBlock *SB : BB->successors()) {
+    if (SB->pred_size() != 1 || SB->isEHPad() || SB->hasAddressTaken())
+      return false;
+  }
+
+  MachineBasicBlock *SBB1 = *BB->succ_begin();
+  auto &BBCopyInst1 = CopyMIList[SBB1->getNumber()];
+
+  for (auto II : BBCopyInst1) {
+    std::pair<Register, Register> Key = II.getFirst();
+    MachineInstr *MI = II.getSecond();
+    bool IsSafetoMove = true;
+    for (MachineBasicBlock *SuccBB : BB->successors()) {
+      auto &SuccBBCopyInst = CopyMIList[SuccBB->getNumber()];
+      if (!SuccBBCopyInst.count(Key)) {
+        // Same copy not present in this successor
+        IsSafetoMove = false;
+        break;
+      }
+      // If present, make sure that it's safe to pull this copy instruction
+      // into the predecessor.
+      MachineInstr *SuccMI = SuccBBCopyInst[Key];
+      if (!isSafetoMove(SuccMI)) {
+        IsSafetoMove = false;
+        break;
+      }
+    }
+    // If we have come this far, this copy instruction can be safely
+    // moved to the predecessor basic block.
+    if (IsSafetoMove) {
+      LLVM_DEBUG(dbgs() << "\t\t Moving instr to BB#" << BB->getNumber() << ": "
+                        << MI << "\n");
+      moveCopyInstr(BB, Key, MI);
+      // Add my into BB copyMI list.
+      Changed = true;
+    }
+  }
+
+#ifndef NDEBUG
+  auto &BBCopyInst = CopyMIList[BB->getNumber()];
+  for (auto II : BBCopyInst) {
+    MachineInstr *TempMI = II.getSecond();
+    LLVM_DEBUG(dbgs() << "\tIn the list: " << TempMI << "\n");
+  }
+#endif
+  return Changed;
+}
+
+bool HexagonCopyHoisting::isSafetoMove(MachineInstr *CandMI) {
+  // Make sure that it's safe to move this 'copy' instruction to the predecessor
+  // basic block.
+  assert(CandMI->getOperand(0).isReg() && CandMI->getOperand(1).isReg());
+  Register DefR = CandMI->getOperand(0).getReg();
+  Register UseR = CandMI->getOperand(1).getReg();
+
+  MachineBasicBlock *BB = CandMI->getParent();
+  // There should not be a def/use of DefR between the start of BB and CandMI.
+  MachineBasicBlock::iterator MII, MIE;
+  for (MII = BB->begin(), MIE = CandMI; MII != MIE; ++MII) {
+    MachineInstr *OtherMI = &*MII;
+    for (const MachineOperand &Mo : OtherMI->operands())
+      if (Mo.isReg() && Mo.getReg() == DefR)
+        return false;
+  }
+  // There should not be a def of UseR between the start of BB and CandMI.
+  for (MII = BB->begin(), MIE = CandMI; MII != MIE; ++MII) {
+    MachineInstr *OtherMI = &*MII;
+    for (const MachineOperand &Mo : OtherMI->operands())
+      if (Mo.isReg() && Mo.isDef() && Mo.getReg() == UseR)
+        return false;
+  }
+  return true;
+}
+
+void HexagonCopyHoisting::moveCopyInstr(MachineBasicBlock *DestBB,
+                                        std::pair<Register, Register> Key,
+                                        MachineInstr *MI) {
+  MachineBasicBlock::iterator FirstTI = DestBB->getFirstTerminator();
+  assert(FirstTI != DestBB->end());
+
+  DestBB->splice(FirstTI, MI->getParent(), MI);
+
+  addMItoCopyList(MI);
+  for (auto I = ++(DestBB->succ_begin()), E = DestBB->succ_end(); I != E; ++I) {
+    MachineBasicBlock *SuccBB = *I;
+    auto &BBCopyInst = CopyMIList[SuccBB->getNumber()];
+    MachineInstr *SuccMI = BBCopyInst[Key];
+    SuccMI->eraseFromParent();
+    BBCopyInst.erase(Key);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+INITIALIZE_PASS(HexagonCopyHoisting, "hexagon-move-phicopy",
+                "Hexagon move phi copy", false, false)
+
+FunctionPass *llvm::createHexagonCopyHoisting() {
+  return new HexagonCopyHoisting();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 6b0315bc1bef..31e37dcce415 100644
--- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -1006,8 +1006,7 @@ bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
 
   static const Register Regs01[] = { LC0, SA0, LC1, SA1 };
   static const Register Regs1[]  = { LC1, SA1 };
-  auto CheckRegs = IsInnerHWLoop ? ArrayRef(Regs01, std::size(Regs01))
-                                 : ArrayRef(Regs1, std::size(Regs1));
+  auto CheckRegs = IsInnerHWLoop ? ArrayRef(Regs01) : ArrayRef(Regs1);
   for (Register R : CheckRegs)
     if (MI->modifiesRegister(R, TRI))
       return true;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e64d7e52a9aa..3a792ecfd03d 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -43,8 +43,9 @@ cl::opt<unsigned> RDFFuncBlockLimit(
     "rdf-bb-limit", cl::Hidden, cl::init(1000),
     cl::desc("Basic block limit for a function for RDF optimizations"));
 
-static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
-  cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
+static cl::opt<bool>
+    DisableHardwareLoops("disable-hexagon-hwloops", cl::Hidden,
+                         cl::desc("Disable Hardware Loops for Hexagon target"));
 
 static cl::opt<bool>
     DisableAModeOpt("disable-hexagon-amodeopt", cl::Hidden,
@@ -58,8 +59,9 @@ static cl::opt<bool>
     DisableHCP("disable-hcp", cl::Hidden,
                cl::desc("Disable Hexagon constant propagation"));
 
-static cl::opt<bool> DisableStoreWidening("disable-store-widen",
-  cl::Hidden, cl::init(false), cl::desc("Disable store widening"));
+static cl::opt<bool> DisableStoreWidening("disable-store-widen", cl::Hidden,
+                                          cl::init(false),
+                                          cl::desc("Disable store widening"));
 
 static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
                                           cl::init(true), cl::Hidden,
@@ -72,42 +74,53 @@ static cl::opt<bool> EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true),
 static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
                                    cl::desc("Enable early if-conversion"));
 
-static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
-  cl::Hidden, cl::desc("Generate \"insert\" instructions"));
+static cl::opt<bool> EnableCopyHoist("hexagon-copy-hoist", cl::init(true),
+                                     cl::Hidden, cl::ZeroOrMore,
+                                     cl::desc("Enable Hexagon copy hoisting"));
+
+static cl::opt<bool>
+    EnableGenInsert("hexagon-insert", cl::init(true), cl::Hidden,
+                    cl::desc("Generate \"insert\" instructions"));
 
 static cl::opt<bool>
     EnableCommGEP("hexagon-commgep", cl::init(true), cl::Hidden,
                   cl::desc("Enable commoning of GEP instructions"));
 
-static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
-  cl::Hidden, cl::desc("Generate \"extract\" instructions"));
+static cl::opt<bool>
+    EnableGenExtract("hexagon-extract", cl::init(true), cl::Hidden,
+                     cl::desc("Generate \"extract\" instructions"));
 
-static cl::opt<bool> EnableGenMux("hexagon-mux", cl::init(true), cl::Hidden,
-  cl::desc("Enable converting conditional transfers into MUX instructions"));
+static cl::opt<bool> EnableGenMux(
+    "hexagon-mux", cl::init(true), cl::Hidden,
+    cl::desc("Enable converting conditional transfers into MUX instructions"));
 
-static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
-  cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
-  "predicate instructions"));
+static cl::opt<bool>
+    EnableGenPred("hexagon-gen-pred", cl::init(true), cl::Hidden,
+                  cl::desc("Enable conversion of arithmetic operations to "
+                           "predicate instructions"));
 
 static cl::opt<bool>
     EnableLoopPrefetch("hexagon-loop-prefetch", cl::Hidden,
                        cl::desc("Enable loop data prefetch on Hexagon"));
 
-static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
-  cl::desc("Disable splitting double registers"));
+static cl::opt<bool>
+    DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
+                cl::desc("Disable splitting double registers"));
 
 static cl::opt<bool>
     EnableGenMemAbs("hexagon-mem-abs", cl::init(true), cl::Hidden,
                     cl::desc("Generate absolute set instructions"));
 
 static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
-  cl::Hidden, cl::desc("Bit simplification"));
+                                       cl::Hidden,
+                                       cl::desc("Bit simplification"));
 
 static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true),
-  cl::Hidden, cl::desc("Loop rescheduling"));
+                                       cl::Hidden,
+                                       cl::desc("Loop rescheduling"));
 
-static cl::opt<bool> HexagonNoOpt("hexagon-noopt", cl::init(false),
-  cl::Hidden, cl::desc("Disable backend optimizations"));
+static cl::opt<bool> HexagonNoOpt("hexagon-noopt", cl::init(false), cl::Hidden,
+                                  cl::desc("Disable backend optimizations"));
 
 static cl::opt<bool>
     EnableVectorPrint("enable-hexagon-vector-print", cl::Hidden,
@@ -148,69 +161,72 @@ static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
 }
 
 static MachineSchedRegistry
-SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
-                    createVLIWMachineSched);
+    SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+                        createVLIWMachineSched);
 
 namespace llvm {
-  extern char &HexagonExpandCondsetsID;
-  extern char &HexagonTfrCleanupID;
-  void initializeHexagonBitSimplifyPass(PassRegistry&);
-  void initializeHexagonConstExtendersPass(PassRegistry&);
-  void initializeHexagonConstPropagationPass(PassRegistry&);
-  void initializeHexagonCopyToCombinePass(PassRegistry&);
-  void initializeHexagonEarlyIfConversionPass(PassRegistry&);
-  void initializeHexagonExpandCondsetsPass(PassRegistry&);
-  void initializeHexagonGenMemAbsolutePass(PassRegistry &);
-  void initializeHexagonGenMuxPass(PassRegistry&);
-  void initializeHexagonHardwareLoopsPass(PassRegistry&);
-  void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
-  void initializeHexagonLoopAlignPass(PassRegistry &);
-  void initializeHexagonNewValueJumpPass(PassRegistry&);
-  void initializeHexagonOptAddrModePass(PassRegistry&);
-  void initializeHexagonPacketizerPass(PassRegistry&);
-  void initializeHexagonRDFOptPass(PassRegistry&);
-  void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
-  void initializeHexagonTfrCleanupPass(PassRegistry &);
-  void initializeHexagonVExtractPass(PassRegistry &);
-  void initializeHexagonVectorCombineLegacyPass(PassRegistry&);
-  void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
-  Pass *createHexagonLoopIdiomPass();
-  Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
-
-  FunctionPass *createHexagonBitSimplify();
-  FunctionPass *createHexagonBranchRelaxation();
-  FunctionPass *createHexagonCallFrameInformation();
-  FunctionPass *createHexagonCFGOptimizer();
-  FunctionPass *createHexagonCommonGEP();
-  FunctionPass *createHexagonConstExtenders();
-  FunctionPass *createHexagonConstPropagationPass();
-  FunctionPass *createHexagonCopyToCombine();
-  FunctionPass *createHexagonEarlyIfConversion();
-  FunctionPass *createHexagonFixupHwLoops();
-  FunctionPass *createHexagonGenExtract();
-  FunctionPass *createHexagonGenInsert();
-  FunctionPass *createHexagonGenMemAbsolute();
-  FunctionPass *createHexagonGenMux();
-  FunctionPass *createHexagonGenPredicate();
-  FunctionPass *createHexagonHardwareLoops();
-  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
-                                     CodeGenOptLevel OptLevel);
-  FunctionPass *createHexagonLoopAlign();
-  FunctionPass *createHexagonLoopRescheduling();
-  FunctionPass *createHexagonNewValueJump();
-  FunctionPass *createHexagonOptAddrMode();
-  FunctionPass *createHexagonOptimizeSZextends();
-  FunctionPass *createHexagonPacketizer(bool Minimal);
-  FunctionPass *createHexagonPeephole();
-  FunctionPass *createHexagonRDFOpt();
-  FunctionPass *createHexagonSplitConst32AndConst64();
-  FunctionPass *createHexagonSplitDoubleRegs();
-  FunctionPass *createHexagonStoreWidening();
-  FunctionPass *createHexagonTfrCleanup();
-  FunctionPass *createHexagonVectorCombineLegacyPass();
-  FunctionPass *createHexagonVectorPrint();
-  FunctionPass *createHexagonVExtract();
-} // end namespace llvm;
+extern char &HexagonCopyHoistingID;
+extern char &HexagonExpandCondsetsID;
+extern char &HexagonTfrCleanupID;
+void initializeHexagonBitSimplifyPass(PassRegistry &);
+void initializeHexagonCopyHoistingPass(PassRegistry &);
+void initializeHexagonConstExtendersPass(PassRegistry &);
+void initializeHexagonConstPropagationPass(PassRegistry &);
+void initializeHexagonCopyToCombinePass(PassRegistry &);
+void initializeHexagonEarlyIfConversionPass(PassRegistry &);
+void initializeHexagonExpandCondsetsPass(PassRegistry &);
+void initializeHexagonGenMemAbsolutePass(PassRegistry &);
+void initializeHexagonGenMuxPass(PassRegistry &);
+void initializeHexagonHardwareLoopsPass(PassRegistry &);
+void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
+void initializeHexagonLoopAlignPass(PassRegistry &);
+void initializeHexagonNewValueJumpPass(PassRegistry &);
+void initializeHexagonOptAddrModePass(PassRegistry &);
+void initializeHexagonPacketizerPass(PassRegistry &);
+void initializeHexagonRDFOptPass(PassRegistry &);
+void initializeHexagonSplitDoubleRegsPass(PassRegistry &);
+void initializeHexagonTfrCleanupPass(PassRegistry &);
+void initializeHexagonVExtractPass(PassRegistry &);
+void initializeHexagonVectorCombineLegacyPass(PassRegistry &);
+void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
+Pass *createHexagonLoopIdiomPass();
+Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
+
+FunctionPass *createHexagonBitSimplify();
+FunctionPass *createHexagonBranchRelaxation();
+FunctionPass *createHexagonCallFrameInformation();
+FunctionPass *createHexagonCFGOptimizer();
+FunctionPass *createHexagonCommonGEP();
+FunctionPass *createHexagonConstExtenders();
+FunctionPass *createHexagonConstPropagationPass();
+FunctionPass *createHexagonCopyHoisting();
+FunctionPass *createHexagonCopyToCombine();
+FunctionPass *createHexagonEarlyIfConversion();
+FunctionPass *createHexagonFixupHwLoops();
+FunctionPass *createHexagonGenExtract();
+FunctionPass *createHexagonGenInsert();
+FunctionPass *createHexagonGenMemAbsolute();
+FunctionPass *createHexagonGenMux();
+FunctionPass *createHexagonGenPredicate();
+FunctionPass *createHexagonHardwareLoops();
+FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                   CodeGenOptLevel OptLevel);
+FunctionPass *createHexagonLoopAlign();
+FunctionPass *createHexagonLoopRescheduling();
+FunctionPass *createHexagonNewValueJump();
+FunctionPass *createHexagonOptAddrMode();
+FunctionPass *createHexagonOptimizeSZextends();
+FunctionPass *createHexagonPacketizer(bool Minimal);
+FunctionPass *createHexagonPeephole();
+FunctionPass *createHexagonRDFOpt();
+FunctionPass *createHexagonSplitConst32AndConst64();
+FunctionPass *createHexagonSplitDoubleRegs();
+FunctionPass *createHexagonStoreWidening();
+FunctionPass *createHexagonTfrCleanup();
+FunctionPass *createHexagonVectorCombineLegacyPass();
+FunctionPass *createHexagonVectorPrint();
+FunctionPass *createHexagonVExtract();
+} // namespace llvm
 
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
@@ -260,6 +276,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
       TLOF(std::make_unique<HexagonTargetObjectFile>()),
       Subtarget(Triple(TT), CPU, FS, *this) {
+  initializeHexagonCopyHoistingPass(*PassRegistry::getPassRegistry());
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
   initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
   initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
@@ -269,10 +286,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
 const HexagonSubtarget *
 HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   AttributeList FnAttrs = F.getAttributes();
-  Attribute CPUAttr =
-      FnAttrs.getFnAttr("target-cpu");
-  Attribute FSAttr =
-      FnAttrs.getFnAttr("target-features");
+  Attribute CPUAttr = FnAttrs.getFnAttr("target-cpu");
+  Attribute FSAttr = FnAttrs.getFnAttr("target-features");
 
   std::string CPU =
       CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
@@ -331,7 +346,7 @@ namespace {
 class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine &TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {}
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();
@@ -433,6 +448,8 @@ void HexagonPassConfig::addPreRegAlloc() {
       addPass(createHexagonConstExtenders());
     if (EnableExpandCondsets)
       insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
+    if (EnableCopyHoist)
+      insertPass(&RegisterCoalescerID, &HexagonCopyHoistingID);
     if (EnableTfrCleanup)
       insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID);
     if (!DisableStoreWidening)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index dc8328a6705d..0a948402fb89 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -385,8 +385,7 @@ createMCAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
 static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
                                     std::unique_ptr<MCObjectWriter> &&OW,
-                                    std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                    bool RelaxAll) {
+                                    std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createHexagonELFStreamer(T, Context, std::move(MAB), std::move(OW),
                                   std::move(Emitter));
 }
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 8f83c883e822..4a381c033b38 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -63,13 +63,12 @@ createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
                                     std::unique_ptr<MCObjectWriter> &&OW,
-                                    std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                    bool RelaxAll) {
+                                    std::unique_ptr<MCCodeEmitter> &&Emitter) {
   if (!T.isOSBinFormatELF())
     llvm_unreachable("OS not supported");
 
   return createELFStreamer(Context, std::move(MAB), std::move(OW),
-                           std::move(Emitter), RelaxAll);
+                           std::move(Emitter));
 }
 
 static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/,
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 20284b18428b..73cb80245bca 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -118,6 +118,13 @@ class LoongArchAsmParser : public MCTargetAsmParser {
   // Helper to emit pseudo instruction "la.tls.gd $rd, $rj, sym".
   void emitLoadAddressTLSGDLarge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
 
+  // Helper to emit pseudo instruction "la.tls.desc $rd, sym".
+  void emitLoadAddressTLSDescAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+  void emitLoadAddressTLSDescPcrel(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+  // Helper to emit pseudo instruction "la.tls.desc $rd, $rj, sym".
+  void emitLoadAddressTLSDescPcrelLarge(MCInst &Inst, SMLoc IDLoc,
+                                        MCStreamer &Out);
+
   // Helper to emit pseudo instruction "li.w/d $rd, $imm".
   void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
 
@@ -132,6 +139,7 @@ public:
     Match_RequiresOpnd2NotR0R1,
     Match_RequiresAMORdDifferRkRj,
     Match_RequiresLAORdDifferRj,
+    Match_RequiresLAORdR4,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "LoongArchGenAsmMatcher.inc"
 #undef GET_OPERAND_DIAGNOSTIC_TYPES
@@ -267,7 +275,9 @@ public:
     bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
                        VK == LoongArchMCExpr::VK_LoongArch_PCALA_LO12 ||
                        VK == LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12 ||
-                       VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12;
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_LO12 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_LD;
     return IsConstantImm
                ? isInt<12>(Imm) && IsValidKind
                : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
@@ -288,7 +298,9 @@ public:
                        VK == LoongArchMCExpr::VK_LoongArch_GOT64_PC_HI12 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_LE64_HI12 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_IE64_HI12 ||
-                       VK == LoongArchMCExpr::VK_LoongArch_TLS_IE64_PC_HI12;
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_IE64_PC_HI12 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC64_HI12 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC64_PC_HI12;
     return IsConstantImm
                ? isInt<12>(Imm) && IsValidKind
                : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
@@ -311,7 +323,8 @@ public:
                        VK == LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_LO12 ||
-                       VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12;
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_LO12;
     return IsConstantImm
                ? isUInt<12>(Imm) && IsValidKind
                : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
@@ -334,7 +347,8 @@ public:
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
                        VK == LoongArchMCExpr::VK_LoongArch_B16 ||
-                       VK == LoongArchMCExpr::VK_LoongArch_PCALA_LO12;
+                       VK == LoongArchMCExpr::VK_LoongArch_PCALA_LO12 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL;
     return IsConstantImm
                ? isShiftedInt<16, 2>(Imm) && IsValidKind
                : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
@@ -355,7 +369,8 @@ public:
                        VK == LoongArchMCExpr::VK_LoongArch_GOT_PC_HI20 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_HI20 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_LD_PC_HI20 ||
-                       VK == LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20;
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_HI20;
     return IsConstantImm
                ? isInt<20>(Imm) && IsValidKind
                : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
@@ -375,7 +390,8 @@ public:
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_LD_HI20 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_IE_HI20 ||
-                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20;
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_HI20;
     return IsConstantImm
                ? isInt<20>(Imm) && IsValidKind
                : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
@@ -396,7 +412,9 @@ public:
                        VK == LoongArchMCExpr::VK_LoongArch_GOT64_PC_LO20 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_IE64_LO20 ||
                        VK == LoongArchMCExpr::VK_LoongArch_TLS_IE64_PC_LO20 ||
-                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LE64_LO20;
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_LE64_LO20 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC64_PC_LO20 ||
+                       VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC64_LO20;
 
     return IsConstantImm
                ? isInt<20>(Imm) && IsValidKind
@@ -801,6 +819,13 @@ void LoongArchAsmParser::emitLAInstSeq(MCRegister DestReg, MCRegister TmpReg,
             MCInstBuilder(Opc).addReg(DestReg).addReg(DestReg).addImm(0),
             getSTI());
         continue;
+      } else if (VK == LoongArchMCExpr::VK_LoongArch_TLS_DESC_LD) {
+        Out.emitInstruction(MCInstBuilder(Opc)
+                                .addReg(LoongArch::R1)
+                                .addReg(DestReg)
+                                .addExpr(LE),
+                            getSTI());
+        continue;
       }
       Out.emitInstruction(
           MCInstBuilder(Opc).addReg(DestReg).addReg(DestReg).addExpr(LE),
@@ -833,6 +858,13 @@ void LoongArchAsmParser::emitLAInstSeq(MCRegister DestReg, MCRegister TmpReg,
           MCInstBuilder(Opc).addReg(DestReg).addReg(DestReg).addReg(TmpReg),
           getSTI());
       break;
+    case LoongArch::JIRL:
+      Out.emitInstruction(MCInstBuilder(Opc)
+                              .addReg(LoongArch::R1)
+                              .addReg(LoongArch::R1)
+                              .addExpr(LE),
+                          getSTI());
+      break;
     }
   }
 }
@@ -1116,6 +1148,109 @@ void LoongArchAsmParser::emitLoadAddressTLSGDLarge(MCInst &Inst, SMLoc IDLoc,
   emitLAInstSeq(DestReg, TmpReg, Symbol, Insts, IDLoc, Out);
 }
 
+void LoongArchAsmParser::emitLoadAddressTLSDescAbs(MCInst &Inst, SMLoc IDLoc,
+                                                   MCStreamer &Out) {
+  // `la.tls.desc $rd, sym` with `la-global-with-abs` feature
+  // for la32 expands to:
+  //   lu12i.w $rd, %desc_hi20(sym)
+  //   ori     $rd, $rd, %desc_lo12(sym)
+  //   ld.w    $ra, $rd, %desc_ld(sym)
+  //   jirl    $ra, $ra, %desc_call(sym)
+  //
+  // for la64 expands to:
+  //   lu12i.w $rd, %desc_hi20(sym)
+  //   ori     $rd, $rd, %desc_lo12(sym)
+  //   lu32i.d $rd, %desc64_lo20(sym)
+  //   lu52i.d $rd, $rd, %desc64_hi12(sym)
+  //   ld.d    $ra, $rd, %desc_ld(sym)
+  //   jirl    $ra, $ra, %desc_call(sym)
+  MCRegister DestReg = Inst.getOperand(0).getReg();
+  const MCExpr *Symbol = Inst.getOpcode() == LoongArch::PseudoLA_TLS_DESC_ABS
+                             ? Inst.getOperand(1).getExpr()
+                             : Inst.getOperand(2).getExpr();
+  unsigned LD = is64Bit() ? LoongArch::LD_D : LoongArch::LD_W;
+  InstSeq Insts;
+
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::LU12I_W, LoongArchMCExpr::VK_LoongArch_TLS_DESC_HI20));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::ORI, LoongArchMCExpr::VK_LoongArch_TLS_DESC_LO12));
+
+  if (is64Bit()) {
+    Insts.push_back(LoongArchAsmParser::Inst(
+        LoongArch::LU32I_D, LoongArchMCExpr::VK_LoongArch_TLS_DESC64_LO20));
+    Insts.push_back(LoongArchAsmParser::Inst(
+        LoongArch::LU52I_D, LoongArchMCExpr::VK_LoongArch_TLS_DESC64_HI12));
+  }
+
+  Insts.push_back(
+      LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_TLS_DESC_LD));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::JIRL, LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL));
+
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+}
+
+void LoongArchAsmParser::emitLoadAddressTLSDescPcrel(MCInst &Inst, SMLoc IDLoc,
+                                                     MCStreamer &Out) {
+  // la.tls.desc $rd, sym
+  // expands to:
+  //   pcalau12i $rd, %desc_pc_hi20(sym)
+  //   addi.w/d  $rd, $rd, %desc_pc_lo12(sym)
+  //   ld.w/d    $ra, $rd, %desc_ld(sym)
+  //   jirl      $ra, $ra, %desc_call(sym)
+  MCRegister DestReg = Inst.getOperand(0).getReg();
+  const MCExpr *Symbol = Inst.getOperand(1).getExpr();
+  unsigned ADDI = is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
+  unsigned LD = is64Bit() ? LoongArch::LD_D : LoongArch::LD_W;
+  InstSeq Insts;
+
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::PCALAU12I, LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_HI20));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      ADDI, LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_LO12));
+  Insts.push_back(
+      LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_TLS_DESC_LD));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::JIRL, LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL));
+
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+}
+
+void LoongArchAsmParser::emitLoadAddressTLSDescPcrelLarge(MCInst &Inst,
+                                                          SMLoc IDLoc,
+                                                          MCStreamer &Out) {
+  // la.tls.desc $rd, $rj, sym
+  // expands to:
+  //   pcalau12i $rd, %desc_pc_hi20(sym)
+  //   addi.d    $rj, $r0, %desc_pc_lo12(sym)
+  //   lu32i.d   $rj, %desc64_pc_lo20(sym)
+  //   lu52i.d   $rj, $rj, %desc64_pc_hi12(sym)
+  //   add.d     $rd, $rd, $rj
+  //   ld.w/d    $ra, $rd, %desc_ld(sym)
+  //   jirl      $ra, $ra, %desc_call(sym)
+  MCRegister DestReg = Inst.getOperand(0).getReg();
+  MCRegister TmpReg = Inst.getOperand(1).getReg();
+  const MCExpr *Symbol = Inst.getOperand(2).getExpr();
+  InstSeq Insts;
+
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::PCALAU12I, LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_HI20));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::ADDI_D, LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_LO12));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::LU32I_D, LoongArchMCExpr::VK_LoongArch_TLS_DESC64_PC_LO20));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::LU52I_D, LoongArchMCExpr::VK_LoongArch_TLS_DESC64_PC_HI12));
+  Insts.push_back(LoongArchAsmParser::Inst(LoongArch::ADD_D));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::LD_D, LoongArchMCExpr::VK_LoongArch_TLS_DESC_LD));
+  Insts.push_back(LoongArchAsmParser::Inst(
+      LoongArch::JIRL, LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL));
+
+  emitLAInstSeq(DestReg, TmpReg, Symbol, Insts, IDLoc, Out);
+}
+
 void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
                                      MCStreamer &Out) {
   MCRegister DestReg = Inst.getOperand(0).getReg();
@@ -1211,6 +1346,16 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   case LoongArch::PseudoLA_TLS_GD_LARGE:
     emitLoadAddressTLSGDLarge(Inst, IDLoc, Out);
     return false;
+  case LoongArch::PseudoLA_TLS_DESC_ABS:
+  case LoongArch::PseudoLA_TLS_DESC_ABS_LARGE:
+    emitLoadAddressTLSDescAbs(Inst, IDLoc, Out);
+    return false;
+  case LoongArch::PseudoLA_TLS_DESC_PC:
+    emitLoadAddressTLSDescPcrel(Inst, IDLoc, Out);
+    return false;
+  case LoongArch::PseudoLA_TLS_DESC_PC_LARGE:
+    emitLoadAddressTLSDescPcrelLarge(Inst, IDLoc, Out);
+    return false;
   case LoongArch::PseudoLI_W:
   case LoongArch::PseudoLI_D:
     emitLoadImm(Inst, IDLoc, Out);
@@ -1238,6 +1383,15 @@ unsigned LoongArchAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
         return Match_RequiresAMORdDifferRkRj;
     }
     break;
+  case LoongArch::PseudoLA_TLS_DESC_ABS:
+  case LoongArch::PseudoLA_TLS_DESC_ABS_LARGE:
+  case LoongArch::PseudoLA_TLS_DESC_PC:
+  case LoongArch::PseudoLA_TLS_DESC_PC_LARGE: {
+    unsigned Rd = Inst.getOperand(0).getReg();
+    if (Rd != LoongArch::R4)
+      return Match_RequiresLAORdR4;
+    break;
+  }
   case LoongArch::PseudoLA_PCREL_LARGE:
   case LoongArch::PseudoLA_GOT_LARGE:
   case LoongArch::PseudoLA_TLS_IE_LARGE:
@@ -1376,6 +1530,8 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                  "$rd must be different from both $rk and $rj");
   case Match_RequiresLAORdDifferRj:
     return Error(Operands[1]->getStartLoc(), "$rd must be different from $rj");
+  case Match_RequiresLAORdR4:
+    return Error(Operands[1]->getStartLoc(), "$rd must be $r4");
   case Match_InvalidUImm1:
     return generateImmOutOfRangeError(Operands, ErrorInfo, /*Lower=*/0,
                                       /*Upper=*/(1 << 1) - 1);
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index ad39658f698e..c136f5b3e515 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -80,6 +80,9 @@ private:
   bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSDesc(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                MachineBasicBlock::iterator &NextMBBI);
 };
 
 char LoongArchPreRAExpandPseudo::ID = 0;
@@ -122,6 +125,8 @@ bool LoongArchPreRAExpandPseudo::expandMI(
     return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
   case LoongArch::PseudoLA_TLS_GD:
     return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_DESC_PC:
+    return expandLoadAddressTLSDesc(MBB, MBBI, NextMBBI);
   }
   return false;
 }
@@ -267,6 +272,52 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
                                  SecondOpcode, LoongArchII::MO_GOT_PC_LO);
 }
 
+bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Code Sequence:
+  // pcalau12i $a0, %desc_pc_hi20(sym)
+  // addi.w/d  $a0, $a0, %desc_pc_lo12(sym)
+  // ld.w/d    $ra, $a0, %desc_ld(sym)
+  // jirl      $ra, $ra, %desc_ld(sym)
+  // add.d     $dst, $a0, $tp
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  const auto &STI = MF->getSubtarget<LoongArchSubtarget>();
+  unsigned ADD = STI.is64Bit() ? LoongArch::ADD_D : LoongArch::ADD_W;
+  unsigned ADDI = STI.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
+  unsigned LD = STI.is64Bit() ? LoongArch::LD_D : LoongArch::LD_W;
+
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg =
+      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+  MachineOperand &Symbol = MI.getOperand(1);
+
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_HI);
+
+  BuildMI(MBB, MBBI, DL, TII->get(ADDI), LoongArch::R4)
+      .addReg(ScratchReg)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+
+  BuildMI(MBB, MBBI, DL, TII->get(LD), LoongArch::R1)
+      .addReg(LoongArch::R4)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC_LD);
+
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PseudoDESC_CALL), LoongArch::R1)
+      .addReg(LoongArch::R1)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC_CALL);
+
+  BuildMI(MBB, MBBI, DL, TII->get(ADD), DestReg)
+      .addReg(LoongArch::R4)
+      .addReg(LoongArch::R2);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 class LoongArchExpandPseudo : public MachineFunctionPass {
 public:
   const LoongArchInstrInfo *TII;
@@ -313,6 +364,9 @@ private:
   bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSDescPcLarge(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       MachineBasicBlock::iterator &NextMBBI);
   bool expandFunctionCALL(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI,
                           MachineBasicBlock::iterator &NextMBBI,
@@ -361,6 +415,8 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
   case LoongArch::PseudoLA_TLS_GD_LARGE:
     return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_DESC_PC_LARGE:
+    return expandLoadAddressTLSDescPcLarge(MBB, MBBI, NextMBBI);
   case LoongArch::PseudoCALL:
   case LoongArch::PseudoCALL_MEDIUM:
   case LoongArch::PseudoCALL_LARGE:
@@ -560,6 +616,58 @@ bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
                                 LoongArchII::MO_GD_PC_HI);
 }
 
+bool LoongArchExpandPseudo::expandLoadAddressTLSDescPcLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Code Sequence:
+  //
+  // pcalau12i  $a0, %desc_pc_hi20(sym)
+  // addi.d     $t8, $zero, %desc_pc_lo12(sym)
+  // lu32i.d    $t8, %desc64_pc_lo20(sym)
+  // lu52i.d    $t8, $t8, %desc64_pc_hi12(sym)
+  // add.d      $a0, $a0, $t8
+  // ld.d       $ra, $a0, %desc_ld(sym)
+  // jirl       $ra, $ra, %desc_call(sym)
+  // add.d      $dst, $a0, $tp
+
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  Register DestReg = MI.getOperand(0).getReg();
+  MachineOperand &Symbol = MI.getOperand(2);
+  Register ScratchReg = LoongArch::R20; // $t8
+
+  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+         "Large code model requires LA64");
+
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), LoongArch::R4)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_HI);
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
+      .addReg(LoongArch::R0)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
+      .addReg(ScratchReg)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_LO);
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
+      .addReg(ScratchReg)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_HI);
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADD_D), LoongArch::R4)
+      .addReg(ScratchReg)
+      .addReg(LoongArch::R4);
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LD_D), LoongArch::R1)
+      .addReg(LoongArch::R4)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC_LD);
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PseudoDESC_CALL), LoongArch::R1)
+      .addReg(LoongArch::R1)
+      .addDisp(Symbol, 0, LoongArchII::MO_DESC_CALL);
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADD_D), DestReg)
+      .addReg(LoongArch::R4)
+      .addReg(LoongArch::R2);
+
+  MI.eraseFromParent();
+
+  return true;
+}
+
 bool LoongArchExpandPseudo::expandFunctionCALL(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 2993726d2b64..4e504729b23e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -149,7 +149,8 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized(
 
   unsigned ScavSlotsNum = 0;
 
-  // Far branches beyond 27-bit offset require a spill slot for scratch register.
+  // Far branches beyond 27-bit offset require a spill slot for scratch
+  // register.
   bool IsLargeFunction = !isInt<27>(estimateFunctionSizeInBytes(TII, MF));
   if (IsLargeFunction)
     ScavSlotsNum = 1;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 285d5c2a63b2..d83fd2b4d25f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -903,6 +903,24 @@ SDValue LoongArchTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
   return LowerCallTo(CLI).first;
 }
 
+SDValue LoongArchTargetLowering::getTLSDescAddr(GlobalAddressSDNode *N,
+                                                SelectionDAG &DAG, unsigned Opc,
+                                                bool Large) const {
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  const GlobalValue *GV = N->getGlobal();
+
+  // This is not actually used, but is necessary for successfully matching the
+  // PseudoLA_*_LARGE nodes.
+  SDValue Tmp = DAG.getConstant(0, DL, Ty);
+
+  // Use a PC-relative addressing mode to access the global dynamic GOT address.
+  // This generates the pattern (PseudoLA_TLS_DESC_PC{,LARGE} sym).
+  SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
+  return Large ? SDValue(DAG.getMachineNode(Opc, DL, Ty, Tmp, Addr), 0)
+               : SDValue(DAG.getMachineNode(Opc, DL, Ty, Addr), 0);
+}
+
 SDValue
 LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
@@ -916,42 +934,46 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   assert(N->getOffset() == 0 && "unexpected offset in global node");
 
-  SDValue Addr;
+  bool IsDesc = DAG.getTarget().useTLSDESC();
+
   switch (getTargetMachine().getTLSModel(N->getGlobal())) {
   case TLSModel::GeneralDynamic:
     // In this model, application code calls the dynamic linker function
     // __tls_get_addr to locate TLS offsets into the dynamic thread vector at
     // runtime.
-    Addr = getDynamicTLSAddr(N, DAG,
-                             Large ? LoongArch::PseudoLA_TLS_GD_LARGE
-                                   : LoongArch::PseudoLA_TLS_GD,
-                             Large);
+    if (!IsDesc)
+      return getDynamicTLSAddr(N, DAG,
+                               Large ? LoongArch::PseudoLA_TLS_GD_LARGE
+                                     : LoongArch::PseudoLA_TLS_GD,
+                               Large);
     break;
   case TLSModel::LocalDynamic:
     // Same as GeneralDynamic, except for assembly modifiers and relocation
     // records.
-    Addr = getDynamicTLSAddr(N, DAG,
-                             Large ? LoongArch::PseudoLA_TLS_LD_LARGE
-                                   : LoongArch::PseudoLA_TLS_LD,
-                             Large);
+    if (!IsDesc)
+      return getDynamicTLSAddr(N, DAG,
+                               Large ? LoongArch::PseudoLA_TLS_LD_LARGE
+                                     : LoongArch::PseudoLA_TLS_LD,
+                               Large);
     break;
   case TLSModel::InitialExec:
     // This model uses the GOT to resolve TLS offsets.
-    Addr = getStaticTLSAddr(N, DAG,
+    return getStaticTLSAddr(N, DAG,
                             Large ? LoongArch::PseudoLA_TLS_IE_LARGE
                                   : LoongArch::PseudoLA_TLS_IE,
                             Large);
-    break;
   case TLSModel::LocalExec:
     // This model is used when static linking as the TLS offsets are resolved
     // during program linking.
     //
     // This node doesn't need an extra argument for the large code model.
-    Addr = getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE);
-    break;
+    return getStaticTLSAddr(N, DAG, LoongArch::PseudoLA_TLS_LE);
   }
 
-  return Addr;
+  return getTLSDescAddr(N, DAG,
+                        Large ? LoongArch::PseudoLA_TLS_DESC_PC_LARGE
+                              : LoongArch::PseudoLA_TLS_DESC_PC,
+                        Large);
 }
 
 template <unsigned N>
@@ -4928,7 +4950,8 @@ bool LoongArchTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return TargetLowering::isZExtFree(Val, VT2);
 }
 
-bool LoongArchTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
+bool LoongArchTargetLowering::isSExtCheaperThanZExt(EVT SrcVT,
+                                                    EVT DstVT) const {
   return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
 }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 4bb6c049f8d7..31b4d6519563 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -263,6 +263,8 @@ private:
                            unsigned Opc, bool Large = false) const;
   SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
                             unsigned Opc, bool Large = false) const;
+  SDValue getTLSDescAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
+                         unsigned Opc, bool Large = false) const;
   SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 6576100d3b32..babb6632471b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -530,6 +530,12 @@ LoongArchInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
       {MO_IE_PC_LO, "loongarch-ie-pc-lo"},
       {MO_IE_PC64_LO, "loongarch-ie-pc64-lo"},
       {MO_IE_PC64_HI, "loongarch-ie-pc64-hi"},
+      {MO_DESC_PC_HI, "loongarch-desc-pc-hi"},
+      {MO_DESC_PC_LO, "loongarch-desc-pc-lo"},
+      {MO_DESC64_PC_LO, "loongarch-desc64-pc-lo"},
+      {MO_DESC64_PC_HI, "loongarch-desc64-pc-hi"},
+      {MO_DESC_LD, "loongarch-desc-ld"},
+      {MO_DESC_CALL, "loongarch-desc-call"},
       {MO_LD_PC_HI, "loongarch-ld-pc-hi"},
       {MO_GD_PC_HI, "loongarch-gd-pc-hi"}};
   return ArrayRef(TargetFlags);
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 80429bc45be1..a7f6eb9a79eb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -1607,6 +1607,34 @@ def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
 } // Defs = [R20], Size = 20
 }
 
+// Used for expand PseudoLA_TLS_DESC_* instructions.
+let isCall = 1, isBarrier = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0,
+    Defs = [R4], Uses = [R4] in
+def PseudoDESC_CALL : Pseudo<(outs GPR:$rd), (ins GPR:$rj, simm16_lsl2:$imm16)>,
+                      PseudoInstExpansion<(JIRL GPR:$rd, GPR:$rj,
+                                           simm16_lsl2:$imm16)>;
+
+// TLSDESC
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+    isAsmParserOnly = 1, Defs = [R1] in {
+def PseudoLA_TLS_DESC_ABS : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src),
+                                   [], "la.tls.desc", "$dst, $src">,
+                                   Requires<[IsLA32, HasLaGlobalWithAbs]>;
+def PseudoLA_TLS_DESC_ABS_LARGE : Pseudo<(outs GPR:$dst),
+                                         (ins GPR:$tmp, bare_symbol:$src), [],
+                                         "la.tls.desc", "$dst, $src">,
+                                  Requires<[IsLA64, HasLaGlobalWithAbs]>;
+def PseudoLA_TLS_DESC_PC : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                                  "la.tls.desc", "$dst, $src">;
+}
+
+let isCall = 1, isBarrier = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0,
+    isCodeGenOnly = 0, isAsmParserOnly = 1, Defs = [R1, R4, R20], Size = 32 in
+def PseudoLA_TLS_DESC_PC_LARGE : Pseudo<(outs GPR:$dst),
+                                        (ins GPR:$tmp, bare_symbol:$src), [],
+                                        "la.tls.desc", "$dst, $tmp, $src">,
+                                 Requires<[IsLA64]>;
+
 // Load address inst alias: "la", "la.global" and "la.local".
 // Default:
 //     la = la.global = la.got
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index 98ad49f25e3f..d1d428241ebc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -98,6 +98,24 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   case LoongArchII::MO_CALL36:
     Kind = LoongArchMCExpr::VK_LoongArch_CALL36;
     break;
+  case LoongArchII::MO_DESC_PC_HI:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_HI20;
+    break;
+  case LoongArchII::MO_DESC_PC_LO:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_LO12;
+    break;
+  case LoongArchII::MO_DESC64_PC_LO:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_DESC64_PC_LO20;
+    break;
+  case LoongArchII::MO_DESC64_PC_HI:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_DESC64_PC_HI12;
+    break;
+  case LoongArchII::MO_DESC_LD:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_DESC_LD;
+    break;
+  case LoongArchII::MO_DESC_CALL:
+    Kind = LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL;
+    break;
     // TODO: Handle more target-flags.
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index cecb4a50aa76..bfdfdcf36d66 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -31,21 +31,11 @@ class StringRef;
 
 class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   virtual void anchor();
-  bool HasLA32 = false;
-  bool HasLA64 = false;
-  bool HasBasicF = false;
-  bool HasBasicD = false;
-  bool HasExtLSX = false;
-  bool HasExtLASX = false;
-  bool HasExtLVZ = false;
-  bool HasExtLBT = false;
-  bool HasLaGlobalWithPcrel = false;
-  bool HasLaGlobalWithAbs = false;
-  bool HasLaLocalWithAbs = false;
-  bool HasUAL = false;
-  bool HasLinkerRelax = false;
-  bool HasExpAutoVec = false;
-  bool HasFrecipe = false;
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool ATTRIBUTE = DEFAULT;
+#include "LoongArchGenSubtargetInfo.inc"
+
   unsigned GRLen = 32;
   MVT GRLenVT = MVT::i32;
   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
@@ -92,20 +82,12 @@ public:
   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool GETTER() const { return ATTRIBUTE; }
+#include "LoongArchGenSubtargetInfo.inc"
+
   bool is64Bit() const { return HasLA64; }
-  bool hasBasicF() const { return HasBasicF; }
-  bool hasBasicD() const { return HasBasicD; }
-  bool hasExtLSX() const { return HasExtLSX; }
-  bool hasExtLASX() const { return HasExtLASX; }
-  bool hasExtLVZ() const { return HasExtLVZ; }
-  bool hasExtLBT() const { return HasExtLBT; }
-  bool hasLaGlobalWithPcrel() const { return HasLaGlobalWithPcrel; }
-  bool hasLaGlobalWithAbs() const { return HasLaGlobalWithAbs; }
-  bool hasLaLocalWithAbs() const { return HasLaLocalWithAbs; }
-  bool hasUAL() const { return HasUAL; }
-  bool hasLinkerRelax() const { return HasLinkerRelax; }
-  bool hasExpAutoVec() const { return HasExpAutoVec; }
-  bool hasFrecipe() const { return HasFrecipe; }
   MVT getGRLenVT() const { return GRLenVT; }
   unsigned getGRLen() const { return GRLen; }
   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index 0692cb92b694..3c3fed7d43ed 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -47,7 +47,13 @@ enum {
   MO_IE_PC64_HI,
   MO_LD_PC_HI,
   MO_GD_PC_HI,
-  MO_CALL36
+  MO_CALL36,
+  MO_DESC_PC_HI,
+  MO_DESC_PC_LO,
+  MO_DESC64_PC_HI,
+  MO_DESC64_PC_LO,
+  MO_DESC_LD,
+  MO_DESC_CALL,
   // TODO: Add more flags.
 };
 } // end namespace LoongArchII
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
index a6e15e09463d..9e56333e5fd9 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
@@ -82,11 +82,9 @@ namespace llvm {
 MCELFStreamer *createLoongArchELFStreamer(MCContext &C,
                                           std::unique_ptr<MCAsmBackend> MAB,
                                           std::unique_ptr<MCObjectWriter> MOW,
-                                          std::unique_ptr<MCCodeEmitter> MCE,
-                                          bool RelaxAll) {
+                                          std::unique_ptr<MCCodeEmitter> MCE) {
   LoongArchELFStreamer *S = new LoongArchELFStreamer(
       C, std::move(MAB), std::move(MOW), std::move(MCE));
-  S->getAssembler().setRelaxAll(RelaxAll);
   return S;
 }
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.h
index 220b54092c72..e220729d8923 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.h
@@ -25,7 +25,6 @@ public:
 MCELFStreamer *createLoongArchELFStreamer(MCContext &C,
                                           std::unique_ptr<MCAsmBackend> MAB,
                                           std::unique_ptr<MCObjectWriter> MOW,
-                                          std::unique_ptr<MCCodeEmitter> MCE,
-                                          bool RelaxAll);
+                                          std::unique_ptr<MCCodeEmitter> MCE);
 } // end namespace llvm
 #endif
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index 0d19d2b0fb1f..fb0587bf3bed 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -114,6 +114,28 @@ enum Fixups {
   // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
   // pcaddu18i+jirl.
   fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
+  // 20-bit fixup corresponding to %desc_pc_hi20(foo) for instruction pcalau12i.
+  fixup_loongarch_tls_desc_pc_hi20 =
+      FirstLiteralRelocationKind + ELF::R_LARCH_TLS_DESC_PC_HI20,
+  // 12-bit fixup corresponding to %desc_pc_lo12(foo) for instructions like
+  // addi.w/d.
+  fixup_loongarch_tls_desc_pc_lo12,
+  // 20-bit fixup corresponding to %desc64_pc_lo20(foo) for instruction lu32i.d.
+  fixup_loongarch_tls_desc64_pc_lo20,
+  // 12-bit fixup corresponding to %desc64_pc_hi12(foo) for instruction lu52i.d.
+  fixup_loongarch_tls_desc64_pc_hi12,
+  // 20-bit fixup corresponding to %desc_hi20(foo) for instruction lu12i.w.
+  fixup_loongarch_tls_desc_hi20,
+  // 12-bit fixup corresponding to %desc_lo12(foo) for instruction ori.
+  fixup_loongarch_tls_desc_lo12,
+  // 20-bit fixup corresponding to %desc64_lo20(foo) for instruction lu32i.d.
+  fixup_loongarch_tls_desc64_lo20,
+  // 12-bit fixup corresponding to %desc64_hi12(foo) for instruction lu52i.d.
+  fixup_loongarch_tls_desc64_hi12,
+  // 12-bit fixup corresponding to %desc_ld(foo) for instruction ld.w/d.
+  fixup_loongarch_tls_desc_ld,
+  // 12-bit fixup corresponding to %desc_call(foo) for instruction jirl.
+  fixup_loongarch_tls_desc_call,
 };
 } // end namespace LoongArch
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 9ac0128f2517..83812dc3c62a 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -244,6 +244,36 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case LoongArchMCExpr::VK_LoongArch_CALL36:
       FixupKind = LoongArch::fixup_loongarch_call36;
       break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_HI20:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_pc_hi20;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_PC_LO12:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_pc_lo12;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC64_PC_LO20:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc64_pc_lo20;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC64_PC_HI12:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc64_pc_hi12;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_HI20:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_hi20;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_LO12:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_lo12;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC64_LO20:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc64_lo20;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC64_HI12:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc64_hi12;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_LD:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_ld;
+      break;
+    case LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL:
+      FixupKind = LoongArch::fixup_loongarch_tls_desc_call;
+      break;
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index d6fa3b6e5096..34f9bc65ec77 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -140,6 +140,26 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
     return "gd_hi20";
   case VK_LoongArch_CALL36:
     return "call36";
+  case VK_LoongArch_TLS_DESC_PC_HI20:
+    return "desc_pc_hi20";
+  case VK_LoongArch_TLS_DESC_PC_LO12:
+    return "desc_pc_lo12";
+  case VK_LoongArch_TLS_DESC64_PC_LO20:
+    return "desc64_pc_lo20";
+  case VK_LoongArch_TLS_DESC64_PC_HI12:
+    return "desc64_pc_hi12";
+  case VK_LoongArch_TLS_DESC_HI20:
+    return "desc_hi20";
+  case VK_LoongArch_TLS_DESC_LO12:
+    return "desc_lo12";
+  case VK_LoongArch_TLS_DESC64_LO20:
+    return "desc64_lo20";
+  case VK_LoongArch_TLS_DESC64_HI12:
+    return "desc64_hi12";
+  case VK_LoongArch_TLS_DESC_LD:
+    return "desc_ld";
+  case VK_LoongArch_TLS_DESC_CALL:
+    return "desc_call";
   }
 }
 
@@ -183,6 +203,16 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
       .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20)
       .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20)
       .Case("call36", VK_LoongArch_CALL36)
+      .Case("desc_pc_hi20", VK_LoongArch_TLS_DESC_PC_HI20)
+      .Case("desc_pc_lo12", VK_LoongArch_TLS_DESC_PC_LO12)
+      .Case("desc64_pc_lo20", VK_LoongArch_TLS_DESC64_PC_LO20)
+      .Case("desc64_pc_hi12", VK_LoongArch_TLS_DESC64_PC_HI12)
+      .Case("desc_hi20", VK_LoongArch_TLS_DESC_HI20)
+      .Case("desc_lo12", VK_LoongArch_TLS_DESC_LO12)
+      .Case("desc64_lo20", VK_LoongArch_TLS_DESC64_LO20)
+      .Case("desc64_hi12", VK_LoongArch_TLS_DESC64_HI12)
+      .Case("desc_ld", VK_LoongArch_TLS_DESC_LD)
+      .Case("desc_call", VK_LoongArch_TLS_DESC_CALL)
       .Default(VK_LoongArch_Invalid);
 }
 
@@ -223,6 +253,8 @@ void LoongArchMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   case VK_LoongArch_TLS_LD_HI20:
   case VK_LoongArch_TLS_GD_PC_HI20:
   case VK_LoongArch_TLS_GD_HI20:
+  case VK_LoongArch_TLS_DESC_PC_HI20:
+  case VK_LoongArch_TLS_DESC_HI20:
     break;
   }
   fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index bd828116d7fa..71dd5bd14e4e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -62,6 +62,16 @@ public:
     VK_LoongArch_TLS_GD_PC_HI20,
     VK_LoongArch_TLS_GD_HI20,
     VK_LoongArch_CALL36,
+    VK_LoongArch_TLS_DESC_PC_HI20,
+    VK_LoongArch_TLS_DESC_PC_LO12,
+    VK_LoongArch_TLS_DESC64_PC_LO20,
+    VK_LoongArch_TLS_DESC64_PC_HI12,
+    VK_LoongArch_TLS_DESC_HI20,
+    VK_LoongArch_TLS_DESC_LO12,
+    VK_LoongArch_TLS_DESC64_LO20,
+    VK_LoongArch_TLS_DESC64_HI12,
+    VK_LoongArch_TLS_DESC_LD,
+    VK_LoongArch_TLS_DESC_CALL,
     VK_LoongArch_Invalid // Must be the last item.
   };
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index a4e6a09863e6..e40981f5b5cd 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -193,10 +193,9 @@ namespace {
 MCStreamer *createLoongArchELFStreamer(const Triple &T, MCContext &Context,
                                        std::unique_ptr<MCAsmBackend> &&MAB,
                                        std::unique_ptr<MCObjectWriter> &&MOW,
-                                       std::unique_ptr<MCCodeEmitter> &&MCE,
-                                       bool RelaxAll) {
+                                       std::unique_ptr<MCCodeEmitter> &&MCE) {
   return createLoongArchELFStreamer(Context, std::move(MAB), std::move(MOW),
-                                    std::move(MCE), RelaxAll);
+                                    std::move(MCE));
 }
 } // end namespace
 
diff --git a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
index 7fcc65beaa65..c7fdd7d7c350 100644
--- a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
+++ b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
@@ -80,6 +80,13 @@ bool M68kExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   default:
     return false;
 
+  case M68k::MOVI8di:
+    return TII->ExpandMOVI(MIB, MVT::i8);
+  case M68k::MOVI16ri:
+    return TII->ExpandMOVI(MIB, MVT::i16);
+  case M68k::MOVI32ri:
+    return TII->ExpandMOVI(MIB, MVT::i32);
+
   case M68k::MOVXd16d8:
     return TII->ExpandMOVX_RR(MIB, MVT::i16, MVT::i8);
   case M68k::MOVXd32d8:
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 63b32a619f35..62e4b36b5c9a 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -3208,7 +3208,7 @@ M68kTargetLowering::EmitLoweredSelect(MachineInstr &MI,
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   MachineInstr *LastCCRSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
-  if (!LastCCRSUser->killsRegister(M68k::CCR) &&
+  if (!LastCCRSUser->killsRegister(M68k::CCR, /*TRI=*/nullptr) &&
       !checkAndUpdateCCRKill(LastCCRSUser, MBB, TRI)) {
     Copy0MBB->addLiveIn(M68k::CCR);
     SinkMBB->addLiveIn(M68k::CCR);
diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td
index fa7e7aa0ed46..dc777a933e27 100644
--- a/llvm/lib/Target/M68k/M68kInstrData.td
+++ b/llvm/lib/Target/M68k/M68kInstrData.td
@@ -19,7 +19,7 @@
 ///
 ///  Pseudo:
 ///
-///     MOVSX [x]     MOVZX [x]     MOVX   [x]
+///     MOVI  [x]     MOVSX [x]     MOVZX [x]     MOVX   [x]
 ///
 ///  Map:
 ///
@@ -165,11 +165,12 @@ foreach AM = MxMoveSupportedAMs in {
 } // foreach AM
 
 // R <- I
+// No pattern, as all immediate -> register moves are matched to the MOVI pseudo
 class MxMove_RI<MxType TYPE, string DST_REG, MxMoveEncoding ENC,
                 MxImmOpBundle SRC = !cast<MxImmOpBundle>("MxOp"#TYPE.Size#"AddrMode_i"),
                 MxOpBundle DST = !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#DST_REG)>
     : MxMove<TYPE.Prefix, (outs DST.Op:$dst), (ins SRC.Op:$src),
-              [(set TYPE.VT:$dst, SRC.ImmPat:$src)], ENC>;
+              [(null_frag)], ENC>;
 
 foreach REG = ["r", "a", "d"] in {
   foreach TYPE = !if(!eq(REG, "d"), [MxType8, MxType16, MxType32], [MxType16, MxType32]) in
@@ -243,6 +244,24 @@ def : Pat<(store   MxType32.BPat :$src, MxType32.JPat :$dst),
           (MOV32ji MxType32.JOp  :$dst, MxType32.IOp  :$src)>;
 
 //===----------------------------------------------------------------------===//
+// MOVEQ
+//===----------------------------------------------------------------------===//
+
+/// ------------+---------+---+-----------------------
+///  F  E  D  C | B  A  9 | 8 | 7  6  5  4  3  2  1  0
+/// ------------+---------+---+-----------------------
+///  0  1  1  1 |   REG   | 0 |          DATA
+/// ------------+---------+---+-----------------------
+
+// No pattern, as all immediate -> register moves are matched to the MOVI pseudo
+let Defs = [CCR] in
+def MOVQ : MxInst<(outs MxDRD32:$dst), (ins Mxi8imm:$imm),
+                  "moveq\t$imm, $dst",
+                  [(null_frag)]> {
+  let Inst = (descend 0b0111, (operand "$dst", 3), 0b0, (operand "$imm", 8));
+}
+
+//===----------------------------------------------------------------------===//
 // MOVEM
 //
 // The mask is already pre-processed by the save/restore spill hook
@@ -496,7 +515,23 @@ class MxPseudoMove_RR<MxType DST, MxType SRC, list<dag> PAT = []>
 
 class MxPseudoMove_RM<MxType DST, MxOperand SRCOpd, list<dag> PAT = []>
     : MxPseudo<(outs DST.ROp:$dst), (ins SRCOpd:$src), PAT>;
-}
+
+
+// These Pseudos handle loading immediates to registers.
+// They are expanded post-RA into either move or moveq instructions,
+// depending on size, destination register class, and immediate value.
+// This is done with pseudoinstructions in order to not constrain RA to
+// data registers if moveq matches.
+class MxPseudoMove_DI<MxType TYPE>
+    : MxPseudo<(outs TYPE.ROp:$dst), (ins TYPE.IOp:$src),
+               [(set TYPE.ROp:$dst, imm:$src)]>;
+
+// i8 imm -> reg can always be converted to moveq,
+// but we still emit a pseudo for consistency.
+def MOVI8di  : MxPseudoMove_DI<MxType8d>;
+def MOVI16ri : MxPseudoMove_DI<MxType16r>;
+def MOVI32ri : MxPseudoMove_DI<MxType32r>;
+} // let Defs = [CCR]
 
 /// This group of Pseudos is analogues to the real x86 extending moves, but
 /// since M68k does not have those we need to emulate. These instructions
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index d56fef9e9029..338db45782c9 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -346,6 +346,40 @@ void M68kInstrInfo::AddZExt(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DL, get(And), Reg).addReg(Reg).addImm(Mask);
 }
 
+// Convert MOVI to MOVQ if the target is a data register and the immediate
+// fits in a sign-extended i8, otherwise emit a plain MOV.
+bool M68kInstrInfo::ExpandMOVI(MachineInstrBuilder &MIB, MVT MVTSize) const {
+  Register Reg = MIB->getOperand(0).getReg();
+  int64_t Imm = MIB->getOperand(1).getImm();
+  bool IsAddressReg = false;
+
+  const auto *DR32 = RI.getRegClass(M68k::DR32RegClassID);
+  const auto *AR32 = RI.getRegClass(M68k::AR32RegClassID);
+  const auto *AR16 = RI.getRegClass(M68k::AR16RegClassID);
+
+  if (AR16->contains(Reg) || AR32->contains(Reg))
+    IsAddressReg = true;
+
+  LLVM_DEBUG(dbgs() << "Expand " << *MIB.getInstr() << " to ");
+
+  if (MVTSize == MVT::i8 || (!IsAddressReg && Imm >= -128 && Imm <= 127)) {
+    LLVM_DEBUG(dbgs() << "MOVEQ\n");
+
+    // We need to assign to the full register to make IV happy
+    Register SReg =
+        MVTSize == MVT::i32 ? Reg : Register(RI.getMatchingMegaReg(Reg, DR32));
+    assert(SReg && "No viable MEGA register available");
+
+    MIB->setDesc(get(M68k::MOVQ));
+    MIB->getOperand(0).setReg(SReg);
+  } else {
+    LLVM_DEBUG(dbgs() << "MOVE\n");
+    MIB->setDesc(get(MVTSize == MVT::i16 ? M68k::MOV16ri : M68k::MOV32ri));
+  }
+
+  return true;
+}
+
 bool M68kInstrInfo::ExpandMOVX_RR(MachineInstrBuilder &MIB, MVT MVTDst,
                                   MVT MVTSrc) const {
   unsigned Move = MVTDst == MVT::i16 ? M68k::MOV16rr : M68k::MOV32rr;
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h
index 577967f2fdfc..d1e1e1cd9998 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.h
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.h
@@ -302,6 +302,9 @@ public:
   void AddZExt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                DebugLoc DL, unsigned Reg, MVT From, MVT To) const;
 
+  /// Move immediate to register
+  bool ExpandMOVI(MachineInstrBuilder &MIB, MVT MVTSize) const;
+
   /// Move across register classes without extension
   bool ExpandMOVX_RR(MachineInstrBuilder &MIB, MVT MVTDst, MVT MVTSrc) const;
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 9843b6144343..e907e8d8a700 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -112,10 +112,11 @@ void MipsELFStreamer::EmitMipsOptionRecords() {
     I->EmitMipsOptionRecord();
 }
 
-MCELFStreamer *llvm::createMipsELFStreamer(
-    MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
-    std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
-    bool RelaxAll) {
+MCELFStreamer *
+llvm::createMipsELFStreamer(MCContext &Context,
+                            std::unique_ptr<MCAsmBackend> MAB,
+                            std::unique_ptr<MCObjectWriter> OW,
+                            std::unique_ptr<MCCodeEmitter> Emitter) {
   return new MipsELFStreamer(Context, std::move(MAB), std::move(OW),
                              std::move(Emitter));
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index ac70e40d4dfe..051806d2cfe8 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -75,8 +75,7 @@ public:
 MCELFStreamer *createMipsELFStreamer(MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
                                      std::unique_ptr<MCObjectWriter> OW,
-                                     std::unique_ptr<MCCodeEmitter> Emitter,
-                                     bool RelaxAll);
+                                     std::unique_ptr<MCCodeEmitter> Emitter);
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index a84ca8ccfb2d..2722e34b3f62 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -22,11 +22,10 @@ bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
 bool baseRegNeedsLoadStoreMask(unsigned Reg);
 
 // This function creates an MCELFStreamer for Mips NaCl.
-MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
-                                         std::unique_ptr<MCAsmBackend> TAB,
-                                         std::unique_ptr<MCObjectWriter> OW,
-                                         std::unique_ptr<MCCodeEmitter> Emitter,
-                                         bool RelaxAll);
+MCELFStreamer *
+createMipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
+                          std::unique_ptr<MCObjectWriter> OW,
+                          std::unique_ptr<MCCodeEmitter> Emitter);
 }
 
 #endif
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index d38b89f9a1f2..499cbd873e29 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -104,15 +104,14 @@ static MCInstPrinter *createMipsMCInstPrinter(const Triple &T,
 static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCAsmBackend> &&MAB,
                                     std::unique_ptr<MCObjectWriter> &&OW,
-                                    std::unique_ptr<MCCodeEmitter> &&Emitter,
-                                    bool RelaxAll) {
+                                    std::unique_ptr<MCCodeEmitter> &&Emitter) {
   MCStreamer *S;
   if (!T.isOSNaCl())
     S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW),
-                              std::move(Emitter), RelaxAll);
+                              std::move(Emitter));
   else
     S = createMipsNaClELFStreamer(Context, std::move(MAB), std::move(OW),
-                                  std::move(Emitter), RelaxAll);
+                                  std::move(Emitter));
   return S;
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index 4ba0ae91e2f6..86194a9ebb61 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -259,15 +259,12 @@ bool baseRegNeedsLoadStoreMask(unsigned Reg) {
   return Reg != Mips::SP && Reg != Mips::T8;
 }
 
-MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
-                                         std::unique_ptr<MCAsmBackend> TAB,
-                                         std::unique_ptr<MCObjectWriter> OW,
-                                         std::unique_ptr<MCCodeEmitter> Emitter,
-                                         bool RelaxAll) {
+MCELFStreamer *
+createMipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
+                          std::unique_ptr<MCObjectWriter> OW,
+                          std::unique_ptr<MCCodeEmitter> Emitter) {
   MipsNaClELFStreamer *S = new MipsNaClELFStreamer(
       Context, std::move(TAB), std::move(OW), std::move(Emitter));
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
 
   // Set bundle-alignment as required by the NaCl ABI for the target.
   S->emitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index bef7607118ce..f609305bfee4 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -1117,6 +1117,22 @@ def : MipsPat<(select i32:$cond, immz, i32:$f),
               ISA_MIPS32R6;
 }
 
+// llvm.fmin/fmax operations.
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(fmaxnum f32:$lhs, f32:$rhs),
+                (MAX_S   f32:$lhs, f32:$rhs)>,
+                ISA_MIPS32R6;
+  def : MipsPat<(fmaxnum f64:$lhs, f64:$rhs),
+                (MAX_D   f64:$lhs, f64:$rhs)>,
+                ISA_MIPS32R6;
+  def : MipsPat<(fminnum f32:$lhs, f32:$rhs),
+                (MIN_S   f32:$lhs, f32:$rhs)>,
+                ISA_MIPS32R6;
+  def : MipsPat<(fminnum f64:$lhs, f64:$rhs),
+                (MIN_D   f64:$lhs, f64:$rhs)>,
+                ISA_MIPS32R6;
+}
+
 // Pseudo instructions
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
     hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT], hasPostISelHook = 1 in {
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7bc66b2d9f4b..8f7c47370ee5 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -358,6 +358,15 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
+  // Lower fmin and fmax operations for MIPS R6.
+  // Instructions are defined but never used.
+  if (Subtarget.hasMips32r6() || Subtarget.hasMips64r6()) {
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  }
+
   if (Subtarget.isGP64bit()) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 897ee89323f0..142dd64ddea9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shl.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shl.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shr.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
@@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shr.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shr.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shl.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index b849b7be7b7b..241078b03873 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -203,15 +203,16 @@ static MCStreamer *
 createPPCELFStreamer(const Triple &T, MCContext &Context,
                      std::unique_ptr<MCAsmBackend> &&MAB,
                      std::unique_ptr<MCObjectWriter> &&OW,
-                     std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll) {
+                     std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createPPCELFStreamer(Context, std::move(MAB), std::move(OW),
                               std::move(Emitter));
 }
 
-static MCStreamer *createPPCXCOFFStreamer(
-    const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB,
-    std::unique_ptr<MCObjectWriter> &&OW,
-    std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll) {
+static MCStreamer *
+createPPCXCOFFStreamer(const Triple &T, MCContext &Context,
+                       std::unique_ptr<MCAsmBackend> &&MAB,
+                       std::unique_ptr<MCObjectWriter> &&OW,
+                       std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createPPCXCOFFStreamer(Context, std::move(MAB), std::move(OW),
                                 std::move(Emitter));
 }
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 6e1002c45d81..51b79dc2b04b 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -27,7 +27,7 @@
 #include "PPCTargetStreamer.h"
 #include "TargetInfo/PowerPCTargetInfo.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -242,7 +242,7 @@ class PPCAIXAsmPrinter : public PPCAsmPrinter {
 private:
   /// Symbols lowered from ExternalSymbolSDNodes, we will need to emit extern
   /// linkage for them in AIX.
-  SmallPtrSet<MCSymbol *, 8> ExtSymSDNodeSymbols;
+  SmallSetVector<MCSymbol *, 8> ExtSymSDNodeSymbols;
 
   /// A format indicator and unique trailing identifier to form part of the
   /// sinit/sterm function names.
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 494e4b52a5b5..c6db8a7bbeb8 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -45,6 +45,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 
 using namespace llvm;
 
@@ -95,6 +96,13 @@ static cl::opt<bool>
                            cl::desc("enable optimization of conditional traps"),
                            cl::init(false), cl::Hidden);
 
+DEBUG_COUNTER(
+    PeepholeXToICounter, "ppc-xtoi-peephole",
+    "Controls whether PPC reg+reg to reg+imm peephole is performed on a MI");
+
+DEBUG_COUNTER(PeepholePerOpCounter, "ppc-per-op-peephole",
+              "Controls whether PPC per opcode peephole is performed on a MI");
+
 namespace {
 
 struct PPCMIPeephole : public MachineFunctionPass {
@@ -469,6 +477,9 @@ bool PPCMIPeephole::simplifyCode() {
           if (MI.isDebugInstr())
             continue;
 
+          if (!DebugCounter::shouldExecute(PeepholeXToICounter))
+            continue;
+
           SmallSet<Register, 4> RRToRIRegsToUpdate;
           if (!TII->convertToImmediateForm(MI, RRToRIRegsToUpdate))
             continue;
@@ -538,6 +549,9 @@ bool PPCMIPeephole::simplifyCode() {
       if (MI.isDebugInstr())
         continue;
 
+      if (!DebugCounter::shouldExecute(PeepholePerOpCounter))
+        continue;
+
       // Per-opcode peepholes.
       switch (MI.getOpcode()) {
 
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 998b9181efe6..b9e8e1f33d3a 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -44,6 +44,13 @@ public:
 
 private:
   void addSPOperands(MCInst &MI) const;
+
+  DecodeStatus getInstruction32(MCInst &Instr, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &CStream) const;
+  DecodeStatus getInstruction16(MCInst &Instr, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &CStream) const;
 };
 } // end anonymous namespace
 
@@ -182,7 +189,7 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSR07RegisterClass(MCInst &Inst, uint64_t RegNo,
+static DecodeStatus DecodeSR07RegisterClass(MCInst &Inst, uint32_t RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
   if (RegNo >= 8)
@@ -255,12 +262,12 @@ static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
+static DecodeStatus decodeVMaskReg(MCInst &Inst, uint32_t RegNo,
                                    uint64_t Address,
                                    const MCDisassembler *Decoder) {
-  if (RegNo > 2) {
+  if (RegNo >= 2)
     return MCDisassembler::Fail;
-  }
+
   MCRegister Reg = (RegNo == 0) ? RISCV::V0 : RISCV::NoRegister;
 
   Inst.addOperand(MCOperand::createReg(Reg));
@@ -361,13 +368,13 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder);
 
-static DecodeStatus decodeZcmpRlist(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address, const void *Decoder);
 
 static DecodeStatus decodeRegReg(MCInst &Inst, uint32_t Insn, uint64_t Address,
                                  const MCDisassembler *Decoder);
 
-static DecodeStatus decodeZcmpSpimm(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeZcmpSpimm(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address, const void *Decoder);
 
 static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
@@ -470,7 +477,7 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeZcmpRlist(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address, const void *Decoder) {
   if (Imm <= 3)
     return MCDisassembler::Fail;
@@ -487,7 +494,7 @@ static DecodeStatus decodeRegReg(MCInst &Inst, uint32_t Insn, uint64_t Address,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeZcmpSpimm(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeZcmpSpimm(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address, const void *Decoder) {
   Inst.addOperand(MCOperand::createImm(Imm));
   return MCDisassembler::Success;
@@ -502,21 +509,13 @@ void RISCVDisassembler::addSPOperands(MCInst &MI) const {
       MI.insert(MI.begin() + i, MCOperand::createReg(RISCV::X2));
 }
 
-DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               ArrayRef<uint8_t> Bytes,
-                                               uint64_t Address,
-                                               raw_ostream &CS) const {
-  // TODO: This will need modification when supporting instruction set
-  // extensions with instructions > 32-bits (up to 176 bits wide).
-  uint32_t Insn;
-  DecodeStatus Result;
-
 #define TRY_TO_DECODE_WITH_ADDITIONAL_OPERATION(FEATURE_CHECKS, DECODER_TABLE, \
                                                 DESC, ADDITIONAL_OPERATION)    \
   do {                                                                         \
     if (FEATURE_CHECKS) {                                                      \
       LLVM_DEBUG(dbgs() << "Trying " DESC ":\n");                              \
-      Result = decodeInstruction(DECODER_TABLE, MI, Insn, Address, this, STI); \
+      DecodeStatus Result =                                                    \
+          decodeInstruction(DECODER_TABLE, MI, Insn, Address, this, STI);      \
       if (Result != MCDisassembler::Fail) {                                    \
         ADDITIONAL_OPERATION;                                                  \
         return Result;                                                         \
@@ -532,104 +531,111 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 #define TRY_TO_DECODE_FEATURE(FEATURE, DECODER_TABLE, DESC)                    \
   TRY_TO_DECODE(STI.hasFeature(FEATURE), DECODER_TABLE, DESC)
 
-  // It's a 32 bit instruction if bit 0 and 1 are 1.
-  if ((Bytes[0] & 0x3) == 0x3) {
-    if (Bytes.size() < 4) {
-      Size = 0;
-      return MCDisassembler::Fail;
-    }
-    Size = 4;
-
-    Insn = support::endian::read32le(Bytes.data());
-
-    TRY_TO_DECODE(STI.hasFeature(RISCV::FeatureStdExtZdinx) &&
-                      !STI.hasFeature(RISCV::Feature64Bit),
-                  DecoderTableRV32Zdinx32,
-                  "RV32Zdinx table (Double in Integer and rv32)");
-    TRY_TO_DECODE(STI.hasFeature(RISCV::FeatureStdExtZacas) &&
-                      !STI.hasFeature(RISCV::Feature64Bit),
-                  DecoderTableRV32Zacas32,
-                  "RV32Zacas table (Compare-And-Swap and rv32)");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureStdExtZfinx, DecoderTableRVZfinx32,
-                          "RVZfinx table (Float in Integer)");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXVentanaCondOps,
-                          DecoderTableXVentana32, "Ventana custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBa, DecoderTableXTHeadBa32,
-                          "XTHeadBa custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBb, DecoderTableXTHeadBb32,
-                          "XTHeadBb custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBs, DecoderTableXTHeadBs32,
-                          "XTHeadBs custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadCondMov,
-                          DecoderTableXTHeadCondMov32,
-                          "XTHeadCondMov custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadCmo, DecoderTableXTHeadCmo32,
-                          "XTHeadCmo custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadFMemIdx,
-                          DecoderTableXTHeadFMemIdx32,
-                          "XTHeadFMemIdx custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMac, DecoderTableXTHeadMac32,
-                          "XTHeadMac custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMemIdx,
-                          DecoderTableXTHeadMemIdx32,
-                          "XTHeadMemIdx custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMemPair,
-                          DecoderTableXTHeadMemPair32,
-                          "XTHeadMemPair custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadSync,
-                          DecoderTableXTHeadSync32,
-                          "XTHeadSync custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadVdot, DecoderTableXTHeadVdot32,
-                          "XTHeadVdot custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfvcp, DecoderTableXSfvcp32,
-                          "SiFive VCIX custom opcode table");
-    TRY_TO_DECODE_FEATURE(
-        RISCV::FeatureVendorXSfvqmaccdod, DecoderTableXSfvqmaccdod32,
-        "SiFive Matrix Multiplication (2x8 and 8x2) Instruction opcode table");
-    TRY_TO_DECODE_FEATURE(
-        RISCV::FeatureVendorXSfvqmaccqoq, DecoderTableXSfvqmaccqoq32,
-        "SiFive Matrix Multiplication (4x8 and 8x4) Instruction opcode table");
-    TRY_TO_DECODE_FEATURE(
-        RISCV::FeatureVendorXSfvfwmaccqqq, DecoderTableXSfvfwmaccqqq32,
-        "SiFive Matrix Multiplication Instruction opcode table");
-    TRY_TO_DECODE_FEATURE(
-        RISCV::FeatureVendorXSfvfnrclipxfqf, DecoderTableXSfvfnrclipxfqf32,
-        "SiFive FP32-to-int8 Ranged Clip Instructions opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecdiscarddlone,
-                          DecoderTableXSiFivecdiscarddlone32,
-                          "SiFive sf.cdiscard.d.l1 custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecflushdlone,
-                          DecoderTableXSiFivecflushdlone32,
-                          "SiFive sf.cflush.d.l1 custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfcease, DecoderTableXSfcease32,
-                          "SiFive sf.cease custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbitmanip,
-                          DecoderTableXCVbitmanip32,
-                          "CORE-V Bit Manipulation custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVelw, DecoderTableXCVelw32,
-                          "CORE-V Event load custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVmac, DecoderTableXCVmac32,
-                          "CORE-V MAC custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVmem, DecoderTableXCVmem32,
-                          "CORE-V MEM custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCValu, DecoderTableXCValu32,
-                          "CORE-V ALU custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVsimd, DecoderTableXCVsimd32,
-                          "CORE-V SIMD extensions custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbi, DecoderTableXCVbi32,
-                          "CORE-V Immediate Branching custom opcode table");
-    TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
-
+DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &CS) const {
+  if (Bytes.size() < 4) {
+    Size = 0;
     return MCDisassembler::Fail;
   }
+  Size = 4;
+
+  uint32_t Insn = support::endian::read32le(Bytes.data());
+
+  TRY_TO_DECODE(STI.hasFeature(RISCV::FeatureStdExtZdinx) &&
+                    !STI.hasFeature(RISCV::Feature64Bit),
+                DecoderTableRV32Zdinx32,
+                "RV32Zdinx table (Double in Integer and rv32)");
+  TRY_TO_DECODE(STI.hasFeature(RISCV::FeatureStdExtZacas) &&
+                    !STI.hasFeature(RISCV::Feature64Bit),
+                DecoderTableRV32Zacas32,
+                "RV32Zacas table (Compare-And-Swap and rv32)");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureStdExtZfinx, DecoderTableRVZfinx32,
+                        "RVZfinx table (Float in Integer)");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXVentanaCondOps,
+                        DecoderTableXVentana32, "Ventana custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBa, DecoderTableXTHeadBa32,
+                        "XTHeadBa custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBb, DecoderTableXTHeadBb32,
+                        "XTHeadBb custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBs, DecoderTableXTHeadBs32,
+                        "XTHeadBs custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadCondMov,
+                        DecoderTableXTHeadCondMov32,
+                        "XTHeadCondMov custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadCmo, DecoderTableXTHeadCmo32,
+                        "XTHeadCmo custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadFMemIdx,
+                        DecoderTableXTHeadFMemIdx32,
+                        "XTHeadFMemIdx custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMac, DecoderTableXTHeadMac32,
+                        "XTHeadMac custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMemIdx,
+                        DecoderTableXTHeadMemIdx32,
+                        "XTHeadMemIdx custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMemPair,
+                        DecoderTableXTHeadMemPair32,
+                        "XTHeadMemPair custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadSync,
+                        DecoderTableXTHeadSync32,
+                        "XTHeadSync custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadVdot,
+                        DecoderTableXTHeadVdot32,
+                        "XTHeadVdot custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfvcp, DecoderTableXSfvcp32,
+                        "SiFive VCIX custom opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXSfvqmaccdod, DecoderTableXSfvqmaccdod32,
+      "SiFive Matrix Multiplication (2x8 and 8x2) Instruction opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXSfvqmaccqoq, DecoderTableXSfvqmaccqoq32,
+      "SiFive Matrix Multiplication (4x8 and 8x4) Instruction opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXSfvfwmaccqqq, DecoderTableXSfvfwmaccqqq32,
+      "SiFive Matrix Multiplication Instruction opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXSfvfnrclipxfqf, DecoderTableXSfvfnrclipxfqf32,
+      "SiFive FP32-to-int8 Ranged Clip Instructions opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecdiscarddlone,
+                        DecoderTableXSiFivecdiscarddlone32,
+                        "SiFive sf.cdiscard.d.l1 custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecflushdlone,
+                        DecoderTableXSiFivecflushdlone32,
+                        "SiFive sf.cflush.d.l1 custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfcease, DecoderTableXSfcease32,
+                        "SiFive sf.cease custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbitmanip,
+                        DecoderTableXCVbitmanip32,
+                        "CORE-V Bit Manipulation custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVelw, DecoderTableXCVelw32,
+                        "CORE-V Event load custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVmac, DecoderTableXCVmac32,
+                        "CORE-V MAC custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVmem, DecoderTableXCVmem32,
+                        "CORE-V MEM custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCValu, DecoderTableXCValu32,
+                        "CORE-V ALU custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVsimd, DecoderTableXCVsimd32,
+                        "CORE-V SIMD extensions custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbi, DecoderTableXCVbi32,
+                        "CORE-V Immediate Branching custom opcode table");
+  TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
 
+  return MCDisassembler::Fail;
+}
+
+DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &CS) const {
   if (Bytes.size() < 2) {
     Size = 0;
     return MCDisassembler::Fail;
   }
   Size = 2;
 
-  Insn = support::endian::read16le(Bytes.data());
+  uint32_t Insn = support::endian::read16le(Bytes.data());
   TRY_TO_DECODE_AND_ADD_SP(!STI.hasFeature(RISCV::Feature64Bit),
                            DecoderTableRISCV32Only_16,
                            "RISCV32Only_16 table (16-bit Instruction)");
@@ -645,3 +651,49 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
   return MCDisassembler::Fail;
 }
+
+DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               ArrayRef<uint8_t> Bytes,
+                                               uint64_t Address,
+                                               raw_ostream &CS) const {
+  // It's a 16 bit instruction if bit 0 and 1 are not 0b11.
+  if ((Bytes[0] & 0b11) != 0b11)
+    return getInstruction16(MI, Size, Bytes, Address, CS);
+
+  // It's a 32 bit instruction if bit 1:0 are 0b11(checked above) and bits 4:2
+  // are not 0b111.
+  if ((Bytes[0] & 0b1'1100) != 0b1'1100)
+    return getInstruction32(MI, Size, Bytes, Address, CS);
+
+  // 48-bit instructions are encoded as 0bxx011111.
+  if ((Bytes[0] & 0b11'1111) == 0b01'1111) {
+    Size = Bytes.size() >= 6 ? 6 : 0;
+    return MCDisassembler::Fail;
+  }
+
+  // 64-bit instructions are encoded as 0x0111111.
+  if ((Bytes[0] & 0b111'1111) == 0b011'1111) {
+    Size = Bytes.size() >= 8 ? 8 : 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Remaining cases need to check a second byte.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // 80-bit through 176-bit instructions are encoded as 0bxnnnxxxx_x1111111.
+  // Where the number of bits is (80 + (nnn * 16)) for nnn != 0b111.
+  unsigned nnn = (Bytes[1] >> 4) & 0b111;
+  if (nnn != 0b111) {
+    Size = 10 + (nnn * 2);
+    if (Bytes.size() < Size)
+      Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Remaining encodings are reserved for > 176-bit instructions.
+  Size = 0;
+  return MCDisassembler::Fail;
+}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index cdf7c048a4bf..ae7ce476fff2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -197,11 +197,9 @@ namespace llvm {
 MCELFStreamer *createRISCVELFStreamer(MCContext &C,
                                       std::unique_ptr<MCAsmBackend> MAB,
                                       std::unique_ptr<MCObjectWriter> MOW,
-                                      std::unique_ptr<MCCodeEmitter> MCE,
-                                      bool RelaxAll) {
+                                      std::unique_ptr<MCCodeEmitter> MCE) {
   RISCVELFStreamer *S =
       new RISCVELFStreamer(C, std::move(MAB), std::move(MOW), std::move(MCE));
-  S->getAssembler().setRelaxAll(RelaxAll);
   return S;
 }
 } // namespace llvm
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index e8f29cd8449b..212d731889f1 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -75,7 +75,6 @@ public:
 MCELFStreamer *createRISCVELFStreamer(MCContext &C,
                                       std::unique_ptr<MCAsmBackend> MAB,
                                       std::unique_ptr<MCObjectWriter> MOW,
-                                      std::unique_ptr<MCCodeEmitter> MCE,
-                                      bool RelaxAll);
+                                      std::unique_ptr<MCCodeEmitter> MCE);
 }
 #endif
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 12a69842ab4c..691a5892ae82 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -332,10 +332,9 @@ namespace {
 MCStreamer *createRISCVELFStreamer(const Triple &T, MCContext &Context,
                                    std::unique_ptr<MCAsmBackend> &&MAB,
                                    std::unique_ptr<MCObjectWriter> &&MOW,
-                                   std::unique_ptr<MCCodeEmitter> &&MCE,
-                                   bool RelaxAll) {
+                                   std::unique_ptr<MCCodeEmitter> &&MCE) {
   return createRISCVELFStreamer(Context, std::move(MAB), std::move(MOW),
-                                std::move(MCE), RelaxAll);
+                                std::move(MCE));
 }
 } // end anonymous namespace
 
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 9fb84efd5b6f..09f496574d64 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -15,6 +15,12 @@ include "llvm/Target/Target.td"
 include "RISCVFeatures.td"
 
 //===----------------------------------------------------------------------===//
+// RISC-V profiles supported.
+//===----------------------------------------------------------------------===//
+
+include "RISCVProfiles.td"
+
+//===----------------------------------------------------------------------===//
 // Named operands for CSR instructions.
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index b0568297a470..dc3ad5ac5908 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2099,8 +2099,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     MVT SubVecContainerVT = SubVecVT;
     // Establish the correct scalable-vector types for any fixed-length type.
     if (SubVecVT.isFixedLengthVector()) {
-      assert(Idx == 0 && V.isUndef());
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+      TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+      [[maybe_unused]] bool ExactlyVecRegSized =
+          Subtarget->expandVScale(SubVecVT.getSizeInBits())
+              .isKnownMultipleOf(Subtarget->expandVScale(VecRegSize));
+      assert(isPowerOf2_64(Subtarget->expandVScale(SubVecVT.getSizeInBits())
+                               .getKnownMinValue()));
+      assert(Idx == 0 && (ExactlyVecRegSized || V.isUndef()));
     }
     MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())
@@ -3668,7 +3674,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   }
 
   // Skip if True has side effect.
-  // TODO: Support vleff and vlsegff.
   if (TII->get(TrueOpc).hasUnmodeledSideEffects())
     return false;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6529ab7a84a1..19ef1f2f18ec 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -524,8 +524,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     if (Subtarget.is64Bit())
       setOperationAction(ISD::FPOWI, MVT::i32, Custom);
 
-    if (!Subtarget.hasStdExtZfa())
-      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
+    setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16,
+                       Subtarget.hasStdExtZfa() ? Legal : Custom);
   }
 
   if (Subtarget.hasStdExtFOrZfinx()) {
@@ -548,10 +548,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Custom);
 
-    if (Subtarget.hasStdExtZfa())
+    if (Subtarget.hasStdExtZfa()) {
       setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-    else
+      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
+    } else {
       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Custom);
+    }
   }
 
   if (Subtarget.hasStdExtFOrZfinx() && Subtarget.is64Bit())
@@ -566,6 +568,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     if (Subtarget.hasStdExtZfa()) {
       setOperationAction(FPRndMode, MVT::f64, Legal);
       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f64, Legal);
     } else {
       if (Subtarget.is64Bit())
         setOperationAction(FPRndMode, MVT::f64, Custom);
@@ -695,7 +698,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_SMAX,        ISD::VP_UMIN,        ISD::VP_UMAX,
         ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE,
         ISD::VP_SADDSAT,     ISD::VP_UADDSAT,     ISD::VP_SSUBSAT,
-        ISD::VP_USUBSAT};
+        ISD::VP_USUBSAT,     ISD::VP_CTTZ_ELTS,   ISD::VP_CTTZ_ELTS_ZERO_UNDEF};
 
     static const unsigned FloatingPointVPOps[] = {
         ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
@@ -756,6 +759,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           {ISD::SELECT_CC, ISD::VSELECT, ISD::VP_MERGE, ISD::VP_SELECT}, VT,
           Expand);
 
+      setOperationAction({ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}, VT,
+                         Custom);
+
       setOperationAction({ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR}, VT, Custom);
 
       setOperationAction(
@@ -5338,6 +5344,44 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
   return Res;
 }
 
+SDValue RISCVTargetLowering::lowerVPCttzElements(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDValue Source = Op->getOperand(0);
+  MVT SrcVT = Source.getSimpleValueType();
+  SDValue Mask = Op->getOperand(1);
+  SDValue EVL = Op->getOperand(2);
+
+  if (SrcVT.isFixedLengthVector()) {
+    MVT ContainerVT = getContainerForFixedLengthVector(SrcVT);
+    Source = convertToScalableVector(ContainerVT, Source, DAG, Subtarget);
+    Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG,
+                                   Subtarget);
+    SrcVT = ContainerVT;
+  }
+
+  // Convert to boolean vector.
+  if (SrcVT.getScalarType() != MVT::i1) {
+    SDValue AllZero = DAG.getConstant(0, DL, SrcVT);
+    SrcVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorElementCount());
+    Source = DAG.getNode(RISCVISD::SETCC_VL, DL, SrcVT,
+                         {Source, AllZero, DAG.getCondCode(ISD::SETNE),
+                          DAG.getUNDEF(SrcVT), Mask, EVL});
+  }
+
+  SDValue Res = DAG.getNode(RISCVISD::VFIRST_VL, DL, XLenVT, Source, Mask, EVL);
+  if (Op->getOpcode() == ISD::VP_CTTZ_ELTS_ZERO_UNDEF)
+    // In this case, we can interpret poison as -1, so nothing to do further.
+    return Res;
+
+  // Convert -1 to VL.
+  SDValue SetCC =
+      DAG.getSetCC(DL, XLenVT, Res, DAG.getConstant(0, DL, XLenVT), ISD::SETLT);
+  Res = DAG.getSelect(DL, XLenVT, SetCC, EVL, Res);
+  return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
+}
+
 // While RVV has alignment restrictions, we should always be able to load as a
 // legal equivalently-sized byte-typed vector instead. This method is
 // responsible for re-expressing a ISD::LOAD via a correctly-aligned type. If
@@ -6592,6 +6636,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     if (Op.getOperand(1).getValueType().getVectorElementType() == MVT::i1)
       return lowerVectorMaskVecReduction(Op, DAG, /*IsVP*/ true);
     return lowerVPREDUCE(Op, DAG);
+  case ISD::VP_CTTZ_ELTS:
+  case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
+    return lowerVPCttzElements(Op, DAG);
   case ISD::UNDEF: {
     MVT ContainerVT = getContainerForFixedLengthVector(Op.getSimpleValueType());
     return convertFromScalableVector(Op.getSimpleValueType(),
@@ -9769,12 +9816,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     }
   }
 
-  // If the subvector vector is a fixed-length type, we cannot use subregister
-  // manipulation to simplify the codegen; we don't know which register of a
-  // LMUL group contains the specific subvector as we only know the minimum
-  // register size. Therefore we must slide the vector group up the full
-  // amount.
-  if (SubVecVT.isFixedLengthVector()) {
+  // If the subvector vector is a fixed-length type and we don't know VLEN
+  // exactly, we cannot use subregister manipulation to simplify the codegen; we
+  // don't know which register of a LMUL group contains the specific subvector
+  // as we only know the minimum register size. Therefore we must slide the
+  // vector group up the full amount.
+  const auto VLen = Subtarget.getRealVLen();
+  if (SubVecVT.isFixedLengthVector() && !VLen) {
     if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector())
       return Op;
     MVT ContainerVT = VecVT;
@@ -9822,41 +9870,90 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     return DAG.getBitcast(Op.getValueType(), SubVec);
   }
 
-  unsigned SubRegIdx, RemIdx;
-  std::tie(SubRegIdx, RemIdx) =
-      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
-          VecVT, SubVecVT, OrigIdx, TRI);
+  MVT ContainerVecVT = VecVT;
+  if (VecVT.isFixedLengthVector()) {
+    ContainerVecVT = getContainerForFixedLengthVector(VecVT);
+    Vec = convertToScalableVector(ContainerVecVT, Vec, DAG, Subtarget);
+  }
+
+  MVT ContainerSubVecVT = SubVecVT;
+  if (SubVecVT.isFixedLengthVector()) {
+    ContainerSubVecVT = getContainerForFixedLengthVector(SubVecVT);
+    SubVec = convertToScalableVector(ContainerSubVecVT, SubVec, DAG, Subtarget);
+  }
+
+  unsigned SubRegIdx;
+  ElementCount RemIdx;
+  // insert_subvector scales the index by vscale if the subvector is scalable,
+  // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
+  // we have a fixed length subvector, we need to adjust the index by 1/vscale.
+  if (SubVecVT.isFixedLengthVector()) {
+    assert(VLen);
+    unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
+    auto Decompose =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            ContainerVecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);
+    SubRegIdx = Decompose.first;
+    RemIdx = ElementCount::getFixed((Decompose.second * Vscale) +
+                                    (OrigIdx % Vscale));
+  } else {
+    auto Decompose =
+        RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+            ContainerVecVT, ContainerSubVecVT, OrigIdx, TRI);
+    SubRegIdx = Decompose.first;
+    RemIdx = ElementCount::getScalable(Decompose.second);
+  }
 
-  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
-  bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+  TypeSize VecRegSize = TypeSize::getScalable(RISCV::RVVBitsPerBlock);
+  assert(isPowerOf2_64(
+      Subtarget.expandVScale(SubVecVT.getSizeInBits()).getKnownMinValue()));
+  bool ExactlyVecRegSized =
+      Subtarget.expandVScale(SubVecVT.getSizeInBits())
+          .isKnownMultipleOf(Subtarget.expandVScale(VecRegSize));
 
   // 1. If the Idx has been completely eliminated and this subvector's size is
   // a vector register or a multiple thereof, or the surrounding elements are
   // undef, then this is a subvector insert which naturally aligns to a vector
   // register. These can easily be handled using subregister manipulation.
-  // 2. If the subvector is smaller than a vector register, then the insertion
-  // must preserve the undisturbed elements of the register. We do this by
-  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
-  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
-  // subvector within the vector register, and an INSERT_SUBVECTOR of that
-  // LMUL=1 type back into the larger vector (resolving to another subregister
-  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
-  // to avoid allocating a large register group to hold our subvector.
-  if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef()))
+  // 2. If the subvector isn't an exact multiple of a valid register group size,
+  // then the insertion must preserve the undisturbed elements of the register.
+  // We do this by lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1
+  // vector type (which resolves to a subregister copy), performing a VSLIDEUP
+  // to place the subvector within the vector register, and an INSERT_SUBVECTOR
+  // of that LMUL=1 type back into the larger vector (resolving to another
+  // subregister operation). See below for how our VSLIDEUP works. We go via a
+  // LMUL=1 type to avoid allocating a large register group to hold our
+  // subvector.
+  if (RemIdx.isZero() && (ExactlyVecRegSized || Vec.isUndef())) {
+    if (SubVecVT.isFixedLengthVector()) {
+      // We may get NoSubRegister if inserting at index 0 and the subvec
+      // container is the same as the vector, e.g. vec=v4i32,subvec=v4i32,idx=0
+      if (SubRegIdx == RISCV::NoSubRegister) {
+        assert(OrigIdx == 0);
+        return Op;
+      }
+
+      SDValue Insert =
+          DAG.getTargetInsertSubreg(SubRegIdx, DL, ContainerVecVT, Vec, SubVec);
+      if (VecVT.isFixedLengthVector())
+        Insert = convertFromScalableVector(VecVT, Insert, DAG, Subtarget);
+      return Insert;
+    }
     return Op;
+  }
 
   // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
   // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
   // (in our case undisturbed). This means we can set up a subvector insertion
   // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
   // size of the subvector.
-  MVT InterSubVT = VecVT;
+  MVT InterSubVT = ContainerVecVT;
   SDValue AlignedExtract = Vec;
-  unsigned AlignedIdx = OrigIdx - RemIdx;
-  if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
-    InterSubVT = getLMUL1VT(VecVT);
+  unsigned AlignedIdx = OrigIdx - RemIdx.getKnownMinValue();
+  if (SubVecVT.isFixedLengthVector())
+    AlignedIdx /= *VLen / RISCV::RVVBitsPerBlock;
+  if (ContainerVecVT.bitsGT(getLMUL1VT(ContainerVecVT))) {
+    InterSubVT = getLMUL1VT(ContainerVecVT);
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
@@ -9867,25 +9964,24 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                        DAG.getUNDEF(InterSubVT), SubVec,
                        DAG.getVectorIdxConstant(0, DL));
 
-  auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
+  auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVecVT, DL, DAG, Subtarget);
 
-  ElementCount EndIndex =
-      ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
-  VL = computeVLMax(SubVecVT, DL, DAG);
+  ElementCount EndIndex = RemIdx + SubVecVT.getVectorElementCount();
+  VL = DAG.getElementCount(DL, XLenVT, SubVecVT.getVectorElementCount());
 
   // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (EndIndex == InterSubVT.getVectorElementCount())
+  if (Subtarget.expandVScale(EndIndex) ==
+      Subtarget.expandVScale(InterSubVT.getVectorElementCount()))
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
-  if (RemIdx == 0) {
+  if (RemIdx.isZero()) {
     SubVec = DAG.getNode(RISCVISD::VMV_V_V_VL, DL, InterSubVT, AlignedExtract,
                          SubVec, VL);
   } else {
-    SDValue SlideupAmt =
-        DAG.getVScale(DL, XLenVT, APInt(XLenVT.getSizeInBits(), RemIdx));
+    SDValue SlideupAmt = DAG.getElementCount(DL, XLenVT, RemIdx);
 
     // Construct the vector length corresponding to RemIdx + length(SubVecVT).
     VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
@@ -9896,10 +9992,13 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   // If required, insert this subvector back into the correct vector register.
   // This should resolve to an INSERT_SUBREG instruction.
-  if (VecVT.bitsGT(InterSubVT))
-    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, SubVec,
+  if (ContainerVecVT.bitsGT(InterSubVT))
+    SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVecVT, Vec, SubVec,
                          DAG.getVectorIdxConstant(AlignedIdx, DL));
 
+  if (VecVT.isFixedLengthVector())
+    SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
+
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
   return DAG.getBitcast(Op.getSimpleValueType(), SubVec);
@@ -13416,6 +13515,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     return SDValue();
   uint64_t MulAmt = CNode->getZExtValue();
 
+  // WARNING: The code below is knowingly incorrect with regards to undef semantics.
+  // We're adding additional uses of X here, and in principle, we should be freezing
+  // X before doing so.  However, adding freeze here causes real regressions, and no
+  // other target properly freezes X in these cases either.
+  SDValue X = N->getOperand(0);
+
   for (uint64_t Divisor : {3, 5, 9}) {
     if (MulAmt % Divisor != 0)
       continue;
@@ -13428,7 +13533,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
     if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) {
       SDLoc DL(N);
-      SDValue X = DAG.getFreeze(N->getOperand(0));
       SDValue Mul359 =
           DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
                       DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
@@ -13446,7 +13550,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     if (ScaleShift >= 1 && ScaleShift < 4) {
       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
       SDLoc DL(N);
-      SDValue X = DAG.getFreeze(N->getOperand(0));
       SDValue Shift1 =
           DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
       return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
@@ -13466,7 +13569,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     unsigned TZ = llvm::countr_zero(C);
     if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) {
       SDLoc DL(N);
-      SDValue X = DAG.getFreeze(N->getOperand(0));
       SDValue Mul359 =
           DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
                       DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
@@ -13481,7 +13583,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     if (ScaleShift >= 1 && ScaleShift < 4) {
       unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2)));
       SDLoc DL(N);
-      SDValue X = DAG.getFreeze(N->getOperand(0));
       SDValue Shift1 =
           DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
       return DAG.getNode(ISD::ADD, DL, VT, Shift1,
@@ -13495,11 +13596,11 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     if (isPowerOf2_64(MulAmt + Offset)) {
       SDLoc DL(N);
       SDValue Shift1 =
-          DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+          DAG.getNode(ISD::SHL, DL, VT, X,
                       DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
-      SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, N->getOperand(0),
+      SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
                                    DAG.getConstant(Log2_64(Offset - 1), DL, VT),
-                                   N->getOperand(0));
+                                   X);
       return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
     }
   }
@@ -15320,7 +15421,7 @@ static SDValue tryFoldSelectIntoOp(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue OtherOp = TrueVal.getOperand(1 - OpToFold);
-  EVT OtherOpVT = OtherOp->getValueType(0);
+  EVT OtherOpVT = OtherOp.getValueType();
   SDValue IdentityOperand =
       DAG.getNeutralElement(Opc, DL, OtherOpVT, N->getFlags());
   if (!Commutative)
@@ -16162,23 +16263,39 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT:
     return performSELECTCombine(N, DAG, Subtarget);
   case RISCVISD::CZERO_EQZ:
-  case RISCVISD::CZERO_NEZ:
-    // czero_eq X, (xor Y, 1) -> czero_ne X, Y if Y is 0 or 1.
-    // czero_ne X, (xor Y, 1) -> czero_eq X, Y if Y is 0 or 1.
-    if (N->getOperand(1).getOpcode() == ISD::XOR &&
-        isOneConstant(N->getOperand(1).getOperand(1))) {
-      SDValue Cond = N->getOperand(1).getOperand(0);
-      APInt Mask = APInt::getBitsSetFrom(Cond.getValueSizeInBits(), 1);
-      if (DAG.MaskedValueIsZero(Cond, Mask)) {
-        unsigned NewOpc = N->getOpcode() == RISCVISD::CZERO_EQZ
-                              ? RISCVISD::CZERO_NEZ
-                              : RISCVISD::CZERO_EQZ;
-        return DAG.getNode(NewOpc, SDLoc(N), N->getValueType(0),
-                           N->getOperand(0), Cond);
-      }
+  case RISCVISD::CZERO_NEZ: {
+    SDValue Val = N->getOperand(0);
+    SDValue Cond = N->getOperand(1);
+
+    unsigned Opc = N->getOpcode();
+
+    // czero_eqz x, x -> x
+    if (Opc == RISCVISD::CZERO_EQZ && Val == Cond)
+      return Val;
+
+    unsigned InvOpc =
+        Opc == RISCVISD::CZERO_EQZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ;
+
+    // czero_eqz X, (xor Y, 1) -> czero_nez X, Y if Y is 0 or 1.
+    // czero_nez X, (xor Y, 1) -> czero_eqz X, Y if Y is 0 or 1.
+    if (Cond.getOpcode() == ISD::XOR && isOneConstant(Cond.getOperand(1))) {
+      SDValue NewCond = Cond.getOperand(0);
+      APInt Mask = APInt::getBitsSetFrom(NewCond.getValueSizeInBits(), 1);
+      if (DAG.MaskedValueIsZero(NewCond, Mask))
+        return DAG.getNode(InvOpc, SDLoc(N), N->getValueType(0), Val, NewCond);
+    }
+    // czero_eqz x, (setcc y, 0, ne) -> czero_eqz x, y
+    // czero_nez x, (setcc y, 0, ne) -> czero_nez x, y
+    // czero_eqz x, (setcc y, 0, eq) -> czero_nez x, y
+    // czero_nez x, (setcc y, 0, eq) -> czero_eqz x, y
+    if (Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
+      ISD::CondCode CCVal = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+      if (ISD::isIntEqualitySetCC(CCVal))
+        return DAG.getNode(CCVal == ISD::SETNE ? Opc : InvOpc, SDLoc(N),
+                           N->getValueType(0), Val, Cond.getOperand(0));
     }
     return SDValue();
-
+  }
   case RISCVISD::SELECT_CC: {
     // Transform
     SDValue LHS = N->getOperand(0);
@@ -16770,6 +16887,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue Scalar = N->getOperand(1);
     SDValue VL = N->getOperand(2);
 
+    if (Scalar.getOpcode() == RISCVISD::VMV_X_S && Passthru.isUndef() &&
+        Scalar.getOperand(0).getValueType() == N->getValueType(0))
+      return Scalar.getOperand(0);
+
     // Use M1 or smaller to avoid over constraining register allocation
     const MVT M1VT = getLMUL1VT(VT);
     if (M1VT.bitsLT(VT)) {
@@ -21002,7 +21123,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   IRBuilder<> Builder(LI);
 
   // Only deinterleave2 supported at present.
-  if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+  if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
   unsigned Factor = 2;
@@ -21052,7 +21173,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
   IRBuilder<> Builder(SI);
 
   // Only interleave2 supported at present.
-  if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
   unsigned Factor = 2;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ed14fd453943..78f99e70c083 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -961,6 +961,7 @@ private:
   SDValue lowerVPFPIntConvOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPStridedLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPStridedStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVPCttzElements(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorExtendToRVV(SDValue Op, SelectionDAG &DAG,
                                             unsigned ExtendOpc) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index ec1a9f4c135c..216dc7808520 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -45,10 +45,6 @@ static cl::opt<bool> DisableInsertVSETVLPHIOpt(
     "riscv-disable-insert-vsetvl-phi-opt", cl::init(false), cl::Hidden,
     cl::desc("Disable looking through phis when inserting vsetvlis."));
 
-static cl::opt<bool> UseStrictAsserts(
-    "riscv-insert-vsetvl-strict-asserts", cl::init(true), cl::Hidden,
-    cl::desc("Enable strict assertion checking for the dataflow algorithm"));
-
 namespace {
 
 static unsigned getVLOpNum(const MachineInstr &MI) {
@@ -156,7 +152,7 @@ static std::optional<unsigned> getEEWForLoadStore(const MachineInstr &MI) {
   }
 }
 
-static bool isNonZeroLoadImmediate(MachineInstr &MI) {
+static bool isNonZeroLoadImmediate(const MachineInstr &MI) {
   return MI.getOpcode() == RISCV::ADDI &&
     MI.getOperand(1).isReg() && MI.getOperand(2).isImm() &&
     MI.getOperand(1).getReg() == RISCV::X0 &&
@@ -262,6 +258,17 @@ struct DemandedFields {
     VLZeroness = true;
   }
 
+  // Make this the result of demanding both the fields in this and B.
+  void doUnion(const DemandedFields &B) {
+    VLAny |= B.VLAny;
+    VLZeroness |= B.VLZeroness;
+    SEW = std::max(SEW, B.SEW);
+    LMUL |= B.LMUL;
+    SEWLMULRatio |= B.SEWLMULRatio;
+    TailPolicy |= B.TailPolicy;
+    MaskPolicy |= B.MaskPolicy;
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Support for debugging, callable in GDB: V->dump()
   LLVM_DUMP_METHOD void dump() const {
@@ -443,8 +450,12 @@ DemandedFields getDemanded(const MachineInstr &MI,
 /// Defines the abstract state with which the forward dataflow models the
 /// values of the VL and VTYPE registers after insertion.
 class VSETVLIInfo {
+  struct AVLDef {
+    const MachineInstr *DefMI;
+    Register DefReg;
+  };
   union {
-    Register AVLReg;
+    AVLDef AVLRegDef;
     unsigned AVLImm;
   };
 
@@ -452,6 +463,8 @@ class VSETVLIInfo {
     Uninitialized,
     AVLIsReg,
     AVLIsImm,
+    AVLIsVLMAX,
+    AVLIsIgnored,
     Unknown,
   } State = Uninitialized;
 
@@ -477,9 +490,10 @@ public:
   void setUnknown() { State = Unknown; }
   bool isUnknown() const { return State == Unknown; }
 
-  void setAVLReg(Register Reg) {
-    assert(Reg.isVirtual() || Reg == RISCV::X0 || Reg == RISCV::NoRegister);
-    AVLReg = Reg;
+  void setAVLRegDef(const MachineInstr *DefMI, Register AVLReg) {
+    assert(DefMI && AVLReg.isVirtual());
+    AVLRegDef.DefMI = DefMI;
+    AVLRegDef.DefReg = AVLReg;
     State = AVLIsReg;
   }
 
@@ -488,23 +502,37 @@ public:
     State = AVLIsImm;
   }
 
+  void setAVLVLMAX() { State = AVLIsVLMAX; }
+
+  void setAVLIgnored() { State = AVLIsIgnored; }
+
   bool hasAVLImm() const { return State == AVLIsImm; }
   bool hasAVLReg() const { return State == AVLIsReg; }
+  bool hasAVLVLMAX() const { return State == AVLIsVLMAX; }
+  bool hasAVLIgnored() const { return State == AVLIsIgnored; }
   Register getAVLReg() const {
-    assert(hasAVLReg());
-    return AVLReg;
+    assert(hasAVLReg() && AVLRegDef.DefReg.isVirtual());
+    return AVLRegDef.DefReg;
   }
   unsigned getAVLImm() const {
     assert(hasAVLImm());
     return AVLImm;
   }
+  const MachineInstr &getAVLDefMI() const {
+    assert(hasAVLReg() && AVLRegDef.DefMI);
+    return *AVLRegDef.DefMI;
+  }
 
   void setAVL(VSETVLIInfo Info) {
     assert(Info.isValid());
     if (Info.isUnknown())
       setUnknown();
     else if (Info.hasAVLReg())
-      setAVLReg(Info.getAVLReg());
+      setAVLRegDef(&Info.getAVLDefMI(), Info.getAVLReg());
+    else if (Info.hasAVLVLMAX())
+      setAVLVLMAX();
+    else if (Info.hasAVLIgnored())
+      setAVLIgnored();
     else {
       assert(Info.hasAVLImm());
       setAVLImm(Info.getAVLImm());
@@ -516,34 +544,38 @@ public:
   bool getTailAgnostic() const { return TailAgnostic; }
   bool getMaskAgnostic() const { return MaskAgnostic; }
 
-  bool hasNonZeroAVL(const MachineRegisterInfo &MRI) const {
+  bool hasNonZeroAVL() const {
     if (hasAVLImm())
       return getAVLImm() > 0;
-    if (hasAVLReg()) {
-      if (getAVLReg() == RISCV::X0)
-        return true;
-      if (MachineInstr *MI = MRI.getVRegDef(getAVLReg());
-          MI && isNonZeroLoadImmediate(*MI))
-        return true;
+    if (hasAVLReg())
+      return isNonZeroLoadImmediate(getAVLDefMI());
+    if (hasAVLVLMAX())
+      return true;
+    if (hasAVLIgnored())
       return false;
-    }
     return false;
   }
 
-  bool hasEquallyZeroAVL(const VSETVLIInfo &Other,
-                         const MachineRegisterInfo &MRI) const {
+  bool hasEquallyZeroAVL(const VSETVLIInfo &Other) const {
     if (hasSameAVL(Other))
       return true;
-    return (hasNonZeroAVL(MRI) && Other.hasNonZeroAVL(MRI));
+    return (hasNonZeroAVL() && Other.hasNonZeroAVL());
   }
 
   bool hasSameAVL(const VSETVLIInfo &Other) const {
     if (hasAVLReg() && Other.hasAVLReg())
-      return getAVLReg() == Other.getAVLReg();
+      return AVLRegDef.DefMI == Other.AVLRegDef.DefMI &&
+             AVLRegDef.DefReg == Other.AVLRegDef.DefReg;
 
     if (hasAVLImm() && Other.hasAVLImm())
       return getAVLImm() == Other.getAVLImm();
 
+    if (hasAVLVLMAX())
+      return Other.hasAVLVLMAX() && hasSameVLMAX(Other);
+
+    if (hasAVLIgnored())
+      return Other.hasAVLIgnored();
+
     return false;
   }
 
@@ -629,7 +661,7 @@ public:
     if (Used.VLAny && !(hasSameAVL(Require) && hasSameVLMAX(Require)))
       return false;
 
-    if (Used.VLZeroness && !hasEquallyZeroAVL(Require, MRI))
+    if (Used.VLZeroness && !hasEquallyZeroAVL(Require))
       return false;
 
     return hasCompatibleVTYPE(Used, Require);
@@ -714,9 +746,13 @@ public:
     if (isUnknown())
       OS << "unknown";
     if (hasAVLReg())
-      OS << "AVLReg=" << (unsigned)AVLReg;
+      OS << "AVLReg=" << (unsigned)getAVLReg();
     if (hasAVLImm())
       OS << "AVLImm=" << (unsigned)AVLImm;
+    if (hasAVLVLMAX())
+      OS << "AVLVLMAX";
+    if (hasAVLIgnored())
+      OS << "AVLIgnored";
     OS << ", "
        << "VLMul=" << (unsigned)VLMul << ", "
        << "SEW=" << (unsigned)SEW << ", "
@@ -836,7 +872,8 @@ INITIALIZE_PASS(RISCVCoalesceVSETVLI, "riscv-coalesce-vsetvli",
 
 // Return a VSETVLIInfo representing the changes made by this VSETVLI or
 // VSETIVLI instruction.
-static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
+static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
+                                     const MachineRegisterInfo &MRI) {
   VSETVLIInfo NewInfo;
   if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
     NewInfo.setAVLImm(MI.getOperand(1).getImm());
@@ -846,7 +883,10 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
     Register AVLReg = MI.getOperand(1).getReg();
     assert((AVLReg != RISCV::X0 || MI.getOperand(0).getReg() != RISCV::X0) &&
            "Can't handle X0, X0 vsetvli yet");
-    NewInfo.setAVLReg(AVLReg);
+    if (AVLReg == RISCV::X0)
+      NewInfo.setAVLVLMAX();
+    else
+      NewInfo.setAVLRegDef(MRI.getVRegDef(AVLReg), AVLReg);
   }
   NewInfo.setVTYPE(MI.getOperand(2).getImm());
 
@@ -913,16 +953,19 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
         if (ST.getRealMinVLen() == ST.getRealMaxVLen() && VLMAX <= 31)
           InstrInfo.setAVLImm(VLMAX);
         else
-          InstrInfo.setAVLReg(RISCV::X0);
+          InstrInfo.setAVLVLMAX();
       }
       else
         InstrInfo.setAVLImm(Imm);
     } else {
-      InstrInfo.setAVLReg(VLOp.getReg());
+      InstrInfo.setAVLRegDef(MRI->getVRegDef(VLOp.getReg()), VLOp.getReg());
     }
   } else {
     assert(isScalarExtractInstr(MI));
-    InstrInfo.setAVLReg(RISCV::NoRegister);
+    // TODO: If we are more clever about x0,x0 insertion then we should be able
+    // to deduce that the VL is ignored based off of DemandedFields, and remove
+    // the AVLIsIgnored state. Then we can just use an arbitrary immediate AVL.
+    InstrInfo.setAVLIgnored();
   }
 #ifndef NDEBUG
   if (std::optional<unsigned> EEW = getEEWForLoadStore(MI)) {
@@ -935,14 +978,13 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
   // AVL operand with the AVL of the defining vsetvli.  We avoid general
   // register AVLs to avoid extending live ranges without being sure we can
   // kill the original source reg entirely.
-  if (InstrInfo.hasAVLReg() && InstrInfo.getAVLReg().isVirtual()) {
-    MachineInstr *DefMI = MRI->getVRegDef(InstrInfo.getAVLReg());
-    if (DefMI && isVectorConfigInstr(*DefMI)) {
-      VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
+  if (InstrInfo.hasAVLReg()) {
+    const MachineInstr &DefMI = InstrInfo.getAVLDefMI();
+    if (isVectorConfigInstr(DefMI)) {
+      VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(DefMI, *MRI);
       if (DefInstrInfo.hasSameVLMAX(InstrInfo) &&
-          (DefInstrInfo.hasAVLImm() || DefInstrInfo.getAVLReg() == RISCV::X0)) {
+          (DefInstrInfo.hasAVLImm() || DefInstrInfo.hasAVLVLMAX()))
         InstrInfo.setAVL(DefInstrInfo);
-      }
     }
   }
 
@@ -976,19 +1018,17 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
     // If our AVL is a virtual register, it might be defined by a VSET(I)VLI. If
     // it has the same VLMAX we want and the last VL/VTYPE we observed is the
     // same, we can use the X0, X0 form.
-    if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg() &&
-        Info.getAVLReg().isVirtual()) {
-      if (MachineInstr *DefMI = MRI->getVRegDef(Info.getAVLReg())) {
-        if (isVectorConfigInstr(*DefMI)) {
-          VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
-          if (DefInfo.hasSameAVL(PrevInfo) && DefInfo.hasSameVLMAX(PrevInfo)) {
-            BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
-                .addReg(RISCV::X0, RegState::Define | RegState::Dead)
-                .addReg(RISCV::X0, RegState::Kill)
-                .addImm(Info.encodeVTYPE())
-                .addReg(RISCV::VL, RegState::Implicit);
-            return;
-          }
+    if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg()) {
+      const MachineInstr &DefMI = Info.getAVLDefMI();
+      if (isVectorConfigInstr(DefMI)) {
+        VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI);
+        if (DefInfo.hasSameAVL(PrevInfo) && DefInfo.hasSameVLMAX(PrevInfo)) {
+          BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+              .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+              .addReg(RISCV::X0, RegState::Kill)
+              .addImm(Info.encodeVTYPE())
+              .addReg(RISCV::VL, RegState::Implicit);
+          return;
         }
       }
     }
@@ -1002,8 +1042,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
     return;
   }
 
-  Register AVLReg = Info.getAVLReg();
-  if (AVLReg == RISCV::NoRegister) {
+  if (Info.hasAVLIgnored()) {
     // We can only use x0, x0 if there's no chance of the vtype change causing
     // the previous vl to become invalid.
     if (PrevInfo.isValid() && !PrevInfo.isUnknown() &&
@@ -1023,20 +1062,19 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
     return;
   }
 
-  if (AVLReg.isVirtual())
-    MRI->constrainRegClass(AVLReg, &RISCV::GPRNoX0RegClass);
-
-  // Use X0 as the DestReg unless AVLReg is X0. We also need to change the
-  // opcode if the AVLReg is X0 as they have different register classes for
-  // the AVL operand.
-  Register DestReg = RISCV::X0;
-  unsigned Opcode = RISCV::PseudoVSETVLI;
-  if (AVLReg == RISCV::X0) {
-    DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
-    Opcode = RISCV::PseudoVSETVLIX0;
+  if (Info.hasAVLVLMAX()) {
+    Register DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass);
+    BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
+        .addReg(DestReg, RegState::Define | RegState::Dead)
+        .addReg(RISCV::X0, RegState::Kill)
+        .addImm(Info.encodeVTYPE());
+    return;
   }
-  BuildMI(MBB, InsertPt, DL, TII->get(Opcode))
-      .addReg(DestReg, RegState::Define | RegState::Dead)
+
+  Register AVLReg = Info.getAVLReg();
+  MRI->constrainRegClass(AVLReg, &RISCV::GPRNoX0RegClass);
+  BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLI))
+      .addReg(RISCV::X0, RegState::Define | RegState::Dead)
       .addReg(AVLReg)
       .addImm(Info.encodeVTYPE());
 }
@@ -1098,14 +1136,12 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
   // it might be defined by a VSET(I)VLI. If it has the same VLMAX we need
   // and the last VL/VTYPE we observed is the same, we don't need a
   // VSETVLI here.
-  if (Require.hasAVLReg() && Require.getAVLReg().isVirtual() &&
-      CurInfo.hasCompatibleVTYPE(Used, Require)) {
-    if (MachineInstr *DefMI = MRI->getVRegDef(Require.getAVLReg())) {
-      if (isVectorConfigInstr(*DefMI)) {
-        VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
-        if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
-          return false;
-      }
+  if (Require.hasAVLReg() && CurInfo.hasCompatibleVTYPE(Used, Require)) {
+    const MachineInstr &DefMI = Require.getAVLDefMI();
+    if (isVectorConfigInstr(DefMI)) {
+      VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI);
+      if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
+        return false;
     }
   }
 
@@ -1158,7 +1194,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
   // variant, so we avoid the transform to prevent extending live range of an
   // avl register operand.
   // TODO: We can probably relax this for immediates.
-  bool EquallyZero = IncomingInfo.hasEquallyZeroAVL(PrevInfo, *MRI) &&
+  bool EquallyZero = IncomingInfo.hasEquallyZeroAVL(PrevInfo) &&
                      IncomingInfo.hasSameVLMAX(PrevInfo);
   if (Demanded.VLAny || (Demanded.VLZeroness && !EquallyZero))
     Info.setAVL(IncomingInfo);
@@ -1189,13 +1225,14 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
 void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
                                        const MachineInstr &MI) const {
   if (isVectorConfigInstr(MI)) {
-    Info = getInfoForVSETVLI(MI);
+    Info = getInfoForVSETVLI(MI, *MRI);
     return;
   }
 
   if (RISCV::isFaultFirstLoad(MI)) {
     // Update AVL to vl-output of the fault first load.
-    Info.setAVLReg(MI.getOperand(1).getReg());
+    Info.setAVLRegDef(MRI->getVRegDef(MI.getOperand(1).getReg()),
+                      MI.getOperand(1).getReg());
     return;
   }
 
@@ -1289,24 +1326,16 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
   if (!Require.hasAVLReg())
     return true;
 
-  Register AVLReg = Require.getAVLReg();
-  if (!AVLReg.isVirtual())
-    return true;
-
   // We need the AVL to be produce by a PHI node in this basic block.
-  MachineInstr *PHI = MRI->getVRegDef(AVLReg);
-  if (!PHI || PHI->getOpcode() != RISCV::PHI || PHI->getParent() != &MBB)
+  const MachineInstr *PHI = &Require.getAVLDefMI();
+  if (PHI->getOpcode() != RISCV::PHI || PHI->getParent() != &MBB)
     return true;
 
   for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps;
        PHIOp += 2) {
     Register InReg = PHI->getOperand(PHIOp).getReg();
     MachineBasicBlock *PBB = PHI->getOperand(PHIOp + 1).getMBB();
-    const BlockData &PBBInfo = BlockInfo[PBB->getNumber()];
-    // If the exit from the predecessor has the VTYPE we are looking for
-    // we might be able to avoid a VSETVLI.
-    if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
-      return true;
+    const VSETVLIInfo &PBBExit = BlockInfo[PBB->getNumber()].Exit;
 
     // We need the PHI input to the be the output of a VSET(I)VLI.
     MachineInstr *DefMI = MRI->getVRegDef(InReg);
@@ -1315,9 +1344,14 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
 
     // We found a VSET(I)VLI make sure it matches the output of the
     // predecessor block.
-    VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
-    if (!DefInfo.hasSameAVL(PBBInfo.Exit) ||
-        !DefInfo.hasSameVTYPE(PBBInfo.Exit))
+    VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI);
+    if (DefInfo != PBBExit)
+      return true;
+
+    // Require has the same VL as PBBExit, so if the exit from the
+    // predecessor has the VTYPE we are looking for we might be able
+    // to avoid a VSETVLI.
+    if (PBBExit.isUnknown() || !PBBExit.hasSameVTYPE(Require))
       return true;
   }
 
@@ -1394,32 +1428,14 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
     transferAfter(CurInfo, MI);
   }
 
-  // If we reach the end of the block and our current info doesn't match the
-  // expected info, insert a vsetvli to correct.
-  if (!UseStrictAsserts) {
-    const VSETVLIInfo &ExitInfo = BlockInfo[MBB.getNumber()].Exit;
-    if (CurInfo.isValid() && ExitInfo.isValid() && !ExitInfo.isUnknown() &&
-        CurInfo != ExitInfo) {
-      // Note there's an implicit assumption here that terminators never use
-      // or modify VL or VTYPE.  Also, fallthrough will return end().
-      auto InsertPt = MBB.getFirstInstrTerminator();
-      insertVSETVLI(MBB, InsertPt, MBB.findDebugLoc(InsertPt), ExitInfo,
-                    CurInfo);
-      CurInfo = ExitInfo;
-    }
-  }
-
-  if (UseStrictAsserts && CurInfo.isValid()) {
-    const auto &Info = BlockInfo[MBB.getNumber()];
-    if (CurInfo != Info.Exit) {
-      LLVM_DEBUG(dbgs() << "in block " << printMBBReference(MBB) << "\n");
-      LLVM_DEBUG(dbgs() << "  begin        state: " << Info.Pred << "\n");
-      LLVM_DEBUG(dbgs() << "  expected end state: " << Info.Exit << "\n");
-      LLVM_DEBUG(dbgs() << "  actual   end state: " << CurInfo << "\n");
-    }
-    assert(CurInfo == Info.Exit &&
-           "InsertVSETVLI dataflow invariant violated");
+  const auto &Info = BlockInfo[MBB.getNumber()];
+  if (CurInfo != Info.Exit) {
+    LLVM_DEBUG(dbgs() << "in block " << printMBBReference(MBB) << "\n");
+    LLVM_DEBUG(dbgs() << "  begin        state: " << Info.Pred << "\n");
+    LLVM_DEBUG(dbgs() << "  expected end state: " << Info.Exit << "\n");
+    LLVM_DEBUG(dbgs() << "  actual   end state: " << CurInfo << "\n");
   }
+  assert(CurInfo == Info.Exit && "InsertVSETVLI dataflow invariant violated");
 }
 
 /// Perform simple partial redundancy elimination of the VSETVLI instructions
@@ -1463,10 +1479,8 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
   // If the AVL value is a register (other than our VLMAX sentinel),
   // we need to prove the value is available at the point we're going
   // to insert the vsetvli at.
-  if (AvailableInfo.hasAVLReg() && RISCV::X0 != AvailableInfo.getAVLReg()) {
-    MachineInstr *AVLDefMI = MRI->getVRegDef(AvailableInfo.getAVLReg());
-    if (!AVLDefMI)
-      return;
+  if (AvailableInfo.hasAVLReg()) {
+    const MachineInstr *AVLDefMI = &AvailableInfo.getAVLDefMI();
     // This is an inline dominance check which covers the case of
     // UnavailablePred being the preheader of a loop.
     if (AVLDefMI->getParent() != UnavailablePred)
@@ -1476,6 +1490,11 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
         return;
   }
 
+  // If the AVL isn't used in its predecessors then bail, since we have no AVL
+  // to insert a vsetvli with.
+  if (AvailableInfo.hasAVLIgnored())
+    return;
+
   // Model the effect of changing the input state of the block MBB to
   // AvailableInfo.  We're looking for two issues here; one legality,
   // one profitability.
@@ -1524,16 +1543,6 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
                 AvailableInfo, OldExit);
 }
 
-static void doUnion(DemandedFields &A, DemandedFields B) {
-  A.VLAny |= B.VLAny;
-  A.VLZeroness |= B.VLZeroness;
-  A.SEW = std::max(A.SEW, B.SEW);
-  A.LMUL |= B.LMUL;
-  A.SEWLMULRatio |= B.SEWLMULRatio;
-  A.TailPolicy |= B.TailPolicy;
-  A.MaskPolicy |= B.MaskPolicy;
-}
-
 // Return true if we can mutate PrevMI to match MI without changing any the
 // fields which would be observed.
 static bool canMutatePriorConfig(const MachineInstr &PrevMI,
@@ -1550,8 +1559,8 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
     if (Used.VLZeroness) {
       if (isVLPreservingConfig(PrevMI))
         return false;
-      if (!getInfoForVSETVLI(PrevMI).hasEquallyZeroAVL(getInfoForVSETVLI(MI),
-                                                       MRI))
+      if (!getInfoForVSETVLI(PrevMI, MRI)
+               .hasEquallyZeroAVL(getInfoForVSETVLI(MI, MRI)))
         return false;
     }
 
@@ -1583,7 +1592,7 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
   for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
 
     if (!isVectorConfigInstr(MI)) {
-      doUnion(Used, getDemanded(MI, MRI, ST));
+      Used.doUnion(getDemanded(MI, MRI, ST));
       if (MI.isCall() || MI.isInlineAsm() ||
           MI.modifiesRegister(RISCV::VL, /*TRI=*/nullptr) ||
           MI.modifiesRegister(RISCV::VTYPE, /*TRI=*/nullptr))
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index dac47d6f4154..8cb9a40a98bc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1633,8 +1633,230 @@ static bool isFMUL(unsigned Opc) {
   }
 }
 
+bool RISCVInstrInfo::isVectorAssociativeAndCommutative(const MachineInstr &Inst,
+                                                       bool Invert) const {
+#define OPCODE_LMUL_CASE(OPC)                                                  \
+  case RISCV::OPC##_M1:                                                        \
+  case RISCV::OPC##_M2:                                                        \
+  case RISCV::OPC##_M4:                                                        \
+  case RISCV::OPC##_M8:                                                        \
+  case RISCV::OPC##_MF2:                                                       \
+  case RISCV::OPC##_MF4:                                                       \
+  case RISCV::OPC##_MF8
+
+#define OPCODE_LMUL_MASK_CASE(OPC)                                             \
+  case RISCV::OPC##_M1_MASK:                                                   \
+  case RISCV::OPC##_M2_MASK:                                                   \
+  case RISCV::OPC##_M4_MASK:                                                   \
+  case RISCV::OPC##_M8_MASK:                                                   \
+  case RISCV::OPC##_MF2_MASK:                                                  \
+  case RISCV::OPC##_MF4_MASK:                                                  \
+  case RISCV::OPC##_MF8_MASK
+
+  unsigned Opcode = Inst.getOpcode();
+  if (Invert) {
+    if (auto InvOpcode = getInverseOpcode(Opcode))
+      Opcode = *InvOpcode;
+    else
+      return false;
+  }
+
+  // clang-format off
+  switch (Opcode) {
+  default:
+    return false;
+  OPCODE_LMUL_CASE(PseudoVADD_VV):
+  OPCODE_LMUL_MASK_CASE(PseudoVADD_VV):
+  OPCODE_LMUL_CASE(PseudoVMUL_VV):
+  OPCODE_LMUL_MASK_CASE(PseudoVMUL_VV):
+    return true;
+  }
+  // clang-format on
+
+#undef OPCODE_LMUL_MASK_CASE
+#undef OPCODE_LMUL_CASE
+}
+
+bool RISCVInstrInfo::areRVVInstsReassociable(const MachineInstr &Root,
+                                             const MachineInstr &Prev) const {
+  if (!areOpcodesEqualOrInverse(Root.getOpcode(), Prev.getOpcode()))
+    return false;
+
+  assert(Root.getMF() == Prev.getMF());
+  const MachineRegisterInfo *MRI = &Root.getMF()->getRegInfo();
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+
+  // Make sure vtype operands are also the same.
+  const MCInstrDesc &Desc = get(Root.getOpcode());
+  const uint64_t TSFlags = Desc.TSFlags;
+
+  auto checkImmOperand = [&](unsigned OpIdx) {
+    return Root.getOperand(OpIdx).getImm() == Prev.getOperand(OpIdx).getImm();
+  };
+
+  auto checkRegOperand = [&](unsigned OpIdx) {
+    return Root.getOperand(OpIdx).getReg() == Prev.getOperand(OpIdx).getReg();
+  };
+
+  // PassThru
+  // TODO: Potentially we can loosen the condition to consider Root to be
+  // associable with Prev if Root has NoReg as passthru. In which case we
+  // also need to loosen the condition on vector policies between these.
+  if (!checkRegOperand(1))
+    return false;
+
+  // SEW
+  if (RISCVII::hasSEWOp(TSFlags) &&
+      !checkImmOperand(RISCVII::getSEWOpNum(Desc)))
+    return false;
+
+  // Mask
+  if (RISCVII::usesMaskPolicy(TSFlags)) {
+    const MachineBasicBlock *MBB = Root.getParent();
+    const MachineBasicBlock::const_reverse_iterator It1(&Root);
+    const MachineBasicBlock::const_reverse_iterator It2(&Prev);
+    Register MI1VReg;
+
+    bool SeenMI2 = false;
+    for (auto End = MBB->rend(), It = It1; It != End; ++It) {
+      if (It == It2) {
+        SeenMI2 = true;
+        if (!MI1VReg.isValid())
+          // There is no V0 def between Root and Prev; they're sharing the
+          // same V0.
+          break;
+      }
+
+      if (It->modifiesRegister(RISCV::V0, TRI)) {
+        Register SrcReg = It->getOperand(1).getReg();
+        // If it's not VReg it'll be more difficult to track its defs, so
+        // bailing out here just to be safe.
+        if (!SrcReg.isVirtual())
+          return false;
+
+        if (!MI1VReg.isValid()) {
+          // This is the V0 def for Root.
+          MI1VReg = SrcReg;
+          continue;
+        }
+
+        // Some random mask updates.
+        if (!SeenMI2)
+          continue;
+
+        // This is the V0 def for Prev; check if it's the same as that of
+        // Root.
+        if (MI1VReg != SrcReg)
+          return false;
+        else
+          break;
+      }
+    }
+
+    // If we haven't encountered Prev, it's likely that this function was
+    // called in a wrong way (e.g. Root is before Prev).
+    assert(SeenMI2 && "Prev is expected to appear before Root");
+  }
+
+  // Tail / Mask policies
+  if (RISCVII::hasVecPolicyOp(TSFlags) &&
+      !checkImmOperand(RISCVII::getVecPolicyOpNum(Desc)))
+    return false;
+
+  // VL
+  if (RISCVII::hasVLOp(TSFlags)) {
+    unsigned OpIdx = RISCVII::getVLOpNum(Desc);
+    const MachineOperand &Op1 = Root.getOperand(OpIdx);
+    const MachineOperand &Op2 = Prev.getOperand(OpIdx);
+    if (Op1.getType() != Op2.getType())
+      return false;
+    switch (Op1.getType()) {
+    case MachineOperand::MO_Register:
+      if (Op1.getReg() != Op2.getReg())
+        return false;
+      break;
+    case MachineOperand::MO_Immediate:
+      if (Op1.getImm() != Op2.getImm())
+        return false;
+      break;
+    default:
+      llvm_unreachable("Unrecognized VL operand type");
+    }
+  }
+
+  // Rounding modes
+  if (RISCVII::hasRoundModeOp(TSFlags) &&
+      !checkImmOperand(RISCVII::getVLOpNum(Desc) - 1))
+    return false;
+
+  return true;
+}
+
+// Most of our RVV pseudos have passthru operand, so the real operands
+// start from index = 2.
+bool RISCVInstrInfo::hasReassociableVectorSibling(const MachineInstr &Inst,
+                                                  bool &Commuted) const {
+  const MachineBasicBlock *MBB = Inst.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  assert(RISCVII::isFirstDefTiedToFirstUse(get(Inst.getOpcode())) &&
+         "Expect the present of passthrough operand.");
+  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(3).getReg());
+
+  // If only one operand has the same or inverse opcode and it's the second
+  // source operand, the operands must be commuted.
+  Commuted = !areRVVInstsReassociable(Inst, *MI1) &&
+             areRVVInstsReassociable(Inst, *MI2);
+  if (Commuted)
+    std::swap(MI1, MI2);
+
+  return areRVVInstsReassociable(Inst, *MI1) &&
+         (isVectorAssociativeAndCommutative(*MI1) ||
+          isVectorAssociativeAndCommutative(*MI1, /* Invert */ true)) &&
+         hasReassociableOperands(*MI1, MBB) &&
+         MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg());
+}
+
+bool RISCVInstrInfo::hasReassociableOperands(
+    const MachineInstr &Inst, const MachineBasicBlock *MBB) const {
+  if (!isVectorAssociativeAndCommutative(Inst) &&
+      !isVectorAssociativeAndCommutative(Inst, /*Invert=*/true))
+    return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
+
+  const MachineOperand &Op1 = Inst.getOperand(2);
+  const MachineOperand &Op2 = Inst.getOperand(3);
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  // We need virtual register definitions for the operands that we will
+  // reassociate.
+  MachineInstr *MI1 = nullptr;
+  MachineInstr *MI2 = nullptr;
+  if (Op1.isReg() && Op1.getReg().isVirtual())
+    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
+  if (Op2.isReg() && Op2.getReg().isVirtual())
+    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
+
+  // And at least one operand must be defined in MBB.
+  return MI1 && MI2 && (MI1->getParent() == MBB || MI2->getParent() == MBB);
+}
+
+void RISCVInstrInfo::getReassociateOperandIndices(
+    const MachineInstr &Root, unsigned Pattern,
+    std::array<unsigned, 5> &OperandIndices) const {
+  TargetInstrInfo::getReassociateOperandIndices(Root, Pattern, OperandIndices);
+  if (RISCV::getRVVMCOpcode(Root.getOpcode())) {
+    // Skip the passthrough operand, so increment all indices by one.
+    for (unsigned I = 0; I < 5; ++I)
+      ++OperandIndices[I];
+  }
+}
+
 bool RISCVInstrInfo::hasReassociableSibling(const MachineInstr &Inst,
                                             bool &Commuted) const {
+  if (isVectorAssociativeAndCommutative(Inst) ||
+      isVectorAssociativeAndCommutative(Inst, /*Invert=*/true))
+    return hasReassociableVectorSibling(Inst, Commuted);
+
   if (!TargetInstrInfo::hasReassociableSibling(Inst, Commuted))
     return false;
 
@@ -1654,6 +1876,9 @@ bool RISCVInstrInfo::hasReassociableSibling(const MachineInstr &Inst,
 
 bool RISCVInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
                                                  bool Invert) const {
+  if (isVectorAssociativeAndCommutative(Inst, Invert))
+    return true;
+
   unsigned Opc = Inst.getOpcode();
   if (Invert) {
     auto InverseOpcode = getInverseOpcode(Opc);
@@ -1706,6 +1931,38 @@ bool RISCVInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
 
 std::optional<unsigned>
 RISCVInstrInfo::getInverseOpcode(unsigned Opcode) const {
+#define RVV_OPC_LMUL_CASE(OPC, INV)                                            \
+  case RISCV::OPC##_M1:                                                        \
+    return RISCV::INV##_M1;                                                    \
+  case RISCV::OPC##_M2:                                                        \
+    return RISCV::INV##_M2;                                                    \
+  case RISCV::OPC##_M4:                                                        \
+    return RISCV::INV##_M4;                                                    \
+  case RISCV::OPC##_M8:                                                        \
+    return RISCV::INV##_M8;                                                    \
+  case RISCV::OPC##_MF2:                                                       \
+    return RISCV::INV##_MF2;                                                   \
+  case RISCV::OPC##_MF4:                                                       \
+    return RISCV::INV##_MF4;                                                   \
+  case RISCV::OPC##_MF8:                                                       \
+    return RISCV::INV##_MF8
+
+#define RVV_OPC_LMUL_MASK_CASE(OPC, INV)                                       \
+  case RISCV::OPC##_M1_MASK:                                                   \
+    return RISCV::INV##_M1_MASK;                                               \
+  case RISCV::OPC##_M2_MASK:                                                   \
+    return RISCV::INV##_M2_MASK;                                               \
+  case RISCV::OPC##_M4_MASK:                                                   \
+    return RISCV::INV##_M4_MASK;                                               \
+  case RISCV::OPC##_M8_MASK:                                                   \
+    return RISCV::INV##_M8_MASK;                                               \
+  case RISCV::OPC##_MF2_MASK:                                                  \
+    return RISCV::INV##_MF2_MASK;                                              \
+  case RISCV::OPC##_MF4_MASK:                                                  \
+    return RISCV::INV##_MF4_MASK;                                              \
+  case RISCV::OPC##_MF8_MASK:                                                  \
+    return RISCV::INV##_MF8_MASK
+
   switch (Opcode) {
   default:
     return std::nullopt;
@@ -1729,7 +1986,16 @@ RISCVInstrInfo::getInverseOpcode(unsigned Opcode) const {
     return RISCV::SUBW;
   case RISCV::SUBW:
     return RISCV::ADDW;
+    // clang-format off
+  RVV_OPC_LMUL_CASE(PseudoVADD_VV, PseudoVSUB_VV);
+  RVV_OPC_LMUL_MASK_CASE(PseudoVADD_VV, PseudoVSUB_VV);
+  RVV_OPC_LMUL_CASE(PseudoVSUB_VV, PseudoVADD_VV);
+  RVV_OPC_LMUL_MASK_CASE(PseudoVSUB_VV, PseudoVADD_VV);
+    // clang-format on
   }
+
+#undef RVV_OPC_LMUL_MASK_CASE
+#undef RVV_OPC_LMUL_CASE
 }
 
 static bool canCombineFPFusedMultiply(const MachineInstr &Root,
@@ -2866,6 +3132,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_RVV_OPCODE_WIDEN(VWMACC_VV):
   case CASE_RVV_OPCODE_WIDEN(VWMACCU_VV):
   case CASE_RVV_OPCODE_UNMASK(VADC_VVM):
+  case CASE_RVV_OPCODE(VSADD_VV):
+  case CASE_RVV_OPCODE(VSADDU_VV):
+  case CASE_RVV_OPCODE(VAADD_VV):
+  case CASE_RVV_OPCODE(VAADDU_VV):
+  case CASE_RVV_OPCODE(VSMUL_VV):
     // Operands 2 and 3 are commutable.
     return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
   case CASE_VFMA_SPLATS(FMADD):
@@ -3152,6 +3423,16 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
 }
 
+#undef CASE_RVV_OPCODE_UNMASK_LMUL
+#undef CASE_RVV_OPCODE_MASK_LMUL
+#undef CASE_RVV_OPCODE_LMUL
+#undef CASE_RVV_OPCODE_UNMASK_WIDEN
+#undef CASE_RVV_OPCODE_UNMASK
+#undef CASE_RVV_OPCODE_MASK_WIDEN
+#undef CASE_RVV_OPCODE_MASK
+#undef CASE_RVV_OPCODE_WIDEN
+#undef CASE_RVV_OPCODE
+
 #undef CASE_VMA_OPCODE_COMMON
 #undef CASE_VMA_OPCODE_LMULS_M1
 #undef CASE_VMA_OPCODE_LMULS_MF2
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 3b03d5efde6e..170f813eb10d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -266,6 +266,9 @@ public:
       SmallVectorImpl<MachineInstr *> &DelInstrs,
       DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
 
+  bool hasReassociableOperands(const MachineInstr &Inst,
+                               const MachineBasicBlock *MBB) const override;
+
   bool hasReassociableSibling(const MachineInstr &Inst,
                               bool &Commuted) const override;
 
@@ -274,6 +277,10 @@ public:
 
   std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
 
+  void getReassociateOperandIndices(
+      const MachineInstr &Root, unsigned Pattern,
+      std::array<unsigned, 5> &OperandIndices) const override;
+
   ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
   getSerializableMachineMemOperandTargetFlags() const override;
 
@@ -297,6 +304,13 @@ protected:
 
 private:
   unsigned getInstBundleLength(const MachineInstr &MI) const;
+
+  bool isVectorAssociativeAndCommutative(const MachineInstr &MI,
+                                         bool Invert = false) const;
+  bool areRVVInstsReassociable(const MachineInstr &MI1,
+                               const MachineInstr &MI2) const;
+  bool hasReassociableVectorSibling(const MachineInstr &Inst,
+                                    bool &Commuted) const;
 };
 
 namespace RISCV {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index f9dadc6c0d48..da4020758eb6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1813,6 +1813,8 @@ def : Pat<(binop_allwusers<srl> (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
 
 // Use binop_allwusers to recover immediates that may have been broken by
 // SimplifyDemandedBits.
+def : Pat<(binop_allwusers<and> GPR:$rs1, 0xffffffff),
+          (COPY GPR:$rs1)>;
 def : Pat<(binop_allwusers<and> GPR:$rs1, u32simm12:$imm),
           (ANDI GPR:$rs1, u32simm12:$imm)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 18d38348f721..f4e50d7aa45c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -841,7 +841,7 @@ def : InstAlias<".insn_cj $opcode, $funct3, $imm11",
 //===----------------------------------------------------------------------===//
 
 // Patterns are defined in the same order the compressed instructions appear
-// on page 82 of the ISA manual.
+// under the "RVC Instruction Set Listings" section of the ISA manual.
 
 // Quadrant 0
 let Predicates = [HasStdExtCOrZca] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index e9715b40adc0..22e548861784 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2146,8 +2146,9 @@ multiclass VPseudoBinaryRoundingMode<VReg RetClass,
                                      string Constraint = "",
                                      int sew = 0,
                                      int UsesVXRM = 1,
-                                     int TargetConstraintType = 1> {
-  let VLMul = MInfo.value, SEW=sew in {
+                                     int TargetConstraintType = 1,
+                                     bit Commutable = 0> {
+  let VLMul = MInfo.value, SEW=sew, isCommutable = Commutable in {
     defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX);
     def suffix : VPseudoBinaryNoMaskRoundingMode<RetClass, Op1Class, Op2Class,
                                                  Constraint, UsesVXRM,
@@ -2232,8 +2233,9 @@ multiclass VPseudoBinaryV_VV<LMULInfo m, string Constraint = "", int sew = 0, bi
   defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint, sew, Commutable=Commutable>;
 }
 
-multiclass VPseudoBinaryV_VV_RM<LMULInfo m, string Constraint = ""> {
-  defm _VV : VPseudoBinaryRoundingMode<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
+multiclass VPseudoBinaryV_VV_RM<LMULInfo m, string Constraint = "", bit Commutable = 0> {
+  defm _VV : VPseudoBinaryRoundingMode<m.vrclass, m.vrclass, m.vrclass, m, Constraint,
+                                       Commutable=Commutable>;
 }
 
 // Similar to VPseudoBinaryV_VV, but uses MxListF.
@@ -2715,10 +2717,11 @@ multiclass VPseudoVGTR_VV_VX_VI<Operand ImmType = simm5, string Constraint = "">
   }
 }
 
-multiclass VPseudoVSALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+multiclass VPseudoVSALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = "",
+                                 bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m, Constraint>,
+    defm "" : VPseudoBinaryV_VV<m, Constraint, Commutable=Commutable>,
               SchedBinary<"WriteVSALUV", "ReadVSALUV", "ReadVSALUX", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX<m, Constraint>,
@@ -2788,7 +2791,7 @@ multiclass VPseudoVSALU_VV_VX {
 multiclass VPseudoVSMUL_VV_VX_RM {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV_RM<m>,
+    defm "" : VPseudoBinaryV_VV_RM<m, Commutable=1>,
               SchedBinary<"WriteVSMulV", "ReadVSMulV", "ReadVSMulV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
@@ -2797,10 +2800,10 @@ multiclass VPseudoVSMUL_VV_VX_RM {
   }
 }
 
-multiclass VPseudoVAALU_VV_VX_RM {
+multiclass VPseudoVAALU_VV_VX_RM<bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV_RM<m>,
+    defm "" : VPseudoBinaryV_VV_RM<m, Commutable=Commutable>,
               SchedBinary<"WriteVAALUV", "ReadVAALUV", "ReadVAALUV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
@@ -6229,7 +6232,7 @@ defm PseudoVSUX : VPseudoIStore<Ordered=false>;
 //===----------------------------------------------------------------------===//
 
 // vleff may update VL register
-let hasSideEffects = 1, Defs = [VL] in
+let Defs = [VL] in
 defm PseudoVL : VPseudoFFLoad;
 
 //===----------------------------------------------------------------------===//
@@ -6245,7 +6248,7 @@ defm PseudoVSOXSEG : VPseudoISegStore<Ordered=true>;
 defm PseudoVSUXSEG : VPseudoISegStore<Ordered=false>;
 
 // vlseg<nf>e<eew>ff.v may update VL register
-let hasSideEffects = 1, Defs = [VL] in {
+let Defs = [VL] in {
 defm PseudoVLSEG : VPseudoUSSegLoadFF;
 }
 
@@ -6447,9 +6450,9 @@ defm PseudoVMV_V : VPseudoUnaryVMV_V_X_I;
 //===----------------------------------------------------------------------===//
 // 12.1. Vector Single-Width Saturating Add and Subtract
 //===----------------------------------------------------------------------===//
-let Defs = [VXSAT], hasSideEffects = 1 in {
-  defm PseudoVSADDU : VPseudoVSALU_VV_VX_VI;
-  defm PseudoVSADD  : VPseudoVSALU_VV_VX_VI;
+let Defs = [VXSAT] in {
+  defm PseudoVSADDU : VPseudoVSALU_VV_VX_VI<Commutable=1>;
+  defm PseudoVSADD  : VPseudoVSALU_VV_VX_VI<Commutable=1>;
   defm PseudoVSSUBU : VPseudoVSALU_VV_VX;
   defm PseudoVSSUB  : VPseudoVSALU_VV_VX;
 }
@@ -6457,15 +6460,15 @@ let Defs = [VXSAT], hasSideEffects = 1 in {
 //===----------------------------------------------------------------------===//
 // 12.2. Vector Single-Width Averaging Add and Subtract
 //===----------------------------------------------------------------------===//
-defm PseudoVAADDU : VPseudoVAALU_VV_VX_RM;
-defm PseudoVAADD  : VPseudoVAALU_VV_VX_RM;
+defm PseudoVAADDU : VPseudoVAALU_VV_VX_RM<Commutable=1>;
+defm PseudoVAADD  : VPseudoVAALU_VV_VX_RM<Commutable=1>;
 defm PseudoVASUBU : VPseudoVAALU_VV_VX_RM;
 defm PseudoVASUB  : VPseudoVAALU_VV_VX_RM;
 
 //===----------------------------------------------------------------------===//
 // 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
 //===----------------------------------------------------------------------===//
-let Defs = [VXSAT], hasSideEffects = 1 in {
+let Defs = [VXSAT] in {
   defm PseudoVSMUL : VPseudoVSMUL_VV_VX_RM;
 }
 
@@ -6478,7 +6481,7 @@ defm PseudoVSSRA : VPseudoVSSHT_VV_VX_VI_RM<uimm5>;
 //===----------------------------------------------------------------------===//
 // 12.5. Vector Narrowing Fixed-Point Clip Instructions
 //===----------------------------------------------------------------------===//
-let Defs = [VXSAT], hasSideEffects = 1 in {
+let Defs = [VXSAT] in {
   defm PseudoVNCLIP  : VPseudoVNCLP_WV_WX_WI_RM;
   defm PseudoVNCLIPU : VPseudoVNCLP_WV_WX_WI_RM;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index aac7dc444a2d..aaf9c019aedf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -1,4 +1,4 @@
-//===-- RISCVInstrInfoZvk.td - RISC-V 'Zvk' instructions -------*- tablegen -*-===//
+//===-- RISCVInstrInfoZvk.td - RISC-V 'Zvk' instructions ---*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/RISCV/RISCVProfiles.td b/llvm/lib/Target/RISCV/RISCVProfiles.td
new file mode 100644
index 000000000000..5c13710faf65
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVProfiles.td
@@ -0,0 +1,204 @@
+//===------ RISCVProfiles.td - RISC-V Profiles -------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+class RISCVProfile<string name, list<SubtargetFeature> features>
+    : SubtargetFeature<name, "Is" # NAME, "true",
+                       "RISC-V " # name # " profile", features>;
+
+defvar RVI20U32Features = [Feature32Bit, FeatureStdExtI];
+defvar RVI20U64Features = [Feature64Bit, FeatureStdExtI];
+
+defvar RVA20U64Features = [Feature64Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtA,
+                           FeatureStdExtF,
+                           FeatureStdExtD,
+                           FeatureStdExtC,
+                           FeatureStdExtZicntr,
+                           FeatureStdExtZiccif,
+                           FeatureStdExtZiccrse,
+                           FeatureStdExtZiccamoa,
+                           FeatureStdExtZa128rs,
+                           FeatureStdExtZicclsm];
+
+defvar RVA20S64Features = !listconcat(RVA20U64Features,
+                                      [FeatureStdExtZifencei,
+                                       FeatureStdExtSvbare,
+                                       FeatureStdExtSvade,
+                                       FeatureStdExtSsccptr,
+                                       FeatureStdExtSstvecd,
+                                       FeatureStdExtSstvala]);
+
+defvar RVA22U64Features = [Feature64Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtA,
+                           FeatureStdExtF,
+                           FeatureStdExtD,
+                           FeatureStdExtC,
+                           FeatureStdExtZicntr,
+                           FeatureStdExtZiccif,
+                           FeatureStdExtZiccrse,
+                           FeatureStdExtZiccamoa,
+                           FeatureStdExtZicclsm,
+                           FeatureStdExtZa64rs,
+                           FeatureStdExtZihpm,
+                           FeatureStdExtZihintpause,
+                           FeatureStdExtZba,
+                           FeatureStdExtZbb,
+                           FeatureStdExtZbs,
+                           FeatureStdExtZic64b,
+                           FeatureStdExtZicbom,
+                           FeatureStdExtZicbop,
+                           FeatureStdExtZicboz,
+                           FeatureStdExtZfhmin,
+                           FeatureStdExtZkt];
+
+defvar RVA22S64Features = !listconcat(RVA22U64Features,
+                                      [FeatureStdExtZifencei,
+                                       FeatureStdExtSvbare,
+                                       FeatureStdExtSvade,
+                                       FeatureStdExtSsccptr,
+                                       FeatureStdExtSstvecd,
+                                       FeatureStdExtSstvala,
+                                       FeatureStdExtSscounterenw,
+                                       FeatureStdExtSvpbmt,
+                                       FeatureStdExtSvinval]);
+
+defvar RVA23U64Features = [Feature64Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtA,
+                           FeatureStdExtF,
+                           FeatureStdExtD,
+                           FeatureStdExtC,
+                           FeatureStdExtZicntr,
+                           FeatureStdExtZihpm,
+                           FeatureStdExtZiccif,
+                           FeatureStdExtZiccrse,
+                           FeatureStdExtZiccamoa,
+                           FeatureStdExtZicclsm,
+                           FeatureStdExtZa64rs,
+                           FeatureStdExtZihintpause,
+                           FeatureStdExtZba,
+                           FeatureStdExtZbb,
+                           FeatureStdExtZbs,
+                           FeatureStdExtZic64b,
+                           FeatureStdExtZicbom,
+                           FeatureStdExtZicbop,
+                           FeatureStdExtZicboz,
+                           FeatureStdExtZfhmin,
+                           FeatureStdExtZkt,
+                           FeatureStdExtV,
+                           FeatureStdExtZvfhmin,
+                           FeatureStdExtZvbb,
+                           FeatureStdExtZvkt,
+                           FeatureStdExtZihintntl,
+                           FeatureStdExtZicond,
+                           FeatureStdExtZimop,
+                           FeatureStdExtZcmop,
+                           FeatureStdExtZcb,
+                           FeatureStdExtZfa,
+                           FeatureStdExtZawrs];
+
+defvar RVA23S64Features = !listconcat(RVA23U64Features,
+                                      [FeatureStdExtZifencei,
+                                       FeatureStdExtSvbare,
+                                       FeatureStdExtSvade,
+                                       FeatureStdExtSsccptr,
+                                       FeatureStdExtSstvecd,
+                                       FeatureStdExtSstvala,
+                                       FeatureStdExtSscounterenw,
+                                       FeatureStdExtSvpbmt,
+                                       FeatureStdExtSvinval,
+                                       FeatureStdExtSvnapot,
+                                       FeatureStdExtSstc,
+                                       FeatureStdExtSscofpmf,
+                                       FeatureStdExtSsnpm,
+                                       FeatureStdExtSsu64xl,
+                                       FeatureStdExtH,
+                                       FeatureStdExtSsstateen,
+                                       FeatureStdExtShcounterenw,
+                                       FeatureStdExtShvstvala,
+                                       FeatureStdExtShtvala,
+                                       FeatureStdExtShvstvecd,
+                                       FeatureStdExtShvsatpa,
+                                       FeatureStdExtShgatpa]);
+
+defvar RVB23U64Features = [Feature64Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtA,
+                           FeatureStdExtF,
+                           FeatureStdExtD,
+                           FeatureStdExtC,
+                           FeatureStdExtZicntr,
+                           FeatureStdExtZihpm,
+                           FeatureStdExtZiccif,
+                           FeatureStdExtZiccrse,
+                           FeatureStdExtZiccamoa,
+                           FeatureStdExtZicclsm,
+                           FeatureStdExtZa64rs,
+                           FeatureStdExtZihintpause,
+                           FeatureStdExtZba,
+                           FeatureStdExtZbb,
+                           FeatureStdExtZbs,
+                           FeatureStdExtZic64b,
+                           FeatureStdExtZicbom,
+                           FeatureStdExtZicbop,
+                           FeatureStdExtZicboz,
+                           FeatureStdExtZkt,
+                           FeatureStdExtZihintntl,
+                           FeatureStdExtZicond,
+                           FeatureStdExtZimop,
+                           FeatureStdExtZcmop,
+                           FeatureStdExtZcb,
+                           FeatureStdExtZfa,
+                           FeatureStdExtZawrs];
+
+defvar RVB23S64Features = !listconcat(RVB23U64Features,
+                                      [FeatureStdExtZifencei,
+                                       FeatureStdExtSvnapot,
+                                       FeatureStdExtSvbare,
+                                       FeatureStdExtSvade,
+                                       FeatureStdExtSsccptr,
+                                       FeatureStdExtSstvecd,
+                                       FeatureStdExtSstvala,
+                                       FeatureStdExtSscounterenw,
+                                       FeatureStdExtSvpbmt,
+                                       FeatureStdExtSvinval,
+                                       FeatureStdExtSstc,
+                                       FeatureStdExtSscofpmf,
+                                       FeatureStdExtSsu64xl]);
+
+defvar RVM23U32Features = [Feature32Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtZba,
+                           FeatureStdExtZbb,
+                           FeatureStdExtZbs,
+                           FeatureStdExtZicond,
+                           FeatureStdExtZihintpause,
+                           FeatureStdExtZihintntl,
+                           FeatureStdExtZce,
+                           FeatureStdExtZicbop,
+                           FeatureStdExtZimop,
+                           FeatureStdExtZcmop];
+
+def RVI20U32 : RISCVProfile<"rvi20u32", RVI20U32Features>;
+def RVI20U64 : RISCVProfile<"rvi20u64", RVI20U64Features>;
+def RVA20U64 : RISCVProfile<"rva20u64", RVA20U64Features>;
+def RVA20S64 : RISCVProfile<"rva20s64", RVA20S64Features>;
+def RVA22U64 : RISCVProfile<"rva22u64", RVA22U64Features>;
+def RVA22S64 : RISCVProfile<"rva22s64", RVA22S64Features>;
+def RVA23U64 : RISCVProfile<"rva23u64", RVA23U64Features>;
+def RVA23S64 : RISCVProfile<"rva23s64", RVA23S64Features>;
+def RVB23U64 : RISCVProfile<"rvb23u64", RVB23U64Features>;
+def RVB23S64 : RISCVProfile<"rvb23s64", RVB23S64Features>;
+def RVM23U32 : RISCVProfile<"rvm23u32", RVM23U32Features>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 85f8f5f654fe..c880c9e921e0 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -200,6 +200,17 @@ public:
     return Min;
   }
 
+  /// If the ElementCount or TypeSize \p X is scalable and VScale (VLEN) is
+  /// exactly known, returns \p X converted to a fixed quantity. Otherwise
+  /// returns \p X unmodified.
+  template <typename Quantity> Quantity expandVScale(Quantity X) const {
+    if (auto VLen = getRealVLen(); VLen && X.isScalable()) {
+      const unsigned VScale = *VLen / RISCV::RVVBitsPerBlock;
+      X = Quantity::getFixed(X.getKnownMinValue() * VScale);
+    }
+    return X;
+  }
+
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
   bool isSoftFPABI() const {
     return TargetABI == RISCVABI::ABI_LP64 ||
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
index 78dfbf4ec932..74ebaa9d0c00 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
@@ -53,9 +53,9 @@ static MCStreamer *
 createSPIRVMCStreamer(const Triple &T, MCContext &Ctx,
                       std::unique_ptr<MCAsmBackend> &&MAB,
                       std::unique_ptr<MCObjectWriter> &&OW,
-                      std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll) {
+                      std::unique_ptr<MCCodeEmitter> &&Emitter) {
   return createSPIRVStreamer(Ctx, std::move(MAB), std::move(OW),
-                             std::move(Emitter), RelaxAll);
+                             std::move(Emitter));
 }
 
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 4b07d7e61fa1..7439d0fefa98 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -189,6 +189,10 @@ lookupBuiltin(StringRef DemangledCall,
   std::string BuiltinName =
       DemangledCall.substr(0, DemangledCall.find('(')).str();
 
+  // Account for possible "__spirv_ocl_" prefix in SPIR-V friendly LLVM IR
+  if (BuiltinName.rfind("__spirv_ocl_", 0) == 0)
+    BuiltinName = BuiltinName.substr(12);
+
   // Check if the extracted name contains type information between angle
   // brackets. If so, the builtin is an instantiated template - needs to have
   // the information after angle brackets and return type removed.
@@ -2008,6 +2012,13 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call,
   const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
   unsigned Opcode =
       SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+
+  bool IsSet = Opcode == SPIRV::OpGroupAsyncCopy;
+  Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType);
+  if (Call->isSpirvOp())
+    return buildOpFromWrapper(MIRBuilder, Opcode, Call,
+                              IsSet ? TypeReg : Register(0));
+
   auto Scope = buildConstantIntReg(SPIRV::Scope::Workgroup, MIRBuilder, GR);
 
   switch (Opcode) {
@@ -2306,7 +2317,7 @@ Type *parseBuiltinCallArgumentBaseType(const StringRef DemangledCall,
     // parseBuiltinCallArgumentBaseType(...) as this function only retrieves the
     // base types.
     if (TypeStr.ends_with("*"))
-      TypeStr = TypeStr.slice(0, TypeStr.find_first_of(" "));
+      TypeStr = TypeStr.slice(0, TypeStr.find_first_of(" *"));
 
     return parseBuiltinTypeNameToTargetExtType("opencl." + TypeStr.str() + "_t",
                                                Ctx);
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 660000fb548d..564028547821 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -585,9 +585,9 @@ defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecC
 
 // Async Copy and Prefetch builtin records:
 defm : DemangledNativeBuiltin<"async_work_group_copy", OpenCL_std, AsyncCopy, 4, 4, OpGroupAsyncCopy>;
-defm : DemangledNativeBuiltin<"__spirv_GroupAsyncCopy", OpenCL_std, AsyncCopy, 4, 4, OpGroupAsyncCopy>;
+defm : DemangledNativeBuiltin<"__spirv_GroupAsyncCopy", OpenCL_std, AsyncCopy, 6, 6, OpGroupAsyncCopy>;
 defm : DemangledNativeBuiltin<"wait_group_events", OpenCL_std, AsyncCopy, 2, 2, OpGroupWaitEvents>;
-defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, 2, 2, OpGroupWaitEvents>;
+defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, 3, 3, OpGroupWaitEvents>;
 
 // Load and store builtin records:
 defm : DemangledNativeBuiltin<"__spirv_Load", OpenCL_std, LoadStore, 1, 3, OpLoad>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 472bc8638c9a..0d539b1ed9a8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -98,6 +98,8 @@ class SPIRVEmitIntrinsics
     return B.CreateIntrinsic(IntrID, {Types}, Args);
   }
 
+  void buildAssignPtr(IRBuilder<> &B, Type *ElemTy, Value *Arg);
+
   void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B);
   void processInstrAfterVisit(Instruction *I, IRBuilder<> &B);
   void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B);
@@ -111,6 +113,7 @@ class SPIRVEmitIntrinsics
   void insertPtrCastOrAssignTypeInstr(Instruction *I, IRBuilder<> &B);
   void processGlobalValue(GlobalVariable &GV, IRBuilder<> &B);
   void processParamTypes(Function *F, IRBuilder<> &B);
+  void processParamTypesByFunHeader(Function *F, IRBuilder<> &B);
   Type *deduceFunParamElementType(Function *F, unsigned OpIdx);
   Type *deduceFunParamElementType(Function *F, unsigned OpIdx,
                                   std::unordered_set<Function *> &FVisited);
@@ -194,6 +197,17 @@ static inline void reportFatalOnTokenType(const Instruction *I) {
                        false);
 }
 
+void SPIRVEmitIntrinsics::buildAssignPtr(IRBuilder<> &B, Type *ElemTy,
+                                         Value *Arg) {
+  CallInst *AssignPtrTyCI =
+      buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Arg->getType()},
+                      Constant::getNullValue(ElemTy), Arg,
+                      {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B);
+  GR->addDeducedElementType(AssignPtrTyCI, ElemTy);
+  GR->addDeducedElementType(Arg, ElemTy);
+  AssignPtrTypeInstr[Arg] = AssignPtrTyCI;
+}
+
 // Set element pointer type to the given value of ValueTy and tries to
 // specify this type further (recursively) by Operand value, if needed.
 Type *SPIRVEmitIntrinsics::deduceElementTypeByValueDeep(
@@ -232,6 +246,19 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeByUsersDeep(
   return nullptr;
 }
 
+// Implements what we know in advance about intrinsics and builtin calls
+// TODO: consider feasibility of this particular case to be generalized by
+// encoding knowledge about intrinsics and builtin calls by corresponding
+// specification rules
+static Type *getPointeeTypeByCallInst(StringRef DemangledName,
+                                      Function *CalledF, unsigned OpIdx) {
+  if ((DemangledName.starts_with("__spirv_ocl_printf(") ||
+       DemangledName.starts_with("printf(")) &&
+      OpIdx == 0)
+    return IntegerType::getInt8Ty(CalledF->getContext());
+  return nullptr;
+}
+
 // Deduce and return a successfully deduced Type of the Instruction,
 // or nullptr otherwise.
 Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(Value *I) {
@@ -795,6 +822,8 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
     return;
 
   // collect information about formal parameter types
+  std::string DemangledName =
+      getOclOrSpirvBuiltinDemangledName(CI->getCalledFunction()->getName());
   Function *CalledF = CI->getCalledFunction();
   SmallVector<Type *, 4> CalledArgTys;
   bool HaveTypes = false;
@@ -811,10 +840,15 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
       if (!ElemTy && hasPointeeTypeAttr(CalledArg))
         ElemTy = getPointeeTypeByAttr(CalledArg);
       if (!ElemTy) {
-        for (User *U : CalledArg->users()) {
-          if (Instruction *Inst = dyn_cast<Instruction>(U)) {
-            if ((ElemTy = deduceElementTypeHelper(Inst)) != nullptr)
-              break;
+        ElemTy = getPointeeTypeByCallInst(DemangledName, CalledF, OpIdx);
+        if (ElemTy) {
+          GR->addDeducedElementType(CalledArg, ElemTy);
+        } else {
+          for (User *U : CalledArg->users()) {
+            if (Instruction *Inst = dyn_cast<Instruction>(U)) {
+              if ((ElemTy = deduceElementTypeHelper(Inst)) != nullptr)
+                break;
+            }
           }
         }
       }
@@ -823,8 +857,6 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
     }
   }
 
-  std::string DemangledName =
-      getOclOrSpirvBuiltinDemangledName(CI->getCalledFunction()->getName());
   if (DemangledName.empty() && !HaveTypes)
     return;
 
@@ -835,8 +867,14 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
       continue;
 
     // Constants (nulls/undefs) are handled in insertAssignPtrTypeIntrs()
-    if (!isa<Instruction>(ArgOperand) && !isa<Argument>(ArgOperand))
-      continue;
+    if (!isa<Instruction>(ArgOperand) && !isa<Argument>(ArgOperand)) {
+      // However, we may have assumptions about the formal argument's type and
+      // may have a need to insert a ptr cast for the actual parameter of this
+      // call.
+      Argument *CalledArg = CalledF->getArg(OpIdx);
+      if (!GR->findDeducedElementType(CalledArg))
+        continue;
+    }
 
     Type *ExpectedType =
         OpIdx < CalledArgTys.size() ? CalledArgTys[OpIdx] : nullptr;
@@ -1102,9 +1140,13 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
                  (II->paramHasAttr(OpNo, Attribute::ImmArg))))
         continue;
       B.SetInsertPoint(I);
-      auto *NewOp =
-          buildIntrWithMD(Intrinsic::spv_track_constant,
-                          {Op->getType(), Op->getType()}, Op, Op, {}, B);
+      Value *OpTyVal = Op;
+      if (Op->getType()->isTargetExtTy())
+        OpTyVal = Constant::getNullValue(
+            IntegerType::get(I->getContext(), GR->getPointerSize()));
+      auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant,
+                                    {Op->getType(), OpTyVal->getType()}, Op,
+                                    OpTyVal, {}, B);
       I->setOperand(OpNo, NewOp);
     }
   }
@@ -1179,28 +1221,29 @@ Type *SPIRVEmitIntrinsics::deduceFunParamElementType(
   return nullptr;
 }
 
-void SPIRVEmitIntrinsics::processParamTypes(Function *F, IRBuilder<> &B) {
+void SPIRVEmitIntrinsics::processParamTypesByFunHeader(Function *F,
+                                                       IRBuilder<> &B) {
   B.SetInsertPointPastAllocas(F);
   for (unsigned OpIdx = 0; OpIdx < F->arg_size(); ++OpIdx) {
     Argument *Arg = F->getArg(OpIdx);
     if (!isUntypedPointerTy(Arg->getType()))
       continue;
+    Type *ElemTy = GR->findDeducedElementType(Arg);
+    if (!ElemTy && hasPointeeTypeAttr(Arg) &&
+        (ElemTy = getPointeeTypeByAttr(Arg)) != nullptr)
+      buildAssignPtr(B, ElemTy, Arg);
+  }
+}
 
+void SPIRVEmitIntrinsics::processParamTypes(Function *F, IRBuilder<> &B) {
+  B.SetInsertPointPastAllocas(F);
+  for (unsigned OpIdx = 0; OpIdx < F->arg_size(); ++OpIdx) {
+    Argument *Arg = F->getArg(OpIdx);
+    if (!isUntypedPointerTy(Arg->getType()))
+      continue;
     Type *ElemTy = GR->findDeducedElementType(Arg);
-    if (!ElemTy) {
-      if (hasPointeeTypeAttr(Arg) &&
-          (ElemTy = getPointeeTypeByAttr(Arg)) != nullptr) {
-        GR->addDeducedElementType(Arg, ElemTy);
-      } else if ((ElemTy = deduceFunParamElementType(F, OpIdx)) != nullptr) {
-        CallInst *AssignPtrTyCI = buildIntrWithMD(
-            Intrinsic::spv_assign_ptr_type, {Arg->getType()},
-            Constant::getNullValue(ElemTy), Arg,
-            {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B);
-        GR->addDeducedElementType(AssignPtrTyCI, ElemTy);
-        GR->addDeducedElementType(Arg, ElemTy);
-        AssignPtrTypeInstr[Arg] = AssignPtrTyCI;
-      }
-    }
+    if (!ElemTy && (ElemTy = deduceFunParamElementType(F, OpIdx)) != nullptr)
+      buildAssignPtr(B, ElemTy, Arg);
   }
 }
 
@@ -1217,6 +1260,8 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   AggrConstTypes.clear();
   AggrStores.clear();
 
+  processParamTypesByFunHeader(F, B);
+
   // StoreInst's operand type can be changed during the next transformations,
   // so we need to store it in the set. Also store already transformed types.
   for (auto &I : instructions(Func)) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index b8296c3f6eea..96b4a570a26b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -314,6 +314,16 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const {
                                       SPIRV::OpTypeBool))
           MI.setDesc(STI.getInstrInfo()->get(SPIRV::OpLogicalNotEqual));
         break;
+      case SPIRV::OpConstantI: {
+        SPIRVType *Type = GR.getSPIRVTypeForVReg(MI.getOperand(1).getReg());
+        if (Type->getOpcode() != SPIRV::OpTypeInt && MI.getOperand(2).isImm() &&
+            MI.getOperand(2).getImm() == 0) {
+          // Validate the null constant of a target extension type
+          MI.setDesc(STI.getInstrInfo()->get(SPIRV::OpConstantNull));
+          for (unsigned i = MI.getNumOperands() - 1; i > 1; --i)
+            MI.removeOperand(i);
+        }
+      } break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 9ee0b38d2233..84508fb5fe09 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -38,7 +38,9 @@ public:
 };
 } // namespace
 
-static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
+static void
+addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                    DenseMap<MachineInstr *, Type *> &TargetExtConstTypes) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
   SmallVector<MachineInstr *, 10> ToErase, ToEraseComposites;
@@ -47,6 +49,7 @@ static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
       if (!isSpvIntrinsic(MI, Intrinsic::spv_track_constant))
         continue;
       ToErase.push_back(&MI);
+      Register SrcReg = MI.getOperand(2).getReg();
       auto *Const =
           cast<Constant>(cast<ConstantAsMetadata>(
                              MI.getOperand(3).getMetadata()->getOperand(0))
@@ -54,14 +57,14 @@ static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
       if (auto *GV = dyn_cast<GlobalValue>(Const)) {
         Register Reg = GR->find(GV, &MF);
         if (!Reg.isValid())
-          GR->add(GV, &MF, MI.getOperand(2).getReg());
+          GR->add(GV, &MF, SrcReg);
         else
           RegsAlreadyAddedToDT[&MI] = Reg;
       } else {
         Register Reg = GR->find(Const, &MF);
         if (!Reg.isValid()) {
           if (auto *ConstVec = dyn_cast<ConstantDataVector>(Const)) {
-            auto *BuildVec = MRI.getVRegDef(MI.getOperand(2).getReg());
+            auto *BuildVec = MRI.getVRegDef(SrcReg);
             assert(BuildVec &&
                    BuildVec->getOpcode() == TargetOpcode::G_BUILD_VECTOR);
             for (unsigned i = 0; i < ConstVec->getNumElements(); ++i) {
@@ -75,7 +78,13 @@ static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
                 BuildVec->getOperand(1 + i).setReg(ElemReg);
             }
           }
-          GR->add(Const, &MF, MI.getOperand(2).getReg());
+          GR->add(Const, &MF, SrcReg);
+          if (Const->getType()->isTargetExtTy()) {
+            // remember association so that we can restore it when assign types
+            MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+            if (SrcMI && SrcMI->getOpcode() == TargetOpcode::G_CONSTANT)
+              TargetExtConstTypes[SrcMI] = Const->getType();
+          }
         } else {
           RegsAlreadyAddedToDT[&MI] = Reg;
           // This MI is unused and will be removed. If the MI uses
@@ -364,8 +373,10 @@ void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
 }
 } // namespace llvm
 
-static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
-                                 MachineIRBuilder MIB) {
+static void
+generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                     MachineIRBuilder MIB,
+                     DenseMap<MachineInstr *, Type *> &TargetExtConstTypes) {
   // Get access to information about available extensions
   const SPIRVSubtarget *ST =
       static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
@@ -422,11 +433,14 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
             continue;
         }
         Type *Ty = nullptr;
-        if (MI.getOpcode() == TargetOpcode::G_CONSTANT)
-          Ty = MI.getOperand(1).getCImm()->getType();
-        else if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
+        if (MI.getOpcode() == TargetOpcode::G_CONSTANT) {
+          auto TargetExtIt = TargetExtConstTypes.find(&MI);
+          Ty = TargetExtIt == TargetExtConstTypes.end()
+                   ? MI.getOperand(1).getCImm()->getType()
+                   : TargetExtIt->second;
+        } else if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) {
           Ty = MI.getOperand(1).getFPImm()->getType();
-        else {
+        } else {
           assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
           Type *ElemTy = nullptr;
           MachineInstr *ElemMI = MRI.getVRegDef(MI.getOperand(1).getReg());
@@ -616,10 +630,12 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
   GR->setCurrentFunc(MF);
   MachineIRBuilder MIB(MF);
-  addConstantsToTrack(MF, GR);
+  // a registry of target extension constants
+  DenseMap<MachineInstr *, Type *> TargetExtConstTypes;
+  addConstantsToTrack(MF, GR, TargetExtConstTypes);
   foldConstantsIntoIntrinsics(MF);
   insertBitcasts(MF, GR, MIB);
-  generateAssignInstrs(MF, GR, MIB);
+  generateAssignInstrs(MF, GR, MIB, TargetExtConstTypes);
   processSwitches(MF, GR, MIB);
   processInstrsWithTypeFolding(MF, GR, MIB);
   removeImplicitFallthroughs(MF, MIB);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index deaf3dcaeb92..1bf23c2e8e41 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -350,6 +350,11 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // Try to expand a boolean SELECT_CCMASK using an IPM sequence.
   SDValue expandSelectBoolean(SDNode *Node);
 
+  // Return true if the flags of N and the subtarget allows for
+  // reassociation, in which case a reg/reg opcode is needed as input to the
+  // MachineCombiner.
+  bool shouldSelectForReassoc(SDNode *N) const;
+
 public:
   static char ID;
 
@@ -2044,6 +2049,15 @@ SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) {
   return Result;
 }
 
+bool SystemZDAGToDAGISel::shouldSelectForReassoc(SDNode *N) const {
+  EVT VT = N->getValueType(0);
+  assert(VT.isFloatingPoint() && "Expected FP SDNode");
+  return N->getFlags().hasAllowReassociation() &&
+         N->getFlags().hasNoSignedZeros() && Subtarget->hasVector() &&
+         (VT != MVT::f32 || Subtarget->hasVectorEnhancements1()) &&
+         !N->isStrictFPOpcode();
+}
+
 void SystemZDAGToDAGISel::PreprocessISelDAG() {
   // If we have conditional immediate loads, we always prefer
   // using those over an IPM sequence.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 115f34fa7751..2da4431cf077 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -9631,7 +9631,7 @@ SDValue SystemZTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
   case 8:
   case 16:
     Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Zero);
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case 32:
   case 64:
     Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::i128, Op,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index f4b5aeaebef9..aad04a2b4159 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -430,8 +430,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
     def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64,  FP64>;
     def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
   }
-  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
-  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
+  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, z_any_fadd_noreassoc, FP32,
+                                z_load, 4>;
+  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, z_any_fadd_noreassoc, FP64,
+                                z_load, 8>;
 }
 
 // Subtraction.
@@ -441,8 +443,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
   def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64,  FP64>;
   def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;
 
-  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, any_fsub, FP32, z_load, 4>;
-  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, any_fsub, FP64, z_load, 8>;
+  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, z_any_fsub_noreassoc, FP32,
+                                z_load, 4>;
+  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, z_any_fsub_noreassoc, FP64,
+                                z_load, 8>;
 }
 
 // Multiplication.
@@ -452,8 +456,10 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
     def MDBR  : BinaryRRE<"mdbr",  0xB31C, any_fmul, FP64,  FP64>;
     def MXBR  : BinaryRRE<"mxbr",  0xB34C, any_fmul, FP128, FP128>;
   }
-  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
-  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, any_fmul, FP64, z_load, 8>;
+  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, z_any_fmul_noreassoc, FP32,
+                                  z_load, 4>;
+  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, z_any_fmul_noreassoc, FP64,
+                                  z_load, 8>;
 }
 
 // f64 multiplication of two FP32 registers.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 6b75c30943b4..b3517fb0ea77 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -610,6 +610,32 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
     .addImm(CCValid).addImm(CCMask);
 }
 
+MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
+                                                  const MachineRegisterInfo *MRI,
+                                                  Register &FoldAsLoadDefReg,
+                                                  MachineInstr *&DefMI) const {
+  // Check whether we can move the DefMI load, and that it only has one use.
+  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+  assert(DefMI);
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(nullptr, SawStore) ||
+      !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
+    return nullptr;
+
+  int UseOpIdx =
+      MI.findRegisterUseOperandIdx(FoldAsLoadDefReg, /*TRI=*/nullptr);
+  assert(UseOpIdx != -1 && "Expected FoldAsLoadDefReg to be used by MI.");
+
+  // Check whether we can fold the load.
+  if (MachineInstr *FoldMI =
+          foldMemoryOperand(MI, {((unsigned)UseOpIdx)}, *DefMI)) {
+    FoldAsLoadDefReg = 0;
+    return FoldMI;
+  }
+
+  return nullptr;
+}
+
 bool SystemZInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                      Register Reg,
                                      MachineRegisterInfo *MRI) const {
@@ -840,6 +866,31 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  if (SystemZ::GR128BitRegClass.contains(DestReg) &&
+      SystemZ::VR128BitRegClass.contains(SrcReg)) {
+    MCRegister DestH64 = RI.getSubReg(DestReg, SystemZ::subreg_h64);
+    MCRegister DestL64 = RI.getSubReg(DestReg, SystemZ::subreg_l64);
+
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VLGVG), DestH64)
+        .addReg(SrcReg)
+        .addReg(SystemZ::NoRegister)
+        .addImm(0)
+        .addDef(DestReg, RegState::Implicit);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VLGVG), DestL64)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SystemZ::NoRegister)
+        .addImm(1);
+    return;
+  }
+
+  if (SystemZ::VR128BitRegClass.contains(DestReg) &&
+      SystemZ::GR128BitRegClass.contains(SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VLVGP), DestReg)
+        .addReg(RI.getSubReg(SrcReg, SystemZ::subreg_h64))
+        .addReg(RI.getSubReg(SrcReg, SystemZ::subreg_l64));
+    return;
+  }
+
   // Everything else needs only one instruction.
   unsigned Opcode;
   if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
@@ -1004,6 +1055,67 @@ SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
   return nullptr;
 }
 
+bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
+                                                   bool Invert) const {
+  unsigned Opc = Inst.getOpcode();
+  if (Invert) {
+    auto InverseOpcode = getInverseOpcode(Opc);
+    if (!InverseOpcode)
+      return false;
+    Opc = *InverseOpcode;
+  }
+
+  switch (Opc) {
+  default:
+    break;
+  // Adds and multiplications.
+  case SystemZ::WFADB:
+  case SystemZ::WFASB:
+  case SystemZ::WFAXB:
+  case SystemZ::VFADB:
+  case SystemZ::VFASB:
+  case SystemZ::WFMDB:
+  case SystemZ::WFMSB:
+  case SystemZ::WFMXB:
+  case SystemZ::VFMDB:
+  case SystemZ::VFMSB:
+    return (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+            Inst.getFlag(MachineInstr::MIFlag::FmNsz));
+  }
+
+  return false;
+}
+
+std::optional<unsigned>
+SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const {
+  // fadd => fsub
+  switch (Opcode) {
+  case SystemZ::WFADB:
+    return SystemZ::WFSDB;
+  case SystemZ::WFASB:
+    return SystemZ::WFSSB;
+  case SystemZ::WFAXB:
+    return SystemZ::WFSXB;
+  case SystemZ::VFADB:
+    return SystemZ::VFSDB;
+  case SystemZ::VFASB:
+    return SystemZ::VFSSB;
+  // fsub => fadd
+  case SystemZ::WFSDB:
+    return SystemZ::WFADB;
+  case SystemZ::WFSSB:
+    return SystemZ::WFASB;
+  case SystemZ::WFSXB:
+    return SystemZ::WFAXB;
+  case SystemZ::VFSDB:
+    return SystemZ::VFADB;
+  case SystemZ::VFSSB:
+    return SystemZ::VFASB;
+  default:
+    return std::nullopt;
+  }
+}
+
 MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, int FrameIndex,
@@ -1338,7 +1450,83 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
     LiveIntervals *LIS) const {
-  return nullptr;
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  MachineBasicBlock *MBB = MI.getParent();
+
+  // For reassociable FP operations, any loads have been purposefully left
+  // unfolded so that MachineCombiner can do its work on reg/reg
+  // opcodes. After that, as many loads as possible are now folded.
+  // TODO: This may be beneficial with other opcodes as well as machine-sink
+  // can move loads close to their user in a different MBB, which the isel
+  // matcher did not see.
+  unsigned LoadOpc = 0;
+  unsigned RegMemOpcode = 0;
+  const TargetRegisterClass *FPRC = nullptr;
+  RegMemOpcode = MI.getOpcode() == SystemZ::WFADB   ? SystemZ::ADB
+                 : MI.getOpcode() == SystemZ::WFSDB ? SystemZ::SDB
+                 : MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
+                                                    : 0;
+  if (RegMemOpcode) {
+    LoadOpc = SystemZ::VL64;
+    FPRC = &SystemZ::FP64BitRegClass;
+  } else {
+    RegMemOpcode = MI.getOpcode() == SystemZ::WFASB   ? SystemZ::AEB
+                   : MI.getOpcode() == SystemZ::WFSSB ? SystemZ::SEB
+                   : MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
+                                                      : 0;
+    if (RegMemOpcode) {
+      LoadOpc = SystemZ::VL32;
+      FPRC = &SystemZ::FP32BitRegClass;
+    }
+  }
+  if (!RegMemOpcode || LoadMI.getOpcode() != LoadOpc)
+    return nullptr;
+
+  // If RegMemOpcode clobbers CC, first make sure CC is not live at this point.
+  if (get(RegMemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)) {
+    assert(LoadMI.getParent() == MI.getParent() && "Assuming a local fold.");
+    assert(LoadMI != InsertPt && "Assuming InsertPt not to be first in MBB.");
+    for (MachineBasicBlock::iterator MII = std::prev(InsertPt);;
+         --MII) {
+      if (MII->definesRegister(SystemZ::CC, /*TRI=*/nullptr)) {
+        if (!MII->registerDefIsDead(SystemZ::CC, /*TRI=*/nullptr))
+          return nullptr;
+        break;
+      }
+      if (MII == MBB->begin()) {
+        if (MBB->isLiveIn(SystemZ::CC))
+          return nullptr;
+        break;
+      }
+    }
+  }
+
+  Register FoldAsLoadDefReg = LoadMI.getOperand(0).getReg();
+  if (Ops.size() != 1 || FoldAsLoadDefReg != MI.getOperand(Ops[0]).getReg())
+    return nullptr;
+  Register DstReg = MI.getOperand(0).getReg();
+  MachineOperand LHS = MI.getOperand(1);
+  MachineOperand RHS = MI.getOperand(2);
+  MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
+  if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB) &&
+      FoldAsLoadDefReg != RHS.getReg())
+    return nullptr;
+
+  MachineOperand &Base = LoadMI.getOperand(1);
+  MachineOperand &Disp = LoadMI.getOperand(2);
+  MachineOperand &Indx = LoadMI.getOperand(3);
+  MachineInstrBuilder MIB =
+      BuildMI(*MI.getParent(), InsertPt, MI.getDebugLoc(), get(RegMemOpcode), DstReg)
+          .add(RegMO)
+          .add(Base)
+          .add(Disp)
+          .add(Indx);
+  MIB->addRegisterDead(SystemZ::CC, &RI);
+  MRI->setRegClass(DstReg, FPRC);
+  MRI->setRegClass(RegMO.getReg(), FPRC);
+  transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
+
+  return MIB;
 }
 
 bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index cdf07310108a..aa10fb564962 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -254,8 +254,13 @@ public:
                     const DebugLoc &DL, Register DstReg,
                     ArrayRef<MachineOperand> Cond, Register TrueReg,
                     Register FalseReg) const override;
+  MachineInstr *optimizeLoadInstr(MachineInstr &MI,
+                                  const MachineRegisterInfo *MRI,
+                                  Register &FoldAsLoadDefReg,
+                                  MachineInstr *&DefMI) const override;
   bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
                      MachineRegisterInfo *MRI) const override;
+
   bool isPredicable(const MachineInstr &MI) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
@@ -285,6 +290,12 @@ public:
                             Register VReg) const override;
   MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
                                       LiveIntervals *LIS) const override;
+
+  bool useMachineCombiner() const override { return true; }
+  bool isAssociativeAndCommutative(const MachineInstr &Inst,
+                                   bool Invert) const override;
+  std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
+
   MachineInstr *
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 245e3c3399a9..c29c54a6cb79 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -139,7 +139,7 @@ let Predicates = [FeatureVector] in {
   // LEY and LDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  let mayLoad = 1 in {
+  let mayLoad = 1, canFoldAsLoad = 1 in {
     def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
     def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
   }
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 1611436b01b7..6cb89ccff85e 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -829,6 +829,18 @@ def any_fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
 // Floating-point negative absolute.
 def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
 
+// Floating-point operations which will not participate in reassociation, and
+// therefore are candidates for reg/mem folding during isel.
+def z_any_fadd_noreassoc : PatFrag<(ops node:$src1, node:$src2),
+                                   (any_fadd node:$src1, node:$src2),
+                                   [{ return !shouldSelectForReassoc(N); }]>;
+def z_any_fsub_noreassoc : PatFrag<(ops node:$src1, node:$src2),
+                                   (any_fsub node:$src1, node:$src2),
+                                   [{ return !shouldSelectForReassoc(N); }]>;
+def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2),
+                                   (any_fmul node:$src1, node:$src2),
+                                   [{ return !shouldSelectForReassoc(N); }]>;
+
 // Strict floating-point fragments.
 def z_any_fcmp    : PatFrags<(ops node:$lhs, node:$rhs),
                              [(z_strict_fcmp node:$lhs, node:$rhs),
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 2491bd2ee2c1..dced64d6b21a 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -30,6 +30,11 @@
 
 using namespace llvm;
 
+static cl::opt<bool> EnableMachineCombinerPass(
+    "systemz-machine-combiner",
+    cl::desc("Enable the machine combiner pass"),
+    cl::init(true), cl::Hidden);
+
 // NOLINTNEXTLINE(readability-identifier-naming)
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
   // Register the target.
@@ -245,6 +250,10 @@ bool SystemZPassConfig::addInstSelector() {
 
 bool SystemZPassConfig::addILPOpts() {
   addPass(&EarlyIfConverterID);
+
+  if (EnableMachineCombinerPass)
+    addPass(&MachineCombinerID);
+
   return true;
 }
 
diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index f430be2653b4..1e83cbeac50d 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyArgumentMove.cpp
   WebAssemblyAsmPrinter.cpp
   WebAssemblyCFGStackify.cpp
+  WebAssemblyCleanCodeAfterTrap.cpp
   WebAssemblyCFGSort.cpp
   WebAssemblyDebugFixup.cpp
   WebAssemblyDebugValueManager.cpp
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
index b7b5b2a97c59..8ea02bd2ad1f 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
@@ -18,24 +18,16 @@
 using namespace llvm;
 
 std::optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
-  // FIXME: can't use StringSwitch because wasm::ValType doesn't have a
-  // "invalid" value.
-  if (Type == "i32")
-    return wasm::ValType::I32;
-  if (Type == "i64")
-    return wasm::ValType::I64;
-  if (Type == "f32")
-    return wasm::ValType::F32;
-  if (Type == "f64")
-    return wasm::ValType::F64;
-  if (Type == "v128" || Type == "i8x16" || Type == "i16x8" || Type == "i32x4" ||
-      Type == "i64x2" || Type == "f32x4" || Type == "f64x2")
-    return wasm::ValType::V128;
-  if (Type == "funcref")
-    return wasm::ValType::FUNCREF;
-  if (Type == "externref")
-    return wasm::ValType::EXTERNREF;
-  return std::nullopt;
+  return llvm::StringSwitch<std::optional<wasm::ValType>>{Type}
+      .Case("i32", wasm::ValType::I32)
+      .Case("i64", wasm::ValType::I64)
+      .Case("f32", wasm::ValType::F32)
+      .Case("f64", wasm::ValType::F64)
+      .Cases("v128", "i8x16", "i16x8", "i32x4", "i64x2", "f32x4", "f64x2",
+             wasm::ValType::V128)
+      .Case("funcref", wasm::ValType::FUNCREF)
+      .Case("externref", wasm::ValType::EXTERNREF)
+      .Default(std::nullopt);
 }
 
 WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 1c40addb6d6f..7fc8546248f1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -37,6 +37,7 @@ FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
                                        CodeGenOptLevel OptLevel);
 FunctionPass *createWebAssemblyArgumentMove();
 FunctionPass *createWebAssemblySetP2AlignOperands();
+FunctionPass *createWebAssemblyCleanCodeAfterTrap();
 
 // Late passes.
 FunctionPass *createWebAssemblyReplacePhysRegs();
@@ -63,6 +64,7 @@ void initializeOptimizeReturnedPass(PassRegistry &);
 void initializeWebAssemblyRefTypeMem2LocalPass(PassRegistry &);
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
 void initializeWebAssemblyArgumentMovePass(PassRegistry &);
+void initializeWebAssemblyCleanCodeAfterTrapPass(PassRegistry &);
 void initializeWebAssemblyCFGSortPass(PassRegistry &);
 void initializeWebAssemblyCFGStackifyPass(PassRegistry &);
 void initializeWebAssemblyDAGToDAGISelPass(PassRegistry &);
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index d538197450b6..f00974531209 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -28,6 +28,9 @@ def FeatureSIMD128 : SubtargetFeature<"simd128", "SIMDLevel", "SIMD128",
 def FeatureRelaxedSIMD : SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
                                       "Enable relaxed-simd instructions">;
 
+def FeatureHalfPrecision : SubtargetFeature<"half-precision", "HasHalfPrecision", "true",
+                                            "Enable half precision instructions">;
+
 def FeatureAtomics : SubtargetFeature<"atomics", "HasAtomics", "true",
                                       "Enable Atomics">;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCleanCodeAfterTrap.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCleanCodeAfterTrap.cpp
new file mode 100644
index 000000000000..e5cba3c48547
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCleanCodeAfterTrap.cpp
@@ -0,0 +1,80 @@
+//===-- WebAssemblyCleanCodeAfterTrap.cpp - Clean Code After Trap ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file remove instruction after trap.
+/// ``llvm.trap`` will be convert as ``unreachable`` which is terminator.
+/// Instruction after terminator will cause validation failed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-clean-code-after-trap"
+
+namespace {
+class WebAssemblyCleanCodeAfterTrap final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyCleanCodeAfterTrap() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "WebAssembly Clean Code After Trap";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyCleanCodeAfterTrap::ID = 0;
+INITIALIZE_PASS(WebAssemblyCleanCodeAfterTrap, DEBUG_TYPE,
+                "WebAssembly Clean Code After Trap", false, false)
+
+FunctionPass *llvm::createWebAssemblyCleanCodeAfterTrap() {
+  return new WebAssemblyCleanCodeAfterTrap();
+}
+
+bool WebAssemblyCleanCodeAfterTrap::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG({
+    dbgs() << "********** CleanCodeAfterTrap **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &BB : MF) {
+    bool HasTerminator = false;
+    llvm::SmallVector<MachineInstr *> RemoveMI{};
+    for (MachineInstr &MI : BB) {
+      if (HasTerminator)
+        RemoveMI.push_back(&MI);
+      if (MI.hasProperty(MCID::Trap) && MI.isTerminator())
+        HasTerminator = true;
+    }
+    if (!RemoveMI.empty()) {
+      Changed = true;
+      LLVM_DEBUG({
+        for (MachineInstr *MI : RemoveMI) {
+          llvm::dbgs() << "* remove ";
+          MI->print(llvm::dbgs());
+        }
+      });
+      for (MachineInstr *MI : RemoveMI)
+        MI->eraseFromParent();
+    }
+  }
+  return Changed;
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 59ea9247bd86..fb2ca532d252 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -30,6 +30,10 @@ def HasRelaxedSIMD :
     Predicate<"Subtarget->hasRelaxedSIMD()">,
     AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
 
+def HasHalfPrecision :
+    Predicate<"Subtarget->hasHalfPrecision()">,
+    AssemblerPredicate<(all_of FeatureHalfPrecision), "half-precision">;
+
 def HasAtomics :
     Predicate<"Subtarget->hasAtomics()">,
     AssemblerPredicate<(all_of FeatureAtomics), "atomics">;
@@ -74,6 +78,10 @@ def HasMultiMemory :
     Predicate<"Subtarget->hasMultiMemory()">,
     AssemblerPredicate<(all_of FeatureMultiMemory), "multimemory">;
 
+def HasMutableGlobals:
+    Predicate<"Subtarget->hasMutableGlobals()">,
+    AssemblerPredicate<(all_of FeatureMutableGlobals), "mutable-globals">;
+
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific DAG Node Types.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 85d02b087c78..cb4589961867 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -50,6 +50,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   bool HasReferenceTypes = false;
   bool HasExtendedConst = false;
   bool HasMultiMemory = false;
+  bool HasHalfPrecision = false;
 
   /// What processor and OS we're targeting.
   Triple TargetTriple;
@@ -93,6 +94,7 @@ public:
   bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
   bool hasSIMD128() const { return SIMDLevel >= SIMD128; }
   bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; }
+  bool hasHalfPrecision() const { return HasHalfPrecision; }
   bool hasAtomics() const { return HasAtomics; }
   bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
   bool hasSignExt() const { return HasSignExt; }
@@ -103,6 +105,7 @@ public:
   bool hasTailCall() const { return HasTailCall; }
   bool hasReferenceTypes() const { return HasReferenceTypes; }
   bool hasMultiMemory() const { return HasMultiMemory; }
+  bool hasExtendedConst() const { return HasExtendedConst; }
 
   /// Parses features string setting specified subtarget options. Definition of
   /// function is auto generated by tblgen.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index cdd39eeb6bbb..de342e896573 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -512,6 +512,10 @@ bool WebAssemblyPassConfig::addInstSelector() {
   // Eliminate range checks and add default targets to br_table instructions.
   addPass(createWebAssemblyFixBrTableDefaults());
 
+  // unreachable is terminator, non-terminator instruction after it is not
+  // allowed.
+  addPass(createWebAssemblyCleanCodeAfterTrap());
+
   return false;
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index c0a75e215a40..8e4015783641 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -1168,33 +1168,34 @@ inline int getMemoryOperandNo(uint64_t TSFlags) {
 
 /// \returns true if the register is a XMM.
 inline bool isXMMReg(unsigned RegNo) {
-  assert(X86::XMM15 - X86::XMM0 == 15 &&
-         "XMM0-15 registers are not continuous");
-  assert(X86::XMM31 - X86::XMM16 == 15 &&
-         "XMM16-31 registers are not continuous");
+  static_assert(X86::XMM15 - X86::XMM0 == 15,
+                "XMM0-15 registers are not continuous");
+  static_assert(X86::XMM31 - X86::XMM16 == 15,
+                "XMM16-31 registers are not continuous");
   return (RegNo >= X86::XMM0 && RegNo <= X86::XMM15) ||
          (RegNo >= X86::XMM16 && RegNo <= X86::XMM31);
 }
 
 /// \returns true if the register is a YMM.
 inline bool isYMMReg(unsigned RegNo) {
-  assert(X86::YMM15 - X86::YMM0 == 15 &&
-         "YMM0-15 registers are not continuous");
-  assert(X86::YMM31 - X86::YMM16 == 15 &&
-         "YMM16-31 registers are not continuous");
+  static_assert(X86::YMM15 - X86::YMM0 == 15,
+                "YMM0-15 registers are not continuous");
+  static_assert(X86::YMM31 - X86::YMM16 == 15,
+                "YMM16-31 registers are not continuous");
   return (RegNo >= X86::YMM0 && RegNo <= X86::YMM15) ||
          (RegNo >= X86::YMM16 && RegNo <= X86::YMM31);
 }
 
 /// \returns true if the register is a ZMM.
 inline bool isZMMReg(unsigned RegNo) {
-  assert(X86::ZMM31 - X86::ZMM0 == 31 && "ZMM registers are not continuous");
+  static_assert(X86::ZMM31 - X86::ZMM0 == 31,
+                "ZMM registers are not continuous");
   return RegNo >= X86::ZMM0 && RegNo <= X86::ZMM31;
 }
 
 /// \returns true if \p RegNo is an apx extended register.
 inline bool isApxExtendedReg(unsigned RegNo) {
-  assert(X86::R31WH - X86::R16 == 95 && "EGPRs are not continuous");
+  static_assert(X86::R31WH - X86::R16 == 95, "EGPRs are not continuous");
   return RegNo >= X86::R16 && RegNo <= X86::R31WH;
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 437a7bd6ff6c..18ecca34943f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -121,7 +121,6 @@ MCStreamer *createX86WinCOFFStreamer(MCContext &C,
                                      std::unique_ptr<MCAsmBackend> &&AB,
                                      std::unique_ptr<MCObjectWriter> &&OW,
                                      std::unique_ptr<MCCodeEmitter> &&CE,
-                                     bool RelaxAll,
                                      bool IncrementalLinkerCompatible);
 
 /// Construct an X86 Mach-O object writer.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 36945d1f6746..dac8bc1fb1be 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -70,11 +70,9 @@ MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
                                            std::unique_ptr<MCAsmBackend> &&AB,
                                            std::unique_ptr<MCObjectWriter> &&OW,
                                            std::unique_ptr<MCCodeEmitter> &&CE,
-                                           bool RelaxAll,
                                            bool IncrementalLinkerCompatible) {
   X86WinCOFFStreamer *S =
       new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW));
-  S->getAssembler().setRelaxAll(RelaxAll);
   S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
   return S;
 }
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 0e89371f2f1d..14c62893766a 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2353,7 +2353,7 @@ SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
     SDValue Src = N.getOperand(0);
     unsigned SrcOpc = Src.getOpcode();
     if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
-         CurDAG->isADDLike(Src)) &&
+         CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
         Src.hasOneUse()) {
       if (CurDAG->isBaseWithConstantOffset(Src)) {
         SDValue AddSrc = Src.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bb43cbe15f52..6a5fc3c53146 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4441,10 +4441,8 @@ static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
-
-  APInt Ones = APInt::getAllOnes(32);
   unsigned NumElts = VT.getSizeInBits() / 32;
-  SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
+  SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
   return DAG.getBitcast(VT, Vec);
 }
 
@@ -4641,6 +4639,7 @@ static ConstantPoolSDNode *getTargetConstantPoolFromBasePtr(SDValue Ptr) {
   return dyn_cast<ConstantPoolSDNode>(Ptr);
 }
 
+// TODO: Add support for non-zero offsets.
 static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
   ConstantPoolSDNode *CNode = getTargetConstantPoolFromBasePtr(Ptr);
   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
@@ -20394,14 +20393,16 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   EVT SrcVT = In.getValueType();
   EVT DstSVT = DstVT.getVectorElementType();
   EVT SrcSVT = SrcVT.getVectorElementType();
+  unsigned NumDstEltBits = DstSVT.getSizeInBits();
+  unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
 
   // Check we have a truncation suited for PACKSS/PACKUS.
   if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
         (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
     return SDValue();
 
-  assert(SrcSVT.getSizeInBits() > DstSVT.getSizeInBits() && "Bad truncation");
-  unsigned NumStages = Log2_32(SrcSVT.getSizeInBits() / DstSVT.getSizeInBits());
+  assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
+  unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
 
   // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
   // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
@@ -20422,8 +20423,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   if (Subtarget.hasAVX512() && NumStages > 1)
     return SDValue();
 
-  unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
-  unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
+  unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Truncate with PACKUS if we are truncating a vector with leading zero
@@ -20445,7 +20445,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
   // see through BITCASTs later on and combines/simplifications can't then use
   // it.
-  if (DstSVT == MVT::i32 && NumSignBits != SrcSVT.getSizeInBits() &&
+  if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
       !Subtarget.hasAVX512())
     return SDValue();
 
@@ -24140,8 +24140,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
               DAG.getConstant(1, DL, VT));
         else
           Neg = CmpOp0;
-        SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                                   Neg); // -(and (x, 0x1))
+        SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
       }
@@ -27896,7 +27895,7 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
 
   // Begin by bitcasting the input to byte vector, then split those bytes
-  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+  // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
   // If the hi input nibble is zero then we add both results together, otherwise
   // we just take the hi result (by masking the lo result to zero before the
   // add).
@@ -28150,9 +28149,8 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
     SDValue Src = Op.getOperand(0);
-    SDValue Sub =
-        DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
-    return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
+    SDValue Neg = DAG.getNegative(Src, DL, VT);
+    return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
   }
 
   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
@@ -29373,10 +29371,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   // +ve/-ve Amt = shift left/right.
   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
-    if (Opc == ISD::SRL || Opc == ISD::SRA) {
-      SDValue Zero = DAG.getConstant(0, dl, VT);
-      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
-    }
+    if (Opc == ISD::SRL || Opc == ISD::SRA)
+      Amt = DAG.getNegative(Amt, dl, VT);
     if (Opc == ISD::SHL || Opc == ISD::SRL)
       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
     if (Opc == ISD::SRA)
@@ -31539,11 +31535,10 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
     // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
     // can use LXADD as opposed to cmpxchg.
     if (Opc == ISD::ATOMIC_LOAD_SUB ||
-        (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) {
-      RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
-      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS,
-                           AN->getMemOperand());
-    }
+        (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
+      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
+                           DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
+
     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
            "Used AtomicRMW ops other than Add should have been expanded!");
     return N;
@@ -43911,8 +43906,7 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
   EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
   SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
   SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
-  SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
-  return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
+  return DAG.getNegative(Zext, DL, ExtractVT);
 }
 
 static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
@@ -45272,7 +45266,7 @@ static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG,
       ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
                            Cond.getOperand(0).getValueType());
   Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
-		                        Cond.getOperand(1), NewCC);
+                      Cond.getOperand(1), NewCC);
   return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
 }
 
@@ -46882,7 +46876,7 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
 /// generate pmullw+pmulhuw for it (MULU16 mode).
-static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
+static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   // Check for legality
   // pmullw/pmulhw are not supported by SSE.
@@ -46901,7 +46895,6 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   if (!canReduceVMulWidth(N, DAG, Mode))
     return SDValue();
 
-  SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getOperand(0).getValueType();
@@ -47041,7 +47034,8 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
 // If the upper 17 bits of either element are zero and the other element are
 // zero/sign bits then we can use PMADDWD, which is always at least as quick as
 // PMULLD, except on KNL.
-static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
+static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL,
+                                   SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
@@ -47103,19 +47097,18 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
       return Op;
     // Mask off upper 16-bits of sign-extended constants.
     if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
-      return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
-                         DAG.getConstant(0xFFFF, SDLoc(N), VT));
+      return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
     if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
       SDValue Src = Op.getOperand(0);
       // Convert sext(vXi16) to zext(vXi16).
       if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
-        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
+        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
       // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
       // which will expand the extension.
       if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
         EVT ExtVT = VT.changeVectorElementType(MVT::i16);
-        Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
-        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
+        Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
+        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
       }
     }
     // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
@@ -47123,12 +47116,12 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
         N->isOnlyUserOf(Op.getNode())) {
       SDValue Src = Op.getOperand(0);
       if (Src.getScalarValueSizeInBits() == 16)
-        return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
+        return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
     }
     // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
     if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
         N->isOnlyUserOf(Op.getNode())) {
-      return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
+      return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
                          Op.getOperand(1));
     }
     return SDValue();
@@ -47149,11 +47142,10 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
                        DAG.getBitcast(OpVT, Ops[0]),
                        DAG.getBitcast(OpVT, Ops[1]));
   };
-  return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
-                          PMADDWDBuilder);
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
 }
 
-static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
+static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
@@ -47177,8 +47169,8 @@ static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
                             ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
     };
-    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
-                            PMULDQBuilder, /*CheckBWI*/false);
+    return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
+                            /*CheckBWI*/ false);
   }
 
   // If the upper bits are zero we can use a single pmuludq.
@@ -47188,8 +47180,8 @@ static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
                              ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
     };
-    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
-                            PMULUDQBuilder, /*CheckBWI*/false);
+    return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
+                            /*CheckBWI*/ false);
   }
 
   return SDValue();
@@ -47199,15 +47191,16 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
-  if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
+  if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
     return V;
 
-  if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
+  if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
     return V;
 
   if (DCI.isBeforeLegalize() && VT.isVector())
-    return reduceVMULWidth(N, DAG, Subtarget);
+    return reduceVMULWidth(N, DL, DAG, Subtarget);
 
   // Optimize a single multiply with constant into two operations in order to
   // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
@@ -47247,15 +47240,13 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
 
-  SDLoc DL(N);
   SDValue NewMul = SDValue();
   if (VT == MVT::i64 || VT == MVT::i32) {
     if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                            DAG.getConstant(AbsMulAmt, DL, VT));
       if (SignMulAmt < 0)
-        NewMul =
-            DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+        NewMul = DAG.getNegative(NewMul, DL, VT);
 
       return NewMul;
     }
@@ -47302,8 +47293,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
 
       // Negate the result.
       if (SignMulAmt < 0)
-        NewMul =
-            DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+        NewMul = DAG.getNegative(NewMul, DL, VT);
     } else if (!Subtarget.slowLEA())
       NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
   }
@@ -47319,10 +47309,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
           ISD::ADD, DL, VT, N->getOperand(0),
           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
-      // To negate, subtract the number from zero
       if (SignMulAmt < 0)
-        NewMul =
-            DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+        NewMul = DAG.getNegative(NewMul, DL, VT);
     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
       // (mul x, 2^N - 1) => (sub (shl x, N), x)
       NewMul =
@@ -49070,8 +49058,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
       APInt MulCLowBit = MulC & (-MulC);
       if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
           (MulCLowBit + MulC).isPowerOf2()) {
-        SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT),
-                                  N0.getOperand(0));
+        SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
         int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
         assert(MulCLowBitLog != -1 &&
                "Isolated lowbit is somehow not a power of 2!");
@@ -49771,8 +49758,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
   // Commute and try again (negate the result for subtracts).
   if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
     if (IsSub)
-      ADCOrSBB =
-          DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
+      ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
     return ADCOrSBB;
   }
 
@@ -54534,7 +54520,7 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
     if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
       SDValue Op(N, 0);
       if (Negate)
-        Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
+        Op = DAG.getNegative(Op, DL, VT);
       DCI.CombineTo(GenericAddSub, Op);
     }
   };
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index e7afc49240e5..fd05e16ac1ce 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -20,6 +20,7 @@
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -72,10 +73,28 @@ FunctionPass *llvm::createX86LowerTileCopyPass() {
 bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   const X86InstrInfo *TII = ST.getInstrInfo();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  BitVector GR64Regs =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(X86::GR64RegClassID));
+  BitVector TILERegs =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(X86::TILERegClassID));
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+    // There won't be a tile copy if no tile register live in.
+    bool HasTileCopy = false;
+    for (const auto &LI : MBB.liveins()) {
+      if (TILERegs.test(LI.PhysReg)) {
+        HasTileCopy = true;
+        break;
+      }
+    }
+    if (!HasTileCopy)
+      continue;
+    LiveRegUnits UsedRegs(*TRI);
+    UsedRegs.addLiveOuts(MBB);
+    for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
+      UsedRegs.stepBackward(MI);
       if (!MI.isCopy())
         continue;
       MachineOperand &DstMO = MI.getOperand(0);
@@ -85,27 +104,41 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
       if (!X86::TILERegClass.contains(DstReg, SrcReg))
         continue;
 
-      const TargetRegisterInfo *TRI = ST.getRegisterInfo();
       // Allocate stack slot for tile register
       unsigned Size = TRI->getSpillSize(X86::TILERegClass);
       Align Alignment = TRI->getSpillAlign(X86::TILERegClass);
       int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
-      // Allocate stack slot for stride register
-      Size = TRI->getSpillSize(X86::GR64RegClass);
-      Alignment = TRI->getSpillAlign(X86::GR64RegClass);
-      int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
 
-      // TODO: Pick a killed regiter to avoid save/reload. There is problem
-      // to get live interval in this stage.
-      Register GR64Cand = X86::RAX;
+      int StrideSS = 0;
+
+      // Pick a killed register to avoid a save/reload.
+      Register GR64Cand = X86::NoRegister;
+      for (auto RegT : GR64Regs.set_bits()) {
+        if (UsedRegs.available(RegT)) {
+          GR64Cand = RegT;
+          break;
+        }
+      }
 
       const DebugLoc &DL = MI.getDebugLoc();
-      // mov %rax (%sp)
-      BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand);
-      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS)
-          .addReg(GR64Cand);
-      // mov 64 %rax
-      BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+      if (GR64Cand) {
+        // mov 64 %reg
+        BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+      } else {
+        // No available register? Save RAX and reload it after use.
+
+        // Allocate stack slot for stride register
+        Size = TRI->getSpillSize(X86::GR64RegClass);
+        Alignment = TRI->getSpillAlign(X86::GR64RegClass);
+        StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+
+        // mov %reg (%sp)
+        addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)),
+                          StrideSS)
+            .addReg(X86::RAX);
+        // mov 64 %reg
+        BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), X86::RAX).addImm(64);
+      }
       // tilestored %tmm, (%sp, %idx)
 #define GET_EGPR_IF_ENABLED(OPC) (ST.hasEGPR() ? OPC##_EVEX : OPC)
       unsigned Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
@@ -120,10 +153,12 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
 #undef GET_EGPR_IF_ENABLED
       NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
                                 TileSS);
-      // restore %rax
-      // mov (%sp) %rax
-      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand),
-                        StrideSS);
+      if (!GR64Cand) {
+        // restore %rax
+        // mov (%sp) %rax
+        addFrameReference(
+            BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand), StrideSS);
+      }
       MI.eraseFromParent();
       Changed = true;
     }
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index be0cf1596d0d..555ede9e9540 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -649,10 +649,11 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
   // APX registers (R16-R31)
   //
   // and try to return the minimum number of registers supported by the target.
-  assert((X86::R15WH + 1 == X86 ::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
-         (X86::K6_K7 + 1 == X86::TMMCFG) && (X86::TMM7 + 1 == X86::R16) &&
-         (X86::R31WH + 1 == X86::NUM_TARGET_REGS) &&
-         "Register number may be incorrect");
+  static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
+                    (X86::K6_K7 + 1 == X86::TMMCFG) &&
+                    (X86::TMM7 + 1 == X86::R16) &&
+                    (X86::R31WH + 1 == X86::NUM_TARGET_REGS),
+                "Register number may be incorrect");
 
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   if (ST.hasEGPR())
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a458b5f9ec8f..4d55a084b730 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -244,7 +244,8 @@ public:
   // TODO: Currently we're always allowing widening on CPUs without VLX,
   // because for many cases we don't have a better option.
   bool canExtendTo512DQ() const {
-    return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
+    return hasAVX512() && hasEVEX512() &&
+           (!hasVLX() || getPreferVectorWidth() >= 512);
   }
   bool canExtendTo512BW() const  {
     return hasBWI() && canExtendTo512DQ();
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index d2c9bae97364..834f4536f93a 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -214,6 +214,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xc18", "cortex-r8")
         .Case("0xd13", "cortex-r52")
         .Case("0xd15", "cortex-r82")
+        .Case("0xd14", "cortex-r82ae")
         .Case("0xd02", "cortex-a34")
         .Case("0xd04", "cortex-a35")
         .Case("0xd03", "cortex-a53")
@@ -245,8 +246,11 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd4a", "neoverse-e1")
         .Case("0xd0c", "neoverse-n1")
         .Case("0xd49", "neoverse-n2")
+        .Case("0xd8e", "neoverse-n3")
         .Case("0xd40", "neoverse-v1")
         .Case("0xd4f", "neoverse-v2")
+        .Case("0xd84", "neoverse-v3")
+        .Case("0xd83", "neoverse-v3ae")
         .Default("generic");
   }
 
@@ -1284,8 +1288,10 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_AVX2);
   if (HasLeaf7 && ((EBX >> 8) & 1))
     setFeature(X86::FEATURE_BMI2);
-  if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
+  if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save) {
     setFeature(X86::FEATURE_AVX512F);
+    setFeature(X86::FEATURE_EVEX512);
+  }
   if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX512DQ);
   if (HasLeaf7 && ((EBX >> 19) & 1))
@@ -1796,6 +1802,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["rtm"]        = HasLeaf7 && ((EBX >> 11) & 1);
   // AVX512 is only supported if the OS supports the context save for it.
   Features["avx512f"]    = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save;
+  Features["evex512"]    = Features["avx512f"];
   Features["avx512dq"]   = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save;
   Features["rdseed"]     = HasLeaf7 && ((EBX >> 18) & 1);
   Features["adx"]        = HasLeaf7 && ((EBX >> 19) & 1);
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 39cb3f2c2fe1..e8172ebb2597 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -47,240 +47,11 @@ static const char *RISCVGImplications[] = {
   "i", "m", "a", "f", "d", "zicsr", "zifencei"
 };
 
-// NOTE: This table should be sorted alphabetically by extension name.
-static const RISCVSupportedExtension SupportedExtensions[] = {
-    {"a", {2, 1}},
-    {"c", {2, 0}},
-    {"d", {2, 2}},
-    {"e", {2, 0}},
-    {"f", {2, 2}},
-    {"h", {1, 0}},
-    {"i", {2, 1}},
-    {"m", {2, 0}},
-
-    {"shcounterenw", {1, 0}},
-    {"shgatpa", {1, 0}},
-    {"shtvala", {1, 0}},
-    {"shvsatpa", {1, 0}},
-    {"shvstvala", {1, 0}},
-    {"shvstvecd", {1, 0}},
-    {"smaia", {1, 0}},
-    {"smepmp", {1, 0}},
-    {"ssaia", {1, 0}},
-    {"ssccptr", {1, 0}},
-    {"sscofpmf", {1, 0}},
-    {"sscounterenw", {1, 0}},
-    {"ssstateen", {1, 0}},
-    {"ssstrict", {1, 0}},
-    {"sstc", {1, 0}},
-    {"sstvala", {1, 0}},
-    {"sstvecd", {1, 0}},
-    {"ssu64xl", {1, 0}},
-    {"svade", {1, 0}},
-    {"svadu", {1, 0}},
-    {"svbare", {1, 0}},
-    {"svinval", {1, 0}},
-    {"svnapot", {1, 0}},
-    {"svpbmt", {1, 0}},
-
-    {"v", {1, 0}},
-
-    // vendor-defined ('X') extensions
-    {"xcvalu", {1, 0}},
-    {"xcvbi", {1, 0}},
-    {"xcvbitmanip", {1, 0}},
-    {"xcvelw", {1, 0}},
-    {"xcvmac", {1, 0}},
-    {"xcvmem", {1, 0}},
-    {"xcvsimd", {1, 0}},
-    {"xsfcease", {1, 0}},
-    {"xsfvcp", {1, 0}},
-    {"xsfvfnrclipxfqf", {1, 0}},
-    {"xsfvfwmaccqqq", {1, 0}},
-    {"xsfvqmaccdod", {1, 0}},
-    {"xsfvqmaccqoq", {1, 0}},
-    {"xsifivecdiscarddlone", {1, 0}},
-    {"xsifivecflushdlone", {1, 0}},
-    {"xtheadba", {1, 0}},
-    {"xtheadbb", {1, 0}},
-    {"xtheadbs", {1, 0}},
-    {"xtheadcmo", {1, 0}},
-    {"xtheadcondmov", {1, 0}},
-    {"xtheadfmemidx", {1, 0}},
-    {"xtheadmac", {1, 0}},
-    {"xtheadmemidx", {1, 0}},
-    {"xtheadmempair", {1, 0}},
-    {"xtheadsync", {1, 0}},
-    {"xtheadvdot", {1, 0}},
-    {"xventanacondops", {1, 0}},
-
-    {"za128rs", {1, 0}},
-    {"za64rs", {1, 0}},
-    {"zacas", {1, 0}},
-    {"zama16b", {1, 0}},
-    {"zawrs", {1, 0}},
-
-    {"zba", {1, 0}},
-    {"zbb", {1, 0}},
-    {"zbc", {1, 0}},
-    {"zbkb", {1, 0}},
-    {"zbkc", {1, 0}},
-    {"zbkx", {1, 0}},
-    {"zbs", {1, 0}},
-
-    {"zca", {1, 0}},
-    {"zcb", {1, 0}},
-    {"zcd", {1, 0}},
-    {"zce", {1, 0}},
-    {"zcf", {1, 0}},
-    {"zcmop", {1, 0}},
-    {"zcmp", {1, 0}},
-    {"zcmt", {1, 0}},
-
-    {"zdinx", {1, 0}},
-
-    {"zfa", {1, 0}},
-    {"zfh", {1, 0}},
-    {"zfhmin", {1, 0}},
-    {"zfinx", {1, 0}},
-
-    {"zhinx", {1, 0}},
-    {"zhinxmin", {1, 0}},
-
-    {"zic64b", {1, 0}},
-    {"zicbom", {1, 0}},
-    {"zicbop", {1, 0}},
-    {"zicboz", {1, 0}},
-    {"ziccamoa", {1, 0}},
-    {"ziccif", {1, 0}},
-    {"zicclsm", {1, 0}},
-    {"ziccrse", {1, 0}},
-    {"zicntr", {2, 0}},
-    {"zicond", {1, 0}},
-    {"zicsr", {2, 0}},
-    {"zifencei", {2, 0}},
-    {"zihintntl", {1, 0}},
-    {"zihintpause", {2, 0}},
-    {"zihpm", {2, 0}},
-    {"zimop", {1, 0}},
-
-    {"zk", {1, 0}},
-    {"zkn", {1, 0}},
-    {"zknd", {1, 0}},
-    {"zkne", {1, 0}},
-    {"zknh", {1, 0}},
-    {"zkr", {1, 0}},
-    {"zks", {1, 0}},
-    {"zksed", {1, 0}},
-    {"zksh", {1, 0}},
-    {"zkt", {1, 0}},
-
-    {"zmmul", {1, 0}},
-
-    {"zvbb", {1, 0}},
-    {"zvbc", {1, 0}},
-
-    {"zve32f", {1, 0}},
-    {"zve32x", {1, 0}},
-    {"zve64d", {1, 0}},
-    {"zve64f", {1, 0}},
-    {"zve64x", {1, 0}},
-
-    {"zvfh", {1, 0}},
-    {"zvfhmin", {1, 0}},
-
-    // vector crypto
-    {"zvkb", {1, 0}},
-    {"zvkg", {1, 0}},
-    {"zvkn", {1, 0}},
-    {"zvknc", {1, 0}},
-    {"zvkned", {1, 0}},
-    {"zvkng", {1, 0}},
-    {"zvknha", {1, 0}},
-    {"zvknhb", {1, 0}},
-    {"zvks", {1, 0}},
-    {"zvksc", {1, 0}},
-    {"zvksed", {1, 0}},
-    {"zvksg", {1, 0}},
-    {"zvksh", {1, 0}},
-    {"zvkt", {1, 0}},
-
-    {"zvl1024b", {1, 0}},
-    {"zvl128b", {1, 0}},
-    {"zvl16384b", {1, 0}},
-    {"zvl2048b", {1, 0}},
-    {"zvl256b", {1, 0}},
-    {"zvl32768b", {1, 0}},
-    {"zvl32b", {1, 0}},
-    {"zvl4096b", {1, 0}},
-    {"zvl512b", {1, 0}},
-    {"zvl64b", {1, 0}},
-    {"zvl65536b", {1, 0}},
-    {"zvl8192b", {1, 0}},
-};
-
-// NOTE: This table should be sorted alphabetically by extension name.
-// clang-format off
-static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
-    {"smmpm", {0, 8}},
-    {"smnpm", {0, 8}},
-    {"ssnpm", {0, 8}},
-    {"sspm", {0, 8}},
-    {"ssqosid", {1, 0}},
-    {"supm", {0, 8}},
-
-    {"zaamo", {0, 2}},
-    {"zabha", {1, 0}},
-    {"zalasr", {0, 1}},
-    {"zalrsc", {0, 2}},
-
-    {"zfbfmin", {1, 0}},
+#define GET_SUPPORTED_EXTENSIONS
+#include "llvm/TargetParser/RISCVTargetParserDef.inc"
 
-    {"zicfilp", {0, 4}},
-    {"zicfiss", {0, 4}},
-
-    {"ztso", {0, 1}},
-
-    {"zvfbfmin", {1, 0}},
-    {"zvfbfwma", {1, 0}},
-};
-// clang-format on
-
-static constexpr RISCVProfile SupportedProfiles[] = {
-    {"rvi20u32", "rv32i"},
-    {"rvi20u64", "rv64i"},
-    {"rva20u64", "rv64imafdc_ziccamoa_ziccif_zicclsm_ziccrse_zicntr_za128rs"},
-    {"rva20s64", "rv64imafdc_ziccamoa_ziccif_zicclsm_ziccrse_zicntr_zifencei_"
-                 "za128rs_ssccptr_sstvala_sstvecd_svade_svbare"},
-    {"rva22u64",
-     "rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zihintpause_zihpm_za64rs_zfhmin_zba_zbb_zbs_zkt"},
-    {"rva22s64",
-     "rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zifencei_zihintpause_zihpm_za64rs_zfhmin_zba_zbb_zbs_zkt_ssccptr_"
-     "sscounterenw_sstvala_sstvecd_svade_svbare_svinval_svpbmt"},
-    {"rva23u64",
-     "rv64imafdcv_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zicond_zihintntl_zihintpause_zihpm_zimop_za64rs_zawrs_zfa_zfhmin_"
-     "zcb_zcmop_zba_zbb_zbs_zkt_zvbb_zvfhmin_zvkt"},
-    {"rva23s64",
-     "rv64imafdcvh_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zicond_zifencei_zihintntl_zihintpause_zihpm_zimop_za64rs_zawrs_"
-     "zfa_zfhmin_zcb_zcmop_zba_zbb_zbs_zkt_zvbb_zvfhmin_zvkt_shcounterenw_"
-     "shgatpa_shtvala_shvsatpa_shvstvala_shvstvecd_ssccptr_sscofpmf_"
-     "sscounterenw_ssnpm0p8_ssstateen_sstc_sstvala_sstvecd_ssu64xl_svade_"
-     "svbare_svinval_svnapot_svpbmt"},
-    {"rvb23u64", "rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_"
-                 "zicclsm_ziccrse_zicntr_zicond_zihintntl_zihintpause_zihpm_"
-                 "zimop_za64rs_zawrs_zfa_zcb_zcmop_zba_zbb_zbs_zkt"},
-    {"rvb23s64",
-     "rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zicond_zifencei_zihintntl_zihintpause_zihpm_zimop_za64rs_zawrs_"
-     "zfa_zcb_zcmop_zba_zbb_zbs_zkt_ssccptr_sscofpmf_sscounterenw_sstc_sstvala_"
-     "sstvecd_ssu64xl_svade_svbare_svinval_svnapot_svpbmt"},
-    {"rvm23u32", "rv32im_zicbop_zicond_zicsr_zihintntl_zihintpause_zimop_zca_"
-                 "zcb_zce_zcmop_zcmp_zcmt_zba_zbb_zbs"},
-};
+#define GET_SUPPORTED_PROFILES
+#include "llvm/TargetParser/RISCVTargetParserDef.inc"
 
 static void verifyTables() {
 #ifndef NDEBUG
@@ -308,7 +79,7 @@ void llvm::riscvExtensionsHelp(StringMap<StringRef> DescMap) {
   outs() << "All available -march extensions for RISC-V\n\n";
   PrintExtension("Name", "Version", (DescMap.empty() ? "" : "Description"));
 
-  RISCVISAInfo::OrderedExtensionMap ExtMap;
+  RISCVISAUtils::OrderedExtensionMap ExtMap;
   for (const auto &E : SupportedExtensions)
     ExtMap[E.Name] = {E.Version.Major, E.Version.Minor};
   for (const auto &E : ExtMap) {
@@ -627,11 +398,10 @@ RISCVISAInfo::parseFeatures(unsigned XLen,
 
   for (auto &Feature : Features) {
     StringRef ExtName = Feature;
-    bool Experimental = false;
     assert(ExtName.size() > 1 && (ExtName[0] == '+' || ExtName[0] == '-'));
     bool Add = ExtName[0] == '+';
     ExtName = ExtName.drop_front(1); // Drop '+' or '-'
-    Experimental = stripExperimentalPrefix(ExtName);
+    bool Experimental = stripExperimentalPrefix(ExtName);
     auto ExtensionInfos = Experimental
                               ? ArrayRef(SupportedExperimentalExtensions)
                               : ArrayRef(SupportedExtensions);
@@ -655,22 +425,22 @@ RISCVISAInfo::parseFeatures(unsigned XLen,
 
 llvm::Expected<std::unique_ptr<RISCVISAInfo>>
 RISCVISAInfo::parseNormalizedArchString(StringRef Arch) {
-  if (llvm::any_of(Arch, isupper)) {
+  if (llvm::any_of(Arch, isupper))
     return createStringError(errc::invalid_argument,
                              "string must be lowercase");
-  }
+
   // Must start with a valid base ISA name.
-  unsigned XLen;
-  if (Arch.starts_with("rv32i") || Arch.starts_with("rv32e"))
+  unsigned XLen = 0;
+  if (Arch.consume_front("rv32"))
     XLen = 32;
-  else if (Arch.starts_with("rv64i") || Arch.starts_with("rv64e"))
+  else if (Arch.consume_front("rv64"))
     XLen = 64;
-  else
+
+  if (XLen == 0 || Arch.empty() || (Arch[0] != 'i' && Arch[0] != 'e'))
     return createStringError(errc::invalid_argument,
                              "arch string must begin with valid base ISA");
+
   std::unique_ptr<RISCVISAInfo> ISAInfo(new RISCVISAInfo(XLen));
-  // Discard rv32/rv64 prefix.
-  Arch = Arch.substr(4);
 
   // Each extension is of the form ${name}${major_version}p${minor_version}
   // and separated by _. Split by _ and then extract the name and version
@@ -690,27 +460,24 @@ RISCVISAInfo::parseNormalizedArchString(StringRef Arch) {
 
     // Split Prefix into the extension name and the major version number
     // (the trailing digits of Prefix).
-    int TrailingDigits = 0;
-    StringRef ExtName = Prefix;
-    while (!ExtName.empty()) {
-      if (!isDigit(ExtName.back()))
+    size_t VersionStart = Prefix.size();
+    while (VersionStart != 0) {
+      if (!isDigit(Prefix[VersionStart - 1]))
         break;
-      ExtName = ExtName.drop_back(1);
-      TrailingDigits++;
+      --VersionStart;
     }
-    if (!TrailingDigits)
+    if (VersionStart == Prefix.size())
       return createStringError(errc::invalid_argument,
                                "extension lacks version in expected format");
 
-    StringRef MajorVersionStr = Prefix.take_back(TrailingDigits);
+    StringRef ExtName = Prefix.slice(0, VersionStart);
+    StringRef MajorVersionStr = Prefix.slice(VersionStart, StringRef::npos);
     if (MajorVersionStr.getAsInteger(10, MajorVersion))
       return createStringError(errc::invalid_argument,
                                "failed to parse major version number");
     ISAInfo->addExtension(ExtName, {MajorVersion, MinorVersion});
   }
-  ISAInfo->updateFLen();
-  ISAInfo->updateMinVLen();
-  ISAInfo->updateMaxELen();
+  ISAInfo->updateImpliedLengths();
   return std::move(ISAInfo);
 }
 
@@ -818,43 +585,44 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
                               bool ExperimentalExtensionVersionCheck,
                               bool IgnoreUnknown) {
   // RISC-V ISA strings must be lowercase.
-  if (llvm::any_of(Arch, isupper)) {
+  if (llvm::any_of(Arch, isupper))
     return createStringError(errc::invalid_argument,
                              "string must be lowercase");
-  }
 
-  if (Arch.starts_with("rvi") || Arch.starts_with("rva") ||
-      Arch.starts_with("rvb") || Arch.starts_with("rvm")) {
-    const auto *FoundProfile =
-        llvm::find_if(SupportedProfiles, [Arch](const RISCVProfile &Profile) {
-          return Arch.starts_with(Profile.Name);
-        });
-
-    if (FoundProfile == std::end(SupportedProfiles))
-      return createStringError(errc::invalid_argument, "unsupported profile");
-
-    std::string NewArch = FoundProfile->MArch.str();
-    StringRef ArchWithoutProfile = Arch.substr(FoundProfile->Name.size());
-    if (!ArchWithoutProfile.empty()) {
-      if (!ArchWithoutProfile.starts_with("_"))
-        return createStringError(
-            errc::invalid_argument,
-            "additional extensions must be after separator '_'");
-      NewArch += ArchWithoutProfile.str();
+  // ISA string must begin with rv32, rv64, or a profile.
+  unsigned XLen = 0;
+  if (Arch.consume_front("rv32")) {
+    XLen = 32;
+  } else if (Arch.consume_front("rv64")) {
+    XLen = 64;
+  } else {
+    // Try parsing as a profile.
+    auto I = llvm::upper_bound(SupportedProfiles, Arch,
+                               [](StringRef Arch, const RISCVProfile &Profile) {
+                                 return Arch < Profile.Name;
+                               });
+
+    if (I != std::begin(SupportedProfiles) && Arch.starts_with((--I)->Name)) {
+      std::string NewArch = I->MArch.str();
+      StringRef ArchWithoutProfile = Arch.drop_front(I->Name.size());
+      if (!ArchWithoutProfile.empty()) {
+        if (ArchWithoutProfile.front() != '_')
+          return createStringError(
+              errc::invalid_argument,
+              "additional extensions must be after separator '_'");
+        NewArch += ArchWithoutProfile.str();
+      }
+      return parseArchString(NewArch, EnableExperimentalExtension,
+                             ExperimentalExtensionVersionCheck, IgnoreUnknown);
     }
-    return parseArchString(NewArch, EnableExperimentalExtension,
-                           ExperimentalExtensionVersionCheck, IgnoreUnknown);
   }
 
-  bool HasRV64 = Arch.starts_with("rv64");
-  // ISA string must begin with rv32 or rv64.
-  if (!(Arch.starts_with("rv32") || HasRV64) || (Arch.size() < 5)) {
+  if (XLen == 0 || Arch.empty())
     return createStringError(
         errc::invalid_argument,
-        "string must begin with rv32{i,e,g} or rv64{i,e,g}");
-  }
+        "string must begin with rv32{i,e,g}, rv64{i,e,g}, or a supported "
+        "profile name");
 
-  unsigned XLen = HasRV64 ? 64 : 32;
   std::unique_ptr<RISCVISAInfo> ISAInfo(new RISCVISAInfo(XLen));
   MapVector<std::string, RISCVISAUtils::ExtensionVersion,
             std::map<std::string, unsigned>>
@@ -862,19 +630,20 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
 
   // The canonical order specified in ISA manual.
   // Ref: Table 22.1 in RISC-V User-Level ISA V2.2
-  char Baseline = Arch[4];
+  char Baseline = Arch.front();
 
   // First letter should be 'e', 'i' or 'g'.
   switch (Baseline) {
   default:
     return createStringError(errc::invalid_argument,
-                             "first letter should be 'e', 'i' or 'g'");
+                             "first letter after \'rv" + Twine(XLen) +
+                                 "\' should be 'e', 'i' or 'g'");
   case 'e':
   case 'i':
     break;
   case 'g':
     // g expands to extensions in RISCVGImplications.
-    if (Arch.size() > 5 && isDigit(Arch[5]))
+    if (Arch.size() > 1 && isDigit(Arch[1]))
       return createStringError(errc::invalid_argument,
                                "version not supported for 'g'");
     break;
@@ -884,8 +653,8 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
     return createStringError(errc::invalid_argument,
                              "extension name missing after separator '_'");
 
-  // Skip rvxxx
-  StringRef Exts = Arch.substr(5);
+  // Skip baseline.
+  StringRef Exts = Arch.drop_front(1);
 
   unsigned Major, Minor, ConsumeLength;
   if (Baseline == 'g') {
@@ -896,11 +665,10 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
     // version since the we don't have clear version scheme for that on
     // ISA spec.
     for (const auto *Ext : RISCVGImplications) {
-      if (auto Version = findDefaultVersion(Ext)) {
-        // Postpone AddExtension until end of this function
-        SeenExtMap[Ext] = {Version->Major, Version->Minor};
-      } else
-        llvm_unreachable("Default extension version not found?");
+      auto Version = findDefaultVersion(Ext);
+      assert(Version && "Default extension version not found?");
+      // Postpone AddExtension until end of this function
+      SeenExtMap[Ext] = {Version->Major, Version->Minor};
     }
   } else {
     // Baseline is `i` or `e`
@@ -1041,138 +809,25 @@ Error RISCVISAInfo::checkDependency() {
   return Error::success();
 }
 
-static const char *ImpliedExtsD[] = {"f"};
-static const char *ImpliedExtsF[] = {"zicsr"};
-static const char *ImpliedExtsV[] = {"zvl128b", "zve64d"};
-static const char *ImpliedExtsXSfvcp[] = {"zve32x"};
-static const char *ImpliedExtsXSfvfnrclipxfqf[] = {"zve32f"};
-static const char *ImpliedExtsXSfvfwmaccqqq[] = {"zvfbfmin"};
-static const char *ImpliedExtsXSfvqmaccdod[] = {"zve32x"};
-static const char *ImpliedExtsXSfvqmaccqoq[] = {"zve32x"};
-static const char *ImpliedExtsXTHeadVdot[] = {"v"};
-static const char *ImpliedExtsZcb[] = {"zca"};
-static const char *ImpliedExtsZcd[] = {"d", "zca"};
-static const char *ImpliedExtsZce[] = {"zcb", "zcmp", "zcmt"};
-static const char *ImpliedExtsZcf[] = {"f", "zca"};
-static const char *ImpliedExtsZcmop[] = {"zca"};
-static const char *ImpliedExtsZcmp[] = {"zca"};
-static const char *ImpliedExtsZcmt[] = {"zca", "zicsr"};
-static const char *ImpliedExtsZdinx[] = {"zfinx"};
-static const char *ImpliedExtsZfa[] = {"f"};
-static const char *ImpliedExtsZfbfmin[] = {"f"};
-static const char *ImpliedExtsZfh[] = {"zfhmin"};
-static const char *ImpliedExtsZfhmin[] = {"f"};
-static const char *ImpliedExtsZfinx[] = {"zicsr"};
-static const char *ImpliedExtsZhinx[] = {"zhinxmin"};
-static const char *ImpliedExtsZhinxmin[] = {"zfinx"};
-static const char *ImpliedExtsZicfiss[] = {"zicsr", "zimop"};
-static const char *ImpliedExtsZicntr[] = {"zicsr"};
-static const char *ImpliedExtsZihpm[] = {"zicsr"};
-static const char *ImpliedExtsZk[] = {"zkn", "zkt", "zkr"};
-static const char *ImpliedExtsZkn[] = {"zbkb", "zbkc", "zbkx",
-                                       "zkne", "zknd", "zknh"};
-static const char *ImpliedExtsZks[] = {"zbkb", "zbkc", "zbkx", "zksed", "zksh"};
-static const char *ImpliedExtsZvbb[] = {"zvkb"};
-static const char *ImpliedExtsZve32f[] = {"zve32x", "f"};
-static const char *ImpliedExtsZve32x[] = {"zvl32b", "zicsr"};
-static const char *ImpliedExtsZve64d[] = {"zve64f", "d"};
-static const char *ImpliedExtsZve64f[] = {"zve64x", "zve32f"};
-static const char *ImpliedExtsZve64x[] = {"zve32x", "zvl64b"};
-static const char *ImpliedExtsZvfbfmin[] = {"zve32f"};
-static const char *ImpliedExtsZvfbfwma[] = {"zvfbfmin", "zfbfmin"};
-static const char *ImpliedExtsZvfh[] = {"zvfhmin", "zfhmin"};
-static const char *ImpliedExtsZvfhmin[] = {"zve32f"};
-static const char *ImpliedExtsZvkn[] = {"zvkb", "zvkned", "zvknhb", "zvkt"};
-static const char *ImpliedExtsZvknc[] = {"zvbc", "zvkn"};
-static const char *ImpliedExtsZvkng[] = {"zvkg", "zvkn"};
-static const char *ImpliedExtsZvknhb[] = {"zve64x"};
-static const char *ImpliedExtsZvks[] = {"zvkb", "zvksed", "zvksh", "zvkt"};
-static const char *ImpliedExtsZvksc[] = {"zvbc", "zvks"};
-static const char *ImpliedExtsZvksg[] = {"zvkg", "zvks"};
-static const char *ImpliedExtsZvl1024b[] = {"zvl512b"};
-static const char *ImpliedExtsZvl128b[] = {"zvl64b"};
-static const char *ImpliedExtsZvl16384b[] = {"zvl8192b"};
-static const char *ImpliedExtsZvl2048b[] = {"zvl1024b"};
-static const char *ImpliedExtsZvl256b[] = {"zvl128b"};
-static const char *ImpliedExtsZvl32768b[] = {"zvl16384b"};
-static const char *ImpliedExtsZvl4096b[] = {"zvl2048b"};
-static const char *ImpliedExtsZvl512b[] = {"zvl256b"};
-static const char *ImpliedExtsZvl64b[] = {"zvl32b"};
-static const char *ImpliedExtsZvl65536b[] = {"zvl32768b"};
-static const char *ImpliedExtsZvl8192b[] = {"zvl4096b"};
-
 struct ImpliedExtsEntry {
   StringLiteral Name;
-  ArrayRef<const char *> Exts;
+  const char *ImpliedExt;
 
   bool operator<(const ImpliedExtsEntry &Other) const {
     return Name < Other.Name;
   }
-
-  bool operator<(StringRef Other) const { return Name < Other; }
 };
 
-// Note: The table needs to be sorted by name.
-static constexpr ImpliedExtsEntry ImpliedExts[] = {
-    {{"d"}, {ImpliedExtsD}},
-    {{"f"}, {ImpliedExtsF}},
-    {{"v"}, {ImpliedExtsV}},
-    {{"xsfvcp"}, {ImpliedExtsXSfvcp}},
-    {{"xsfvfnrclipxfqf"}, {ImpliedExtsXSfvfnrclipxfqf}},
-    {{"xsfvfwmaccqqq"}, {ImpliedExtsXSfvfwmaccqqq}},
-    {{"xsfvqmaccdod"}, {ImpliedExtsXSfvqmaccdod}},
-    {{"xsfvqmaccqoq"}, {ImpliedExtsXSfvqmaccqoq}},
-    {{"xtheadvdot"}, {ImpliedExtsXTHeadVdot}},
-    {{"zcb"}, {ImpliedExtsZcb}},
-    {{"zcd"}, {ImpliedExtsZcd}},
-    {{"zce"}, {ImpliedExtsZce}},
-    {{"zcf"}, {ImpliedExtsZcf}},
-    {{"zcmop"}, {ImpliedExtsZcmop}},
-    {{"zcmp"}, {ImpliedExtsZcmp}},
-    {{"zcmt"}, {ImpliedExtsZcmt}},
-    {{"zdinx"}, {ImpliedExtsZdinx}},
-    {{"zfa"}, {ImpliedExtsZfa}},
-    {{"zfbfmin"}, {ImpliedExtsZfbfmin}},
-    {{"zfh"}, {ImpliedExtsZfh}},
-    {{"zfhmin"}, {ImpliedExtsZfhmin}},
-    {{"zfinx"}, {ImpliedExtsZfinx}},
-    {{"zhinx"}, {ImpliedExtsZhinx}},
-    {{"zhinxmin"}, {ImpliedExtsZhinxmin}},
-    {{"zicfiss"}, {ImpliedExtsZicfiss}},
-    {{"zicntr"}, {ImpliedExtsZicntr}},
-    {{"zihpm"}, {ImpliedExtsZihpm}},
-    {{"zk"}, {ImpliedExtsZk}},
-    {{"zkn"}, {ImpliedExtsZkn}},
-    {{"zks"}, {ImpliedExtsZks}},
-    {{"zvbb"}, {ImpliedExtsZvbb}},
-    {{"zve32f"}, {ImpliedExtsZve32f}},
-    {{"zve32x"}, {ImpliedExtsZve32x}},
-    {{"zve64d"}, {ImpliedExtsZve64d}},
-    {{"zve64f"}, {ImpliedExtsZve64f}},
-    {{"zve64x"}, {ImpliedExtsZve64x}},
-    {{"zvfbfmin"}, {ImpliedExtsZvfbfmin}},
-    {{"zvfbfwma"}, {ImpliedExtsZvfbfwma}},
-    {{"zvfh"}, {ImpliedExtsZvfh}},
-    {{"zvfhmin"}, {ImpliedExtsZvfhmin}},
-    {{"zvkn"}, {ImpliedExtsZvkn}},
-    {{"zvknc"}, {ImpliedExtsZvknc}},
-    {{"zvkng"}, {ImpliedExtsZvkng}},
-    {{"zvknhb"}, {ImpliedExtsZvknhb}},
-    {{"zvks"}, {ImpliedExtsZvks}},
-    {{"zvksc"}, {ImpliedExtsZvksc}},
-    {{"zvksg"}, {ImpliedExtsZvksg}},
-    {{"zvl1024b"}, {ImpliedExtsZvl1024b}},
-    {{"zvl128b"}, {ImpliedExtsZvl128b}},
-    {{"zvl16384b"}, {ImpliedExtsZvl16384b}},
-    {{"zvl2048b"}, {ImpliedExtsZvl2048b}},
-    {{"zvl256b"}, {ImpliedExtsZvl256b}},
-    {{"zvl32768b"}, {ImpliedExtsZvl32768b}},
-    {{"zvl4096b"}, {ImpliedExtsZvl4096b}},
-    {{"zvl512b"}, {ImpliedExtsZvl512b}},
-    {{"zvl64b"}, {ImpliedExtsZvl64b}},
-    {{"zvl65536b"}, {ImpliedExtsZvl65536b}},
-    {{"zvl8192b"}, {ImpliedExtsZvl8192b}},
-};
+static bool operator<(const ImpliedExtsEntry &LHS, StringRef RHS) {
+  return LHS.Name < RHS;
+}
+
+static bool operator<(StringRef LHS, const ImpliedExtsEntry &RHS) {
+  return LHS < RHS.Name;
+}
+
+#define GET_IMPLIED_EXTENSIONS
+#include "llvm/TargetParser/RISCVTargetParserDef.inc"
 
 void RISCVISAInfo::updateImplication() {
   bool HasE = Exts.count("e") != 0;
@@ -1195,18 +850,19 @@ void RISCVISAInfo::updateImplication() {
 
   while (!WorkList.empty()) {
     StringRef ExtName = WorkList.pop_back_val();
-    auto I = llvm::lower_bound(ImpliedExts, ExtName);
-    if (I != std::end(ImpliedExts) && I->Name == ExtName) {
-      for (const char *ImpliedExt : I->Exts) {
-        if (WorkList.count(ImpliedExt))
-          continue;
-        if (Exts.count(ImpliedExt))
-          continue;
-        auto Version = findDefaultVersion(ImpliedExt);
-        addExtension(ImpliedExt, Version.value());
-        WorkList.insert(ImpliedExt);
-      }
-    }
+    auto Range = std::equal_range(std::begin(ImpliedExts),
+                                  std::end(ImpliedExts), ExtName);
+    std::for_each(Range.first, Range.second,
+                  [&](const ImpliedExtsEntry &Implied) {
+                    const char *ImpliedExt = Implied.ImpliedExt;
+                    if (WorkList.count(ImpliedExt))
+                      return;
+                    if (Exts.count(ImpliedExt))
+                      return;
+                    auto Version = findDefaultVersion(ImpliedExt);
+                    addExtension(ImpliedExt, Version.value());
+                    WorkList.insert(ImpliedExt);
+                  });
   }
 
   // Add Zcf if Zce and F are enabled on RV32.
@@ -1217,79 +873,81 @@ void RISCVISAInfo::updateImplication() {
   }
 }
 
-struct CombinedExtsEntry {
-  StringLiteral CombineExt;
-  ArrayRef<const char *> RequiredExts;
-};
-
-static constexpr CombinedExtsEntry CombineIntoExts[] = {
-    {{"zk"}, {ImpliedExtsZk}},
-    {{"zkn"}, {ImpliedExtsZkn}},
-    {{"zks"}, {ImpliedExtsZks}},
-    {{"zvkn"}, {ImpliedExtsZvkn}},
-    {{"zvknc"}, {ImpliedExtsZvknc}},
-    {{"zvkng"}, {ImpliedExtsZvkng}},
-    {{"zvks"}, {ImpliedExtsZvks}},
-    {{"zvksc"}, {ImpliedExtsZvksc}},
-    {{"zvksg"}, {ImpliedExtsZvksg}},
+static constexpr StringLiteral CombineIntoExts[] = {
+    {"zk"},    {"zkn"},  {"zks"},   {"zvkn"},  {"zvknc"},
+    {"zvkng"}, {"zvks"}, {"zvksc"}, {"zvksg"},
 };
 
 void RISCVISAInfo::updateCombination() {
-  bool IsNewCombine = false;
+  bool MadeChange = false;
   do {
-    IsNewCombine = false;
-    for (CombinedExtsEntry CombineIntoExt : CombineIntoExts) {
-      auto CombineExt = CombineIntoExt.CombineExt;
-      auto RequiredExts = CombineIntoExt.RequiredExts;
+    MadeChange = false;
+    for (StringRef CombineExt : CombineIntoExts) {
       if (hasExtension(CombineExt))
         continue;
-      bool IsAllRequiredFeatureExist = true;
-      for (const char *Ext : RequiredExts)
-        IsAllRequiredFeatureExist &= hasExtension(Ext);
-      if (IsAllRequiredFeatureExist) {
+
+      // Look up the extension in the ImpliesExt table to find everything it
+      // depends on.
+      auto Range = std::equal_range(std::begin(ImpliedExts),
+                                    std::end(ImpliedExts), CombineExt);
+      bool HasAllRequiredFeatures = std::all_of(
+          Range.first, Range.second, [&](const ImpliedExtsEntry &Implied) {
+            return hasExtension(Implied.ImpliedExt);
+          });
+      if (HasAllRequiredFeatures) {
         auto Version = findDefaultVersion(CombineExt);
         addExtension(CombineExt, Version.value());
-        IsNewCombine = true;
+        MadeChange = true;
       }
     }
-  } while (IsNewCombine);
+  } while (MadeChange);
 }
 
-void RISCVISAInfo::updateFLen() {
-  FLen = 0;
+void RISCVISAInfo::updateImpliedLengths() {
+  assert(FLen == 0 && MaxELenFp == 0 && MaxELen == 0 && MinVLen == 0 &&
+         "Expected lengths to be initialied to zero");
+
   // TODO: Handle q extension.
   if (Exts.count("d"))
     FLen = 64;
   else if (Exts.count("f"))
     FLen = 32;
-}
 
-void RISCVISAInfo::updateMinVLen() {
-  for (auto const &Ext : Exts) {
-    StringRef ExtName = Ext.first;
-    bool IsZvlExt = ExtName.consume_front("zvl") && ExtName.consume_back("b");
-    if (IsZvlExt) {
-      unsigned ZvlLen;
-      if (!ExtName.getAsInteger(10, ZvlLen))
-        MinVLen = std::max(MinVLen, ZvlLen);
-    }
+  if (Exts.count("v")) {
+    MaxELenFp = std::max(MaxELenFp, 64u);
+    MaxELen = std::max(MaxELen, 64u);
   }
-}
 
-void RISCVISAInfo::updateMaxELen() {
-  // handles EEW restriction by sub-extension zve
   for (auto const &Ext : Exts) {
     StringRef ExtName = Ext.first;
-    bool IsZveExt = ExtName.consume_front("zve");
-    if (IsZveExt) {
-      if (ExtName.back() == 'f')
+    // Infer MaxELen and MaxELenFp from Zve(32/64)(x/f/d)
+    if (ExtName.consume_front("zve")) {
+      unsigned ZveELen;
+      if (ExtName.consumeInteger(10, ZveELen))
+        continue;
+
+      if (ExtName == "f")
         MaxELenFp = std::max(MaxELenFp, 32u);
-      if (ExtName.back() == 'd')
+      else if (ExtName == "d")
         MaxELenFp = std::max(MaxELenFp, 64u);
-      ExtName = ExtName.drop_back();
-      unsigned ZveELen;
-      ExtName.getAsInteger(10, ZveELen);
+      else if (ExtName != "x")
+        continue;
+
       MaxELen = std::max(MaxELen, ZveELen);
+      continue;
+    }
+
+    // Infer MinVLen from zvl*b.
+    if (ExtName.consume_front("zvl")) {
+      unsigned ZvlLen;
+      if (ExtName.consumeInteger(10, ZvlLen))
+        continue;
+
+      if (ExtName != "b")
+        continue;
+
+      MinVLen = std::max(MinVLen, ZvlLen);
+      continue;
     }
   }
 }
@@ -1315,9 +973,7 @@ llvm::Expected<std::unique_ptr<RISCVISAInfo>>
 RISCVISAInfo::postProcessAndChecking(std::unique_ptr<RISCVISAInfo> &&ISAInfo) {
   ISAInfo->updateImplication();
   ISAInfo->updateCombination();
-  ISAInfo->updateFLen();
-  ISAInfo->updateMinVLen();
-  ISAInfo->updateMaxELen();
+  ISAInfo->updateImpliedLengths();
 
   if (Error Result = ISAInfo->checkDependency())
     return std::move(Result);
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index c3015d895230..40ee59c014b0 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -95,7 +95,8 @@ static std::vector<StringRef> getSearchPaths(opt::InputArgList *Args,
 
 // Opens a file. Path has to be resolved already. (used for def file)
 std::unique_ptr<MemoryBuffer> openFile(const Twine &Path) {
-  ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MB = MemoryBuffer::getFile(Path);
+  ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MB =
+      MemoryBuffer::getFile(Path, /*IsText=*/true);
 
   if (std::error_code EC = MB.getError()) {
     llvm::errs() << "cannot open file " << Path << ": " << EC.message() << "\n";
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index da714c9a7570..c8c835115a99 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -87,6 +87,7 @@ STATISTIC(NumNestRemoved   , "Number of nest attributes removed");
 STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
 STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
 STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
+STATISTIC(NumAtExitRemoved, "Number of atexit handlers removed");
 STATISTIC(NumInternalFunc, "Number of internal functions");
 STATISTIC(NumColdCC, "Number of functions marked coldcc");
 STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
@@ -306,6 +307,10 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
       APInt Offset(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
       PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
           DL, Offset, /* AllowNonInbounds */ true);
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(PtrOp)) {
+        if (II->getIntrinsicID() == Intrinsic::threadlocal_address)
+          PtrOp = II->getArgOperand(0);
+      }
       if (PtrOp == GV) {
         if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) {
           LI->replaceAllUsesWith(Value);
@@ -318,6 +323,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
       if (getUnderlyingObject(MI->getRawDest()) == GV)
         EraseFromParent(MI);
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::threadlocal_address)
+        append_range(WorkList, II->users());
     }
   }
 
@@ -2321,18 +2329,19 @@ OptimizeGlobalAliases(Module &M,
 }
 
 static Function *
-FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+FindAtExitLibFunc(Module &M,
+                  function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+                  LibFunc Func) {
   // Hack to get a default TLI before we have actual Function.
   auto FuncIter = M.begin();
   if (FuncIter == M.end())
     return nullptr;
   auto *TLI = &GetTLI(*FuncIter);
 
-  LibFunc F = LibFunc_cxa_atexit;
-  if (!TLI->has(F))
+  if (!TLI->has(Func))
     return nullptr;
 
-  Function *Fn = M.getFunction(TLI->getName(F));
+  Function *Fn = M.getFunction(TLI->getName(Func));
   if (!Fn)
     return nullptr;
 
@@ -2340,17 +2349,18 @@ FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   TLI = &GetTLI(*Fn);
 
   // Make sure that the function has the correct prototype.
-  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
+  LibFunc F;
+  if (!TLI->getLibFunc(*Fn, F) || F != Func)
     return nullptr;
 
   return Fn;
 }
 
-/// Returns whether the given function is an empty C++ destructor and can
-/// therefore be eliminated.
-/// Note that we assume that other optimization passes have already simplified
-/// the code so we simply check for 'ret'.
-static bool cxxDtorIsEmpty(const Function &Fn) {
+/// Returns whether the given function is an empty C++ destructor or atexit
+/// handler and can therefore be eliminated. Note that we assume that other
+/// optimization passes have already simplified the code so we simply check for
+/// 'ret'.
+static bool IsEmptyAtExitFunction(const Function &Fn) {
   // FIXME: We could eliminate C++ destructors if they're readonly/readnone and
   // nounwind, but that doesn't seem worth doing.
   if (Fn.isDeclaration())
@@ -2366,7 +2376,7 @@ static bool cxxDtorIsEmpty(const Function &Fn) {
   return false;
 }
 
-static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
+static bool OptimizeEmptyGlobalAtExitDtors(Function *CXAAtExitFn, bool isCXX) {
   /// Itanium C++ ABI p3.3.5:
   ///
   ///   After constructing a global (or local static) object, that will require
@@ -2379,8 +2389,8 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   ///   registered before this one. It returns zero if registration is
   ///   successful, nonzero on failure.
 
-  // This pass will look for calls to __cxa_atexit where the function is trivial
-  // and remove them.
+  // This pass will look for calls to __cxa_atexit or atexit where the function
+  // is trivial and remove them.
   bool Changed = false;
 
   for (User *U : llvm::make_early_inc_range(CXAAtExitFn->users())) {
@@ -2393,14 +2403,17 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
 
     Function *DtorFn =
       dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
-    if (!DtorFn || !cxxDtorIsEmpty(*DtorFn))
+    if (!DtorFn || !IsEmptyAtExitFunction(*DtorFn))
       continue;
 
     // Just remove the call.
     CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
     CI->eraseFromParent();
 
-    ++NumCXXDtorsRemoved;
+    if (isCXX)
+      ++NumCXXDtorsRemoved;
+    else
+      ++NumAtExitRemoved;
 
     Changed |= true;
   }
@@ -2518,9 +2531,12 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL,
 
     // Try to remove trivial global destructors if they are not removed
     // already.
-    Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI);
-    if (CXAAtExitFn)
-      LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
+    if (Function *CXAAtExitFn =
+            FindAtExitLibFunc(M, GetTLI, LibFunc_cxa_atexit))
+      LocalChange |= OptimizeEmptyGlobalAtExitDtors(CXAAtExitFn, true);
+
+    if (Function *AtExitFn = FindAtExitLibFunc(M, GetTLI, LibFunc_atexit))
+      LocalChange |= OptimizeEmptyGlobalAtExitDtors(AtExitFn, false);
 
     // Optimize IFuncs whose callee's are statically known.
     LocalChange |= OptimizeStaticIFuncs(M);
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index f8920541e6fd..e591a8e73b1c 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -281,32 +281,21 @@ static bool runIPSCCP(
     Function *F = I.first;
     const ValueLatticeElement &ReturnValue = I.second;
 
-    // If there is a known constant range for the return value, add !range
-    // metadata to the function's call sites.
+    // If there is a known constant range for the return value, add range
+    // attribute to the return value.
     if (ReturnValue.isConstantRange() &&
         !ReturnValue.getConstantRange().isSingleElement()) {
       // Do not add range metadata if the return value may include undef.
       if (ReturnValue.isConstantRangeIncludingUndef())
         continue;
 
+      // Do not touch existing attribute for now.
+      // TODO: We should be able to take the intersection of the existing
+      // attribute and the inferred range.
+      if (F->hasRetAttribute(Attribute::Range))
+        continue;
       auto &CR = ReturnValue.getConstantRange();
-      for (User *User : F->users()) {
-        auto *CB = dyn_cast<CallBase>(User);
-        if (!CB || CB->getCalledFunction() != F)
-          continue;
-
-        // Do not touch existing metadata for now.
-        // TODO: We should be able to take the intersection of the existing
-        // metadata and the inferred range.
-        if (CB->getMetadata(LLVMContext::MD_range))
-          continue;
-
-        LLVMContext &Context = CB->getParent()->getContext();
-        Metadata *RangeMD[] = {
-            ConstantAsMetadata::get(ConstantInt::get(Context, CR.getLower())),
-            ConstantAsMetadata::get(ConstantInt::get(Context, CR.getUpper()))};
-        CB->setMetadata(LLVMContext::MD_range, MDNode::get(Context, RangeMD));
-      }
+      F->addRangeRetAttr(CR);
       continue;
     }
     if (F->getReturnType()->isVoidTy())
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 0b3a6931e779..6cbd138842c8 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -252,20 +252,21 @@ static cl::opt<unsigned> PrecentMismatchForStalenessError(
 
 static cl::opt<bool> CallsitePrioritizedInline(
     "sample-profile-prioritized-inline", cl::Hidden,
-
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
 static cl::opt<bool> UsePreInlinerDecision(
     "sample-profile-use-preinliner", cl::Hidden,
-
     cl::desc("Use the preinliner decisions stored in profile context."));
 
 static cl::opt<bool> AllowRecursiveInline(
     "sample-profile-recursive-inline", cl::Hidden,
-
     cl::desc("Allow sample loader inliner to inline recursive calls."));
 
+static cl::opt<bool> RemoveProbeAfterProfileAnnotation(
+    "sample-profile-remove-probe", cl::Hidden, cl::init(false),
+    cl::desc("Remove pseudo-probe after sample profile annotation."));
+
 static cl::opt<std::string> ProfileInlineReplayFile(
     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
@@ -518,6 +519,7 @@ protected:
   void generateMDProfMetadata(Function &F);
   bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI,
                                   const SampleProfileMap &Profiles);
+  void removePseudoProbeInsts(Module &M);
 
   /// Map from function name to Function *. Used to find the function from
   /// the function name. If the function name contains suffix, additional
@@ -2127,6 +2129,20 @@ bool SampleProfileLoader::rejectHighStalenessProfile(
   return false;
 }
 
+void SampleProfileLoader::removePseudoProbeInsts(Module &M) {
+  for (auto &F : M) {
+    std::vector<Instruction *> InstsToDel;
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (isa<PseudoProbeInst>(&I))
+          InstsToDel.push_back(&I);
+      }
+    }
+    for (auto *I : InstsToDel)
+      I->eraseFromParent();
+  }
+}
+
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
                                       ProfileSummaryInfo *_PSI,
                                       LazyCallGraph &CG) {
@@ -2196,6 +2212,9 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
          notInlinedCallInfo)
       updateProfileCallee(pair.first, pair.second.entryCount);
 
+  if (RemoveProbeAfterProfileAnnotation && FunctionSamples::ProfileIsProbeBased)
+    removePseudoProbeInsts(M);
+
   return retval;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 88b7e496897e..51ac77348ed9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2001,43 +2001,30 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
   if (!GEP1)
     return nullptr;
 
-  if (GEP2) {
-    // (gep X, ...) - (gep X, ...)
-    //
-    // Avoid duplicating the arithmetic if there are more than one non-constant
-    // indices between the two GEPs and either GEP has a non-constant index and
-    // multiple users. If zero non-constant index, the result is a constant and
-    // there is no duplication. If one non-constant index, the result is an add
-    // or sub with a constant, which is no larger than the original code, and
-    // there's no duplicated arithmetic, even if either GEP has multiple
-    // users. If more than one non-constant indices combined, as long as the GEP
-    // with at least one non-constant index doesn't have multiple users, there
-    // is no duplication.
-    unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices();
-    unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices();
-    if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 &&
-        ((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) ||
-         (NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) {
-      return nullptr;
-    }
-  }
+  // To avoid duplicating the offset arithmetic, rewrite the GEP to use the
+  // computed offset. This may erase the original GEP, so be sure to cache the
+  // inbounds flag before emitting the offset.
+  // TODO: We should probably do this even if there is only one GEP.
+  bool RewriteGEPs = GEP2 != nullptr;
 
   // Emit the offset of the GEP and an intptr_t.
-  Value *Result = EmitGEPOffset(GEP1);
+  bool GEP1IsInBounds = GEP1->isInBounds();
+  Value *Result = EmitGEPOffset(GEP1, RewriteGEPs);
 
   // If this is a single inbounds GEP and the original sub was nuw,
   // then the final multiplication is also nuw.
   if (auto *I = dyn_cast<Instruction>(Result))
-    if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
+    if (IsNUW && !GEP2 && !Swapped && GEP1IsInBounds &&
         I->getOpcode() == Instruction::Mul)
       I->setHasNoUnsignedWrap();
 
   // If we have a 2nd GEP of the same base pointer, subtract the offsets.
   // If both GEPs are inbounds, then the subtract does not have signed overflow.
   if (GEP2) {
-    Value *Offset = EmitGEPOffset(GEP2);
+    bool GEP2IsInBounds = GEP2->isInBounds();
+    Value *Offset = EmitGEPOffset(GEP2, RewriteGEPs);
     Result = Builder.CreateSub(Result, Offset, "gepdiff", /* NUW */ false,
-                               GEP1->isInBounds() && GEP2->isInBounds());
+                               GEP1IsInBounds && GEP2IsInBounds);
   }
 
   // If we have p - gep(p, ...)  then we have to negate the result.
@@ -2781,6 +2768,16 @@ Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
       propagateSelectFMF(NewSel, P == X);
       return NewSel;
     }
+
+    // -(Cond ? X : C) --> Cond ? -X : -C
+    // -(Cond ? C : Y) --> Cond ? -C : -Y
+    if (match(X, m_ImmConstant()) || match(Y, m_ImmConstant())) {
+      Value *NegX = Builder.CreateFNegFMF(X, &I, X->getName() + ".neg");
+      Value *NegY = Builder.CreateFNegFMF(Y, &I, Y->getName() + ".neg");
+      SelectInst *NewSel = SelectInst::Create(Cond, NegX, NegY);
+      propagateSelectFMF(NewSel, /*CommonOperand=*/true);
+      return NewSel;
+    }
   }
 
   // fneg (copysign x, y) -> copysign x, (fneg y)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 8ec1ed7529c1..ed9a89b14efc 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3141,20 +3141,20 @@ Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B,
   return nullptr;
 }
 
-/// We have an expression of the form (A & C) | (B & D). Try to simplify this
-/// to "A' ? C : D", where A' is a boolean or vector of booleans.
+/// We have an expression of the form (A & B) | (C & D). Try to simplify this
+/// to "A' ? B : D", where A' is a boolean or vector of booleans.
 /// When InvertFalseVal is set to true, we try to match the pattern
-/// where we have peeked through a 'not' op and A and B are the same:
-/// (A & C) | ~(A | D) --> (A & C) | (~A & ~D) --> A' ? C : ~D
-Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
+/// where we have peeked through a 'not' op and A and C are the same:
+/// (A & B) | ~(A | D) --> (A & B) | (~A & ~D) --> A' ? B : ~D
+Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *B, Value *C,
                                               Value *D, bool InvertFalseVal) {
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
   A = peekThroughBitcast(A, true);
-  B = peekThroughBitcast(B, true);
-  if (Value *Cond = getSelectCondition(A, B, InvertFalseVal)) {
-    // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
+  C = peekThroughBitcast(C, true);
+  if (Value *Cond = getSelectCondition(A, C, InvertFalseVal)) {
+    // ((bc Cond) & B) | ((bc ~Cond) & D) --> bc (select Cond, (bc B), (bc D))
     // If this is a vector, we may need to cast to match the condition's length.
     // The bitcasts will either all exist or all not exist. The builder will
     // not create unnecessary casts if the types already match.
@@ -3168,11 +3168,11 @@ Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
       Type *EltTy = Builder.getIntNTy(SelEltSize / Elts);
       SelTy = VectorType::get(EltTy, VecTy->getElementCount());
     }
-    Value *BitcastC = Builder.CreateBitCast(C, SelTy);
+    Value *BitcastB = Builder.CreateBitCast(B, SelTy);
     if (InvertFalseVal)
       D = Builder.CreateNot(D);
     Value *BitcastD = Builder.CreateBitCast(D, SelTy);
-    Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD);
+    Value *Select = Builder.CreateSelect(Cond, BitcastB, BitcastD);
     return Builder.CreateBitCast(Select, OrigType);
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e5652458f150..1913ef92c16c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3168,7 +3168,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::experimental_vector_reverse: {
+  case Intrinsic::vector_reverse: {
     Value *BO0, *BO1, *X, *Y;
     Value *Vec = II->getArgOperand(0);
     if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 6ce39be59bda..11e31877de38 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -770,6 +770,12 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
         return new ICmpInst(ICmpInst::Predicate::ICMP_EQ, X, Zero);
       }
     }
+
+    if (Trunc.hasNoUnsignedWrap() || Trunc.hasNoSignedWrap()) {
+      Value *X, *Y;
+      if (match(Src, m_Xor(m_Value(X), m_Value(Y))))
+        return new ICmpInst(ICmpInst::ICMP_NE, X, Y);
+    }
   }
 
   Value *A, *B;
@@ -2050,9 +2056,9 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
     // the GEP otherwise.
     if (GEP->hasOneUse() &&
         isa<ConstantPointerNull>(GEP->getPointerOperand())) {
-      return replaceInstUsesWith(CI,
-                                 Builder.CreateIntCast(EmitGEPOffset(GEP), Ty,
-                                                       /*isSigned=*/false));
+      return replaceInstUsesWith(
+          CI, Builder.CreateIntCast(EmitGEPOffset(cast<GEPOperator>(GEP)), Ty,
+                                    /*isSigned=*/false));
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c2062adcd5b7..f66883de8dd5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -816,29 +816,9 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     }
 
     if (GEPsInBounds || CmpInst::isEquality(Cond)) {
-      auto EmitGEPOffsetAndRewrite = [&](GEPOperator *GEP) {
-        IRBuilderBase::InsertPointGuard Guard(Builder);
-        auto *Inst = dyn_cast<Instruction>(GEP);
-        if (Inst)
-          Builder.SetInsertPoint(Inst);
-
-        Value *Offset = EmitGEPOffset(GEP);
-        // If a non-trivial GEP has other uses, rewrite it to avoid duplicating
-        // the offset arithmetic.
-        if (Inst && !GEP->hasOneUse() && !GEP->hasAllConstantIndices() &&
-            !GEP->getSourceElementType()->isIntegerTy(8)) {
-          replaceInstUsesWith(*Inst,
-                              Builder.CreateGEP(Builder.getInt8Ty(),
-                                                GEP->getPointerOperand(),
-                                                Offset, "", GEPsInBounds));
-          eraseInstFromFunction(*Inst);
-        }
-        return Offset;
-      };
-
       // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
-      Value *L = EmitGEPOffsetAndRewrite(GEPLHS);
-      Value *R = EmitGEPOffsetAndRewrite(GEPRHS);
+      Value *L = EmitGEPOffset(GEPLHS, /*RewriteGEP=*/true);
+      Value *R = EmitGEPOffset(GEPRHS, /*RewriteGEP=*/true);
       return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
     }
   }
@@ -6909,8 +6889,8 @@ static Instruction *foldVectorCmp(CmpInst &Cmp,
     if (auto *I = dyn_cast<Instruction>(V))
       I->copyIRFlags(&Cmp);
     Module *M = Cmp.getModule();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, V->getType());
+    Function *F =
+        Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
     return CallInst::Create(F, V);
   };
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 4479afbd09af..db7838bbe3c2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -354,8 +354,9 @@ private:
   }
 
   bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
-                                  const Instruction &CxtI) const {
-    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+                                  const Instruction &CxtI,
+                                  bool IsNSW = false) const {
+    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI, IsNSW) ==
            OverflowResult::NeverOverflows;
   }
 
@@ -376,7 +377,7 @@ private:
     }
   }
 
-  Value *EmitGEPOffset(User *GEP);
+  Value *EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP = false);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt);
   Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 4ed4c36e21e0..ca1b1921404d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -530,7 +530,7 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
     I.setHasNoSignedWrap(true);
   }
 
-  if (!HasNUW && willNotOverflowUnsignedMul(Op0, Op1, I)) {
+  if (!HasNUW && willNotOverflowUnsignedMul(Op0, Op1, I, I.hasNoSignedWrap())) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 117eb7a1dcc9..8818369e7945 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2537,8 +2537,8 @@ Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
       if (auto *I = dyn_cast<Instruction>(V))
         I->copyIRFlags(&Sel);
       Module *M = Sel.getModule();
-      Function *F = Intrinsic::getDeclaration(
-          M, Intrinsic::experimental_vector_reverse, V->getType());
+      Function *F =
+          Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
       return CallInst::Create(F, V);
     };
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 244f03a1bc2b..1cb21a1d81af 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1120,14 +1120,6 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(Trunc, ConstantInt::get(Ty, Mask));
     }
 
-    if (match(Op0, m_Shl(m_Value(X), m_APInt(C1))) && C1->ult(BitWidth)) {
-      unsigned AmtSum = ShAmtC + C1->getZExtValue();
-      // Oversized shifts are simplified to zero in InstSimplify.
-      if (AmtSum < BitWidth)
-        // (X << C1) << C2 --> X << (C1 + C2)
-        return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
-    }
-
     // If we have an opposite shift by the same amount, we may be able to
     // reorder binops and shifts to eliminate math/logic.
     auto isSuitableBinOpcode = [](Instruction::BinaryOps BinOpcode) {
@@ -1394,14 +1386,6 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       }
     }
 
-    // (X >>u C1) >>u C --> X >>u (C1 + C)
-    if (match(Op0, m_LShr(m_Value(X), m_APInt(C1)))) {
-      // Oversized shifts are simplified to zero in InstSimplify.
-      unsigned AmtSum = ShAmtC + C1->getZExtValue();
-      if (AmtSum < BitWidth)
-        return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
-    }
-
     Instruction *TruncSrc;
     if (match(Op0, m_OneUse(m_Trunc(m_Instruction(TruncSrc)))) &&
         match(TruncSrc, m_LShr(m_Value(X), m_APInt(C1)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f6684834a772..b6f8b24f43b8 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -190,8 +190,26 @@ bool InstCombiner::isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
   return TTI.isValidAddrSpaceCast(FromAS, ToAS);
 }
 
-Value *InstCombinerImpl::EmitGEPOffset(User *GEP) {
-  return llvm::emitGEPOffset(&Builder, DL, GEP);
+Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) {
+  if (!RewriteGEP)
+    return llvm::emitGEPOffset(&Builder, DL, GEP);
+
+  IRBuilderBase::InsertPointGuard Guard(Builder);
+  auto *Inst = dyn_cast<Instruction>(GEP);
+  if (Inst)
+    Builder.SetInsertPoint(Inst);
+
+  Value *Offset = EmitGEPOffset(GEP);
+  // If a non-trivial GEP has other uses, rewrite it to avoid duplicating
+  // the offset arithmetic.
+  if (Inst && !GEP->hasOneUse() && !GEP->hasAllConstantIndices() &&
+      !GEP->getSourceElementType()->isIntegerTy(8)) {
+    replaceInstUsesWith(
+        *Inst, Builder.CreateGEP(Builder.getInt8Ty(), GEP->getPointerOperand(),
+                                 Offset, "", GEP->isInBounds()));
+    eraseInstFromFunction(*Inst);
+  }
+  return Offset;
 }
 
 /// Legal integers and common types are considered desirable. This is used to
@@ -2025,8 +2043,8 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
     if (auto *BO = dyn_cast<BinaryOperator>(V))
       BO->copyIRFlags(&Inst);
     Module *M = Inst.getModule();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, V->getType());
+    Function *F =
+        Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
     return CallInst::Create(F, V);
   };
 
@@ -2321,6 +2339,43 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
   return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
 }
 
+// Canonicalization:
+// gep T, (gep i8, base, C1), (Index + C2) into
+// gep T, (gep i8, base, C1 + C2 * sizeof(T)), Index
+static Instruction *canonicalizeGEPOfConstGEPI8(GetElementPtrInst &GEP,
+                                                GEPOperator *Src,
+                                                InstCombinerImpl &IC) {
+  if (GEP.getNumIndices() != 1)
+    return nullptr;
+  auto &DL = IC.getDataLayout();
+  Value *Base;
+  const APInt *C1;
+  if (!match(Src, m_PtrAdd(m_Value(Base), m_APInt(C1))))
+    return nullptr;
+  Value *VarIndex;
+  const APInt *C2;
+  Type *PtrTy = Src->getType()->getScalarType();
+  unsigned IndexSizeInBits = DL.getIndexTypeSizeInBits(PtrTy);
+  if (!match(GEP.getOperand(1), m_AddLike(m_Value(VarIndex), m_APInt(C2))))
+    return nullptr;
+  if (C1->getBitWidth() != IndexSizeInBits ||
+      C2->getBitWidth() != IndexSizeInBits)
+    return nullptr;
+  Type *BaseType = GEP.getSourceElementType();
+  if (isa<ScalableVectorType>(BaseType))
+    return nullptr;
+  APInt TypeSize(IndexSizeInBits, DL.getTypeAllocSize(BaseType));
+  APInt NewOffset = TypeSize * *C2 + *C1;
+  if (NewOffset.isZero() ||
+      (Src->hasOneUse() && GEP.getOperand(1)->hasOneUse())) {
+    Value *GEPConst =
+        IC.Builder.CreatePtrAdd(Base, IC.Builder.getInt(NewOffset));
+    return GetElementPtrInst::Create(BaseType, GEPConst, VarIndex);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
                                              GEPOperator *Src) {
   // Combine Indices - If the source pointer to this getelementptr instruction
@@ -2329,6 +2384,9 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
   if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
     return nullptr;
 
+  if (auto *I = canonicalizeGEPOfConstGEPI8(GEP, Src, *this))
+    return I;
+
   // For constant GEPs, use a more general offset-based folding approach.
   Type *PtrTy = Src->getType()->getScalarType();
   if (GEP.hasAllConstantIndices() &&
@@ -2653,7 +2711,6 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value *, 8> Indices(GEP.indices());
   Type *GEPType = GEP.getType();
   Type *GEPEltType = GEP.getSourceElementType();
-  bool IsGEPSrcEleScalable = GEPEltType->isScalableTy();
   if (Value *V = simplifyGEPInst(GEPEltType, PtrOp, Indices, GEP.isInBounds(),
                                  SQ.getWithInstruction(&GEP)))
     return replaceInstUsesWith(GEP, V);
@@ -2729,6 +2786,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                                     GEP.isInBounds()));
   }
 
+  // Canonicalize scalable GEPs to an explicit offset using the llvm.vscale
+  // intrinsic. This has better support in BasicAA.
+  if (GEPEltType->isScalableTy()) {
+    Value *Offset = EmitGEPOffset(cast<GEPOperator>(&GEP));
+    return replaceInstUsesWith(
+        GEP, Builder.CreatePtrAdd(PtrOp, Offset, "", GEP.isInBounds()));
+  }
+
   // Check to see if the inputs to the PHI node are getelementptr instructions.
   if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
     auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
@@ -2838,9 +2903,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (Instruction *I = visitGEPOfGEP(GEP, Src))
       return I;
 
-  // Skip if GEP source element type is scalable. The type alloc size is unknown
-  // at compile-time.
-  if (GEP.getNumIndices() == 1 && !IsGEPSrcEleScalable) {
+  if (GEP.getNumIndices() == 1) {
     unsigned AS = GEP.getPointerAddressSpace();
     if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
         DL.getIndexSizeInBits(AS)) {
@@ -2890,6 +2953,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     return nullptr;
 
   if (GEP.getNumIndices() == 1) {
+    // We can only preserve inbounds if the original gep is inbounds, the add
+    // is nsw, and the add operands are non-negative.
+    auto CanPreserveInBounds = [&](bool AddIsNSW, Value *Idx1, Value *Idx2) {
+      SimplifyQuery Q = SQ.getWithInstruction(&GEP);
+      return GEP.isInBounds() && AddIsNSW && isKnownNonNegative(Idx1, Q) &&
+             isKnownNonNegative(Idx2, Q);
+    };
+
     // Try to replace ADD + GEP with GEP + GEP.
     Value *Idx1, *Idx2;
     if (match(GEP.getOperand(1),
@@ -2899,10 +2970,15 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // as:
       //   %newptr = getelementptr i32, ptr %ptr, i64 %idx1
       //   %newgep = getelementptr i32, ptr %newptr, i64 %idx2
-      auto *NewPtr = Builder.CreateGEP(GEP.getResultElementType(),
-                                       GEP.getPointerOperand(), Idx1);
-      return GetElementPtrInst::Create(GEP.getResultElementType(), NewPtr,
-                                       Idx2);
+      bool IsInBounds = CanPreserveInBounds(
+          cast<OverflowingBinaryOperator>(GEP.getOperand(1))->hasNoSignedWrap(),
+          Idx1, Idx2);
+      auto *NewPtr =
+          Builder.CreateGEP(GEP.getResultElementType(), GEP.getPointerOperand(),
+                            Idx1, "", IsInBounds);
+      return replaceInstUsesWith(
+          GEP, Builder.CreateGEP(GEP.getResultElementType(), NewPtr, Idx2, "",
+                                 IsInBounds));
     }
     ConstantInt *C;
     if (match(GEP.getOperand(1), m_OneUse(m_SExtLike(m_OneUse(m_NSWAdd(
@@ -2913,12 +2989,17 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // as:
       // %newptr = getelementptr i32, ptr %ptr, i32 %idx1
       // %newgep = getelementptr i32, ptr %newptr, i32 idx2
+      bool IsInBounds = CanPreserveInBounds(
+          /*IsNSW=*/true, Idx1, C);
       auto *NewPtr = Builder.CreateGEP(
           GEP.getResultElementType(), GEP.getPointerOperand(),
-          Builder.CreateSExt(Idx1, GEP.getOperand(1)->getType()));
-      return GetElementPtrInst::Create(
-          GEP.getResultElementType(), NewPtr,
-          Builder.CreateSExt(C, GEP.getOperand(1)->getType()));
+          Builder.CreateSExt(Idx1, GEP.getOperand(1)->getType()), "",
+          IsInBounds);
+      return replaceInstUsesWith(
+          GEP,
+          Builder.CreateGEP(GEP.getResultElementType(), NewPtr,
+                            Builder.CreateSExt(C, GEP.getOperand(1)->getType()),
+                            "", IsInBounds));
     }
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 981405329389..8d345d394b51 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_component_library(LLVMInstrumentation
   InstrProfiling.cpp
   KCFI.cpp
   LowerAllowCheckPass.cpp
+  PGOCtxProfLowering.cpp
   PGOForceFunctionAttrs.cpp
   PGOInstrumentation.cpp
   PGOMemOPSizeOpt.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 88b852340340..fa661b17c13a 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1385,14 +1385,6 @@ bool HWAddressSanitizer::instrumentLandingPads(
   return true;
 }
 
-static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
-  return dyn_cast<DbgAssignIntrinsic>(DVI);
-}
-
-static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) {
-  return DVR->isDbgAssign() ? DVR : nullptr;
-}
-
 bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
                                          Value *StackTag, Value *UARTag,
                                          const DominatorTree &DT,
@@ -1448,28 +1440,7 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
              !memtag::isLifetimeIntrinsic(User);
     });
 
-    // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
-    // abstracted over whether they're intrinsic-stored or DbgVariableRecord
-    // stored.
-    auto AnnotateDbgRecord = [&](auto *DPtr) {
-      // Prepend "tag_offset, N" to the dwarf expression.
-      // Tag offset logically applies to the alloca pointer, and it makes sense
-      // to put it at the beginning of the expression.
-      SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset,
-                                         retagMask(N)};
-      for (size_t LocNo = 0; LocNo < DPtr->getNumVariableLocationOps(); ++LocNo)
-        if (DPtr->getVariableLocationOp(LocNo) == AI)
-          DPtr->setExpression(DIExpression::appendOpsToArg(
-              DPtr->getExpression(), NewOps, LocNo));
-      if (auto *DAI = DynCastToDbgAssign(DPtr)) {
-        if (DAI->getAddress() == AI)
-          DAI->setAddressExpression(DIExpression::prependOpcodes(
-              DAI->getAddressExpression(), NewOps));
-      }
-    };
-
-    llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
-    llvm::for_each(Info.DbgVariableRecords, AnnotateDbgRecord);
+    memtag::annotateDebugRecords(Info, retagMask(N));
 
     auto TagEnd = [&](Instruction *Node) {
       IRB.SetInsertPoint(Node);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index e5ef0333696d..2b504b893ddb 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1135,6 +1135,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   std::unique_ptr<VarArgHelper> VAHelper;
   const TargetLibraryInfo *TLI;
   Instruction *FnPrologueEnd;
+  SmallVector<Instruction *, 16> Instructions;
 
   // The following flags disable parts of MSan instrumentation based on
   // exclusion list contents and command-line options.
@@ -1520,6 +1521,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     for (BasicBlock *BB : depth_first(FnPrologueEnd->getParent()))
       visit(*BB);
 
+    // `visit` above only collects instructions. Process them after iterating
+    // CFG to avoid requirement on CFG transformations.
+    for (Instruction *I : Instructions)
+      InstVisitor<MemorySanitizerVisitor>::visit(*I);
+
     // Finalize PHI nodes.
     for (PHINode *PN : ShadowPHINodes) {
       PHINode *PNS = cast<PHINode>(getShadow(PN));
@@ -1955,8 +1961,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       unsigned ArgOffset = 0;
       const DataLayout &DL = F->getParent()->getDataLayout();
       for (auto &FArg : F->args()) {
-        if (!FArg.getType()->isSized()) {
-          LLVM_DEBUG(dbgs() << "Arg is not sized\n");
+        if (!FArg.getType()->isSized() || FArg.getType()->isScalableTy()) {
+          LLVM_DEBUG(dbgs() << (FArg.getType()->isScalableTy()
+                                    ? "vscale not fully supported\n"
+                                    : "Arg is not sized\n"));
+          if (A == &FArg) {
+            ShadowPtr = getCleanShadow(V);
+            setOrigin(A, getCleanOrigin());
+            break;
+          }
           continue;
         }
 
@@ -2189,7 +2202,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       setOrigin(&I, getCleanOrigin());
       return;
     }
-    InstVisitor<MemorySanitizerVisitor>::visit(I);
+
+    Instructions.push_back(&I);
   }
 
   /// Instrument LoadInst
@@ -2506,6 +2520,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
                           bool Signed = false) {
     Type *srcTy = V->getType();
+    if (srcTy == dstTy)
+      return V;
     size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
     size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
     if (srcSizeInBits > 1 && dstSizeInBits == 1)
@@ -4196,6 +4212,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << CB << "\n");
         continue;
       }
+
+      if (A->getType()->isScalableTy()) {
+        LLVM_DEBUG(dbgs() << "Arg  " << i << " is vscale: " << CB << "\n");
+        // Handle as noundef, but don't reserve tls slots.
+        insertShadowCheck(A, &CB);
+        continue;
+      }
+
       unsigned Size = 0;
       const DataLayout &DL = F.getParent()->getDataLayout();
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
new file mode 100644
index 000000000000..9d6dd5ccb38b
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -0,0 +1,24 @@
+//===- PGOCtxProfLowering.cpp - Contextual PGO Instr. Lowering ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::list<std::string> ContextRoots(
+    "profile-context-root", cl::Hidden,
+    cl::desc(
+        "A function name, assumed to be global, which will be treated as the "
+        "root of an interesting graph, which will be profiled independently "
+        "from other similar graphs."));
+
+bool PGOCtxProfLoweringPass::isContextualIRPGOEnabled() {
+  return !ContextRoots.empty();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index a7b7556685e4..b333b1582e80 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -110,6 +110,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/BlockCoverageInference.h"
 #include "llvm/Transforms/Instrumentation/CFGMST.h"
+#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/MisExpect.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -333,6 +334,20 @@ extern cl::opt<bool> EnableVTableValueProfiling;
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
+bool shouldInstrumentEntryBB() {
+  return PGOInstrumentEntry ||
+         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+}
+
+// FIXME(mtrofin): re-enable this for ctx profiling, for non-indirect calls. Ctx
+// profiling implicitly captures indirect call cases, but not other values.
+// Supporting other values is relatively straight-forward - just another counter
+// range within the context.
+bool isValueProfilingDisabled() {
+  return DisableValueProfiling ||
+         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+}
+
 // Return a string describing the branch condition that can be
 // used in static branch probability heuristics:
 static std::string getBranchCondString(Instruction *TI) {
@@ -379,7 +394,7 @@ static GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS) {
   uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION | VARIANT_MASK_IR_PROF);
   if (IsCS)
     ProfileVersion |= VARIANT_MASK_CSIR_PROF;
-  if (PGOInstrumentEntry)
+  if (shouldInstrumentEntryBB())
     ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;
   if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
     ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;
@@ -861,7 +876,7 @@ static void instrumentOneFunc(
   }
 
   FuncPGOInstrumentation<PGOEdge, PGOBBInfo> FuncInfo(
-      F, TLI, ComdatMembers, true, BPI, BFI, IsCS, PGOInstrumentEntry,
+      F, TLI, ComdatMembers, true, BPI, BFI, IsCS, shouldInstrumentEntryBB(),
       PGOBlockCoverage);
 
   auto Name = FuncInfo.FuncNameVar;
@@ -883,6 +898,43 @@ static void instrumentOneFunc(
   unsigned NumCounters =
       InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
 
+  if (PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) {
+    auto *CSIntrinsic =
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_callsite);
+    // We want to count the instrumentable callsites, then instrument them. This
+    // is because the llvm.instrprof.callsite intrinsic has an argument (like
+    // the other instrprof intrinsics) capturing the total number of
+    // instrumented objects (counters, or callsites, in this case). In this
+    // case, we want that value so we can readily pass it to the compiler-rt
+    // APIs that may have to allocate memory based on the nr of callsites.
+    // The traversal logic is the same for both counting and instrumentation,
+    // just needs to be done in succession.
+    auto Visit = [&](llvm::function_ref<void(CallBase * CB)> Visitor) {
+      for (auto &BB : F)
+        for (auto &Instr : BB)
+          if (auto *CS = dyn_cast<CallBase>(&Instr)) {
+            if ((CS->getCalledFunction() &&
+                 CS->getCalledFunction()->isIntrinsic()) ||
+                dyn_cast<InlineAsm>(CS->getCalledOperand()))
+              continue;
+            Visitor(CS);
+          }
+    };
+    // First, count callsites.
+    uint32_t TotalNrCallsites = 0;
+    Visit([&TotalNrCallsites](auto *) { ++TotalNrCallsites; });
+
+    // Now instrument.
+    uint32_t CallsiteIndex = 0;
+    Visit([&](auto *CB) {
+      IRBuilder<> Builder(CB);
+      Builder.CreateCall(CSIntrinsic,
+                         {Name, CFGHash, Builder.getInt32(TotalNrCallsites),
+                          Builder.getInt32(CallsiteIndex++),
+                          CB->getCalledOperand()});
+    });
+  }
+
   uint32_t I = 0;
   if (PGOTemporalInstrumentation) {
     NumCounters += PGOBlockCoverage ? 8 : 1;
@@ -914,7 +966,7 @@ static void instrumentOneFunc(
                                        FuncInfo.FunctionHash);
   assert(I == NumCounters);
 
-  if (DisableValueProfiling)
+  if (isValueProfilingDisabled())
     return;
 
   NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size();
@@ -1676,7 +1728,7 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
 
 // Traverse all valuesites and annotate the instructions for all value kind.
 void PGOUseFunc::annotateValueSites() {
-  if (DisableValueProfiling)
+  if (isValueProfilingDisabled())
     return;
 
   // Create the PGOFuncName meta data.
@@ -1779,7 +1831,7 @@ static bool InstrumentAllFunctions(
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
-  if (!IsCS)
+  if (!IsCS && !PGOCtxProfLoweringPass::isContextualIRPGOEnabled())
     createIRLevelProfileFlagVar(M, /*IsCS=*/false);
 
   Triple TT(M.getTargetTriple());
@@ -2018,6 +2070,8 @@ static bool annotateAllFunctions(
   bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
   if (PGOInstrumentEntry.getNumOccurrences() > 0)
     InstrumentFuncEntry = PGOInstrumentEntry;
+  InstrumentFuncEntry |= PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+
   bool HasSingleByteCoverage = PGOReader->hasSingleByteCoverage();
   for (auto &F : M) {
     if (skipPGOUse(F))
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 1caed93b1b66..ba2546b8db0e 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -131,7 +131,7 @@ public:
   explicit operator bool() const { return SI && SIUse; }
 };
 
-void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
+void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
             std::vector<SelectInstToUnfold> *NewSIsToUnfold,
             std::vector<BasicBlock *> *NewBBs);
 
@@ -142,6 +142,7 @@ public:
       : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {}
 
   bool run(Function &F);
+  bool LoopInfoBroken;
 
 private:
   void
@@ -157,7 +158,7 @@ private:
 
       std::vector<SelectInstToUnfold> NewSIsToUnfold;
       std::vector<BasicBlock *> NewBBs;
-      unfold(&DTU, SIToUnfold, &NewSIsToUnfold, &NewBBs);
+      unfold(&DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs);
 
       // Put newly discovered select instructions into the work list.
       for (const SelectInstToUnfold &NewSIToUnfold : NewSIsToUnfold)
@@ -201,7 +202,7 @@ void createBasicBlockAndSinkSelectInst(
 /// created basic blocks into \p NewBBs.
 ///
 /// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible.
-void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
+void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
             std::vector<SelectInstToUnfold> *NewSIsToUnfold,
             std::vector<BasicBlock *> *NewBBs) {
   SelectInst *SI = SIToUnfold.getInst();
@@ -307,6 +308,12 @@ void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
   DTU->applyUpdates({{DominatorTree::Insert, StartBlock, TT},
                      {DominatorTree::Insert, StartBlock, FT}});
 
+  // Preserve loop info
+  if (Loop *L = LI->getLoopFor(SI->getParent())) {
+    for (BasicBlock *NewBB : *NewBBs)
+      L->addBasicBlockToLoop(NewBB, *LI);
+  }
+
   // The select is now dead.
   assert(SI->use_empty() && "Select must be dead now");
   SI->eraseFromParent();
@@ -522,9 +529,10 @@ private:
 };
 
 struct AllSwitchPaths {
-  AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE)
-      : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()),
-        ORE(ORE) {}
+  AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE,
+                 LoopInfo *LI)
+      : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), ORE(ORE),
+        LI(LI) {}
 
   std::vector<ThreadingPath> &getThreadingPaths() { return TPaths; }
   unsigned getNumThreadingPaths() { return TPaths.size(); }
@@ -545,7 +553,7 @@ struct AllSwitchPaths {
       return;
     }
 
-    for (PathType Path : LoopPaths) {
+    for (const PathType &Path : LoopPaths) {
       ThreadingPath TPath;
 
       const BasicBlock *PrevBB = Path.back();
@@ -596,6 +604,12 @@ private:
 
     Visited.insert(BB);
 
+    // Stop if we have reached the BB out of loop, since its successors have no
+    // impact on the DFA.
+    // TODO: Do we need to stop exploring if BB is the outer loop of the switch?
+    if (!LI->getLoopFor(BB))
+      return Res;
+
     // Some blocks have multiple edges to the same successor, and this set
     // is used to prevent a duplicate path from being generated
     SmallSet<BasicBlock *, 4> Successors;
@@ -737,6 +751,7 @@ private:
   BasicBlock *SwitchBlock;
   OptimizationRemarkEmitter *ORE;
   std::vector<ThreadingPath> TPaths;
+  LoopInfo *LI;
 };
 
 struct TransformDFA {
@@ -1283,6 +1298,7 @@ bool DFAJumpThreading::run(Function &F) {
 
   SmallVector<AllSwitchPaths, 2> ThreadableLoops;
   bool MadeChanges = false;
+  LoopInfoBroken = false;
 
   for (BasicBlock &BB : F) {
     auto *SI = dyn_cast<SwitchInst>(BB.getTerminator());
@@ -1304,7 +1320,7 @@ bool DFAJumpThreading::run(Function &F) {
     if (!Switch.getSelectInsts().empty())
       MadeChanges = true;
 
-    AllSwitchPaths SwitchPaths(&Switch, ORE);
+    AllSwitchPaths SwitchPaths(&Switch, ORE, LI);
     SwitchPaths.run();
 
     if (SwitchPaths.getNumThreadingPaths() > 0) {
@@ -1315,10 +1331,15 @@ bool DFAJumpThreading::run(Function &F) {
       // strict requirement but it can cause buggy behavior if there is an
       // overlap of blocks in different opportunities. There is a lot of room to
       // experiment with catching more opportunities here.
+      // NOTE: To release this contraint, we must handle LoopInfo invalidation
       break;
     }
   }
 
+#ifdef NDEBUG
+  LI->verify(*DT);
+#endif
+
   SmallPtrSet<const Value *, 32> EphValues;
   if (ThreadableLoops.size() > 0)
     CodeMetrics::collectEphemeralValues(&F, AC, EphValues);
@@ -1327,6 +1348,7 @@ bool DFAJumpThreading::run(Function &F) {
     TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues);
     Transform.run();
     MadeChanges = true;
+    LoopInfoBroken = true;
   }
 
 #ifdef EXPENSIVE_CHECKS
@@ -1347,11 +1369,13 @@ PreservedAnalyses DFAJumpThreadingPass::run(Function &F,
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   OptimizationRemarkEmitter ORE(&F);
-
-  if (!DFAJumpThreading(&AC, &DT, &LI, &TTI, &ORE).run(F))
+  DFAJumpThreading ThreadImpl(&AC, &DT, &LI, &TTI, &ORE);
+  if (!ThreadImpl.run(F))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
+  if (!ThreadImpl.LoopInfoBroken)
+    PA.preserve<LoopAnalysis>();
   return PA;
 }
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index d829e92b2444..b5be8ac24941 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -726,6 +726,69 @@ void GVNPass::ValueTable::verifyRemoved(const Value *V) const {
 }
 
 //===----------------------------------------------------------------------===//
+//                     LeaderMap External Functions
+//===----------------------------------------------------------------------===//
+
+/// Push a new Value to the LeaderTable onto the list for its value number.
+void GVNPass::LeaderMap::insert(uint32_t N, Value *V, const BasicBlock *BB) {
+  LeaderListNode &Curr = NumToLeaders[N];
+  if (!Curr.Entry.Val) {
+    Curr.Entry.Val = V;
+    Curr.Entry.BB = BB;
+    return;
+  }
+
+  LeaderListNode *Node = TableAllocator.Allocate<LeaderListNode>();
+  Node->Entry.Val = V;
+  Node->Entry.BB = BB;
+  Node->Next = Curr.Next;
+  Curr.Next = Node;
+}
+
+/// Scan the list of values corresponding to a given
+/// value number, and remove the given instruction if encountered.
+void GVNPass::LeaderMap::erase(uint32_t N, Instruction *I,
+                               const BasicBlock *BB) {
+  LeaderListNode *Prev = nullptr;
+  LeaderListNode *Curr = &NumToLeaders[N];
+
+  while (Curr && (Curr->Entry.Val != I || Curr->Entry.BB != BB)) {
+    Prev = Curr;
+    Curr = Curr->Next;
+  }
+
+  if (!Curr)
+    return;
+
+  if (Prev) {
+    Prev->Next = Curr->Next;
+  } else {
+    if (!Curr->Next) {
+      Curr->Entry.Val = nullptr;
+      Curr->Entry.BB = nullptr;
+    } else {
+      LeaderListNode *Next = Curr->Next;
+      Curr->Entry.Val = Next->Entry.Val;
+      Curr->Entry.BB = Next->Entry.BB;
+      Curr->Next = Next->Next;
+    }
+  }
+}
+
+void GVNPass::LeaderMap::verifyRemoved(const Value *V) const {
+  // Walk through the value number scope to make sure the instruction isn't
+  // ferreted away in it.
+  for (const auto &I : NumToLeaders) {
+    (void)I;
+    assert(I.second.Entry.Val != V && "Inst still in value numbering scope!");
+    assert(
+        std::none_of(leader_iterator(&I.second), leader_iterator(nullptr),
+                     [=](const LeaderTableEntry &E) { return E.Val == V; }) &&
+        "Inst still in value numbering scope!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
 //                                GVN Pass
 //===----------------------------------------------------------------------===//
 
@@ -1467,7 +1530,7 @@ void GVNPass::eliminatePartiallyRedundantLoad(
         OldLoad->replaceAllUsesWith(NewLoad);
         replaceValuesPerBlockEntry(ValuesPerBlock, OldLoad, NewLoad);
         if (uint32_t ValNo = VN.lookup(OldLoad, false))
-          removeFromLeaderTable(ValNo, OldLoad, OldLoad->getParent());
+          LeaderTable.erase(ValNo, OldLoad, OldLoad->getParent());
         VN.erase(OldLoad);
         removeInstruction(OldLoad);
       }
@@ -2204,10 +2267,9 @@ GVNPass::ValueTable::assignExpNewValueNum(Expression &Exp) {
 /// defined in \p BB.
 bool GVNPass::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
                                          GVNPass &Gvn) {
-  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
-  while (Vals && Vals->BB == BB)
-    Vals = Vals->Next;
-  return !Vals;
+  return all_of(
+      Gvn.LeaderTable.getLeaders(Num),
+      [=](const LeaderMap::LeaderTableEntry &L) { return L.BB == BB; });
 }
 
 /// Wrap phiTranslateImpl to provide caching functionality.
@@ -2229,12 +2291,11 @@ bool GVNPass::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
                                            const BasicBlock *PhiBlock,
                                            GVNPass &Gvn) {
   CallInst *Call = nullptr;
-  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
-  while (Vals) {
-    Call = dyn_cast<CallInst>(Vals->Val);
+  auto Leaders = Gvn.LeaderTable.getLeaders(Num);
+  for (const auto &Entry : Leaders) {
+    Call = dyn_cast<CallInst>(Entry.Val);
     if (Call && Call->getParent() == PhiBlock)
       break;
-    Vals = Vals->Next;
   }
 
   if (AA->doesNotAccessMemory(Call))
@@ -2327,23 +2388,17 @@ void GVNPass::ValueTable::eraseTranslateCacheEntry(
 // question.  This is fast because dominator tree queries consist of only
 // a few comparisons of DFS numbers.
 Value *GVNPass::findLeader(const BasicBlock *BB, uint32_t num) {
-  LeaderTableEntry Vals = LeaderTable[num];
-  if (!Vals.Val) return nullptr;
+  auto Leaders = LeaderTable.getLeaders(num);
+  if (Leaders.empty())
+    return nullptr;
 
   Value *Val = nullptr;
-  if (DT->dominates(Vals.BB, BB)) {
-    Val = Vals.Val;
-    if (isa<Constant>(Val)) return Val;
-  }
-
-  LeaderTableEntry* Next = Vals.Next;
-  while (Next) {
-    if (DT->dominates(Next->BB, BB)) {
-      if (isa<Constant>(Next->Val)) return Next->Val;
-      if (!Val) Val = Next->Val;
+  for (const auto &Entry : Leaders) {
+    if (DT->dominates(Entry.BB, BB)) {
+      Val = Entry.Val;
+      if (isa<Constant>(Val))
+        return Val;
     }
-
-    Next = Next->Next;
   }
 
   return Val;
@@ -2452,7 +2507,7 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
     // have the simple case where the edge dominates the end.
     if (RootDominatesEnd && !isa<Instruction>(RHS) &&
         canReplacePointersIfEqual(LHS, RHS, DL))
-      addToLeaderTable(LVN, RHS, Root.getEnd());
+      LeaderTable.insert(LVN, RHS, Root.getEnd());
 
     // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
     // LHS always has at least one use that is not dominated by Root, this will
@@ -2546,7 +2601,7 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
       // The leader table only tracks basic blocks, not edges. Only add to if we
       // have the simple case where the edge dominates the end.
       if (RootDominatesEnd)
-        addToLeaderTable(Num, NotVal, Root.getEnd());
+        LeaderTable.insert(Num, NotVal, Root.getEnd());
 
       continue;
     }
@@ -2596,7 +2651,7 @@ bool GVNPass::processInstruction(Instruction *I) {
       return true;
 
     unsigned Num = VN.lookupOrAdd(Load);
-    addToLeaderTable(Num, Load, Load->getParent());
+    LeaderTable.insert(Num, Load, Load->getParent());
     return false;
   }
 
@@ -2664,7 +2719,7 @@ bool GVNPass::processInstruction(Instruction *I) {
   // Allocations are always uniquely numbered, so we can save time and memory
   // by fast failing them.
   if (isa<AllocaInst>(I) || I->isTerminator() || isa<PHINode>(I)) {
-    addToLeaderTable(Num, I, I->getParent());
+    LeaderTable.insert(Num, I, I->getParent());
     return false;
   }
 
@@ -2672,7 +2727,7 @@ bool GVNPass::processInstruction(Instruction *I) {
   // need to do a lookup to see if the number already exists
   // somewhere in the domtree: it can't!
   if (Num >= NextNum) {
-    addToLeaderTable(Num, I, I->getParent());
+    LeaderTable.insert(Num, I, I->getParent());
     return false;
   }
 
@@ -2681,7 +2736,7 @@ bool GVNPass::processInstruction(Instruction *I) {
   Value *Repl = findLeader(I->getParent(), Num);
   if (!Repl) {
     // Failure, just remember this instance for future use.
-    addToLeaderTable(Num, I, I->getParent());
+    LeaderTable.insert(Num, I, I->getParent());
     return false;
   }
 
@@ -2876,7 +2931,7 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
   VN.add(Instr, Num);
 
   // Update the availability map to include the new instruction.
-  addToLeaderTable(Num, Instr, Pred);
+  LeaderTable.insert(Num, Instr, Pred);
   return true;
 }
 
@@ -3027,13 +3082,13 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) {
   // After creating a new PHI for ValNo, the phi translate result for ValNo will
   // be changed, so erase the related stale entries in phi translate cache.
   VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock);
-  addToLeaderTable(ValNo, Phi, CurrentBlock);
+  LeaderTable.insert(ValNo, Phi, CurrentBlock);
   Phi->setDebugLoc(CurInst->getDebugLoc());
   CurInst->replaceAllUsesWith(Phi);
   if (MD && Phi->getType()->isPtrOrPtrVectorTy())
     MD->invalidateCachedPointerInfo(Phi);
   VN.erase(CurInst);
-  removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+  LeaderTable.erase(ValNo, CurInst, CurrentBlock);
 
   LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
   removeInstruction(CurInst);
@@ -3127,7 +3182,6 @@ void GVNPass::cleanupGlobalSets() {
   VN.clear();
   LeaderTable.clear();
   BlockRPONumber.clear();
-  TableAllocator.Reset();
   ICF->clear();
   InvalidBlockRPONumbers = true;
 }
@@ -3147,18 +3201,7 @@ void GVNPass::removeInstruction(Instruction *I) {
 /// internal data structures.
 void GVNPass::verifyRemoved(const Instruction *Inst) const {
   VN.verifyRemoved(Inst);
-
-  // Walk through the value number scope to make sure the instruction isn't
-  // ferreted away in it.
-  for (const auto &I : LeaderTable) {
-    const LeaderTableEntry *Node = &I.second;
-    assert(Node->Val != Inst && "Inst still in value numbering scope!");
-
-    while (Node->Next) {
-      Node = Node->Next;
-      assert(Node->Val != Inst && "Inst still in value numbering scope!");
-    }
-  }
+  LeaderTable.verifyRemoved(Inst);
 }
 
 /// BB is declared dead, which implied other blocks become dead as well. This
@@ -3285,7 +3328,7 @@ void GVNPass::assignValNumForDeadCode() {
   for (BasicBlock *BB : DeadBlocks) {
     for (Instruction &Inst : *BB) {
       unsigned ValNum = VN.lookupOrAdd(&Inst);
-      addToLeaderTable(ValNum, &Inst, BB);
+      LeaderTable.insert(ValNum, &Inst, BB);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 9df28747570c..104e8ceb7967 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -279,6 +279,9 @@ bool InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
 
+  if (!LHS->getType()->isIntegerTy())
+    return false;
+
   // Canonicalize to the `Index Pred Invariant` comparison
   if (IsLoopInvariant(LHS)) {
     std::swap(LHS, RHS);
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index ffcb511e6a83..08d82fa66da3 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1876,7 +1876,7 @@ bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) {
 static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
                                             BasicBlock *OldPred,
                                             BasicBlock *NewPred,
-                                     DenseMap<Instruction*, Value*> &ValueMap) {
+                                            ValueToValueMapTy &ValueMap) {
   for (PHINode &PN : PHIBB->phis()) {
     // Ok, we have a PHI node.  Figure out what the incoming value was for the
     // DestBlock.
@@ -1884,7 +1884,7 @@ static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
 
     // Remap the value if necessary.
     if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
-      DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+      ValueToValueMapTy::iterator I = ValueMap.find(Inst);
       if (I != ValueMap.end())
         IV = I->second;
     }
@@ -1945,9 +1945,8 @@ bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
 
 /// Update the SSA form.  NewBB contains instructions that are copied from BB.
 /// ValueMapping maps old values in BB to new ones in NewBB.
-void JumpThreadingPass::updateSSA(
-    BasicBlock *BB, BasicBlock *NewBB,
-    DenseMap<Instruction *, Value *> &ValueMapping) {
+void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
+                                  ValueToValueMapTy &ValueMapping) {
   // If there were values defined in BB that are used outside the block, then we
   // now have to update all uses of the value to use either the original value,
   // the cloned value, or some PHI derived value.  This can require arbitrary
@@ -2008,14 +2007,15 @@ void JumpThreadingPass::updateSSA(
 /// Clone instructions in range [BI, BE) to NewBB.  For PHI nodes, we only clone
 /// arguments that come from PredBB.  Return the map from the variables in the
 /// source basic block to the variables in the newly created basic block.
-DenseMap<Instruction *, Value *>
-JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
-                                     BasicBlock::iterator BE, BasicBlock *NewBB,
-                                     BasicBlock *PredBB) {
+
+void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping,
+                                          BasicBlock::iterator BI,
+                                          BasicBlock::iterator BE,
+                                          BasicBlock *NewBB,
+                                          BasicBlock *PredBB) {
   // We are going to have to map operands from the source basic block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in the source basic
   // block, evaluate them to account for entry from PredBB.
-  DenseMap<Instruction *, Value *> ValueMapping;
 
   // Retargets llvm.dbg.value to any renamed variables.
   auto RetargetDbgValueIfPossible = [&](Instruction *NewInst) -> bool {
@@ -2103,7 +2103,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
     // Remap operands to patch up intra-block references.
     for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
       if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
-        DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst);
+        ValueToValueMapTy::iterator I = ValueMapping.find(Inst);
         if (I != ValueMapping.end())
           New->setOperand(i, I->second);
       }
@@ -2120,7 +2120,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
       RetargetDbgVariableRecordIfPossible(&DVR);
   }
 
-  return ValueMapping;
+  return;
 }
 
 /// Attempt to thread through two successive basic blocks.
@@ -2295,8 +2295,9 @@ void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
   // We are going to have to map operands from the original BB block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in PredBB, evaluate them
   // to account for entry from PredPredBB.
-  DenseMap<Instruction *, Value *> ValueMapping =
-      cloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
+  ValueToValueMapTy ValueMapping;
+  cloneInstructions(ValueMapping, PredBB->begin(), PredBB->end(), NewBB,
+                    PredPredBB);
 
   // Copy the edge probabilities from PredBB to NewBB.
   if (BPI)
@@ -2419,8 +2420,9 @@ void JumpThreadingPass::threadEdge(BasicBlock *BB,
   }
 
   // Copy all the instructions from BB to NewBB except the terminator.
-  DenseMap<Instruction *, Value *> ValueMapping =
-      cloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
+  ValueToValueMapTy ValueMapping;
+  cloneInstructions(ValueMapping, BB->begin(), std::prev(BB->end()), NewBB,
+                    PredBB);
 
   // We didn't copy the terminator from BB over to NewBB, because there is now
   // an unconditional jump to SuccBB.  Insert the unconditional jump.
@@ -2675,7 +2677,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
 
   // We are going to have to map operands from the original BB block into the
   // PredBB block.  Evaluate PHI nodes in BB.
-  DenseMap<Instruction*, Value*> ValueMapping;
+  ValueToValueMapTy ValueMapping;
 
   BasicBlock::iterator BI = BB->begin();
   for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
@@ -2689,11 +2691,14 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
     // Remap operands to patch up intra-block references.
     for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
       if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
-        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        ValueToValueMapTy::iterator I = ValueMapping.find(Inst);
         if (I != ValueMapping.end())
           New->setOperand(i, I->second);
       }
 
+    // Remap debug variable operands.
+    remapDebugVariable(ValueMapping, New);
+
     // If this instruction can be simplified after the operands are updated,
     // just use the simplified value instead.  This frequently happens due to
     // phi translation.
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index edddfb1b9240..059900f357e6 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -126,8 +126,10 @@ struct StoreToLoadForwardingCandidate {
 
     // We don't need to check non-wrapping here because forward/backward
     // dependence wouldn't be valid if these weren't monotonic accesses.
-    auto *Dist = cast<SCEVConstant>(
+    auto *Dist = dyn_cast<SCEVConstant>(
         PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+    if (!Dist)
+      return false;
     const APInt &Val = Dist->getAPInt();
     return Val == TypeByteSize * StrideLoad;
   }
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1036b8ae963a..7ef5dceffec0 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -99,7 +99,7 @@ struct MemsetRange {
   MaybeAlign Alignment;
 
   /// TheStores - The actual stores that make up this range.
-  SmallVector<Instruction*, 16> TheStores;
+  SmallVector<Instruction *, 16> TheStores;
 
   bool isProfitableToUseMemset(const DataLayout &DL) const;
 };
@@ -108,10 +108,12 @@ struct MemsetRange {
 
 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we found more than 4 stores to merge or 16 bytes, use memset.
-  if (TheStores.size() >= 4 || End-Start >= 16) return true;
+  if (TheStores.size() >= 4 || End - Start >= 16)
+    return true;
 
   // If there is nothing to merge, don't do anything.
-  if (TheStores.size() < 2) return false;
+  if (TheStores.size() < 2)
+    return false;
 
   // If any of the stores are a memset, then it is always good to extend the
   // memset.
@@ -121,7 +123,8 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
 
   // Assume that the code generator is capable of merging pairs of stores
   // together if it wants to.
-  if (TheStores.size() == 2) return false;
+  if (TheStores.size() == 2)
+    return false;
 
   // If we have fewer than 8 stores, it can still be worthwhile to do this.
   // For example, merging 4 i8 stores into an i32 store is useful almost always.
@@ -133,7 +136,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // the maximum GPR width is the same size as the largest legal integer
   // size. If so, check to see whether we will end up actually reducing the
   // number of stores used.
-  unsigned Bytes = unsigned(End-Start);
+  unsigned Bytes = unsigned(End - Start);
   unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
   if (MaxIntSize == 0)
     MaxIntSize = 1;
@@ -145,7 +148,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
   // etc.
-  return TheStores.size() > NumPointerStores+NumByteStores;
+  return TheStores.size() > NumPointerStores + NumByteStores;
 }
 
 namespace {
@@ -197,7 +200,7 @@ public:
 /// existing ranges as appropriate.
 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
                             MaybeAlign Alignment, Instruction *Inst) {
-  int64_t End = Start+Size;
+  int64_t End = Start + Size;
 
   range_iterator I = partition_point(
       Ranges, [=](const MemsetRange &O) { return O.End < Start; });
@@ -207,10 +210,10 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
   // to insert a new range.  Handle this now.
   if (I == Ranges.end() || End < I->Start) {
     MemsetRange &R = *Ranges.insert(I, MemsetRange());
-    R.Start        = Start;
-    R.End          = End;
-    R.StartPtr     = Ptr;
-    R.Alignment    = Alignment;
+    R.Start = Start;
+    R.End = End;
+    R.StartPtr = Ptr;
+    R.Alignment = Alignment;
     R.TheStores.push_back(Inst);
     return;
   }
@@ -397,7 +400,8 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
 
     if (auto *NextStore = dyn_cast<StoreInst>(BI)) {
       // If this is a store, see if we can merge it in.
-      if (!NextStore->isSimple()) break;
+      if (!NextStore->isSimple())
+        break;
 
       Value *StoredVal = NextStore->getValueOperand();
 
@@ -460,7 +464,8 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   // emit memset's for anything big enough to be worthwhile.
   Instruction *AMemSet = nullptr;
   for (const MemsetRange &Range : Ranges) {
-    if (Range.TheStores.size() == 1) continue;
+    if (Range.TheStores.size() == 1)
+      continue;
 
     // If it is profitable to lower this range to memset, do so now.
     if (!Range.isProfitableToUseMemset(DL))
@@ -481,12 +486,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
     if (!Range.TheStores.empty())
       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 
-    auto *NewDef =
-        cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI
-                            ? MSSAU->createMemoryAccessBefore(
-                                  AMemSet, nullptr, MemInsertPoint)
-                            : MSSAU->createMemoryAccessAfter(
-                                  AMemSet, nullptr, MemInsertPoint));
+    auto *NewDef = cast<MemoryDef>(
+        MemInsertPoint->getMemoryInst() == &*BI
+            ? MSSAU->createMemoryAccessBefore(AMemSet, nullptr, MemInsertPoint)
+            : MSSAU->createMemoryAccessAfter(AMemSet, nullptr, MemInsertPoint));
     MSSAU->insertDef(NewDef, /*RenameUses=*/true);
     MemInsertPoint = NewDef;
 
@@ -512,12 +515,13 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
 
   // Keep track of the arguments of all instruction we plan to lift
   // so we can make sure to lift them as well if appropriate.
-  DenseSet<Instruction*> Args;
+  DenseSet<Instruction *> Args;
   auto AddArg = [&](Value *Arg) {
     auto *I = dyn_cast<Instruction>(Arg);
     if (I && I->getParent() == SI->getParent()) {
       // Cannot hoist user of P above P
-      if (I == P) return false;
+      if (I == P)
+        return false;
       Args.insert(I);
     }
     return true;
@@ -630,8 +634,7 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
 bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
                                        const DataLayout &DL,
                                        BasicBlock::iterator &BBI) {
-  if (!LI->isSimple() || !LI->hasOneUse() ||
-      LI->getParent() != SI->getParent())
+  if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent())
     return false;
 
   auto *T = LI->getType();
@@ -678,21 +681,20 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
         UseMemMove = true;
 
       IRBuilder<> Builder(P);
-      Value *Size = Builder.CreateTypeSize(Builder.getInt64Ty(),
-                                           DL.getTypeStoreSize(T));
+      Value *Size =
+          Builder.CreateTypeSize(Builder.getInt64Ty(), DL.getTypeStoreSize(T));
       Instruction *M;
       if (UseMemMove)
-        M = Builder.CreateMemMove(
-            SI->getPointerOperand(), SI->getAlign(),
-            LI->getPointerOperand(), LI->getAlign(), Size);
+        M = Builder.CreateMemMove(SI->getPointerOperand(), SI->getAlign(),
+                                  LI->getPointerOperand(), LI->getAlign(),
+                                  Size);
       else
-        M = Builder.CreateMemCpy(
-            SI->getPointerOperand(), SI->getAlign(),
-            LI->getPointerOperand(), LI->getAlign(), Size);
+        M = Builder.CreateMemCpy(SI->getPointerOperand(), SI->getAlign(),
+                                 LI->getPointerOperand(), LI->getAlign(), Size);
       M->copyMetadata(*SI, LLVMContext::MD_DIAssignID);
 
-      LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
-                        << *M << "\n");
+      LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M
+                        << "\n");
 
       auto *LastDef =
           cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
@@ -755,7 +757,8 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
 }
 
 bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
-  if (!SI->isSimple()) return false;
+  if (!SI->isSimple())
+    return false;
 
   // Avoid merging nontemporal stores since the resulting
   // memcpy/memset would not be able to preserve the nontemporal hint.
@@ -794,8 +797,8 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // 0xA0A0A0A0 and 0.0.
   auto *V = SI->getOperand(0);
   if (Value *ByteVal = isBytewiseValue(V, DL)) {
-    if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
-                                              ByteVal)) {
+    if (Instruction *I =
+            tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) {
       BBI = I->getIterator(); // Don't invalidate iterator.
       return true;
     }
@@ -816,8 +819,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       // The newly inserted memset is immediately overwritten by the original
       // store, so we do not need to rename uses.
       auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI));
-      auto *NewAccess = MSSAU->createMemoryAccessBefore(
-          M, nullptr, StoreDef);
+      auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef);
       MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false);
 
       eraseInstruction(SI);
@@ -836,8 +838,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
   // See if there is another memset or store neighboring this memset which
   // allows us to widen out the memset to do a single larger store.
   if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
-    if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
-                                              MSI->getValue())) {
+    if (Instruction *I =
+            tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) {
       BBI = I->getIterator(); // Don't invalidate iterator.
       return true;
     }
@@ -850,7 +852,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
                                          Instruction *cpyStore, Value *cpyDest,
                                          Value *cpySrc, TypeSize cpySize,
-                                         Align cpyDestAlign, BatchAAResults &BAA,
+                                         Align cpyDestAlign,
+                                         BatchAAResults &BAA,
                                          std::function<CallInst *()> GetC) {
   // The general transformation to keep in mind is
   //
@@ -898,15 +901,15 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
     if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
       return false;
 
-
   if (C->getParent() != cpyStore->getParent()) {
     LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n");
     return false;
   }
 
-  MemoryLocation DestLoc = isa<StoreInst>(cpyStore) ?
-    MemoryLocation::get(cpyStore) :
-    MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore));
+  MemoryLocation DestLoc =
+      isa<StoreInst>(cpyStore)
+          ? MemoryLocation::get(cpyStore)
+          : MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore));
 
   // Check that nothing touches the dest of the copy between
   // the call and the store/memcpy.
@@ -1175,7 +1178,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
 
   // If all checks passed, then we can transform M.
   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
-                    << *MDep << '\n' << *M << '\n');
+                    << *MDep << '\n'
+                    << *M << '\n');
 
   // TODO: Is this worth it if we're creating a less aligned memcpy? For
   // example we could be moving from movaps -> movq on x86.
@@ -1307,8 +1311,8 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   // memcpy's defining access is the memset about to be removed.
   auto *LastDef =
       cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
-  auto *NewAccess = MSSAU->createMemoryAccessBefore(
-      NewMemSet, nullptr, LastDef);
+  auto *NewAccess =
+      MSSAU->createMemoryAccessBefore(NewMemSet, nullptr, LastDef);
   MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
 
   eraseInstruction(MemSet);
@@ -1384,7 +1388,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
       return false;
 
     // A known memcpy size is also required.
-    auto  *CCopySize = dyn_cast<ConstantInt>(CopySize);
+    auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
     if (!CCopySize)
       return false;
     if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) {
@@ -1655,7 +1659,8 @@ static bool isZeroSize(Value *Size) {
 /// altogether.
 bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   // We can only optimize non-volatile memcpy's.
-  if (M->isVolatile()) return false;
+  if (M->isVolatile())
+    return false;
 
   // If the source and destination of the memcpy are the same, then zap it.
   if (M->getSource() == M->getDest()) {
@@ -1796,11 +1801,10 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
                     << "\n");
 
   // If not, then we know we can transform this.
-  Type *ArgTys[3] = { M->getRawDest()->getType(),
-                      M->getRawSource()->getType(),
-                      M->getLength()->getType() };
-  M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
-                                                 Intrinsic::memcpy, ArgTys));
+  Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(),
+                     M->getLength()->getType()};
+  M->setCalledFunction(
+      Intrinsic::getDeclaration(M->getModule(), Intrinsic::memcpy, ArgTys));
 
   // For MemorySSA nothing really changes (except that memcpy may imply stricter
   // aliasing guarantees).
@@ -1843,7 +1847,8 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   // Get the alignment of the byval.  If the call doesn't specify the alignment,
   // then it is some target specific value that we can't know.
   MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
-  if (!ByValAlign) return false;
+  if (!ByValAlign)
+    return false;
 
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
@@ -1987,7 +1992,7 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
       continue;
 
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
-        // Avoid invalidating the iterator.
+      // Avoid invalidating the iterator.
       Instruction *I = &*BI++;
 
       bool RepeatInstruction = false;
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 5396038d8b92..51fc28ef90ef 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -333,6 +333,10 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // Finally, erase the old block and update dominator info.
   DeleteDeadBlock(BB, DTU);
 
+  // Remove redundant "llvm.dbg" instrunctions after blocks have been merged.
+  if (PredBB->getParent()->getSubprogram())
+    RemoveRedundantDbgInstrs(PredBB);
+
   return true;
 }
 
@@ -1401,13 +1405,13 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
   if (OldLatch) {
     BasicBlock *NewLatch = L->getLoopLatch();
     if (NewLatch != OldLatch) {
-      MDNode *MD = OldLatch->getTerminator()->getMetadata("llvm.loop");
-      NewLatch->getTerminator()->setMetadata("llvm.loop", MD);
+      MDNode *MD = OldLatch->getTerminator()->getMetadata(LLVMContext::MD_loop);
+      NewLatch->getTerminator()->setMetadata(LLVMContext::MD_loop, MD);
       // It's still possible that OldLatch is the latch of another inner loop,
       // in which case we do not remove the metadata.
       Loop *IL = LI->getLoopFor(OldLatch);
       if (IL && IL->getLoopLatch() != OldLatch)
-        OldLatch->getTerminator()->setMetadata("llvm.loop", nullptr);
+        OldLatch->getTerminator()->setMetadata(LLVMContext::MD_loop, nullptr);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index ed0ed345435c..e97506b4bbd9 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1255,7 +1255,7 @@ static void setRetExtAttr(Function &F,
 }
 
 // Modeled after X86TargetLowering::markLibCallAttributes.
-static void markRegisterParameterAttributes(Function *F) {
+void llvm::markRegisterParameterAttributes(Function *F) {
   if (!F->arg_size() || F->isVarArg())
     return;
 
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 42e648484416..303a09805a9d 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -541,13 +540,18 @@ void PruningFunctionCloner::CloneBlock(
       RemapInstruction(NewInst, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
 
-      // Eagerly constant fold the newly cloned instruction. If successful, add
-      // a mapping to the new value. Non-constant operands may be incomplete at
-      // this stage, thus instruction simplification is performed after
-      // processing phi-nodes.
-      if (Value *V = ConstantFoldInstruction(
-              NewInst, BB->getModule()->getDataLayout())) {
-        if (isInstructionTriviallyDead(NewInst)) {
+      // If we can simplify this instruction to some other value, simply add
+      // a mapping to that value rather than inserting a new instruction into
+      // the basic block.
+      if (Value *V =
+              simplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
+        // On the off-chance that this simplifies to an instruction in the old
+        // function, map it back into the new function.
+        if (NewFunc != OldFunc)
+          if (Value *MappedV = VMap.lookup(V))
+            V = MappedV;
+
+        if (!NewInst->mayHaveSideEffects()) {
           VMap[&*II] = V;
           NewInst->eraseFromParent();
           continue;
@@ -819,34 +823,52 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     }
   }
 
-  // As phi-nodes have been now remapped, allow incremental simplification of
-  // newly-cloned instructions.
+  // Make a second pass over the PHINodes now that all of them have been
+  // remapped into the new function, simplifying the PHINode and performing any
+  // recursive simplifications exposed. This will transparently update the
+  // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce
+  // two PHINodes, the iteration over the old PHIs remains valid, and the
+  // mapping will just map us to the new node (which may not even be a PHI
+  // node).
   const DataLayout &DL = NewFunc->getParent()->getDataLayout();
-  for (const auto &BB : *OldFunc) {
-    for (const auto &I : BB) {
-      auto *NewI = dyn_cast_or_null<Instruction>(VMap.lookup(&I));
-      if (!NewI)
-        continue;
-
-      // Skip over non-intrinsic callsites, we don't want to remove any nodes
-      // from the CGSCC.
-      CallBase *CB = dyn_cast<CallBase>(NewI);
-      if (CB && CB->getCalledFunction() &&
-          !CB->getCalledFunction()->isIntrinsic())
-        continue;
-
-      if (Value *V = simplifyInstruction(NewI, DL)) {
-        NewI->replaceAllUsesWith(V);
-
-        if (isInstructionTriviallyDead(NewI)) {
-          NewI->eraseFromParent();
-        } else {
-          // Did not erase it? Restore the new instruction into VMap previously
-          // dropped by `ValueIsRAUWd`.
-          VMap[&I] = NewI;
-        }
-      }
-    }
+  SmallSetVector<const Value *, 8> Worklist;
+  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
+    if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
+      Worklist.insert(PHIToResolve[Idx]);
+
+  // Note that we must test the size on each iteration, the worklist can grow.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    const Value *OrigV = Worklist[Idx];
+    auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
+    if (!I)
+      continue;
+
+    // Skip over non-intrinsic callsites, we don't want to remove any nodes from
+    // the CGSCC.
+    CallBase *CB = dyn_cast<CallBase>(I);
+    if (CB && CB->getCalledFunction() &&
+        !CB->getCalledFunction()->isIntrinsic())
+      continue;
+
+    // See if this instruction simplifies.
+    Value *SimpleV = simplifyInstruction(I, DL);
+    if (!SimpleV)
+      continue;
+
+    // Stash away all the uses of the old instruction so we can check them for
+    // recursive simplifications after a RAUW. This is cheaper than checking all
+    // uses of To on the recursive step in most cases.
+    for (const User *U : OrigV->users())
+      Worklist.insert(cast<Instruction>(U));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+
+    // If the original instruction had no side effects, remove it.
+    if (isInstructionTriviallyDead(I))
+      I->eraseFromParent();
+    else
+      VMap[OrigV] = I;
   }
 
   // Remap debug intrinsic operands now that all values have been mapped.
@@ -1109,6 +1131,9 @@ BasicBlock *llvm::DuplicateInstructionsInSplitBetween(
         if (I != ValueMapping.end())
           New->setOperand(i, I->second);
       }
+
+    // Remap debug variable operands.
+    remapDebugVariable(ValueMapping, New);
   }
 
   return NewBB;
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index f4207474e9a6..59a7dd1a00ed 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -137,7 +137,8 @@ static bool runOnFunction(Function &F, bool PostInlining) {
 
 PreservedAnalyses
 llvm::EntryExitInstrumenterPass::run(Function &F, FunctionAnalysisManager &AM) {
-  runOnFunction(F, PostInlining);
+  if (!runOnFunction(F, PostInlining))
+    return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   return PA;
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index c5aded3c45f4..b177e048faae 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -172,9 +172,14 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
           return true;
         GS.StoredType = GlobalStatus::Stored;
       } else if (const auto *CB = dyn_cast<CallBase>(I)) {
-        if (!CB->isCallee(&U))
-          return true;
-        GS.IsLoaded = true;
+        if (CB->getIntrinsicID() == Intrinsic::threadlocal_address) {
+          if (analyzeGlobalAux(I, GS, VisitedUsers))
+            return true;
+        } else {
+          if (!CB->isCallee(&U))
+            return true;
+          GS.IsLoaded = true;
+        }
       } else {
         return true; // Any other non-load instruction might take address!
       }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 5f456092bf4e..f3cd3104c312 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3685,6 +3685,30 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
   return nullptr;
 }
 
+void llvm::remapDebugVariable(ValueToValueMapTy &Mapping, Instruction *Inst) {
+  auto RemapDebugOperands = [&Mapping](auto *DV, auto Set) {
+    for (auto *Op : Set) {
+      auto I = Mapping.find(Op);
+      if (I != Mapping.end())
+        DV->replaceVariableLocationOp(Op, I->second, /*AllowEmpty=*/true);
+    }
+  };
+  auto RemapAssignAddress = [&Mapping](auto *DA) {
+    auto I = Mapping.find(DA->getAddress());
+    if (I != Mapping.end())
+      DA->setAddress(I->second);
+  };
+  if (auto DVI = dyn_cast<DbgVariableIntrinsic>(Inst))
+    RemapDebugOperands(DVI, DVI->location_ops());
+  if (auto DAI = dyn_cast<DbgAssignIntrinsic>(Inst))
+    RemapAssignAddress(DAI);
+  for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange())) {
+    RemapDebugOperands(&DVR, DVR.location_ops());
+    if (DVR.isDbgAssign())
+      RemapAssignAddress(&DVR);
+  }
+}
+
 namespace {
 
 /// A potential constituent of a bitreverse or bswap expression. See
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 0f55af3b6edd..5cd96412a322 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -287,7 +287,7 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
     return;
 
   SmallVector<uint32_t, 2> Weights;
-  extractFromBranchWeightMD(WeightMD, Weights);
+  extractFromBranchWeightMD32(WeightMD, Weights);
   if (Weights.size() != 2)
     return;
   uint32_t OrigLoopExitWeight = Weights[0];
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 73c5d6367822..e3e09d11ba8c 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1930,10 +1930,12 @@ llvm::hasPartialIVCondition(const Loop &L, unsigned MSSAThreshold,
   if (!TI || !TI->isConditional())
     return {};
 
-  auto *CondI = dyn_cast<CmpInst>(TI->getCondition());
+  auto *CondI = dyn_cast<Instruction>(TI->getCondition());
   // The case with the condition outside the loop should already be handled
   // earlier.
-  if (!CondI || !L.contains(CondI))
+  // Allow CmpInst and TruncInsts as they may be users of load instructions
+  // and have potential for partial unswitching
+  if (!CondI || !isa<CmpInst, TruncInst>(CondI) || !L.contains(CondI))
     return {};
 
   SmallVector<Instruction *> InstToDuplicate;
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 7b1eb70168d8..0464ba5e1811 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -283,5 +284,37 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) {
                                 IRB.CreateCall(ThreadPointerFunc), 8 * Slot);
 }
 
+static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
+  return dyn_cast<DbgAssignIntrinsic>(DVI);
+}
+
+static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) {
+  return DVR->isDbgAssign() ? DVR : nullptr;
+}
+
+void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag) {
+  // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
+  // abstracted over whether they're intrinsic-stored or DbgVariableRecord
+  // stored.
+  auto AnnotateDbgRecord = [&](auto *DPtr) {
+    // Prepend "tag_offset, N" to the dwarf expression.
+    // Tag offset logically applies to the alloca pointer, and it makes sense
+    // to put it at the beginning of the expression.
+    SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset, Tag};
+    for (size_t LocNo = 0; LocNo < DPtr->getNumVariableLocationOps(); ++LocNo)
+      if (DPtr->getVariableLocationOp(LocNo) == Info.AI)
+        DPtr->setExpression(
+            DIExpression::appendOpsToArg(DPtr->getExpression(), NewOps, LocNo));
+    if (auto *DAI = DynCastToDbgAssign(DPtr)) {
+      if (DAI->getAddress() == Info.AI)
+        DAI->setAddressExpression(
+            DIExpression::prependOpcodes(DAI->getAddressExpression(), NewOps));
+    }
+  };
+
+  llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
+  llvm::for_each(Info.DbgVariableRecords, AnnotateDbgRecord);
+}
+
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 3eda669eb8a7..5a44a11ecfd2 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1066,11 +1066,8 @@ static int ConstantIntSortPredicate(ConstantInt *const *P1,
 static void GetBranchWeights(Instruction *TI,
                              SmallVectorImpl<uint64_t> &Weights) {
   MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
-  assert(MD);
-  for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
-    ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(i));
-    Weights.push_back(CI->getValue().getZExtValue());
-  }
+  assert(MD && "Invalid branch-weight metadata");
+  extractFromBranchWeightMD64(MD, Weights);
 
   // If TI is a conditional eq, the default case is the false case,
   // and the corresponding branch-weight data is at index 2. We swap the
@@ -7524,6 +7521,13 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
                                       SI->getPointerAddressSpace())) &&
                SI->getPointerOperand() == I;
 
+    // llvm.assume(false/undef) always triggers immediate UB.
+    if (auto *Assume = dyn_cast<AssumeInst>(Use)) {
+      // Ignore assume operand bundles.
+      if (I == Assume->getArgOperand(0))
+        return true;
+    }
+
     if (auto *CB = dyn_cast<CallBase>(Use)) {
       if (C->isNullValue() && NullPointerIsDefined(CB->getFunction()))
         return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6..d1c54b928f9f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -460,9 +460,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
-    return Builder.CreateIntrinsic(
-        WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
-        /*FMFSource=*/nullptr, Name);
+    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
+                                   Vals,
+                                   /*FMFSource=*/nullptr, Name);
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2517,9 +2517,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
       auto *MaskTy =
           VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
-      return Builder.CreateIntrinsic(
-          MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
-          /*FMFSource=*/nullptr, "interleaved.mask");
+      return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops,
+                                     /*FMFSource=*/nullptr, "interleaved.mask");
     }
 
     if (!BlockInMask)
@@ -2571,7 +2570,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
         // so must use intrinsics to deinterleave.
         Value *DI = Builder.CreateIntrinsic(
-            Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
+            Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
             /*FMFSource=*/nullptr, "strided.vec");
         unsigned J = 0;
         for (unsigned I = 0; I < InterleaveFactor; ++I) {
@@ -4167,7 +4166,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
   // Worklist containing uniform instructions demanding lane 0.
   SetVector<Instruction *> Worklist;
-  BasicBlock *Latch = TheLoop->getLoopLatch();
 
   // Add uniform instructions demanding lane 0 to the worklist. Instructions
   // that are scalar with predication must not be considered uniform after
@@ -4189,12 +4187,16 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     Worklist.insert(I);
   };
 
-  // Start with the conditional branch. If the branch condition is an
-  // instruction contained in the loop that is only used by the branch, it is
-  // uniform.
-  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
-    addToWorklistIfAllowed(Cmp);
+  // Start with the conditional branches exiting the loop. If the branch
+  // condition is an instruction contained in the loop that is only used by the
+  // branch, it is uniform.
+  SmallVector<BasicBlock *> Exiting;
+  TheLoop->getExitingBlocks(Exiting);
+  for (BasicBlock *E : Exiting) {
+    auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
+    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+      addToWorklistIfAllowed(Cmp);
+  }
 
   auto PrevVF = VF.divideCoefficientBy(2);
   // Return true if all lanes perform the same memory operation, and we can
@@ -4335,6 +4337,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // nodes separately. An induction variable will remain uniform if all users
   // of the induction variable and induction variable update remain uniform.
   // The code below handles both pointer and non-pointer induction variables.
+  BasicBlock *Latch = TheLoop->getLoopLatch();
   for (const auto &Induction : Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
@@ -6873,11 +6876,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     // In cases of scalarized and predicated instructions, there will be VF
     // predicated blocks in the vectorized loop. Each branch around these
     // blocks requires also an extract of its vector compare i1 element.
+    // Note that the conditional branch from the loop latch will be replaced by
+    // a single branch controlling the loop, so there is no extra overhead from
+    // scalarization.
     bool ScalarPredicatedBB = false;
     BranchInst *BI = cast<BranchInst>(I);
     if (VF.isVector() && BI->isConditional() &&
         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
-         PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
+         PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
+        BI->getParent() != TheLoop->getLoopLatch())
       ScalarPredicatedBB = true;
 
     if (ScalarPredicatedBB) {
@@ -8265,6 +8272,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
     return nullptr;
 
   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
+  Ops.push_back(Operands.back());
 
   // Is it beneficial to perform intrinsic call compared to lib call?
   bool ShouldUseVectorIntrinsic =
@@ -8275,7 +8283,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                 },
                 Range);
   if (ShouldUseVectorIntrinsic)
-    return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
+    return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
                                  CI->getDebugLoc());
 
   Function *Variant = nullptr;
@@ -8328,7 +8336,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
       Ops.insert(Ops.begin() + *MaskPos, Mask);
     }
 
-    return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
+    return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
                                  Intrinsic::not_intrinsic, CI->getDebugLoc(),
                                  Variant);
   }
@@ -8817,12 +8825,24 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     // Only handle constant strides for now.
     if (!ScevStride)
       continue;
-    Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
 
-    auto *ConstVPV = Plan->getOrAddLiveIn(CI);
-    // The versioned value may not be used in the loop directly, so just add a
-    // new live-in in those cases.
-    Plan->getOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
+    auto *CI = Plan->getOrAddLiveIn(
+        ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
+    if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
+      StrideVPV->replaceAllUsesWith(CI);
+
+    // The versioned value may not be used in the loop directly but through a
+    // sext/zext. Add new live-ins in those cases.
+    for (Value *U : StrideV->users()) {
+      if (!isa<SExtInst, ZExtInst>(U))
+        continue;
+      VPValue *StrideVPV = Plan->getLiveIn(U);
+      if (!StrideVPV)
+        continue;
+      VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(
+          U->getType(), ScevStride->getAPInt().getSExtValue()));
+      StrideVPV->replaceAllUsesWith(CI);
+    }
   }
 
   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a1a28076881c..bc553c5009ed 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -216,6 +216,9 @@ static const unsigned MaxMemDepDistance = 160;
 /// regions to be handled.
 static const int MinScheduleRegionSize = 16;
 
+/// Maximum allowed number of operands in the PHI nodes.
+static const unsigned MaxPHINumOperands = 128;
+
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
@@ -1135,6 +1138,7 @@ public:
     ScalarToTreeEntry.clear();
     MultiNodeScalars.clear();
     MustGather.clear();
+    NonScheduledFirst.clear();
     EntryToLastInstruction.clear();
     ExternalUses.clear();
     ExternalUsesAsGEPs.clear();
@@ -1252,7 +1256,7 @@ public:
   /// effectively impossible for the backend to undo.
   /// TODO: If load combining is allowed in the IR optimizer, this analysis
   ///       may not be necessary.
-  bool isLoadCombineCandidate() const;
+  bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
@@ -1391,12 +1395,19 @@ public:
         return LookAheadHeuristics::ScoreSplat;
       }
 
+      auto CheckSameEntryOrFail = [&]() {
+        if (const TreeEntry *TE1 = R.getTreeEntry(V1);
+            TE1 && TE1 == R.getTreeEntry(V2))
+          return LookAheadHeuristics::ScoreSplatLoads;
+        return LookAheadHeuristics::ScoreFail;
+      };
+
       auto *LI1 = dyn_cast<LoadInst>(V1);
       auto *LI2 = dyn_cast<LoadInst>(V2);
       if (LI1 && LI2) {
         if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
             !LI2->isSimple())
-          return LookAheadHeuristics::ScoreFail;
+          return CheckSameEntryOrFail();
 
         std::optional<int> Dist = getPointersDiff(
             LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
@@ -1408,7 +1419,7 @@ public:
                   FixedVectorType::get(LI1->getType(), NumLanes),
                   LI1->getAlign()))
             return LookAheadHeuristics::ScoreMaskedGatherCandidate;
-          return LookAheadHeuristics::ScoreFail;
+          return CheckSameEntryOrFail();
         }
         // The distance is too large - still may be profitable to use masked
         // loads/gathers.
@@ -1465,14 +1476,14 @@ public:
           }
           return LookAheadHeuristics::ScoreAltOpcodes;
         }
-        return LookAheadHeuristics::ScoreFail;
+        return CheckSameEntryOrFail();
       }
 
       auto *I1 = dyn_cast<Instruction>(V1);
       auto *I2 = dyn_cast<Instruction>(V2);
       if (I1 && I2) {
         if (I1->getParent() != I2->getParent())
-          return LookAheadHeuristics::ScoreFail;
+          return CheckSameEntryOrFail();
         SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
         Ops.push_back(I1);
         Ops.push_back(I2);
@@ -1493,7 +1504,7 @@ public:
       if (isa<UndefValue>(V2))
         return LookAheadHeuristics::ScoreUndef;
 
-      return LookAheadHeuristics::ScoreFail;
+      return CheckSameEntryOrFail();
     }
 
     /// Go through the operands of \p LHS and \p RHS recursively until
@@ -1656,6 +1667,7 @@ public:
     const DataLayout &DL;
     ScalarEvolution &SE;
     const BoUpSLP &R;
+    const Loop *L = nullptr;
 
     /// \returns the operand data at \p OpIdx and \p Lane.
     OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -1824,8 +1836,9 @@ public:
       // Track if the operand must be marked as used. If the operand is set to
       // Score 1 explicitly (because of non power-of-2 unique scalars, we may
       // want to reestimate the operands again on the following iterations).
-      bool IsUsed =
-          RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
+      bool IsUsed = RMode == ReorderingMode::Splat ||
+                    RMode == ReorderingMode::Constant ||
+                    RMode == ReorderingMode::Load;
       // Iterate through all unused operands and look for the best.
       for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
         // Get the operand at Idx and Lane.
@@ -1846,23 +1859,44 @@ public:
         // Look for an operand that matches the current mode.
         switch (RMode) {
         case ReorderingMode::Load:
-        case ReorderingMode::Constant:
         case ReorderingMode::Opcode: {
           bool LeftToRight = Lane > LastLane;
           Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
           Value *OpRight = (LeftToRight) ? Op : OpLastLane;
           int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
                                         OpIdx, Idx, IsUsed);
-          if (Score > static_cast<int>(BestOp.Score)) {
+          if (Score > static_cast<int>(BestOp.Score) ||
+              (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
+               Idx == OpIdx)) {
             BestOp.Idx = Idx;
             BestOp.Score = Score;
             BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
           }
           break;
         }
+        case ReorderingMode::Constant:
+          if (isa<Constant>(Op) ||
+              (!BestOp.Score && L && L->isLoopInvariant(Op))) {
+            BestOp.Idx = Idx;
+            if (isa<Constant>(Op)) {
+              BestOp.Score = LookAheadHeuristics::ScoreConstants;
+              BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
+                  LookAheadHeuristics::ScoreConstants;
+            }
+            if (isa<UndefValue>(Op) || !isa<Constant>(Op))
+              IsUsed = false;
+          }
+          break;
         case ReorderingMode::Splat:
-          if (Op == OpLastLane)
+          if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
+            IsUsed = Op == OpLastLane;
+            if (Op == OpLastLane) {
+              BestOp.Score = LookAheadHeuristics::ScoreSplat;
+              BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
+                  LookAheadHeuristics::ScoreSplat;
+            }
             BestOp.Idx = Idx;
+          }
           break;
         case ReorderingMode::Failed:
           llvm_unreachable("Not expected Failed reordering mode.");
@@ -2055,10 +2089,12 @@ public:
     void clear() { OpsVec.clear(); }
 
     /// \Returns true if there are enough operands identical to \p Op to fill
-    /// the whole vector.
+    /// the whole vector (it is mixed with constants or loop invariant values).
     /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
     bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
       bool OpAPO = getData(OpIdx, Lane).APO;
+      bool IsInvariant = L && L->isLoopInvariant(Op);
+      unsigned Cnt = 0;
       for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
         if (Ln == Lane)
           continue;
@@ -2068,22 +2104,51 @@ public:
           OperandData &Data = getData(OpI, Ln);
           if (Data.APO != OpAPO || Data.IsUsed)
             continue;
-          if (Data.V == Op) {
+          Value *OpILane = getValue(OpI, Lane);
+          bool IsConstantOp = isa<Constant>(OpILane);
+          // Consider the broadcast candidate if:
+          // 1. Same value is found in one of the operands.
+          if (Data.V == Op ||
+              // 2. The operand in the given lane is not constant but there is a
+              // constant operand in another lane (which can be moved to the
+              // given lane). In this case we can represent it as a simple
+              // permutation of constant and broadcast.
+              (!IsConstantOp &&
+               ((Lns > 2 && isa<Constant>(Data.V)) ||
+                // 2.1. If we have only 2 lanes, need to check that value in the
+                // next lane does not build same opcode sequence.
+                (Lns == 2 &&
+                 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
+                      .getOpcode() &&
+                 isa<Constant>(Data.V)))) ||
+              // 3. The operand in the current lane is loop invariant (can be
+              // hoisted out) and another operand is also a loop invariant
+              // (though not a constant). In this case the whole vector can be
+              // hoisted out.
+              // FIXME: need to teach the cost model about this case for better
+              // estimation.
+              (IsInvariant && !isa<Constant>(Data.V) &&
+               !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
+               L->isLoopInvariant(Data.V))) {
             FoundCandidate = true;
-            Data.IsUsed = true;
+            Data.IsUsed = Data.V == Op;
+            if (Data.V == Op)
+              ++Cnt;
             break;
           }
         }
         if (!FoundCandidate)
           return false;
       }
-      return true;
+      return getNumLanes() == 2 || Cnt > 1;
     }
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
     VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
-        : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
+        : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
+          L(R.LI->getLoopFor(
+              (cast<Instruction>(RootVL.front())->getParent()))) {
       // Append all the operands of RootVL.
       appendOperandsOfVL(RootVL);
     }
@@ -2215,8 +2280,6 @@ public:
                 // getBestOperand().
                 swap(OpIdx, *BestIdx, Lane);
               } else {
-                // We failed to find a best operand, set mode to 'Failed'.
-                ReorderingModes[OpIdx] = ReorderingMode::Failed;
                 // Enable the second pass.
                 StrategyFailed = true;
               }
@@ -2356,6 +2419,14 @@ public:
   bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
     return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
   }
+  /// Checks if the given value is gathered in one of the nodes.
+  bool isGathered(const Value *V) const {
+    return MustGather.contains(V);
+  }
+  /// Checks if the specified value was not schedule.
+  bool isNotScheduled(const Value *V) const {
+    return NonScheduledFirst.contains(V);
+  }
 
   /// Check if the value is vectorized in the tree.
   bool isVectorized(Value *V) const { return getTreeEntry(V); }
@@ -2478,12 +2549,12 @@ private:
   /// which exploits values reused across lanes, and arranges the inserts
   /// for ease of later optimization.
   template <typename BVTy, typename ResTy, typename... Args>
-  ResTy processBuildVector(const TreeEntry *E, Args &...Params);
+  ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
 
   /// Create a new vector from a list of scalar values.  Produces a sequence
   /// which exploits values reused across lanes, and arranges the inserts
   /// for ease of later optimization.
-  Value *createBuildVector(const TreeEntry *E);
+  Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
 
   /// Returns the instruction in the bundle, which can be used as a base point
   /// for scheduling. Usually it is the last instruction in the bundle, except
@@ -2547,7 +2618,8 @@ private:
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
-  InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
+  InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
+                                Type *ScalarTy) const;
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
@@ -2555,7 +2627,7 @@ private:
 
   /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
   /// specified, the starting vector value is poison.
-  Value *gather(ArrayRef<Value *> VL, Value *Root);
+  Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
@@ -3071,6 +3143,9 @@ private:
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
+  /// A set of first non-schedulable values.
+  ValueSet NonScheduledFirst;
+
   /// A map between the vectorized entries and the last instructions in the
   /// bundles. The bundles are built in use order, not in the def order of the
   /// instructions. So, we cannot rely directly on the last instruction in the
@@ -4251,6 +4326,11 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
   return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
 }
 
+static std::pair<InstructionCost, InstructionCost>
+getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
+            Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
+            Type *ScalarTy, VectorType *VecTy);
+
 BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
     ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
     SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
@@ -4389,31 +4469,56 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
         if (VectorizedCnt == VL.size() / VF) {
           // Compare masked gather cost and loads + insersubvector costs.
           TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-          InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
-              Instruction::Load, VecTy,
-              cast<LoadInst>(VL0)->getPointerOperand(),
-              /*VariableMask=*/false, CommonAlignment, CostKind);
+          auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
+              TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
+              CostKind, ScalarTy, VecTy);
+          InstructionCost MaskedGatherCost =
+              TTI.getGatherScatterOpCost(
+                  Instruction::Load, VecTy,
+                  cast<LoadInst>(VL0)->getPointerOperand(),
+                  /*VariableMask=*/false, CommonAlignment, CostKind) +
+              VectorGEPCost - ScalarGEPCost;
           InstructionCost VecLdCost = 0;
           auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
           for (auto [I, LS] : enumerate(States)) {
             auto *LI0 = cast<LoadInst>(VL[I * VF]);
             switch (LS) {
-            case LoadsState::Vectorize:
+            case LoadsState::Vectorize: {
+              auto [ScalarGEPCost, VectorGEPCost] =
+                  getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+                              LI0->getPointerOperand(), Instruction::Load,
+                              CostKind, ScalarTy, SubVecTy);
               VecLdCost += TTI.getMemoryOpCost(
-                  Instruction::Load, SubVecTy, LI0->getAlign(),
-                  LI0->getPointerAddressSpace(), CostKind,
-                  TTI::OperandValueInfo());
+                               Instruction::Load, SubVecTy, LI0->getAlign(),
+                               LI0->getPointerAddressSpace(), CostKind,
+                               TTI::OperandValueInfo()) +
+                           VectorGEPCost - ScalarGEPCost;
               break;
-            case LoadsState::StridedVectorize:
-              VecLdCost += TTI.getStridedMemoryOpCost(
-                  Instruction::Load, SubVecTy, LI0->getPointerOperand(),
-                  /*VariableMask=*/false, CommonAlignment, CostKind);
+            }
+            case LoadsState::StridedVectorize: {
+              auto [ScalarGEPCost, VectorGEPCost] =
+                  getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+                              LI0->getPointerOperand(), Instruction::Load,
+                              CostKind, ScalarTy, SubVecTy);
+              VecLdCost +=
+                  TTI.getStridedMemoryOpCost(
+                      Instruction::Load, SubVecTy, LI0->getPointerOperand(),
+                      /*VariableMask=*/false, CommonAlignment, CostKind) +
+                  VectorGEPCost - ScalarGEPCost;
               break;
-            case LoadsState::ScatterVectorize:
-              VecLdCost += TTI.getGatherScatterOpCost(
-                  Instruction::Load, SubVecTy, LI0->getPointerOperand(),
-                  /*VariableMask=*/false, CommonAlignment, CostKind);
+            }
+            case LoadsState::ScatterVectorize: {
+              auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
+                  TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+                  LI0->getPointerOperand(), Instruction::GetElementPtr,
+                  CostKind, ScalarTy, SubVecTy);
+              VecLdCost +=
+                  TTI.getGatherScatterOpCost(
+                      Instruction::Load, SubVecTy, LI0->getPointerOperand(),
+                      /*VariableMask=*/false, CommonAlignment, CostKind) +
+                  VectorGEPCost - ScalarGEPCost;
               break;
+            }
             case LoadsState::Gather:
               llvm_unreachable(
                   "Expected only consecutive, strided or masked gather loads.");
@@ -4422,13 +4527,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
             for (int Idx : seq<int>(0, VL.size()))
               ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
             VecLdCost +=
-                TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
-                                   ShuffleMask, CostKind, I * VF, SubVecTy);
+                TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
+                                   CostKind, I * VF, SubVecTy);
           }
           // If masked gather cost is higher - better to vectorize, so
           // consider it as a gather node. It will be better estimated
           // later.
-          if (MaskedGatherCost > VecLdCost)
+          if (MaskedGatherCost >= VecLdCost)
             return true;
         }
       }
@@ -5988,6 +6093,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   auto *VL0 = cast<Instruction>(S.OpValue);
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
+    // Too many operands - gather, most probably won't be vectorized.
+    if (VL0->getNumOperands() > MaxPHINumOperands)
+      return TreeEntry::NeedToGather;
     // Check for terminator values (e.g. invoke).
     for (Value *V : VL)
       for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
@@ -6294,6 +6402,85 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   }
 }
 
+namespace {
+/// Allows to correctly handle operands of the phi nodes based on the \p Main
+/// PHINode order of incoming basic blocks/values.
+class PHIHandler {
+  DominatorTree &DT;
+  PHINode *Main = nullptr;
+  SmallVector<Value *> Phis;
+  SmallVector<SmallVector<Value *>> Operands;
+
+public:
+  PHIHandler() = delete;
+  PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
+      : DT(DT), Main(Main), Phis(Phis),
+        Operands(Main->getNumIncomingValues(),
+                 SmallVector<Value *>(Phis.size(), nullptr)) {}
+  void buildOperands() {
+    constexpr unsigned FastLimit = 4;
+    if (Main->getNumIncomingValues() <= FastLimit) {
+      for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
+        BasicBlock *InBB = Main->getIncomingBlock(I);
+        if (!DT.isReachableFromEntry(InBB)) {
+          Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
+          continue;
+        }
+        // Prepare the operand vector.
+        for (auto [Idx, V] : enumerate(Phis)) {
+          auto *P = cast<PHINode>(V);
+          if (P->getIncomingBlock(I) == InBB)
+            Operands[I][Idx] = P->getIncomingValue(I);
+          else
+            Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
+        }
+      }
+      return;
+    }
+    SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4> Blocks;
+    for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
+      BasicBlock *InBB = Main->getIncomingBlock(I);
+      if (!DT.isReachableFromEntry(InBB)) {
+        Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
+        continue;
+      }
+      Blocks.try_emplace(InBB).first->second.push_back(I);
+    }
+    for (auto [Idx, V] : enumerate(Phis)) {
+      auto *P = cast<PHINode>(V);
+      for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
+        BasicBlock *InBB = P->getIncomingBlock(I);
+        if (InBB == Main->getIncomingBlock(I)) {
+          if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
+            continue;
+          Operands[I][Idx] = P->getIncomingValue(I);
+          continue;
+        }
+        auto It = Blocks.find(InBB);
+        if (It == Blocks.end())
+          continue;
+        Operands[It->second.front()][Idx] = P->getIncomingValue(I);
+      }
+    }
+    for (const auto &P : Blocks) {
+      if (P.getSecond().size() <= 1)
+        continue;
+      unsigned BasicI = P.getSecond().front();
+      for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
+        assert(all_of(enumerate(Operands[I]),
+                      [&](const auto &Data) {
+                        return !Data.value() ||
+                               Data.value() == Operands[BasicI][Data.index()];
+                      }) &&
+               "Expected empty operands list.");
+        Operands[I] = Operands[BasicI];
+      }
+    }
+  }
+  ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
+};
+} // namespace
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -6646,6 +6833,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
            "tryScheduleBundle should cancelScheduling on failure");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndicies);
+    NonScheduledFirst.insert(VL.front());
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -6661,24 +6849,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       // Keeps the reordered operands to avoid code duplication.
-      SmallVector<ValueList, 2> OperandsVec;
-      for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
-        if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
-          ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
-          TE->setOperand(I, Operands);
-          OperandsVec.push_back(Operands);
-          continue;
-        }
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
-              PH->getIncomingBlock(I)));
-        TE->setOperand(I, Operands);
-        OperandsVec.push_back(Operands);
-      }
-      for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
-        buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
+      PHIHandler Handler(*DT, PH, VL);
+      Handler.buildOperands();
+      for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
+        TE->setOperand(I, Handler.getOperands(I));
+      for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
+        buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
       return;
     }
     case Instruction::ExtractValue:
@@ -7805,7 +7981,13 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
 
     ScalarCost =
         TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
-    if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
+    auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
+    if (!BaseGEP) {
+      auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
+      if (It != Ptrs.end())
+        BaseGEP = cast<GEPOperator>(*It);
+    }
+    if (BaseGEP) {
       SmallVector<const Value *> Indices(BaseGEP->indices());
       VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
                                BaseGEP->getPointerOperand(), Indices, VecTy,
@@ -7847,6 +8029,33 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
+    case Instruction::Store: {
+      Type *ScalarTy =
+          cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
+      auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
+      Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
+      // Check if profitable to represent consecutive load + reverse as strided
+      // load with stride -1.
+      if (isReverseOrder(E.ReorderIndices) &&
+          TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+        SmallVector<int> Mask;
+        inversePermutation(E.ReorderIndices, Mask);
+        auto *BaseSI = cast<StoreInst>(E.Scalars.back());
+        InstructionCost OriginalVecCost =
+            TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
+                                 BaseSI->getPointerAddressSpace(), CostKind,
+                                 TTI::OperandValueInfo()) +
+            ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+        InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
+        if (StridedCost < OriginalVecCost)
+          // Strided load is more profitable than consecutive load + reverse -
+          // transform the node to strided load.
+          E.State = TreeEntry::StridedVectorize;
+      }
+      break;
+    }
     default:
       break;
     }
@@ -7863,6 +8072,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   bool IsFinalized = false;
   SmallVector<int> CommonMask;
   SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
+  Type *ScalarTy = nullptr;
   const TargetTransformInfo &TTI;
   InstructionCost Cost = 0;
   SmallDenseSet<Value *> VectorizedVals;
@@ -7892,13 +8102,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
     if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
       return TTI::TCC_Free;
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+    auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
     InstructionCost GatherCost = 0;
     SmallVector<Value *> Gathers(VL.begin(), VL.end());
     // Improve gather cost for gather of loads, if we can group some of the
     // loads into vector loads.
     InstructionsState S = getSameOpcode(VL, *R.TLI);
-    const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
+    const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
     unsigned MinVF = R.getMinVF(2 * Sz);
     if (VL.size() > 2 &&
         ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
@@ -7912,7 +8122,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                  }))) &&
         !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
         !isSplat(Gathers)) {
-      InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
+      InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
       SetVector<Value *> VectorizedLoads;
       SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
       SmallVector<unsigned> ScatterVectorized;
@@ -8040,7 +8250,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                              VecTy, Mask, CostKind);
             }
           } else {
-            GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true);
+            GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
+                                          PointerOps.front()->getType());
           }
         }
         if (NeedInsertSubvectorAnalysis) {
@@ -8074,18 +8285,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       transform(VL, ShuffleMask.begin(), [](Value *V) {
         return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
       });
-      InstructionCost InsertCost = TTI.getVectorInstrCost(
-          Instruction::InsertElement, VecTy, CostKind, 0,
-          PoisonValue::get(VecTy), *It);
-      return InsertCost +
-             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
-                                ShuffleMask, CostKind, /*Index=*/0,
-                                /*SubTp=*/nullptr, /*Args=*/*It);
+      InstructionCost InsertCost =
+          TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
+                                 PoisonValue::get(VecTy), *It);
+      return InsertCost + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
+                                             VecTy, ShuffleMask, CostKind,
+                                             /*Index=*/0, /*SubTp=*/nullptr,
+                                             /*Args=*/*It);
     }
     return GatherCost +
            (all_of(Gathers, IsaPred<UndefValue>)
                 ? TTI::TCC_Free
-                : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
+                : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
+                                  ScalarTy));
   };
 
   /// Compute the cost of creating a vector containing the extracted values from
@@ -8105,8 +8317,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             return Sz;
           return std::max(Sz, VecTy->getNumElements());
         });
-    unsigned NumSrcRegs = TTI.getNumberOfParts(
-        FixedVectorType::get(VL.front()->getType(), NumElts));
+    unsigned NumSrcRegs =
+        TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
     if (NumSrcRegs == 0)
       NumSrcRegs = 1;
     // FIXME: this must be moved to TTI for better estimation.
@@ -8152,17 +8364,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       std::optional<TTI::ShuffleKind> RegShuffleKind =
           CheckPerRegistersShuffle(SubMask);
       if (!RegShuffleKind) {
-        Cost += ::getShuffleCost(
-            TTI, *ShuffleKinds[Part],
-            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
+        Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
+                                 FixedVectorType::get(ScalarTy, NumElts),
+                                 MaskSlice);
         continue;
       }
       if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
           !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
-        Cost += ::getShuffleCost(
-            TTI, *RegShuffleKind,
-            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
-            SubMask);
+        Cost += ::getShuffleCost(TTI, *RegShuffleKind,
+                                 FixedVectorType::get(ScalarTy, EltsPerVector),
+                                 SubMask);
       }
     }
     return Cost;
@@ -8279,6 +8490,48 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     SmallVector<int> CommonMask(Mask.begin(), Mask.end());
     Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
     unsigned CommonVF = Mask.size();
+    InstructionCost ExtraCost = 0;
+    auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
+                                        unsigned VF) -> InstructionCost {
+      if (E.State == TreeEntry::NeedToGather && allConstant(E.Scalars))
+        return TTI::TCC_Free;
+      Type *EScalarTy = E.Scalars.front()->getType();
+      bool IsSigned = true;
+      if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
+        EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
+        IsSigned = It->second.second;
+      }
+      if (EScalarTy != ScalarTy) {
+        unsigned CastOpcode = Instruction::Trunc;
+        unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
+        unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
+        if (DstSz > SrcSz)
+          CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
+        return TTI.getCastInstrCost(CastOpcode,
+                                    FixedVectorType::get(ScalarTy, VF),
+                                    FixedVectorType::get(EScalarTy, VF),
+                                    TTI::CastContextHint::None, CostKind);
+      }
+      return TTI::TCC_Free;
+    };
+    auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
+      if (isa<Constant>(V))
+        return TTI::TCC_Free;
+      auto *VecTy = cast<VectorType>(V->getType());
+      Type *EScalarTy = VecTy->getElementType();
+      if (EScalarTy != ScalarTy) {
+        bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
+        unsigned CastOpcode = Instruction::Trunc;
+        unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
+        unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
+        if (DstSz > SrcSz)
+          CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
+        return TTI.getCastInstrCost(
+            CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
+            VecTy, TTI::CastContextHint::None, CostKind);
+      }
+      return TTI::TCC_Free;
+    };
     if (!V1 && !V2 && !P2.isNull()) {
       // Shuffle 2 entry nodes.
       const TreeEntry *E = P1.get<const TreeEntry *>();
@@ -8305,11 +8558,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           }
         }
         CommonVF = E->Scalars.size();
+        ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
+                     GetNodeMinBWAffectedCost(*E2, CommonVF);
+      } else {
+        ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
+                     GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else if (!V1 && P2.isNull()) {
       // Shuffle single entry node.
       const TreeEntry *E = P1.get<const TreeEntry *>();
@@ -8328,8 +8584,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = E->Scalars.size();
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
       // Not identity/broadcast? Try to see if the original vector is better.
       if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
           CommonVF == CommonMask.size() &&
@@ -8346,6 +8602,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
     } else if (V1 && P2.isNull()) {
       // Shuffle single vector.
+      ExtraCost += GetValueMinBWAffectedCost(V1);
       CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
       assert(
           all_of(Mask,
@@ -8372,11 +8629,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = VF;
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL,
-          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetValueMinBWAffectedCost(V1);
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(
+          *E2, std::min(CommonVF, E2->getVectorFactor()));
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else if (!V1 && V2) {
       // Shuffle vector and tree node.
       unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
@@ -8400,11 +8657,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = VF;
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL,
-          FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(
+          *E1, std::min(CommonVF, E1->getVectorFactor()));
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      ExtraCost += GetValueMinBWAffectedCost(V2);
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else {
       assert(V1 && V2 && "Expected both vectors.");
       unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
@@ -8415,30 +8672,33 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                       return Idx < 2 * static_cast<int>(CommonVF);
                     }) &&
              "All elements in mask must be less than 2 * CommonVF.");
+      ExtraCost +=
+          GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
       if (V1->getType() != V2->getType()) {
-        V1 = Constant::getNullValue(FixedVectorType::get(
-            cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
-        V2 = getAllOnesValue(
-            *R.DL, FixedVectorType::get(
-                       cast<FixedVectorType>(V1->getType())->getElementType(),
-                       CommonVF));
+        V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+        V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
+      } else {
+        if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
+          V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+        if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
+          V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
       }
     }
-    InVectors.front() = Constant::getNullValue(FixedVectorType::get(
-        cast<FixedVectorType>(V1->getType())->getElementType(),
-        CommonMask.size()));
+    InVectors.front() = Constant::getNullValue(
+        FixedVectorType::get(ScalarTy, CommonMask.size()));
     if (InVectors.size() == 2)
       InVectors.pop_back();
-    return BaseShuffleAnalysis::createShuffle<InstructionCost>(
-        V1, V2, CommonMask, Builder);
+    return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
+                           V1, V2, CommonMask, Builder);
   }
 
 public:
-  ShuffleCostEstimator(TargetTransformInfo &TTI,
+  ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
                        ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
-      : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
-        R(R), CheckedExtracts(CheckedExtracts) {}
+      : ScalarTy(ScalarTy), TTI(TTI),
+        VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
+        CheckedExtracts(CheckedExtracts) {}
   Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
                         ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
                         unsigned NumParts, bool &UseVecBaseAsInput) {
@@ -8486,6 +8746,12 @@ public:
         const TreeEntry *VE = R.getTreeEntry(V);
         if (!CheckedExtracts.insert(V).second ||
             !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+            any_of(EE->users(),
+                   [&](User *U) {
+                     return isa<GetElementPtrInst>(U) &&
+                            !R.areAllUsersVectorized(cast<Instruction>(U),
+                                                     &VectorizedVals);
+                   }) ||
             (VE && VE != E))
           continue;
         std::optional<unsigned> EEIdx = getExtractIndex(EE);
@@ -8528,7 +8794,7 @@ public:
     if (NumParts != 1 && UniqueBases.size() != 1) {
       UseVecBaseAsInput = true;
       VecBase = Constant::getNullValue(
-          FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
+          FixedVectorType::get(ScalarTy, CommonMask.size()));
     }
     return VecBase;
   }
@@ -8556,8 +8822,7 @@ public:
       return;
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
@@ -8574,8 +8839,7 @@ public:
       return;
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
@@ -8675,7 +8939,7 @@ public:
     return ConstantVector::getSplat(
         ElementCount::getFixed(
             cast<FixedVectorType>(Root->getType())->getNumElements()),
-        getAllOnesValue(*R.DL, VL.front()->getType()));
+        getAllOnesValue(*R.DL, ScalarTy));
   }
   InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
   /// Finalize emission of the shuffles.
@@ -8821,7 +9085,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (isa<InsertElementInst>(VL[0]))
       return InstructionCost::getInvalid();
     return processBuildVector<ShuffleCostEstimator, InstructionCost>(
-        E, *TTI, VectorizedVals, *this, CheckedExtracts);
+        E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
@@ -9256,6 +9520,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                                          Op1Info, Op2Info, Operands, VI);
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
+      if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
+        for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
+          ArrayRef<Value *> Ops = E->getOperand(I);
+          if (all_of(Ops, [&](Value *Op) {
+                auto *CI = dyn_cast<ConstantInt>(Op);
+                return CI && CI->getValue().countr_one() >= It->second.first;
+              }))
+            return CommonCost;
+        }
+      }
       unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
       TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
       TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
@@ -9324,11 +9598,22 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       // We know that we can merge the stores. Calculate the cost.
-      TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
-      return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
-                                  BaseSI->getPointerAddressSpace(), CostKind,
-                                  OpInfo) +
-             CommonCost;
+      InstructionCost VecStCost;
+      if (E->State == TreeEntry::StridedVectorize) {
+        Align CommonAlignment =
+            computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
+        VecStCost = TTI->getStridedMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind);
+      } else {
+        assert(E->State == TreeEntry::Vectorize &&
+               "Expected either strided or consecutive stores.");
+        TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
+        VecStCost = TTI->getMemoryOpCost(
+            Instruction::Store, VecTy, BaseSI->getAlign(),
+            BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
+      }
+      return VecStCost + CommonCost;
     };
     SmallVector<Value *> PointerOps(VL.size());
     for (auto [I, V] : enumerate(VL)) {
@@ -9587,11 +9872,11 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
                                     /* MatchOr */ false);
 }
 
-bool BoUpSLP::isLoadCombineCandidate() const {
+bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
   // Peek through a final sequence of stores and check if all operations are
   // likely to be load-combined.
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  for (Value *Scalar : VectorizableTree[0]->Scalars) {
+  unsigned NumElts = Stores.size();
+  for (Value *Scalar : Stores) {
     Value *X;
     if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
         !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
@@ -10861,12 +11146,8 @@ BoUpSLP::isGatherShuffledEntry(
   return Res;
 }
 
-InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
-                                       bool ForPoisonSrc) const {
-  // Find the type of the operands in VL.
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
+                                       Type *ScalarTy) const {
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   bool DuplicateNonConst = false;
   // Find the cost of inserting/extracting values from the vector.
@@ -10877,6 +11158,11 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost;
   auto EstimateInsertCost = [&](unsigned I, Value *V) {
+    if (V->getType() != ScalarTy) {
+      Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
+                                    TTI::CastContextHint::None, CostKind);
+      V = nullptr;
+    }
     if (!ForPoisonSrc)
       Cost +=
           TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
@@ -11104,7 +11390,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
-Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
+Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
   // List of instructions/lanes from current block and/or the blocks which are
   // part of the current loop. These instructions will be inserted at the end to
   // make it possible to optimize loops and hoist invariant instructions out of
@@ -11130,14 +11416,11 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
   auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
                                       Type *Ty) {
     Value *Scalar = V;
-    if (cast<VectorType>(Vec->getType())->getElementType() != Ty) {
-      assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
+    if (Scalar->getType() != Ty) {
+      assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
              "Expected integer types only.");
-      Vec = Builder.CreateIntCast(
-          Vec,
-          VectorType::get(Ty,
-                          cast<VectorType>(Vec->getType())->getElementCount()),
-          !isKnownNonNegative(Vec, SimplifyQuery(*DL)));
+      Scalar = Builder.CreateIntCast(
+          Scalar, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
     }
 
     Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
@@ -11165,10 +11448,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
     }
     return Vec;
   };
-  Value *Val0 =
-      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
-  Type *ScalarTy = Val0->getType();
-  FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   Value *Vec = Root ? Root : PoisonValue::get(VecTy);
   SmallVector<int> NonConsts;
   // Insert constant values at first.
@@ -11247,6 +11527,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   /// resulting shuffle and the second operand sets to be the newly added
   /// operand. The \p CommonMask is transformed in the proper way after that.
   SmallVector<Value *, 2> InVectors;
+  Type *ScalarTy = nullptr;
   IRBuilderBase &Builder;
   BoUpSLP &R;
 
@@ -11357,9 +11638,20 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         CommonMask[Idx] = Idx;
   }
 
+  /// Cast value \p V to the vector type with the same number of elements, but
+  /// the base type \p ScalarTy.
+  Value *castToScalarTyElem(Value *V) {
+    auto *VecTy = cast<VectorType>(V->getType());
+    if (VecTy->getElementType() == ScalarTy)
+      return V;
+    return Builder.CreateIntCast(
+        V, VectorType::get(ScalarTy, VecTy->getElementCount()),
+        !isKnownNonNegative(V, SimplifyQuery(*R.DL)));
+  }
+
 public:
-  ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
-      : Builder(Builder), R(R) {}
+  ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
+      : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
   Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
@@ -11384,6 +11676,8 @@ public:
           any_of(EI->users(), [&](User *U) {
             const TreeEntry *UTE = R.getTreeEntry(U);
             return !UTE || R.MultiNodeScalars.contains(U) ||
+                   (isa<GetElementPtrInst>(U) &&
+                    !R.areAllUsersVectorized(cast<Instruction>(U))) ||
                    count_if(R.VectorizableTree,
                             [&](const std::unique_ptr<TreeEntry> &TE) {
                               return any_of(TE->UserTreeIndices,
@@ -11396,8 +11690,10 @@ public:
         continue;
       R.eraseInstruction(EI);
     }
-    if (NumParts == 1 || UniqueBases.size() == 1)
+    if (NumParts == 1 || UniqueBases.size() == 1) {
+      VecBase = castToScalarTyElem(VecBase);
       return VecBase;
+    }
     UseVecBaseAsInput = true;
     auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
       for (auto [I, Idx] : enumerate(Mask))
@@ -11434,6 +11730,7 @@ public:
                "Expected vectors of the same size.");
         PrevSize = Size;
 #endif // NDEBUG
+        VecOp = castToScalarTyElem(VecOp);
         Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
       }
       if (!Bases.front())
@@ -11489,10 +11786,10 @@ public:
       return std::nullopt;
     // Postpone gather emission, will be emitted after the end of the
     // process to keep correct order.
-    auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
-                                       E->getVectorFactor());
+    auto *ResVecTy = FixedVectorType::get(ScalarTy, E->getVectorFactor());
     return Builder.CreateAlignedLoad(
-        VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),
+        ResVecTy,
+        PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
         MaybeAlign());
   }
   /// Adds 2 input vectors (in form of tree entries) and the mask for their
@@ -11508,6 +11805,8 @@ public:
   /// Adds 2 input vectors and the mask for their shuffling.
   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
     assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
+    V1 = castToScalarTyElem(V1);
+    V2 = castToScalarTyElem(V2);
     if (InVectors.empty()) {
       InVectors.push_back(V1);
       InVectors.push_back(V2);
@@ -11535,6 +11834,7 @@ public:
   }
   /// Adds another one input vector and the mask for the shuffling.
   void add(Value *V1, ArrayRef<int> Mask, bool = false) {
+    V1 = castToScalarTyElem(V1);
     if (InVectors.empty()) {
       if (!isa<FixedVectorType>(V1->getType())) {
         V1 = createShuffle(V1, nullptr, CommonMask);
@@ -11598,7 +11898,7 @@ public:
   }
   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
                 Value *Root = nullptr) {
-    return R.gather(VL, Root);
+    return R.gather(VL, Root, ScalarTy);
   }
   Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
   /// Finalize emission of the shuffles.
@@ -11698,7 +11998,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
     }
     if (IsSameVE) {
       auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
-        ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+        ShuffleInstructionBuilder ShuffleBuilder(
+            cast<VectorType>(V->getType())->getElementType(), Builder, *this);
         ShuffleBuilder.add(V, Mask);
         return ShuffleBuilder.finalize(std::nullopt);
       };
@@ -11773,7 +12074,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
 }
 
 template <typename BVTy, typename ResTy, typename... Args>
-ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
+ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
+                                  Args &...Params) {
   assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
   unsigned VF = E->getVectorFactor();
 
@@ -11821,7 +12123,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     }
     return true;
   };
-  BVTy ShuffleBuilder(Params...);
+  BVTy ShuffleBuilder(ScalarTy, Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
@@ -11830,7 +12132,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   bool UseVecBaseAsInput = false;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
-  Type *ScalarTy = GatheredScalars.front()->getType();
+  Type *OrigScalarTy = GatheredScalars.front()->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
   unsigned NumParts = TTI->getNumberOfParts(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
@@ -11865,7 +12167,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               GatheredScalars.size() != VF) {
             Resized = true;
             GatheredScalars.append(VF - GatheredScalars.size(),
-                                   PoisonValue::get(ScalarTy));
+                                   PoisonValue::get(OrigScalarTy));
           }
       }
     }
@@ -11925,12 +12227,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               });
             }))
           GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+                                 PoisonValue::get(OrigScalarTy));
       }
       // Remove shuffled elements from list of gathers.
       for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
         if (Mask[I] != PoisonMaskElem)
-          GatheredScalars[I] = PoisonValue::get(ScalarTy);
+          GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
       }
     }
   }
@@ -11941,7 +12243,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     // such sequences.
     bool IsSplat = IsRootPoison && isSplat(Scalars) &&
                    (Scalars.size() > 2 || Scalars.front() == Scalars.back());
-    Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
+    Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
     SmallVector<int> UndefPos;
     DenseMap<Value *, unsigned> UniquePositions;
     // Gather unique non-const values and all constant values.
@@ -11963,7 +12265,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       ++NumNonConsts;
       SinglePos = I;
       Value *OrigV = V;
-      Scalars[I] = PoisonValue::get(ScalarTy);
+      Scalars[I] = PoisonValue::get(OrigScalarTy);
       if (IsSplat) {
         Scalars.front() = OrigV;
         ReuseMask[I] = 0;
@@ -11979,7 +12281,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         ReuseMask.assign(VF, PoisonMaskElem);
         std::swap(Scalars.front(), Scalars[SinglePos]);
         if (!UndefPos.empty() && UndefPos.front() == 0)
-          Scalars.front() = UndefValue::get(ScalarTy);
+          Scalars.front() = UndefValue::get(OrigScalarTy);
       }
       ReuseMask[SinglePos] = SinglePos;
     } else if (!UndefPos.empty() && IsSplat) {
@@ -12009,7 +12311,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
           // Replace the undef by the poison, in the mask it is replaced by
           // non-poisoned scalar already.
           if (I != Pos)
-            Scalars[I] = PoisonValue::get(ScalarTy);
+            Scalars[I] = PoisonValue::get(OrigScalarTy);
         }
       } else {
         // Replace undefs by the poisons, emit broadcast and then emit
@@ -12017,7 +12319,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         for (int I : UndefPos) {
           ReuseMask[I] = PoisonMaskElem;
           if (isa<UndefValue>(Scalars[I]))
-            Scalars[I] = PoisonValue::get(ScalarTy);
+            Scalars[I] = PoisonValue::get(OrigScalarTy);
         }
         NeedFreeze = true;
       }
@@ -12072,9 +12374,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
       } else {
         IsUsedInExpr = false;
-        ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
-                               ScalarTy, GatheredScalars.size())),
-                           ExtractMask, /*ForExtracts=*/true);
+        ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
+                           /*ForExtracts=*/true);
       }
     }
     if (!GatherShuffles.empty()) {
@@ -12155,9 +12456,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     // contains only constant to build final vector and then shuffle.
     for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
       if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
-        NonConstants[I] = PoisonValue::get(ScalarTy);
+        NonConstants[I] = PoisonValue::get(OrigScalarTy);
       else
-        GatheredScalars[I] = PoisonValue::get(ScalarTy);
+        GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
     }
     // Generate constants for final shuffle and build a mask for them.
     if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
@@ -12203,9 +12504,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   return Res;
 }
 
-Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
-  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
-                                                                *this);
+Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
+  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
+                                                                Builder, *this);
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
@@ -12218,19 +12519,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return E->VectorizedValue;
   }
 
+  Value *V = E->Scalars.front();
+  Type *ScalarTy = V->getType();
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    ScalarTy = Store->getValueOperand()->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(V))
+    ScalarTy = IE->getOperand(1)->getType();
+  auto It = MinBWs.find(E);
+  if (It != MinBWs.end())
+    ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   if (E->State == TreeEntry::NeedToGather) {
     // Set insert point for non-reduction initial nodes.
     if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
-    Value *Vec = createBuildVector(E);
+    Value *Vec = createBuildVector(E, ScalarTy);
     E->VectorizedValue = Vec;
     return Vec;
   }
 
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
-    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
-    if (E->getOpcode() == Instruction::Store) {
+    ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
+    if (E->getOpcode() == Instruction::Store &&
+        E->State == TreeEntry::Vectorize) {
       ArrayRef<int> Mask =
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
                    E->ReorderIndices.size());
@@ -12250,14 +12562,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   Instruction *VL0 = E->getMainOp();
-  Type *ScalarTy = VL0->getType();
-  if (auto *Store = dyn_cast<StoreInst>(VL0))
-    ScalarTy = Store->getValueOperand()->getType();
-  else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
-    ScalarTy = IE->getOperand(1)->getType();
-  auto It = MinBWs.find(E);
-  if (It != MinBWs.end())
-    ScalarTy = IntegerType::get(F->getContext(), It->second.first);
   auto GetOperandSignedness = [&](unsigned Idx) {
     const TreeEntry *OpE = getOperandEntry(E, Idx);
     bool IsSigned = false;
@@ -12270,7 +12574,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       });
     return IsSigned;
   };
-  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
@@ -12712,6 +13015,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
+      if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
+        for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
+          ArrayRef<Value *> Ops = E->getOperand(I);
+          if (all_of(Ops, [&](Value *Op) {
+                auto *CI = dyn_cast<ConstantInt>(Op);
+                return CI && CI->getValue().countr_one() >= It->second.first;
+              })) {
+            V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
+            E->VectorizedValue = V;
+            ++NumVectorInstructions;
+            return V;
+          }
+        }
+      }
       if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
         assert((It != MinBWs.end() ||
                 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
@@ -12827,8 +13144,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       VecValue = FinalShuffle(VecValue, E, VecTy);
 
       Value *Ptr = SI->getPointerOperand();
-      StoreInst *ST =
-          Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
+      Instruction *ST;
+      if (E->State == TreeEntry::Vectorize) {
+        ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
+      } else {
+        assert(E->State == TreeEntry::StridedVectorize &&
+               "Expected either strided or conseutive stores.");
+        Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
+        Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
+        auto *Inst = Builder.CreateIntrinsic(
+            Intrinsic::experimental_vp_strided_store,
+            {VecTy, Ptr->getType(), StrideTy},
+            {VecValue, Ptr,
+             ConstantInt::get(
+                 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
+             Builder.getAllOnesMask(VecTy->getElementCount()),
+             Builder.getInt32(E->Scalars.size())});
+        Inst->addParamAttr(
+            /*ArgNo=*/1,
+            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+        ST = Inst;
+      }
 
       Value *V = propagateMetadata(ST, E->Scalars);
 
@@ -13145,7 +13481,8 @@ Value *BoUpSLP::vectorizeTree(
     auto *TE = const_cast<TreeEntry *>(E);
     if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
       if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
-              TE->UserTreeIndices.front().EdgeIdx)))
+              TE->UserTreeIndices.front().EdgeIdx)) &&
+          VecTE->isSame(TE->Scalars))
         // Found gather node which is absolutely the same as one of the
         // vectorized nodes. It may happen after reordering.
         continue;
@@ -13316,8 +13653,11 @@ Value *BoUpSLP::vectorizeTree(
             // Leave the GEPs as is, they are free in most cases and better to
             // keep them as GEPs.
             auto *CloneGEP = GEP->clone();
-            CloneGEP->insertBefore(*Builder.GetInsertBlock(),
-                                   Builder.GetInsertPoint());
+            if (isa<Instruction>(Vec))
+              CloneGEP->insertBefore(*Builder.GetInsertBlock(),
+                                     Builder.GetInsertPoint());
+            else
+              CloneGEP->insertBefore(GEP);
             if (GEP->hasName())
               CloneGEP->takeName(GEP);
             Ex = CloneGEP;
@@ -13520,7 +13860,8 @@ Value *BoUpSLP::vectorizeTree(
       else
         CombinedMask2[I] = Mask[I] - VF;
     }
-    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+    ShuffleInstructionBuilder ShuffleBuilder(
+        cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
     ShuffleBuilder.add(V1, CombinedMask1);
     if (V2)
       ShuffleBuilder.add(V2, CombinedMask2);
@@ -14555,13 +14896,27 @@ bool BoUpSLP::collectValuesToDemote(
       return false;
     bool Res = all_of(
         E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
-    // Gather demoted constant operands.
-    if (Res && E.State == TreeEntry::NeedToGather &&
-        all_of(E.Scalars, IsaPred<Constant>))
-      ToDemote.push_back(E.Idx);
+    // Demote gathers.
+    if (Res && E.State == TreeEntry::NeedToGather) {
+      // Check possible extractelement instructions bases and final vector
+      // length.
+      SmallPtrSet<Value *, 4> UniqueBases;
+      for (Value *V : E.Scalars) {
+        auto *EE = dyn_cast<ExtractElementInst>(V);
+        if (!EE)
+          continue;
+        UniqueBases.insert(EE->getVectorOperand());
+      }
+      const unsigned VF = E.Scalars.size();
+      Type *OrigScalarTy = E.Scalars.front()->getType();
+      if (UniqueBases.size() <= 2 ||
+          TTI->getNumberOfParts(FixedVectorType::get(OrigScalarTy, VF)) ==
+              TTI->getNumberOfParts(FixedVectorType::get(
+                  IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
+        ToDemote.push_back(E.Idx);
+    }
     return Res;
   };
-  // TODO: improve handling of gathered values and others.
   if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
       any_of(E.Scalars, [&](Value *V) {
         return all_of(V->users(), [&](User *U) {
@@ -15072,11 +15427,16 @@ void BoUpSLP::computeMinimumValueSizes() {
       IsSignedCmp =
           NodeIdx < VectorizableTree.size() &&
           any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
-                 [](const EdgeInfo &EI) {
+                 [&](const EdgeInfo &EI) {
                    return EI.UserTE->getOpcode() == Instruction::ICmp &&
-                          any_of(EI.UserTE->Scalars, [](Value *V) {
+                          any_of(EI.UserTE->Scalars, [&](Value *V) {
                             auto *IC = dyn_cast<ICmpInst>(V);
-                            return IC && IC->isSigned();
+                            return IC &&
+                                   (IC->isSigned() ||
+                                    !isKnownNonNegative(IC->getOperand(0),
+                                                        SimplifyQuery(*DL)) ||
+                                    !isKnownNonNegative(IC->getOperand(1),
+                                                        SimplifyQuery(*DL)));
                           });
                  });
     }
@@ -15205,8 +15565,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   return Changed;
 }
 
-bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
-                                            unsigned Idx, unsigned MinVF) {
+std::optional<bool>
+SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
+                                       unsigned Idx, unsigned MinVF,
+                                       unsigned &Size) {
+  Size = 0;
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
                     << "\n");
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
@@ -15223,11 +15586,42 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
                     << "\n");
 
+  SetVector<Value *> ValOps;
+  for (Value *V : Chain)
+    ValOps.insert(cast<StoreInst>(V)->getValueOperand());
+  // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
+  InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
+  if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
+    DenseSet<Value *> Stores(Chain.begin(), Chain.end());
+    bool IsPowerOf2 =
+        isPowerOf2_32(ValOps.size()) ||
+        (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
+    if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
+         (!S.MainOp->isSafeToRemove() ||
+          any_of(ValOps.getArrayRef(),
+                 [&](Value *V) {
+                   return !isa<ExtractElementInst>(V) &&
+                          (V->getNumUses() > Chain.size() ||
+                           any_of(V->users(), [&](User *U) {
+                             return !Stores.contains(U);
+                           }));
+                 }))) ||
+        (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
+      Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
+      return false;
+    }
+  }
+  if (R.isLoadCombineCandidate(Chain))
+    return true;
   R.buildTree(Chain);
-  if (R.isTreeTinyAndNotFullyVectorizable())
-    return false;
-  if (R.isLoadCombineCandidate())
+  // Check if tree tiny and store itself or its value is not vectorized.
+  if (R.isTreeTinyAndNotFullyVectorizable()) {
+    if (R.isGathered(Chain.front()) ||
+        R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
+      return std::nullopt;
+    Size = R.getTreeSize();
     return false;
+  }
   R.reorderTopToBottom();
   R.reorderBottomToTop();
   R.buildExternalUses();
@@ -15235,6 +15629,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   R.computeMinimumValueSizes();
   R.transformNodes();
 
+  Size = R.getTreeSize();
+  if (S.getOpcode() == Instruction::Load)
+    Size = 2; // cut off masked gather small trees
   InstructionCost Cost = R.getTreeCost();
 
   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
@@ -15256,17 +15653,45 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   return false;
 }
 
-bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
-                                        BoUpSLP &R) {
+/// Checks if the quadratic mean deviation is less than 90% of the mean size.
+static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
+                           bool First) {
+  unsigned Num = 0;
+  uint64_t Sum = std::accumulate(
+      Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
+      [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
+        unsigned Size = First ? Val.first : Val.second;
+        if (Size == 1)
+          return V;
+        ++Num;
+        return V + Size;
+      });
+  if (Num == 0)
+    return true;
+  uint64_t Mean = Sum / Num;
+  if (Mean == 0)
+    return true;
+  uint64_t Dev = std::accumulate(
+                     Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
+                     [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
+                       unsigned P = First ? Val.first : Val.second;
+                       if (P == 1)
+                         return V;
+                       return V + (P - Mean) * (P - Mean);
+                     }) /
+                 Num;
+  return Dev * 81 / (Mean * Mean) == 0;
+}
+
+bool SLPVectorizerPass::vectorizeStores(
+    ArrayRef<StoreInst *> Stores, BoUpSLP &R,
+    DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
+        &Visited) {
   // We may run into multiple chains that merge into a single chain. We mark the
   // stores that we vectorized so that we don't visit the same store twice.
   BoUpSLP::ValueSet VectorizedStores;
   bool Changed = false;
 
-  // Stores the pair of stores (first_store, last_store) in a range, that were
-  // already tried to be vectorized. Allows to skip the store ranges that were
-  // already tried to be vectorized but the attempts were unsuccessful.
-  DenseSet<std::pair<Value *, Value *>> TriedSequences;
   struct StoreDistCompare {
     bool operator()(const std::pair<unsigned, int> &Op1,
                     const std::pair<unsigned, int> &Op2) const {
@@ -15294,7 +15719,14 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         PrevDist = DataVar.second;
       });
 
-      if (Operands.size() <= 1)
+      if (Operands.size() <= 1 ||
+          !Visited
+               .insert({Operands.front(),
+                        cast<StoreInst>(Operands.front())->getValueOperand(),
+                        Operands.back(),
+                        cast<StoreInst>(Operands.back())->getValueOperand(),
+                        Operands.size()})
+               .second)
         continue;
 
       unsigned MaxVecRegSize = R.getMaxVecRegSize();
@@ -15303,13 +15735,19 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
 
       unsigned MaxVF =
           std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
+      unsigned MaxRegVF = MaxVF;
       auto *Store = cast<StoreInst>(Operands[0]);
       Type *StoreTy = Store->getValueOperand()->getType();
       Type *ValueTy = StoreTy;
       if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
         ValueTy = Trunc->getSrcTy();
-      unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
-          R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
+      if (ValueTy == StoreTy &&
+          R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
+        MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
+      unsigned MinVF = std::max<unsigned>(
+          2, PowerOf2Ceil(TTI->getStoreMinimumVF(
+                 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
+                 ValueTy)));
 
       if (MaxVF < MinVF) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
@@ -15324,7 +15762,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
         // lanes are used.
         unsigned CandVF = Operands.size();
-        if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
+        if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
           NonPowerOf2VF = CandVF;
       }
 
@@ -15335,40 +15773,184 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         VF = Size > MaxVF ? NonPowerOf2VF : Size;
         Size *= 2;
       });
-      unsigned StartIdx = 0;
-      for (unsigned Size : CandidateVFs) {
-        for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
-          ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
-          assert(
-              all_of(
-                  Slice,
-                  [&](Value *V) {
-                    return cast<StoreInst>(V)->getValueOperand()->getType() ==
-                           cast<StoreInst>(Slice.front())
-                               ->getValueOperand()
-                               ->getType();
-                  }) &&
-              "Expected all operands of same type.");
-          if (!VectorizedStores.count(Slice.front()) &&
-              !VectorizedStores.count(Slice.back()) &&
-              TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
-                  .second &&
-              vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
-            // Mark the vectorized stores so that we don't vectorize them again.
-            VectorizedStores.insert(Slice.begin(), Slice.end());
-            Changed = true;
-            // If we vectorized initial block, no need to try to vectorize it
-            // again.
-            if (Cnt == StartIdx)
-              StartIdx += Size;
-            Cnt += Size;
-            continue;
+      unsigned End = Operands.size();
+      unsigned Repeat = 0;
+      constexpr unsigned MaxAttempts = 4;
+      OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
+      for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
+        P.first = P.second = 1;
+      });
+      DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
+      auto IsNotVectorized = [](bool First,
+                                const std::pair<unsigned, unsigned> &P) {
+        return First ? P.first > 0 : P.second > 0;
+      };
+      auto IsVectorized = [](bool First,
+                             const std::pair<unsigned, unsigned> &P) {
+        return First ? P.first == 0 : P.second == 0;
+      };
+      auto VFIsProfitable = [](bool First, unsigned Size,
+                               const std::pair<unsigned, unsigned> &P) {
+        return First ? Size >= P.first : Size >= P.second;
+      };
+      auto FirstSizeSame = [](unsigned Size,
+                              const std::pair<unsigned, unsigned> &P) {
+        return Size == P.first;
+      };
+      while (true) {
+        ++Repeat;
+        bool RepeatChanged = false;
+        bool AnyProfitableGraph;
+        for (unsigned Size : CandidateVFs) {
+          AnyProfitableGraph = false;
+          unsigned StartIdx = std::distance(
+              RangeSizes.begin(),
+              find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
+                                            std::placeholders::_1)));
+          while (StartIdx < End) {
+            unsigned EndIdx =
+                std::distance(RangeSizes.begin(),
+                              find_if(RangeSizes.drop_front(StartIdx),
+                                      std::bind(IsVectorized, Size >= MaxRegVF,
+                                                std::placeholders::_1)));
+            unsigned Sz = EndIdx >= End ? End : EndIdx;
+            for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
+              if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
+                                  Size >= MaxRegVF)) {
+                ++Cnt;
+                continue;
+              }
+              ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+              assert(all_of(Slice,
+                            [&](Value *V) {
+                              return cast<StoreInst>(V)
+                                         ->getValueOperand()
+                                         ->getType() ==
+                                     cast<StoreInst>(Slice.front())
+                                         ->getValueOperand()
+                                         ->getType();
+                            }) &&
+                     "Expected all operands of same type.");
+              if (!NonSchedulable.empty()) {
+                auto [NonSchedSizeMax, NonSchedSizeMin] =
+                    NonSchedulable.lookup(Slice.front());
+                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
+                  Cnt += NonSchedSizeMax;
+                  continue;
+                }
+              }
+              unsigned TreeSize;
+              std::optional<bool> Res =
+                  vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
+              if (!Res) {
+                NonSchedulable
+                    .try_emplace(Slice.front(), std::make_pair(Size, Size))
+                    .first->getSecond()
+                    .second = Size;
+              } else if (*Res) {
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                VectorizedStores.insert(Slice.begin(), Slice.end());
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                AnyProfitableGraph = RepeatChanged = Changed = true;
+                // If we vectorized initial block, no need to try to vectorize
+                // it again.
+                for_each(RangeSizes.slice(Cnt, Size),
+                         [](std::pair<unsigned, unsigned> &P) {
+                           P.first = P.second = 0;
+                         });
+                if (Cnt < StartIdx + MinVF) {
+                  for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
+                           [](std::pair<unsigned, unsigned> &P) {
+                             P.first = P.second = 0;
+                           });
+                  StartIdx = Cnt + Size;
+                }
+                if (Cnt > Sz - Size - MinVF) {
+                  for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
+                           [](std::pair<unsigned, unsigned> &P) {
+                             P.first = P.second = 0;
+                           });
+                  if (Sz == End)
+                    End = Cnt;
+                  Sz = Cnt;
+                }
+                Cnt += Size;
+                continue;
+              }
+              if (Size > 2 && Res &&
+                  !all_of(RangeSizes.slice(Cnt, Size),
+                          std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
+                                    std::placeholders::_1))) {
+                Cnt += Size;
+                continue;
+              }
+              // Check for the very big VFs that we're not rebuilding same
+              // trees, just with larger number of elements.
+              if (Size > MaxRegVF && TreeSize > 1 &&
+                  all_of(RangeSizes.slice(Cnt, Size),
+                         std::bind(FirstSizeSame, TreeSize,
+                                   std::placeholders::_1))) {
+                Cnt += Size;
+                while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
+                  ++Cnt;
+                continue;
+              }
+              if (TreeSize > 1)
+                for_each(RangeSizes.slice(Cnt, Size),
+                         [&](std::pair<unsigned, unsigned> &P) {
+                           if (Size >= MaxRegVF)
+                             P.second = std::max(P.second, TreeSize);
+                           else
+                             P.first = std::max(P.first, TreeSize);
+                         });
+              ++Cnt;
+              AnyProfitableGraph = true;
+            }
+            if (StartIdx >= End)
+              break;
+            if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
+              AnyProfitableGraph = true;
+            StartIdx = std::distance(
+                RangeSizes.begin(),
+                find_if(RangeSizes.drop_front(Sz),
+                        std::bind(IsNotVectorized, Size >= MaxRegVF,
+                                  std::placeholders::_1)));
           }
-          ++Cnt;
+          if (!AnyProfitableGraph && Size >= MaxRegVF)
+            break;
         }
-        // Check if the whole array was vectorized already - exit.
-        if (StartIdx >= Operands.size())
+        // All values vectorized - exit.
+        if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
+              return P.first == 0 && P.second == 0;
+            }))
+          break;
+        // Check if tried all attempts or no need for the last attempts at all.
+        if (Repeat >= MaxAttempts ||
+            (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
           break;
+        constexpr unsigned StoresLimit = 64;
+        const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
+            Operands.size(),
+            static_cast<unsigned>(
+                End -
+                std::distance(
+                    RangeSizes.begin(),
+                    find_if(RangeSizes, std::bind(IsNotVectorized, true,
+                                                  std::placeholders::_1))) +
+                1)));
+        unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
+        if (VF > MaxTotalNum || VF >= StoresLimit)
+          break;
+        for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
+          if (P.first != 0)
+            P.first = std::max(P.second, P.first);
+        });
+        // Last attempt to vectorize max number of elements, if all previous
+        // attempts were unsuccessful because of the cost issues.
+        CandidateVFs.clear();
+        CandidateVFs.push_back(VF);
       }
     }
   };
@@ -17834,8 +18416,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     // Collect the incoming values from the PHIs.
     Incoming.clear();
     for (Instruction &I : *BB) {
-      PHINode *P = dyn_cast<PHINode>(&I);
-      if (!P)
+      auto *P = dyn_cast<PHINode>(&I);
+      if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
         break;
 
       // No need to analyze deleted, vectorized and non-vectorizable
@@ -18186,6 +18768,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
   };
 
   // Attempt to sort and vectorize each of the store-groups.
+  DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
   for (auto &Pair : Stores) {
     if (Pair.second.size() < 2)
       continue;
@@ -18203,8 +18786,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
                                             Pair.second.rend());
     Changed |= tryToVectorizeSequence<StoreInst>(
         ReversedStores, StoreSorter, AreCompatibleStores,
-        [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
-          return vectorizeStores(Candidates, R);
+        [&](ArrayRef<StoreInst *> Candidates, bool) {
+          return vectorizeStores(Candidates, R, Attempted);
         },
         /*MaxVFOnly=*/false, R);
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4..71594be2b965 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1458,18 +1458,21 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
 
 public:
   template <typename IterT>
-  VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
+  VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
                     Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
                     Function *Variant = nullptr)
-      : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, &I, DL),
-        VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {}
+      : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL),
+        VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {
+    assert(
+        isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
+        "last operand must be the called function");
+  }
 
   ~VPWidenCallRecipe() override = default;
 
   VPWidenCallRecipe *clone() override {
-    return new VPWidenCallRecipe(*cast<CallInst>(getUnderlyingInstr()),
-                                 operands(), VectorIntrinsicID, getDebugLoc(),
-                                 Variant);
+    return new VPWidenCallRecipe(getUnderlyingValue(), operands(),
+                                 VectorIntrinsicID, getDebugLoc(), Variant);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
@@ -1477,6 +1480,17 @@ public:
   /// Produce a widened version of the call instruction.
   void execute(VPTransformState &State) override;
 
+  Function *getCalledScalarFunction() const {
+    return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
+  }
+
+  operand_range arg_operands() {
+    return make_range(op_begin(), op_begin() + getNumOperands() - 1);
+  }
+  const_operand_range arg_operands() const {
+    return make_range(op_begin(), op_begin() + getNumOperands() - 1);
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -3215,6 +3229,9 @@ public:
     return Value2VPValue[V];
   }
 
+  /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
+  VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the live-ins of this VPlan to \p O.
   void printLiveIns(raw_ostream &O) const;
@@ -3278,8 +3295,8 @@ public:
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
-  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
-                                  BasicBlock *LoopPreHeaderBB,
+  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
+                                  BasicBlock *LoopLatchBB,
                                   BasicBlock *LoopExitBB);
 };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c..29ed001ccd2c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -51,9 +51,12 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPWidenStoreSC:
     return true;
   case VPReplicateSC:
-  case VPWidenCallSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
         ->mayWriteToMemory();
+  case VPWidenCallSC:
+    return !cast<VPWidenCallRecipe>(this)
+                ->getCalledScalarFunction()
+                ->onlyReadsMemory();
   case VPBranchOnMaskSC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
@@ -87,9 +90,12 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPWidenLoadSC:
     return true;
   case VPReplicateSC:
-  case VPWidenCallSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
         ->mayReadFromMemory();
+  case VPWidenCallSC:
+    return !cast<VPWidenCallRecipe>(this)
+                ->getCalledScalarFunction()
+                ->onlyWritesMemory();
   case VPBranchOnMaskSC:
   case VPPredInstPHISC:
   case VPScalarIVStepsSC:
@@ -136,9 +142,10 @@ bool VPRecipeBase::mayHaveSideEffects() const {
     default:
       return true;
     }
-  case VPWidenCallSC:
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
-        ->mayHaveSideEffects();
+  case VPWidenCallSC: {
+    Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
+    return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
+  }
   case VPBlendSC:
   case VPReductionSC:
   case VPScalarIVStepsSC:
@@ -702,8 +709,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 
 void VPWidenCallRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
-  auto &CI = *cast<CallInst>(getUnderlyingInstr());
-  assert(!isa<DbgInfoIntrinsic>(CI) &&
+  Function *CalledScalarFn = getCalledScalarFunction();
+  assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
   State.setDebugLocFrom(getDebugLoc());
 
@@ -716,10 +723,10 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
     // Add return type if intrinsic is overloaded on it.
     if (UseIntrinsic &&
         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
-      TysForDecl.push_back(
-          VectorType::get(CI.getType()->getScalarType(), State.VF));
+      TysForDecl.push_back(VectorType::get(
+          CalledScalarFn->getReturnType()->getScalarType(), State.VF));
     SmallVector<Value *, 4> Args;
-    for (const auto &I : enumerate(operands())) {
+    for (const auto &I : enumerate(arg_operands())) {
       // Some intrinsics have a scalar argument - don't replace it with a
       // vector.
       Value *Arg;
@@ -752,16 +759,19 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
       VectorF = Variant;
     }
 
+    auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());
     SmallVector<OperandBundleDef, 1> OpBundles;
-    CI.getOperandBundlesAsDefs(OpBundles);
+    if (CI)
+      CI->getOperandBundlesAsDefs(OpBundles);
+
     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
 
     if (isa<FPMathOperator>(V))
-      V->copyFastMathFlags(&CI);
+      V->copyFastMathFlags(CI);
 
     if (!V->getType()->isVoidTy())
       State.set(this, V, Part);
-    State.addMetadata(V, &CI);
+    State.addMetadata(V, CI);
   }
 }
 
@@ -770,16 +780,18 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CALL ";
 
-  auto *CI = cast<CallInst>(getUnderlyingInstr());
-  if (CI->getType()->isVoidTy())
+  Function *CalledFn = getCalledScalarFunction();
+  if (CalledFn->getReturnType()->isVoidTy())
     O << "void ";
   else {
     printAsOperand(O, SlotTracker);
     O << " = ";
   }
 
-  O << "call @" << CI->getCalledFunction()->getName() << "(";
-  printOperands(O, SlotTracker);
+  O << "call @" << CalledFn->getName() << "(";
+  interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
+    Op->printAsOperand(O, SlotTracker);
+  });
   O << ")";
 
   if (VectorIntrinsicID)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe6..017b00c042f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -75,8 +75,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
           NewRecipe = new VPWidenCallRecipe(
-              *CI, drop_end(Ingredient.operands()),
-              getVectorIntrinsicIDForCall(CI, &TLI), CI->getDebugLoc());
+              CI, Ingredient.operands(), getVectorIntrinsicIDForCall(CI, &TLI),
+              CI->getDebugLoc());
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
           NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
@@ -506,13 +506,12 @@ static void removeDeadRecipes(VPlan &Plan) {
   }
 }
 
-static VPValue *createScalarIVSteps(VPlan &Plan,
-                                    InductionDescriptor::InductionKind Kind,
-                                    Instruction::BinaryOps InductionOpcode,
-                                    FPMathOperator *FPBinOp,
-                                    ScalarEvolution &SE, Instruction *TruncI,
-                                    VPValue *StartV, VPValue *Step,
-                                    VPBasicBlock::iterator IP) {
+static VPScalarIVStepsRecipe *
+createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
+                    Instruction::BinaryOps InductionOpcode,
+                    FPMathOperator *FPBinOp, ScalarEvolution &SE,
+                    Instruction *TruncI, VPValue *StartV, VPValue *Step,
+                    VPBasicBlock::iterator IP) {
   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
   VPSingleDefRecipe *BaseIV = CanonicalIV;
@@ -579,16 +578,13 @@ static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
       VPValue *StartV =
           Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
       VPValue *StepV = PtrIV->getOperand(1);
-      VPRecipeBase *Steps =
-          createScalarIVSteps(Plan, InductionDescriptor::IK_IntInduction,
-                              Instruction::Add, nullptr, SE, nullptr, StartV,
-                              StepV, InsertPt)
-              ->getDefiningRecipe();
+      VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
+          Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
+          SE, nullptr, StartV, StepV, InsertPt);
 
-      auto *Recipe =
-          new VPInstruction(VPInstruction::PtrAdd,
-                            {PtrIV->getStartValue(), Steps->getVPSingleValue()},
-                            PtrIV->getDebugLoc(), "next.gep");
+      auto *Recipe = new VPInstruction(VPInstruction::PtrAdd,
+                                       {PtrIV->getStartValue(), Steps},
+                                       PtrIV->getDebugLoc(), "next.gep");
 
       Recipe->insertAfter(Steps);
       PtrIV->replaceAllUsesWith(Recipe);
@@ -606,7 +602,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
       continue;
 
     const InductionDescriptor &ID = WideIV->getInductionDescriptor();
-    VPValue *Steps = createScalarIVSteps(
+    VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
         Plan, ID.getKind(), ID.getInductionOpcode(),
         dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), SE,
         WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
diff --git a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
index 836a028ad6aa..f491b086107a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
@@ -7,58 +7,58 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_reverse() #0{
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> undef)
-  call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> undef)
-  call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> undef)
-  call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> undef)
-  call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> undef)
-  call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> undef)
-  call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> undef)
-  call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> undef)
-  call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> undef)
-  call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> undef)
-  call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> undef)
-  call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef)
-  call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef)
-  call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef)
-  call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
-  call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
+  call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef)
+  call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> undef)
+  call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> undef)
+  call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> undef)
+  call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> undef)
+  call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> undef)
+  call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> undef)
+  call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> undef)
+  call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> undef)
+  call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> undef)
+  call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> undef)
+  call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> undef)
+  call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> undef)
+  call <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat> undef)
   ret void
 }
 
 attributes #0 = { "target-features"="+sve,+bf16" }
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
-declare <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double>)
-declare <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat>)
-declare <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <16 x half> @llvm.vector.reverse.v16f16(<16 x half>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <8 x float> @llvm.vector.reverse.v8f32(<8 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
+declare <4 x double> @llvm.vector.reverse.v4f64(<4 x double>)
+declare <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat>)
+declare <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat>)
diff --git a/llvm/test/Analysis/CostModel/AArch64/splice.ll b/llvm/test/Analysis/CostModel/AArch64/splice.ll
index f5afdff41b1d..1d76a4838cee 100644
--- a/llvm/test/Analysis/CostModel/AArch64/splice.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/splice.ll
@@ -5,96 +5,96 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_splice() #0 {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i1 = call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %splice.v2i128 = call <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i8 = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v32i8 = call <32 x i8> @llvm.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i16 = call <2 x i16> @llvm.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i16 = call <4 x i16> @llvm.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i16 = call <8 x i16> @llvm.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16i16 = call <16 x i16> @llvm.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i32 = call <4 x i32> @llvm.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8i32 = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i64 = call <2 x i64> @llvm.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4i64 = call <4 x i64> @llvm.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f16 = call <2 x half> @llvm.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f16 = call <4 x half> @llvm.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8f16 = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16f16 = call <16 x half> @llvm.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f32 = call <2 x float> @llvm.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f32 = call <4 x float> @llvm.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8f32 = call <8 x float> @llvm.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f64 = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4f64 = call <4 x double> @llvm.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2bf16 = call <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4bf16 = call <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8bf16 = call <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16bf16 = call <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i1 = call <16 x i1> @llvm.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i1 = call <8 x i1> @llvm.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i1 = call <4 x i1> @llvm.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i1 = call <2 x i1> @llvm.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %splice.v2i128 = call <2 x i128> @llvm.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %splice.v16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
-  %splice.v32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
-  %splice.v2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
-  %splice.v4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
-  %splice.v8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
-  %splice.v16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
-  %splice.v4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
-  %splice.v8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
-  %splice.v2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
-  %splice.v4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
-  %splice.v2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
-  %splice.v4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
-  %splice.v8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
-  %splice.v16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
-  %splice.v2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
-  %splice.v4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
-  %splice.v8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
-  %splice.v2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
-  %splice.v4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
-  %splice.v2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
-  %splice.v4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
-  %splice.v8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
-  %splice.v16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
-  %splice.v16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
-  %splice.v8i1 =  call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
-  %splice.v4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
-  %splice.v2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
-  %splice.v2i128 = call <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
+  %splice.v16i8 = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
+  %splice.v32i8 = call <32 x i8> @llvm.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
+  %splice.v2i16 = call <2 x i16> @llvm.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
+  %splice.v4i16 = call <4 x i16> @llvm.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
+  %splice.v8i16 = call <8 x i16> @llvm.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
+  %splice.v16i16 = call <16 x i16> @llvm.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
+  %splice.v4i32 = call <4 x i32> @llvm.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
+  %splice.v8i32 = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
+  %splice.v2i64 = call <2 x i64> @llvm.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
+  %splice.v4i64 = call <4 x i64> @llvm.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
+  %splice.v2f16 = call <2 x half> @llvm.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
+  %splice.v4f16 = call <4 x half> @llvm.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
+  %splice.v8f16 = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
+  %splice.v16f16 = call <16 x half> @llvm.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
+  %splice.v2f32 = call <2 x float> @llvm.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
+  %splice.v4f32 = call <4 x float> @llvm.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
+  %splice.v8f32 = call <8 x float> @llvm.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
+  %splice.v2f64 = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
+  %splice.v4f64 = call <4 x double> @llvm.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
+  %splice.v2bf16 = call <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
+  %splice.v4bf16 = call <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
+  %splice.v8bf16 = call <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
+  %splice.v16bf16 = call <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
+  %splice.v16i1 = call <16 x i1> @llvm.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
+  %splice.v8i1 =  call <8 x i1> @llvm.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
+  %splice.v4i1 = call <4 x i1> @llvm.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
+  %splice.v2i1 = call <2 x i1> @llvm.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
+  %splice.v2i128 = call <2 x i128> @llvm.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
   ret void
 }
 
-declare <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1>, <2 x i1>, i32)
-declare <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1>, <4 x i1>, i32)
-declare <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1>, <8 x i1>, i32)
-declare <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1>, <16 x i1>, i32)
-declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
-declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
-declare <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8>, <32 x i8>, i32)
-declare <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16>, <2 x i16>, i32)
-declare <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16>, <4 x i16>, i32)
-declare <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32)
-declare <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16>, <16 x i16>, i32)
-declare <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32)
-declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
-declare <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32)
-declare <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64>, <4 x i64>, i32)
-declare <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half>, <2 x half>, i32)
-declare <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half>, <4 x half>, i32)
-declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32)
-declare <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half>, <16 x half>, i32)
-declare <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat>, <2 x bfloat>, i32)
-declare <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat>, <4 x bfloat>, i32)
-declare <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat>, <8 x bfloat>, i32)
-declare <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat>, <16 x bfloat>, i32)
-declare <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float>, <2 x float>, i32)
-declare <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float>, <4 x float>, i32)
-declare <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float>, <8 x float>, i32)
-declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
-declare <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double>, <4 x double>, i32)
-declare <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128>, <2 x i128>, i32)
+declare <2 x i1> @llvm.vector.splice.v2i1(<2 x i1>, <2 x i1>, i32)
+declare <4 x i1> @llvm.vector.splice.v4i1(<4 x i1>, <4 x i1>, i32)
+declare <8 x i1> @llvm.vector.splice.v8i1(<8 x i1>, <8 x i1>, i32)
+declare <16 x i1> @llvm.vector.splice.v16i1(<16 x i1>, <16 x i1>, i32)
+declare <2 x i8> @llvm.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
+declare <16 x i8> @llvm.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
+declare <32 x i8> @llvm.vector.splice.v32i8(<32 x i8>, <32 x i8>, i32)
+declare <2 x i16> @llvm.vector.splice.v2i16(<2 x i16>, <2 x i16>, i32)
+declare <4 x i16> @llvm.vector.splice.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32)
+declare <16 x i16> @llvm.vector.splice.v16i16(<16 x i16>, <16 x i16>, i32)
+declare <4 x i32> @llvm.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32)
+declare <8 x i32> @llvm.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
+declare <2 x i64> @llvm.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32)
+declare <4 x i64> @llvm.vector.splice.v4i64(<4 x i64>, <4 x i64>, i32)
+declare <2 x half> @llvm.vector.splice.v2f16(<2 x half>, <2 x half>, i32)
+declare <4 x half> @llvm.vector.splice.v4f16(<4 x half>, <4 x half>, i32)
+declare <8 x half> @llvm.vector.splice.v8f16(<8 x half>, <8 x half>, i32)
+declare <16 x half> @llvm.vector.splice.v16f16(<16 x half>, <16 x half>, i32)
+declare <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat>, <2 x bfloat>, i32)
+declare <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat>, <4 x bfloat>, i32)
+declare <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat>, <8 x bfloat>, i32)
+declare <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat>, <16 x bfloat>, i32)
+declare <2 x float> @llvm.vector.splice.v2f32(<2 x float>, <2 x float>, i32)
+declare <4 x float> @llvm.vector.splice.v4f32(<4 x float>, <4 x float>, i32)
+declare <8 x float> @llvm.vector.splice.v8f32(<8 x float>, <8 x float>, i32)
+declare <16 x float> @llvm.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <4 x double> @llvm.vector.splice.v4f64(<4 x double>, <4 x double>, i32)
+declare <2 x i128> @llvm.vector.splice.v2i128(<2 x i128>, <2 x i128>, i32)
 
 attributes #0 = { "target-features"="+bf16" }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 7ce3021b0093..15c278b060c9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -270,122 +270,122 @@ declare <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32>, i1)
 
 define void @vector_reverse() #0 {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_reverse'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-  %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-  %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-  %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-  %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-  %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-  %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-  %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-  %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-  %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-  %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-  %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-  %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-  %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+  %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+  %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+  %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+  %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+  %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+  %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+  %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+  %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+  %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+  %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+  %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+  %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+  %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
   ret void
 }
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
-declare <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
+declare <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
 
 define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
@@ -450,236 +450,236 @@ declare <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float>)
 
 define void @vector_splice() #0 {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_splice'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-  %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-  %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-  %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-  %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-  %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-  %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-  %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-  %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-  %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-  %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+  %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+  %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+  %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+  %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+  %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+  %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+  %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+  %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+  %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+  %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ;; negative Index
-  %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-  %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-  %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-  %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-  %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-  %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-  %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-  %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-  %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-  %splice_nxv1i64_neg= call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-  %splice_nxv2i64_neg= call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-  %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-  %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-  %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-  %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-  %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-  %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-  %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-  %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-  %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-  %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-  %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-  %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-  %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-  %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-  %splice_nxv8i1_neg =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-  %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-  %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-  %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+  %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+  %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+  %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+  %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+  %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+  %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+  %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+  %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+  %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+  %splice_nxv1i64_neg= call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+  %splice_nxv2i64_neg= call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+  %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+  %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+  %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+  %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+  %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+  %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+  %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+  %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+  %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+  %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+  %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+  %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+  %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+  %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+  %splice_nxv8i1_neg =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+  %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+  %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+  %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
   ret void
 }
 
-declare <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
-declare <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
-declare <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, i32)
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
+declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
+declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
+declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
 
 define void @get_lane_mask() #0 {
 ; CHECK-LABEL: 'get_lane_mask'
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 306277e46fa5..1dde88f366a3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 < %s | FileCheck %s
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -252,8 +252,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fdiv <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = fdiv <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32F16 = fdiv <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fdiv <32 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
@@ -263,8 +263,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F32 = fdiv <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = fdiv <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = fdiv <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
@@ -272,8 +272,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = fdiv <1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = fdiv <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
@@ -332,8 +332,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16 = frem <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F16 = frem <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F16 = frem <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F16 = frem <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32F16 = frem <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
@@ -343,8 +343,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = frem <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = frem <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F32 = frem <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F32 = frem <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = frem <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F32 = frem <vscale x 1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F32 = frem <vscale x 2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F32 = frem <vscale x 4 x float> undef, undef
@@ -352,8 +352,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F32 = frem <vscale x 16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = frem <1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = frem <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F64 = frem <8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F64 = frem <vscale x 1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F64 = frem <vscale x 2 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F64 = frem <vscale x 4 x double> undef, undef
@@ -492,8 +492,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
@@ -503,8 +503,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.copysign.v1f32(<1 x float> undef, <1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.copysign.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.copysign.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
@@ -512,8 +512,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.copysign.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.copysign.v1f64(<1 x double> undef, <1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.copysign.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.copysign.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
@@ -572,8 +572,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.fma.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
@@ -583,8 +583,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.fma.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
@@ -592,8 +592,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.fma.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.fma.v1f64(<1 x double> undef, <1 x double> undef, <1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.fma.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
@@ -651,15 +651,15 @@ define void @fmuladd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %7 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %11 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %15 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x half> @llvm.fmuladd.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x half> @llvm.fmuladd.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
index 00f2cd7b63a4..b4afbb513166 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mcpu=sifive-x280 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s --check-prefix=SIFIVE-X280
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mcpu=sifive-x280 < %s | FileCheck %s --check-prefix=SIFIVE-X280
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -709,8 +709,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = udiv <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = udiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = udiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = udiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = udiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = udiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = udiv <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = udiv <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = udiv <vscale x 4 x i16> undef, undef
@@ -720,8 +720,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = udiv <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = udiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = udiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = udiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = udiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = udiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = udiv <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = udiv <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = udiv <vscale x 4 x i32> undef, undef
@@ -729,8 +729,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = udiv <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = udiv <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = udiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = udiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = udiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = udiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = udiv <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = udiv <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = udiv <vscale x 4 x i64> undef, undef
@@ -825,8 +825,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = urem <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = urem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = urem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = urem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = urem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = urem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = urem <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = urem <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = urem <vscale x 4 x i16> undef, undef
@@ -836,8 +836,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = urem <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = urem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = urem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = urem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = urem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = urem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = urem <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = urem <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = urem <vscale x 4 x i32> undef, undef
@@ -845,8 +845,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = urem <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = urem <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = urem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = urem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = urem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = urem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = urem <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = urem <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = urem <vscale x 4 x i64> undef, undef
@@ -941,8 +941,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = sdiv <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = sdiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = sdiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = sdiv <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = sdiv <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = sdiv <vscale x 4 x i16> undef, undef
@@ -952,8 +952,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = sdiv <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = sdiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = sdiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = sdiv <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = sdiv <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = sdiv <vscale x 4 x i32> undef, undef
@@ -961,8 +961,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = sdiv <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = sdiv <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = sdiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = sdiv <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = sdiv <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = sdiv <vscale x 4 x i64> undef, undef
@@ -1057,8 +1057,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = srem <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = srem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = srem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = srem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = srem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = srem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = srem <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = srem <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = srem <vscale x 4 x i16> undef, undef
@@ -1068,8 +1068,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = srem <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = srem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = srem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = srem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = srem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = srem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = srem <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = srem <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = srem <vscale x 4 x i32> undef, undef
@@ -1077,8 +1077,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = srem <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = srem <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = srem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = srem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = srem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = srem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = srem <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = srem <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = srem <vscale x 4 x i64> undef, undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index 7cc7cff0e6e8..e068ab638d3a 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -78,148 +78,148 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_reverse() {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-  %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-  %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-  %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+  %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+  %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+  %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
   ret void
 }
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64>)
-declare <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64>)
+declare <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
 
 
 define void @vector_splice() {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_splice'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
   ret void
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
diff --git a/llvm/test/Analysis/CostModel/RISCV/splice.ll b/llvm/test/Analysis/CostModel/RISCV/splice.ll
index c70c879dba5a..9acccef9c4f6 100644
--- a/llvm/test/Analysis/CostModel/RISCV/splice.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/splice.ll
@@ -4,220 +4,220 @@
 
 define void @vector_splice() {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_splice'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-  %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-  %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-  %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-  %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-  %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-  %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+  %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+  %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+  %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+  %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+  %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+  %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+  %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
 
-  %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-  %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-  %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-  %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-  %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-  %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-  %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+  %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+  %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+  %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+  %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+  %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+  %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+  %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
 
-  %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-  %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-  %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-  %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-  %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-  %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-  %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+  %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+  %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+  %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+  %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+  %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+  %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+  %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
 
-  %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-  %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-  %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-  %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-  %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-  %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-  %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+  %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+  %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+  %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+  %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+  %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+  %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+  %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
 
-  %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-  %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-  %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-  %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-  %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-  %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-  %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+  %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+  %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+  %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+  %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+  %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+  %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+  %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
 
-  %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-  %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-  %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-  %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-  %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-  %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-  %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+  %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+  %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+  %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+  %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+  %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+  %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+  %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
 
-  %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-  %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-  %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-  %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-  %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-  %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-  %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+  %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+  %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+  %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+  %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+  %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+  %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+  %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 
   ret void
 }
 
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
-declare <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+declare <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
 
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
-declare <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16>, <vscale x 64 x i16>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
+declare <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16>, <vscale x 64 x i16>, i32)
 
-declare <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
-declare <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
-declare <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, i32)
-declare <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32>, <vscale x 64 x i32>, i32)
+declare <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
+declare <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
+declare <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, i32)
+declare <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32>, <vscale x 64 x i32>, i32)
 
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
-declare <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
-declare <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, i32)
-declare <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64>, <vscale x 32 x i64>, i32)
-declare <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64>, <vscale x 64 x i64>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+declare <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, i32)
+declare <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64>, <vscale x 32 x i64>, i32)
+declare <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64>, <vscale x 64 x i64>, i32)
 
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
-declare <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
-declare <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half>, <vscale x 64 x half>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
+declare <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half>, <vscale x 64 x half>, i32)
 
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float>, <vscale x 32 x float>, i32)
-declare <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float>, <vscale x 64 x float>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float>, <vscale x 32 x float>, i32)
+declare <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float>, <vscale x 64 x float>, i32)
 
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
-declare <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
-declare <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double>, <vscale x 16 x double>, i32)
-declare <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, i32)
-declare <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double>, <vscale x 64 x double>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
+declare <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double>, <vscale x 16 x double>, i32)
+declare <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, i32)
+declare <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double>, <vscale x 64 x double>, i32)
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
new file mode 100644
index 000000000000..8c7df4bdf5a5
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define void @forward_dep_known_safe_due_to_backedge_taken_count(ptr %A) {
+; CHECK-LABEL: 'forward_dep_known_safe_due_to_backedge_taken_count'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %A.511= getelementptr inbounds i32, ptr %A, i64 511
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul.2 = shl nuw nsw i64 %iv, 1
+  %gep.mul.2 = getelementptr inbounds i32, ptr %A.511, i64 %iv.mul.2
+  %l = load i32, ptr %gep.mul.2, align 4
+  %add = add nsw i32 %l, 5
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i32 %add, ptr %gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @forward_dep_not_known_safe_due_to_backedge_taken_count(ptr %A) {
+; CHECK-LABEL: 'forward_dep_not_known_safe_due_to_backedge_taken_count'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %l = load i32, ptr %gep.mul.2, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %A.510 = getelementptr inbounds i32, ptr %A, i64 510
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul.2 = shl nuw nsw i64 %iv, 1
+  %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv.mul.2
+  %l = load i32, ptr %gep.mul.2, align 4
+  %add = add nsw i32 %l, 5
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i32 %add, ptr %gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @unknown_dep_known_safe_due_to_backedge_taken_count(ptr %A) {
+; CHECK-LABEL: 'unknown_dep_known_safe_due_to_backedge_taken_count'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %A.511 = getelementptr inbounds i32, ptr %A, i64 511
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul.2 = shl nuw nsw i64 %iv, 1
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  %l = load i32, ptr %gep, align 4
+  %add = add nsw i32 %l, 5
+  %gep.mul.2 = getelementptr inbounds i32, ptr %A.511, i64 %iv.mul.2
+  store i32 %add, ptr %gep.mul.2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @unknown_dep_not_known_safe_due_to_backedge_taken_count(ptr %A) {
+; CHECK-LABEL: 'unknown_dep_not_known_safe_due_to_backedge_taken_count'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep.mul.2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %A.510 = getelementptr inbounds i32, ptr %A, i64 510
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul.2 = shl nuw nsw i64 %iv, 1
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  %l = load i32, ptr %gep, align 4
+  %add = add nsw i32 %l, 5
+  %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv.mul.2
+  store i32 %add, ptr %gep.mul.2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-forward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-forward.ll
index 51755314896b..5f4c732dc19d 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-forward.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-forward.ll
@@ -8,10 +8,9 @@ declare void @llvm.assume(i1)
 define void @different_non_constant_strides_known_forward(ptr %A) {
 ; CHECK-LABEL: 'different_non_constant_strides_known_forward'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            %l = load i32, ptr %gep.mul.2, align 4 ->
 ; CHECK-NEXT:            store i32 %add, ptr %gep, align 4
 ; CHECK-EMPTY:
@@ -45,10 +44,9 @@ exit:
 define void @different_non_constant_strides_known_forward_min_distance_3(ptr %A) {
 ; CHECK-LABEL: 'different_non_constant_strides_known_forward_min_distance_3'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            %l = load i32, ptr %gep.mul.2, align 4 ->
 ; CHECK-NEXT:            store i32 %add, ptr %gep, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/ValueTracking/phi-known-bits.ll b/llvm/test/Analysis/ValueTracking/phi-known-bits.ll
index e5b8ba151e04..7b5e143fecfd 100644
--- a/llvm/test/Analysis/ValueTracking/phi-known-bits.ll
+++ b/llvm/test/Analysis/ValueTracking/phi-known-bits.ll
@@ -932,9 +932,11 @@ define i1 @recursiveGEP_withPtrSub_scalableGEP(ptr %val1) {
 ; CHECK-NEXT:    br label [[WHILE_COND_I:%.*]]
 ; CHECK:       while.cond.i:
 ; CHECK-NEXT:    [[A_PN_I:%.*]] = phi ptr [ [[TEST_0_I:%.*]], [[WHILE_COND_I]] ], [ [[VAL1:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TEST_0_I]] = getelementptr <vscale x 16 x i8>, ptr [[A_PN_I]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TEST_0_I]], align 1
-; CHECK-NEXT:    [[CMP3_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TEST_0_I]] = getelementptr i8, ptr [[A_PN_I]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TEST_0_I]], align 1
+; CHECK-NEXT:    [[CMP3_NOT_I:%.*]] = icmp eq i8 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[CMP3_NOT_I]], label [[WHILE_END_I:%.*]], label [[WHILE_COND_I]]
 ; CHECK:       while.end.i:
 ; CHECK-NEXT:    [[BOOL:%.*]] = icmp eq ptr [[TEST_0_I]], [[VAL1]]
@@ -964,9 +966,11 @@ define i1 @recursiveGEP_withPtrSub_scalableGEP_inbounds(ptr %val1) {
 ; CHECK-NEXT:    br label [[WHILE_COND_I:%.*]]
 ; CHECK:       while.cond.i:
 ; CHECK-NEXT:    [[A_PN_I:%.*]] = phi ptr [ [[TEST_0_I:%.*]], [[WHILE_COND_I]] ], [ [[VAL1:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TEST_0_I]] = getelementptr inbounds <vscale x 16 x i8>, ptr [[A_PN_I]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TEST_0_I]], align 1
-; CHECK-NEXT:    [[CMP3_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TEST_0_I]] = getelementptr inbounds i8, ptr [[A_PN_I]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TEST_0_I]], align 1
+; CHECK-NEXT:    [[CMP3_NOT_I:%.*]] = icmp eq i8 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[CMP3_NOT_I]], label [[WHILE_END_I:%.*]], label [[WHILE_COND_I]]
 ; CHECK:       while.end.i:
 ; CHECK-NEXT:    [[BOOL:%.*]] = icmp eq ptr [[TEST_0_I]], [[VAL1]]
diff --git a/llvm/test/Assembler/thinlto-summary.ll b/llvm/test/Assembler/thinlto-summary.ll
index 05dad2c7acad..e0d866da0d8a 100644
--- a/llvm/test/Assembler/thinlto-summary.ll
+++ b/llvm/test/Assembler/thinlto-summary.ll
@@ -46,28 +46,32 @@
 ^18 = gv: (guid: 17, summaries: (alias: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1), aliasee: ^14)))
 
 ; Test all types of TypeIdInfo on function summaries.
-^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 4, typeIdInfo: (typeTests: (^25, ^27)))))
-^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^28, offset: 16))))))
-^21 = gv: (guid: 20, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 5, typeIdInfo: (typeCheckedLoadVCalls: (vFuncId: (^26, offset: 16))))))
-^22 = gv: (guid: 21, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 15, typeIdInfo: (typeTestAssumeConstVCalls: ((vFuncId: (^28, offset: 16), args: (42)), (vFuncId: (^28, offset: 24)))))))
-^23 = gv: (guid: 22, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 5, typeIdInfo: (typeCheckedLoadConstVCalls: ((vFuncId: (^29, offset: 16), args: (42)))))))
+^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 4, typeIdInfo: (typeTests: (^26, ^28)))))
+^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^29, offset: 16))))))
+^21 = gv: (guid: 20, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 5, typeIdInfo: (typeCheckedLoadVCalls: (vFuncId: (^27, offset: 16))))))
+^22 = gv: (guid: 21, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 15, typeIdInfo: (typeTestAssumeConstVCalls: ((vFuncId: (^29, offset: 16), args: (42)), (vFuncId: (^29, offset: 24)))))))
+^23 = gv: (guid: 22, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 5, typeIdInfo: (typeCheckedLoadConstVCalls: ((vFuncId: (^30, offset: 16), args: (42)))))))
 
 ; Function summary with an import type of declaration
 ^24 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, importType: declaration), insts: 5)))
 
+; GUID that are 64-bit
+
+^25 = gv: (guid: 9123456789101112131, summaries: (function: (module: ^0, flags: (linkage: internal, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, importType: definition), insts: 1)))
+
 ; Test TypeId summaries:
 
-^25 = typeid: (name: "_ZTS1C", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0)))
+^26 = typeid: (name: "_ZTS1C", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0)))
 ; Test TypeId with other optional fields (alignLog2/sizeM1/bitMask/inlineBits)
-^26 = typeid: (name: "_ZTS1B", summary: (typeTestRes: (kind: inline, sizeM1BitWidth: 0, alignLog2: 1, sizeM1: 2, bitMask: 3, inlineBits: 4)))
+^27 = typeid: (name: "_ZTS1B", summary: (typeTestRes: (kind: inline, sizeM1BitWidth: 0, alignLog2: 1, sizeM1: 2, bitMask: 3, inlineBits: 4)))
 ; Test the AllOnes resolution, and all kinds of WholeProgramDevirtResolution
 ; types, including all optional resolution by argument kinds.
-^27 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi")), (offset: 16, wpdRes: (kind: indir, resByArg: (args: (1, 2), byArg: (kind: indir, byte: 2, bit: 3), args: (3), byArg: (kind: uniformRetVal, info: 1), args: (4), byArg: (kind: uniqueRetVal, info: 1), args: (5), byArg: (kind: virtualConstProp)))))))
+^28 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi")), (offset: 16, wpdRes: (kind: indir, resByArg: (args: (1, 2), byArg: (kind: indir, byte: 2, bit: 3), args: (3), byArg: (kind: uniformRetVal, info: 1), args: (4), byArg: (kind: uniqueRetVal, info: 1), args: (5), byArg: (kind: virtualConstProp)))))))
 ; Test the other kinds of type test resoultions
-^28 = typeid: (name: "_ZTS1D", summary: (typeTestRes: (kind: byteArray, sizeM1BitWidth: 0)))
-^29 = typeid: (name: "_ZTS1E", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0)))
-^30 = flags: 8
-^31 = blockcount: 1888
+^29 = typeid: (name: "_ZTS1D", summary: (typeTestRes: (kind: byteArray, sizeM1BitWidth: 0)))
+^30 = typeid: (name: "_ZTS1E", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0)))
+^31 = flags: 8
+^32 = blockcount: 1888
 
 ; Make sure we get back from llvm-dis essentially what we put in via llvm-as.
 ; CHECK: ^0 = module: (path: "thinlto-summary1.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049))
@@ -91,19 +95,20 @@
 ; CHECK: ^16 = gv: (guid: 15, summaries: (function: (module: ^1, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 1, funcFlags: (readNone: 1, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0, alwaysInline: 1, noUnwind: 1, mayThrow: 1, hasUnknownCall: 1, mustBeUnreachable: 0))))
 ; CHECK: ^17 = gv: (guid: 16, summaries: (function: (module: ^1, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 1, funcFlags: (readNone: 0, readOnly: 1, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 1), calls: ((callee: ^15)))))
 ; CHECK: ^18 = gv: (guid: 17, summaries: (alias: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), aliasee: ^14)))
-; CHECK: ^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 4, typeIdInfo: (typeTests: (^25, ^27)))))
-; CHECK: ^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^28, offset: 16))))))
-; CHECK: ^21 = gv: (guid: 20, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 5, typeIdInfo: (typeCheckedLoadVCalls: (vFuncId: (^26, offset: 16))))))
-; CHECK: ^22 = gv: (guid: 21, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 15, typeIdInfo: (typeTestAssumeConstVCalls: ((vFuncId: (^28, offset: 16), args: (42)), (vFuncId: (^28, offset: 24)))))))
-; CHECK: ^23 = gv: (guid: 22, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 5, typeIdInfo: (typeCheckedLoadConstVCalls: ((vFuncId: (^29, offset: 16), args: (42)))))))
+; CHECK: ^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 4, typeIdInfo: (typeTests: (^26, ^28)))))
+; CHECK: ^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^29, offset: 16))))))
+; CHECK: ^21 = gv: (guid: 20, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 5, typeIdInfo: (typeCheckedLoadVCalls: (vFuncId: (^27, offset: 16))))))
+; CHECK: ^22 = gv: (guid: 21, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 15, typeIdInfo: (typeTestAssumeConstVCalls: ((vFuncId: (^29, offset: 16), args: (42)), (vFuncId: (^29, offset: 24)))))))
+; CHECK: ^23 = gv: (guid: 22, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: definition), insts: 5, typeIdInfo: (typeCheckedLoadConstVCalls: ((vFuncId: (^30, offset: 16), args: (42)))))))
 ; CHECK: ^24 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0, importType: declaration), insts: 5)))
-; CHECK: ^25 = typeid: (name: "_ZTS1C", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0))) ; guid = 1884921850105019584
-; CHECK: ^26 = typeid: (name: "_ZTS1B", summary: (typeTestRes: (kind: inline, sizeM1BitWidth: 0, alignLog2: 1, sizeM1: 2, bitMask: 3, inlineBits: 4))) ; guid = 6203814149063363976
-; CHECK: ^27 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi")), (offset: 16, wpdRes: (kind: indir, resByArg: (args: (1, 2), byArg: (kind: indir, byte: 2, bit: 3), args: (3), byArg: (kind: uniformRetVal, info: 1), args: (4), byArg: (kind: uniqueRetVal, info: 1), args: (5), byArg: (kind: virtualConstProp))))))) ; guid = 7004155349499253778
-; CHECK: ^28 = typeid: (name: "_ZTS1D", summary: (typeTestRes: (kind: byteArray, sizeM1BitWidth: 0))) ; guid = 9614786172484273522
-; CHECK: ^29 = typeid: (name: "_ZTS1E", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0))) ; guid = 17437243864166745132
-; CHECK: ^30 = flags: 8
-; CHECK: ^31 = blockcount: 1888
+; CHECK: ^25 = gv: (guid: 9123456789101112131, summaries: (function: (module: ^0, flags: (linkage: internal, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 1)))
+; CHECK: ^26 = typeid: (name: "_ZTS1C", summary: (typeTestRes: (kind: single, sizeM1BitWidth: 0))) ; guid = 1884921850105019584
+; CHECK: ^27 = typeid: (name: "_ZTS1B", summary: (typeTestRes: (kind: inline, sizeM1BitWidth: 0, alignLog2: 1, sizeM1: 2, bitMask: 3, inlineBits: 4))) ; guid = 6203814149063363976
+; CHECK: ^28 = typeid: (name: "_ZTS1A", summary: (typeTestRes: (kind: allOnes, sizeM1BitWidth: 7), wpdResolutions: ((offset: 0, wpdRes: (kind: branchFunnel)), (offset: 8, wpdRes: (kind: singleImpl, singleImplName: "_ZN1A1nEi")), (offset: 16, wpdRes: (kind: indir, resByArg: (args: (1, 2), byArg: (kind: indir, byte: 2, bit: 3), args: (3), byArg: (kind: uniformRetVal, info: 1), args: (4), byArg: (kind: uniqueRetVal, info: 1), args: (5), byArg: (kind: virtualConstProp))))))) ; guid = 7004155349499253778
+; CHECK: ^29 = typeid: (name: "_ZTS1D", summary: (typeTestRes: (kind: byteArray, sizeM1BitWidth: 0))) ; guid = 9614786172484273522
+; CHECK: ^30 = typeid: (name: "_ZTS1E", summary: (typeTestRes: (kind: unsat, sizeM1BitWidth: 0))) ; guid = 17437243864166745132
+; CHECK: ^31 = flags: 8
+; CHECK: ^32 = blockcount: 1888
 
 ; Make sure parsing of a non-summary entry containing a ":" does not fail
 ; after summary parsing, which handles colons differently.
diff --git a/llvm/test/Bitcode/summary_version.ll b/llvm/test/Bitcode/summary_version.ll
index 98feab6fe2f9..26c64f81a773 100644
--- a/llvm/test/Bitcode/summary_version.ll
+++ b/llvm/test/Bitcode/summary_version.ll
@@ -2,7 +2,7 @@
 ; RUN: opt  -module-summary  %s -o - | llvm-bcanalyzer -dump | FileCheck %s
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK: <VERSION op0=9/>
+; CHECK: <VERSION op0=10/>
 
 
 
diff --git a/llvm/test/Bitcode/thinlto-alias.ll b/llvm/test/Bitcode/thinlto-alias.ll
index 5dfff0f79619..7deb2d8259e3 100644
--- a/llvm/test/Bitcode/thinlto-alias.ll
+++ b/llvm/test/Bitcode/thinlto-alias.ll
@@ -31,9 +31,9 @@
 ; COMBINED-NEXT:    <VERSION
 ; COMBINED-NEXT:    <FLAGS
 ; See if the call to analias is registered, using the expected value id.
-; COMBINED-NEXT:    <VALUE_GUID op0=[[ALIASID:[0-9]+]] op1=-5751648690987223394/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op0=[[ALIASID:[0-9]+]] op1=2955807229 op2=886945438/>
 ; COMBINED-NEXT:    <VALUE_GUID
-; COMBINED-NEXT:    <VALUE_GUID op0=[[ALIASEEID:[0-9]+]] op1=-1039159065113703048/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op0=[[ALIASEEID:[0-9]+]] op1=4053019222 op2=46484856/>
 ; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}} op9=[[ALIASID]]
 ; COMBINED-NEXT:    <COMBINED_PROFILE {{.*}}
 ; COMBINED-NEXT:    <COMBINED_ALIAS  {{.*}} op3=[[ALIASEEID]]
diff --git a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
index 19e228fd5355..d864cadcff35 100644
--- a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
+++ b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
@@ -11,21 +11,21 @@
 ; RUN: llvm-dis -o - %t.o | llvm-as -o - | llvm-dis -o - | FileCheck %s --check-prefix=DIS
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK-NEXT:   <VERSION op0=9/>
+; CHECK-NEXT:   <VERSION op0=10/>
 ; CHECK-NEXT:   <FLAGS op0=0/>
 ; The `VALUE_GUID` below represents the "_ZTV4Base" referenced by the instruction
 ; that loads vtable pointers.
-; CHECK-NEXT: <VALUE_GUID op0=21 op1=1960855528937986108/>
+; CHECK-NEXT: <VALUE_GUID {{.*}} op0=21 op1=456547254 op2=3929380924/>
 ; The `VALUE_GUID` below represents the "_ZN4Base4funcEv" referenced by the
 ; indirect call instruction.
-; CHECK-NEXT:      <VALUE_GUID op0=20 op1=5459407273543877811/>
+; CHECK-NEXT:      <VALUE_GUID {{.*}} op0=20 op1=1271117309 op2=2009351347/>
 ; NOTE vtables and functions from Derived class is dropped because
 ; `-icp-max-num-vtables` and `-icp-max-prom` are both set to one.
 ; <PERMODULE_PROFILE> has the format [valueid, flags, instcount, funcflags,
 ;                                     numrefs, rorefcnt, worefcnt,
 ;                                     m x valueid,
 ;                                     n x (valueid, hotness+tailcall)]
-; CHECK-NEXT:   <PERMODULE_PROFILE abbrevid=4 op0=0 op1=0 op2=4 op3=256 op4=1 op5=1 op6=0 op7=21 op8=20 op9=3/>
+; CHECK-NEXT:   <PERMODULE_PROFILE {{.*}} op0=0 op1=0 op2=4 op3=256 op4=1 op5=1 op6=0 op7=21 op8=20 op9=3/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-partial-sample-profile-summary.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-partial-sample-profile-summary.ll
index d44ee24694be..0c3ab9b20893 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-partial-sample-profile-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-partial-sample-profile-summary.ll
@@ -30,7 +30,7 @@
 ; CHECK-LABEL:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <FLAGS
-; CHECK-NEXT:    <VALUE_GUID op0=27 op1=123/>
+; CHECK-NEXT:    <VALUE_GUID {{.*}} op0=27 op1=0 op2=123/>
 ; op4=none1 op6=hot1 op8=cold1 op10=none2 op12=hot2 op14=cold2 op16=none3 op18=hot3 op20=cold3 op22=123
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=7 op8=0 op9=1 op10=3 op11=4 op12=1 op13=8 op14=0 op15=2 op16=3 op17=5 op18=1 op19=9 op20=0 op21=3 op22=3 op23=6 op24=1 op25=27 op26=4/>
 ; CHECK-NEXT:    <BLOCK_COUNT op0=4/>
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
index 2bbab0c6bb0d..ed3c716288d6 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-pgo.ll
@@ -26,7 +26,7 @@
 ; COMBINED:       <GLOBALVAL_SUMMARY_BLOCK
 ; COMBINED-NEXT:    <VERSION
 ; COMBINED-NEXT:    <FLAGS
-; COMBINED-NEXT:    <VALUE_GUID op0=[[FUNCID:[0-9]+]] op1=7289175272376759421/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op0=[[FUNCID:[0-9]+]] op1=1697143370 op2=1603531901/>
 ; COMBINED-NEXT:    <VALUE_GUID
 ; COMBINED-NEXT:    <COMBINED
 ; See if the call to func is registered, using the expected hotness type.
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
index 563fb18107d3..576261e5392b 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
@@ -47,7 +47,7 @@
 ; CHECK-LABEL:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <FLAGS
-; CHECK-NEXT:    <VALUE_GUID op0=25 op1=123/>
+; CHECK-NEXT:    <VALUE_GUID {{.*}} op0=25 op1=0 op2=123/>
 ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=1 op8=3 op9=5 op10=1 op11=2 op12=3 op13=4 op14=1 op15=6 op16=2 op17=3 op18=3 op19=7 op20=2 op21=8 op22=2 op23=25 op24=4/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
index 601bebd39267..2e9b362d39bb 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
@@ -30,7 +30,7 @@
 ; CHECK-LABEL:       <GLOBALVAL_SUMMARY_BLOCK
 ; CHECK-NEXT:    <VERSION
 ; CHECK-NEXT:    <FLAGS
-; CHECK-NEXT:    <VALUE_GUID op0=26 op1=123/>
+; CHECK-NEXT:    <VALUE_GUID {{.*}} op0=26 op1=0 op2=123/>
 ; op4=none1 op6=hot1 op8=cold1 op10=none2 op12=hot2 op14=cold2 op16=none3 op18=hot3 op20=cold3 op22=123
 ; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op7=7 op8=0 op9=1 op10=3 op11=4 op12=1 op13=8 op14=0 op15=2 op16=3 op17=5 op18=1 op19=9 op20=0 op21=3 op22=3 op23=6 op24=1 op25=26 op26=4/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
diff --git a/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll b/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
index 542b400f8e33..becbc4a32dd9 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-callgraph.ll
@@ -30,7 +30,7 @@
 ; COMBINED-NEXT:    <FLAGS
 ; Only 2 VALUE_GUID since reference to undefinedglob should not be included in
 ; combined index.
-; COMBINED-NEXT:    <VALUE_GUID op0=[[FUNCID:[0-9]+]] op1=7289175272376759421/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op0=[[FUNCID:[0-9]+]] op1=1697143370 op2=1603531901/>
 ; COMBINED-NEXT:    <VALUE_GUID
 ; COMBINED-NEXT:    <COMBINED_PROFILE
 ; See if the call to func is registered.
diff --git a/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll b/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
index 0139f00b4aa3..306eed8ec9ae 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-originalnames.ll
@@ -6,9 +6,9 @@
 ; COMBINED:       <GLOBALVAL_SUMMARY_BLOCK
 ; COMBINED-NEXT:    <VERSION
 ; COMBINED-NEXT:    <FLAGS
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=686735765308251824/>
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=4507502870619175775/>
-; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=-8118561185538785069/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=159893130 op2=1103175344/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=1049484794 op2=2739878751/>
+; COMBINED-NEXT:    <VALUE_GUID {{.*}} op1=2404717469 op2=2695872723/>
 ; COMBINED-DAG:    <COMBINED_PROFILE{{ }}
 ; COMBINED-DAG:    <COMBINED_ORIGINAL_NAME op0=-2012135647395072713/>
 ; COMBINED-DAG:    <COMBINED_GLOBALVAR_INIT_REFS
diff --git a/llvm/test/Bitcode/thinlto-function-summary-paramaccess.ll b/llvm/test/Bitcode/thinlto-function-summary-paramaccess.ll
index f8e6476ea3e8..9efe794580e1 100644
--- a/llvm/test/Bitcode/thinlto-function-summary-paramaccess.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary-paramaccess.ll
@@ -286,55 +286,55 @@ entry:
 
 
 ; COMBINED: <FLAGS op0=0/>
-; COMBINED-NEXT: <VALUE_GUID op0=[[CALLEE1:1]] op1=72710208629861106/>
-; COMBINED-NEXT: <VALUE_GUID op0=[[CALLEE2:2]] op1=900789920918863816/>
-; COMBINED-NEXT: <VALUE_GUID op0=3 op1=1075564720951610524/>
-; COMBINED-NEXT: <VALUE_GUID op0=4 op1=1417835201204712148/>
-; COMBINED-NEXT: <VALUE_GUID op0=5 op1=2949024673554120799/>
-; COMBINED-NEXT: <VALUE_GUID op0=6 op1=4179978066780831873/>
-; COMBINED-NEXT: <VALUE_GUID op0=7 op1=5540766144860458461/>
-; COMBINED-NEXT: <VALUE_GUID op0=8 op1=6187077497926519485/>
-; COMBINED-NEXT: <VALUE_GUID op0=9 op1=6707380319572075172/>
-; COMBINED-NEXT: <VALUE_GUID op0=10 op1=8411925997558855107/>
-; COMBINED-NEXT: <VALUE_GUID op0=11 op1=-8159310605091129913/>
-; COMBINED-NEXT: <VALUE_GUID op0=12 op1=-6599332516747241070/>
-; COMBINED-NEXT: <VALUE_GUID op0=13 op1=-5282029362632487219/>
-; COMBINED-NEXT: <VALUE_GUID op0=14 op1=-3184895716019949174/>
-; COMBINED-NEXT: <VALUE_GUID op0=15 op1=-2750063944951688315/>
-; COMBINED-NEXT: <VALUE_GUID op0=16 op1=-2287148700827644426/>
-; COMBINED-NEXT: <VALUE_GUID op0=17 op1=-1913852605147216470/>
-; COMBINED-NEXT: <VALUE_GUID op0=18 op1=-1792695732907084926/>
-; COMBINED-NEXT: <VALUE_GUID op0=19 op1=-1296325529848142540/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=[[CALLEE1:1]] op1=16929164 op2=2901240562/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=[[CALLEE2:2]] op1=209731497 op2=362741704/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=3 op1=250424426 op2=1162038428/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=4 op1=330115482 op2=2111435476/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=5 op1=686623312 op2=3842916447/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=6 op1=973226983 op2=3211083905/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=7 op1=1290060147 op2=3622505949/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=8 op1=1440541236 op2=767101629/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=9 op1=1561683677 op2=160047780/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=10 op1=1958554144 op2=1633580483/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=11 op1=2395229756 op2=192361927/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=12 op1=2758440458 op2=1889048978/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=13 op1=3065148999 op2=3004927693/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=14 op1=3553425976 op2=2012721546/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=15 op1=3654668137 op2=2609615749/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=16 op1=3762448991 op2=3668708854/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=17 op1=3849363762 op2=364807594/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=18 op1=3877572794 op2=2713121666/>
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0=19 op1=3993142988 op2=2149688628/>
 ; COMBINED-NEXT: <PARAM_ACCESS op0=1 op1=0 op2=0 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=1
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=1
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=2
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=2
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=1 op4=0 op5=[[CALLEE2]] op6=4 op7=6/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=3
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=3
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=24 op2=32 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=4
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=4
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=2 op3=0 op4=2 op5=0 op6=8 op7=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=5
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=6
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=5
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=6
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=2 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=7
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=7
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=2 op3=0 op4=1 op5=0 op6=8 op7=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=8
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=9
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=8
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=9
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=1 op4=0 op5=[[CALLEE2]] op6=0 op7=2/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=10
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=11
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=10
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=11
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=113 op2=97 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=12
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=13
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=12
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=13
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=14
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=15
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=14
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=15
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=1 op2=-2 op3=0/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=16
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=16
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=1 op4=0 op5=[[CALLEE2]] op6=1431 op7=1429/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=17
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=17
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=2 op4=0 op5=[[CALLEE2]] op6=1431 op7=250 op8=1 op9=[[CALLEE1]] op10=67 op11=65/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=18
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=18
 ; COMBINED-NEXT: <PARAM_ACCESS op0=0 op1=0 op2=0 op3=1 op4=0 op5=[[CALLEE2]] op6=1431 op7=250/>
-; COMBINED-NEXT: <COMBINED_PROFILE abbrevid=4 op0=19
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op0=19
diff --git a/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll b/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll
new file mode 100644
index 000000000000..f06395945297
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <8 x i32> @interleave_fixed(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @interleave_fixed
+; CHECK: %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+
+  %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i32> %res
+}
+
+define { <4 x i32>, <4 x i32> } @deinterleave_fixed(<8 x i32> %a) {
+; CHECK-LABEL: @deinterleave_fixed
+; CHECK: %res = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %a)
+
+  %res = call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %a)
+  ret { <4 x i32>, <4 x i32> } %res
+}
+
+define <vscale x 8 x i32> @interleave_scalable(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: @interleave_scalable
+; CHECK: %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+
+  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 8 x i32> %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_scalable(<vscale x 8 x i32> %a) {
+; CHECK-LABEL: @deinterleave_scalable
+; CHECK: %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+; CHECK: <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+
+declare  { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
+; CHECK: declare  { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+
+declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+; CHECK: <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare  { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+; CHECK: declare  { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
diff --git a/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll b/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll
new file mode 100644
index 000000000000..6b853eaf4175
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <16 x i8> @reverse_fixed(<16 x i8> %a) {
+; CHECK-LABEL: @reverse_fixed
+; CHECK: %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
+
+  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  ret <16 x i8> %res
+}
+
+define <vscale x 16 x i8> @reverse_scalable(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: @reverse_scalable
+; CHECK: %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+
+  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %res
+}
+
+declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
+; CHECK: declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+
+declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+; CHECK: declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
diff --git a/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll b/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll
new file mode 100644
index 000000000000..1b55da21ecd2
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <8 x half> @splice_fixed(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: @splice_fixed
+; CHECK: %res = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 2)
+
+  %res = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 2)
+  ret <8 x half> %res
+}
+
+define <vscale x 8 x half> @splice_scalable(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: @splice_scalable
+; CHECK: %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 2)
+
+  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 2)
+  ret <vscale x 8 x half> %res
+}
+
+declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32 immarg)
+; CHECK: declare <8 x half> @llvm.vector.splice.v8f16(<8 x half>, <8 x half>, i32 immarg)
+
+declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32 immarg)
+; CHECK: declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32 immarg)
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 6127b76db06b..eb4013511416 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -25,6 +25,7 @@ llvm_canonicalize_cmake_booleans(
   LLVM_INCLUDE_DXIL_TESTS
   LLVM_TOOL_LLVM_DRIVER_BUILD
   LLVM_INCLUDE_SPIRV_TOOLS_TESTS
+  LLVM_APPEND_VC_REV
   )
 
 configure_lit_site_cfg(
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
index 06fb2ce161c2..0c67a867580c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
@@ -201,3 +201,113 @@ body:             |
     RET_ReallyLR
 
 ...
+---
+name:            test_idx_undef
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_idx_undef
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %4:_(s8) = G_CONSTANT i8 -128
+    %5:_(s64) = G_IMPLICIT_DEF
+    %0:_(p0) = COPY $x0
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %2, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
+---
+name:            test_elt_undef
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_elt_undef
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 127
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[DEF]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>))
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %4:_(s8) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 3
+    %0:_(p0) = COPY $x0
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %2, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
+---
+name:            test_elt_undef_with_freeze
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_elt_undef_with_freeze
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 127
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<32 x s8>) = G_FREEZE [[BUILD_VECTOR]]
+    ; CHECK-NEXT: G_STORE [[FREEZE]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>))
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %4:_(s8) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 3
+    %0:_(p0) = COPY $x0
+    %9:_(<32 x s8>) = G_FREEZE %2
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %9, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
+---
+name:            test_insert_extract
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_insert_extract
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 127
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>))
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %5:_(s64) = G_CONSTANT i64 3
+    %4:_(s8) = G_EXTRACT_VECTOR_ELT %2, %5
+    %0:_(p0) = COPY $x0
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %2, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
+---
+name:            test_idx_oob
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_idx_oob
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %4:_(s8) = G_CONSTANT i8 -128
+    %5:_(s64) = G_CONSTANT i64 1024
+    %0:_(p0) = COPY $x0
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %2, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index 9a525151ca32..c97a00ccdd45 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -243,3 +243,29 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
   %1 = udiv <8 x i16> %a0, <i16 1, i16 119, i16 73, i16 -111, i16 -3, i16 118, i16 32, i16 31>
   ret <8 x i16> %1
 }
+
+define i32 @udiv_div_by_180(i32 %x)
+; SDAG-LABEL: udiv_div_by_180:
+; SDAG:       // %bb.0:
+; SDAG-NEXT:    mov w8, #5826 // =0x16c2
+; SDAG-NEXT:    and w9, w0, #0xff
+; SDAG-NEXT:    movk w8, #364, lsl #16
+; SDAG-NEXT:    umull x8, w9, w8
+; SDAG-NEXT:    lsr x0, x8, #32
+; SDAG-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: udiv_div_by_180:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    uxtb w8, w0
+; GISEL-NEXT:    mov w9, #5826 // =0x16c2
+; GISEL-NEXT:    movk w9, #364, lsl #16
+; GISEL-NEXT:    umull x8, w8, w9
+; GISEL-NEXT:    lsr x0, x8, #32
+; GISEL-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; GISEL-NEXT:    ret
+{
+  %truncate = and i32 %x, 255
+  %udiv = udiv i32 %truncate, 180
+  ret i32 %udiv
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
index 10882a06af1b..0b7fae47a65a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
@@ -11,7 +11,7 @@ define void @vector_deinterleave2_v4i32(<4 x i32> %a) {
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(0, 2)
   ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(1, 3)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call {<2 x i32>, <2 x i32>} @llvm.experimental.vector.deinterleave2.v4i32(<4 x i32> %a)
+    %res = call {<2 x i32>, <2 x i32>} @llvm.vector.deinterleave2.v4i32(<4 x i32> %a)
     ret void
 }
 
@@ -29,6 +29,6 @@ define void @vector_deinterleave2_v8f32(<8 x float> %a) {
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(0, 2, 4, 6)
   ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(1, 3, 5, 7)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %a)
+    %res = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %a)
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
index f51e47a428d1..0d8ac82c1051 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
@@ -10,7 +10,7 @@ define void @vector_interleave2_v4i32(<2 x i32> %a, <2 x i32> %b) {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[COPY1]], shufflemask(0, 2, 1, 3)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call <4 x i32> @llvm.experimental.vector.interleave2.v4i32(<2 x i32> %a, <2 x i32> %b)
+    %res = call <4 x i32> @llvm.vector.interleave2.v4i32(<2 x i32> %a, <2 x i32> %b)
     ret void
 }
 
@@ -25,6 +25,6 @@ define void @vector_interleave2_v8f32(<4 x float> %a, <4 x float> %b) {
   ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<8 x s32>) = G_SHUFFLE_VECTOR [[BITCAST]](<4 x s32>), [[BITCAST1]], shufflemask(0, 4, 1, 5, 2, 6, 3, 7)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+    %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
new file mode 100644
index 000000000000..e3a633c9e035
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1  %s -o - | FileCheck %s
+---
+name:            scalar_to_oversize_vector
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: scalar_to_oversize_vector
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32)
+  ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32)
+  ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV8]](s32), [[UV9]](s32), [[UV10]](s32), [[UV11]](s32)
+  ; CHECK-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV12]](s32), [[UV13]](s32), [[UV14]](s32), [[UV15]](s32)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[C1]](p0) :: (store (<4 x s32>), align 64)
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 16
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[C2]](p0) :: (store (<4 x s32>) into unknown-address + 16)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(p0) = G_CONSTANT i64 32
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[C3]](p0) :: (store (<4 x s32>) into unknown-address + 32, align 32)
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(p0) = G_CONSTANT i64 48
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR3]](<4 x s32>), [[C4]](p0) :: (store (<4 x s32>) into unknown-address + 48)
+  ; CHECK-NEXT:   G_BR %bb.1
+  bb.1:
+    %0:_(s512) = G_CONSTANT i512 0
+    %2:_(p0) = G_CONSTANT i64 0
+    G_BR %bb.2
+
+  bb.2:
+    %4:_(s512) = G_CONSTANT i512 0
+    %1:_(<16 x s32>) = G_BITCAST %4(s512)
+    %3:_(p0) = G_CONSTANT i64 0
+    G_STORE %1(<16 x s32>), %3(p0) :: (store (<16 x s32>))
+    G_BR %bb.2
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
index 07744dada4f1..b1166e683ec7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
@@ -323,3 +323,22 @@ define i32 @test_alias_3xs16(ptr %ptr, ptr %ptr2, ptr %ptr3, ptr noalias %safe_p
   store i32 14, ptr %addr4
   ret i32 %safeld
 }
+
+@G = external global [10 x i32]
+
+define void @invalid_zero_offset_no_merge(i64 %0) {
+; CHECK-LABEL: invalid_zero_offset_no_merge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, _G@GOTPAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr x8, [x8, _G@GOTPAGEOFF]
+; CHECK-NEXT:    str wzr, [x8, x0, lsl #2]
+; CHECK-NEXT:    str wzr, [x8, #4]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdrGot Lloh0, Lloh1
+  %2 = getelementptr [10 x i32], ptr @G, i64 0, i64 %0
+  store i32 0, ptr %2, align 4
+  store i32 0, ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1), align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir
index b0fc9b650187..1de548da9cbd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.mir
@@ -162,6 +162,13 @@
     ret void
   }
 
+  @G = external global [10 x i32]
+  define void @invalid_zero_offset_no_merge(i64 %0) {
+    %2 = getelementptr [10 x i32], ptr @G, i64 0, i64 %0
+    store i32 0, ptr %2, align 4
+    store i32 0, ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1), align 4
+    ret void
+  }
 ...
 ---
 name:            test_simple_2xs8
@@ -582,13 +589,11 @@ liveins:
 frameInfo:
   maxAlignment:    1
 machineFunctionInfo: {}
+# The store to ptr2 prevents merging into a single store.
+# We can still merge the stores into addr1 and addr2.
 body:             |
   bb.1 (%ir-block.0):
     liveins: $x0, $x1
-
-    ; The store to ptr2 prevents merging into a single store.
-    ; We can still merge the stores into addr1 and addr2.
-
     ; CHECK-LABEL: name: test_alias_4xs16
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
@@ -639,10 +644,10 @@ liveins:
 frameInfo:
   maxAlignment:    1
 machineFunctionInfo: {}
+# Here store of 5 and 9 can be merged, others have aliasing barriers.
 body:             |
   bb.1 (%ir-block.0):
     liveins: $x0, $x1, $x2
-    ; Here store of 5 and 9 can be merged, others have aliasing barriers.
     ; CHECK-LABEL: name: test_alias2_4xs16
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
@@ -698,12 +703,11 @@ liveins:
 frameInfo:
   maxAlignment:    1
 machineFunctionInfo: {}
+# No merging can be done here.
 body:             |
   bb.1 (%ir-block.0):
     liveins: $x0, $x1, $x2, $x3
 
-    ; No merging can be done here.
-
     ; CHECK-LABEL: name: test_alias3_4xs16
     ; CHECK: liveins: $x0, $x1, $x2, $x3
     ; CHECK-NEXT: {{  $}}
@@ -767,12 +771,10 @@ stack:
   - { id: 0, name: a1, size: 24, alignment: 4 }
   - { id: 1, name: a2, size: 4, alignment: 4 }
 machineFunctionInfo: {}
+# Can merge because the load is from a different alloca and can't alias.
 body:             |
   bb.1 (%ir-block.0):
     liveins: $x0
-
-    ; Can merge because the load is from a different alloca and can't alias.
-
     ; CHECK-LABEL: name: test_alias_allocas_2xs32
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
@@ -826,3 +828,43 @@ body:             |
     RET_ReallyLR
 
 ...
+---
+name:            invalid_zero_offset_no_merge
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.1):
+    liveins: $x0
+
+    ; CHECK-LABEL: name: invalid_zero_offset_no_merge
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @G
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[GV]], [[SHL]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %ir.2)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[GV]], [[C2]](s64)
+    ; CHECK-NEXT: G_STORE [[C1]](s32), [[PTR_ADD1]](p0) :: (store (s32) into `ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1)`)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s64) = COPY $x0
+    %9:_(s64) = G_CONSTANT i64 2
+    %3:_(s64) = G_SHL %0, %9(s64)
+    %1:_(p0) = G_GLOBAL_VALUE @G
+    %4:_(p0) = G_PTR_ADD %1, %3(s64)
+    %6:_(s32) = G_CONSTANT i32 0
+    G_STORE %6(s32), %4(p0) :: (store (s32) into %ir.2)
+    %8:_(s64) = G_CONSTANT i64 4
+    %7:_(p0) = nuw G_PTR_ADD %1, %8(s64)
+    G_STORE %6(s32), %7(p0) :: (store (s32) into `ptr getelementptr inbounds ([10 x i32], ptr @G, i64 0, i64 1)`)
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 43122c8c953f..bd5d076d1ba8 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -353,9 +353,9 @@ define <16 x i1> @lane_mask_v16i1_i32(i32 %index, i32 %TC) {
 define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
 ; CHECK-LABEL: lane_mask_v8i1_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    whilelo p0.h, w0, w1
-; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    whilelo p0.b, w0, w1
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
   ret <8 x i1> %active.lane.mask
@@ -364,9 +364,9 @@ define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
 define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
 ; CHECK-LABEL: lane_mask_v4i1_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    whilelo p0.s, w0, w1
-; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    whilelo p0.h, w0, w1
+; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC)
   ret <4 x i1> %active.lane.mask
@@ -375,9 +375,9 @@ define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
 define <2 x i1> @lane_mask_v2i1_i32(i32 %index, i32 %TC) {
 ; CHECK-LABEL: lane_mask_v2i1_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    whilelo p0.d, w0, w1
-; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    whilelo p0.s, w0, w1
+; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC)
   ret <2 x i1> %active.lane.mask
@@ -397,9 +397,9 @@ define <16 x i1> @lane_mask_v16i1_i64(i64 %index, i64 %TC) {
 define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
 ; CHECK-LABEL: lane_mask_v8i1_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    whilelo p0.h, x0, x1
-; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    whilelo p0.b, x0, x1
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %index, i64 %TC)
   ret <8 x i1> %active.lane.mask
@@ -408,9 +408,9 @@ define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
 define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
 ; CHECK-LABEL: lane_mask_v4i1_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    whilelo p0.s, x0, x1
-; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    whilelo p0.h, x0, x1
+; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %index, i64 %TC)
   ret <4 x i1> %active.lane.mask
@@ -419,9 +419,9 @@ define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
 define <2 x i1> @lane_mask_v2i1_i64(i64 %index, i64 %TC) {
 ; CHECK-LABEL: lane_mask_v2i1_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    whilelo p0.d, x0, x1
-; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    whilelo p0.s, x0, x1
+; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 %index, i64 %TC)
   ret <2 x i1> %active.lane.mask
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index bb9ba05f7a27..c00c9bfe127e 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -223,8 +223,8 @@ define i8 @matches_has_sret() nounwind {
 
 %TSRet = type { i64, i64 }
 define void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind {
-; CHECK-LABEL:    .def    $ientry_thunk$cdecl$m16a32$v;
-; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m16a32$v
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$m16$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m16$v
 ; CHECK:          // %bb.0:
 ; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
 ; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
@@ -457,7 +457,7 @@ define %T2 @simple_struct(%T1 %0, %T2 %1, %T3, %T4) nounwind {
 ; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$i8$v
 ; CHECK-NEXT:     .word   1
 ; CHECK-NEXT:     .symidx "#has_aligned_sret"
-; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m16a32$v
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m16$v
 ; CHECK-NEXT:     .word   1
 ; CHECK-NEXT:     .symidx "#small_array"
 ; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m2$m2F8
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
index 3b911e78aff2..7a40fcd85ac5 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
@@ -236,8 +236,8 @@ declare void @has_sret(ptr sret([100 x i8])) nounwind;
 
 %TSRet = type { i64, i64 }
 declare void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind;
-; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m16a32$v;
-; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m16a32$v
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m16$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m16$v
 ; CHECK:          // %bb.0:
 ; CHECK-NEXT:     sub     sp, sp, #48
 ; CHECK-NEXT:     .seh_stackalloc 48
@@ -271,8 +271,8 @@ declare void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind;
 ; CHECK:          adrp    x11, has_aligned_sret
 ; CHECK:          add     x11, x11, :lo12:has_aligned_sret
 ; CHECK:          ldr     x9, [x9, :lo12:__os_arm64x_check_icall]
-; CHECK:          adrp    x10, ($iexit_thunk$cdecl$m16a32$v)
-; CHECK:          add     x10, x10, :lo12:($iexit_thunk$cdecl$m16a32$v)
+; CHECK:          adrp    x10, ($iexit_thunk$cdecl$m16$v)
+; CHECK:          add     x10, x10, :lo12:($iexit_thunk$cdecl$m16$v)
 ; CHECK:          blr     x9
 ; CHECK:          .seh_startepilogue
 ; CHECK:          ldr     x30, [sp], #16                  // 8-byte Folded Reload
@@ -492,7 +492,7 @@ declare %T2 @simple_struct(%T1, %T2, %T3, %T4) nounwind;
 ; CHECK-NEXT:     .symidx has_sret
 ; CHECK-NEXT:     .word   0
 ; CHECK-NEXT:     .symidx has_aligned_sret
-; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m16a32$v
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m16$v
 ; CHECK-NEXT:     .word   4
 ; CHECK-NEXT:     .symidx "#has_aligned_sret$exit_thunk"
 ; CHECK-NEXT:     .symidx has_aligned_sret
diff --git a/llvm/test/CodeGen/AArch64/combine-mul.ll b/llvm/test/CodeGen/AArch64/combine-mul.ll
index a2b042530809..c49e5ae6620a 100644
--- a/llvm/test/CodeGen/AArch64/combine-mul.ll
+++ b/llvm/test/CodeGen/AArch64/combine-mul.ll
@@ -44,8 +44,7 @@ define <4 x i1> @PR48683_vec_undef(<4 x i32> %x) {
 define i64 @combine_mul_self_demandedbits(i64 %x) {
 ; CHECK-LABEL: combine_mul_self_demandedbits:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x8, x0, x0
-; CHECK-NEXT:    and x0, x8, #0xfffffffffffffffd
+; CHECK-NEXT:    mul x0, x0, x0
 ; CHECK-NEXT:    ret
   %1 = mul i64 %x, %x
   %2 = and i64 %1, -3
@@ -77,7 +76,7 @@ define i8 @one_demanded_bit(i8 %x) {
 define <2 x i64> @one_demanded_bit_splat(<2 x i64> %x) {
 ; CHECK-LABEL: one_demanded_bit_splat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32
+; CHECK-NEXT:    mov w8, #32 // =0x20
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #5
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -131,7 +130,7 @@ define i32 @squared_demanded_2_low_bits(i32 %x) {
 define <2 x i64> @squared_demanded_2_low_bits_splat(<2 x i64> %x) {
 ; CHECK-LABEL: squared_demanded_2_low_bits_splat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-2
+; CHECK-NEXT:    mov x8, #-2 // =0xfffffffffffffffe
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
index 86b1d5d195ff..0485d530fd06 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
@@ -25,10 +25,10 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -37,12 +37,12 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
   %12 = fadd contract <vscale x 2 x double> %10, %9
   %13 = fadd contract <vscale x 2 x double> %6, %11
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -67,10 +67,10 @@ define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -79,10 +79,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -93,7 +93,7 @@ entry:
   %19 = fsub contract <vscale x 2 x double> %17, %18
   %20 = fadd contract <vscale x 2 x double> %9, %19
   %21 = fadd contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -118,10 +118,10 @@ define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    fsub z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -130,10 +130,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -144,7 +144,7 @@ entry:
   %19 = fsub contract <vscale x 2 x double> %17, %18
   %20 = fsub contract <vscale x 2 x double> %9, %19
   %21 = fsub contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -169,10 +169,10 @@ define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x
 ; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -181,10 +181,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
-  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -195,7 +195,7 @@ entry:
   %19 = fadd contract <vscale x 2 x double> %17, %18
   %20 = fadd contract <vscale x 2 x double> %9, %19
   %21 = fadd contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -238,10 +238,10 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec78 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec78 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec78, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec78, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -250,14 +250,14 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
   %12 = tail call contract <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %11)
   %13 = fadd contract <vscale x 2 x double> %10, %12
   %14 = tail call contract <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %10)
   %15 = fsub contract <vscale x 2 x double> %14, %11
-  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %16 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
   %17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
   %18 = fmul contract <vscale x 2 x double> %15, %17
@@ -268,10 +268,10 @@ entry:
   %23 = fsub contract <vscale x 2 x double> %21, %22
   %24 = fadd contract <vscale x 2 x double> %9, %23
   %25 = fadd contract <vscale x 2 x double> %6, %20
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %24, <vscale x 2 x double> %25)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %24, <vscale x 2 x double> %25)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
index edf580e334e8..c643ae9265c0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -16,24 +16,24 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
 ; CHECK-NEXT:    mov z1.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fadd fast <vscale x 2 x double> %4, %5
   %7 = fmul fast <vscale x 2 x double> %2, %0
-  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
   %10 = fadd fast <vscale x 2 x double> %8, %7
   %11 = fmul fast <vscale x 2 x double> %3, %1
   %12 = fsub fast <vscale x 2 x double> %10, %11
   %13 = fadd fast <vscale x 2 x double> %6, %9
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -56,20 +56,20 @@ define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %12 = fmul fast <vscale x 2 x double> %11, %8
@@ -82,7 +82,7 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %4, %5
   %20 = fadd fast <vscale x 2 x double> %19, %13
   %21 = fadd fast <vscale x 2 x double> %20, %12
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %18, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %18, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -105,20 +105,20 @@ define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
-  %strided.vec58 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec58 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 1
   %12 = fmul fast <vscale x 2 x double> %11, %9
@@ -131,7 +131,7 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %18, %17
   %20 = fadd fast <vscale x 2 x double> %4, %5
   %21 = fsub fast <vscale x 2 x double> %20, %19
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %16, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %16, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -154,19 +154,19 @@ define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
-  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
-  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
   %11 = fmul fast <vscale x 2 x double> %10, %7
@@ -180,7 +180,7 @@ entry:
   %19 = fmul fast <vscale x 2 x double> %9, %8
   %20 = fsub fast <vscale x 2 x double> %18, %19
   %21 = fadd fast <vscale x 2 x double> %20, %11
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -211,20 +211,20 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
-  %strided.vec84 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec84 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 1
   %12 = fmul fast <vscale x 2 x double> %10, %8
@@ -237,9 +237,9 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %18, %12
   %20 = fmul fast <vscale x 2 x double> %11, %9
   %21 = fsub fast <vscale x 2 x double> %19, %20
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
index 48b5756b01fb..dae8d9f89e99 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -24,15 +24,15 @@ define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
   %a.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
   %b.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1)
+  %interleaved.vec = tail call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1)
   ret <vscale x 4 x half> %interleaved.vec
 }
 
@@ -45,15 +45,15 @@ define <vscale x 8 x half> @complex_add_v8f16(<vscale x 8 x half> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
   %a.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
   %b.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1)
+  %interleaved.vec = tail call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1)
   ret <vscale x 8 x half> %interleaved.vec
 }
 
@@ -68,15 +68,15 @@ define <vscale x 16 x half> @complex_add_v16f16(<vscale x 16 x half> %a, <vscale
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
   %a.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
   %b.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 8 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 8 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1)
+  %interleaved.vec = tail call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1)
   ret <vscale x 16 x half> %interleaved.vec
 }
 
@@ -95,26 +95,26 @@ define <vscale x 32 x half> @complex_add_v32f16(<vscale x 32 x half> %a, <vscale
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
   %a.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
   %b.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 16 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 16 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1)
+  %interleaved.vec = tail call <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1)
   ret <vscale x 32 x half> %interleaved.vec
 }
 
-declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
 
-declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
 
-declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
 
-declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
index 7cdb10e7159f..a5c64c0982d0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -127,15 +127,15 @@ define <4 x half> @complex_add_v4f16_with_intrinsic(<4 x half> %a, <4 x half> %b
 ; CHECK-NEXT:    fcadd v0.4h, v1.4h, v0.4h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %a)
+  %a.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half> %a)
   %a.real = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %b)
+  %b.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half> %b)
   %b.real = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 1
   %0 = fsub fast <2 x half> %b.real, %a.imag
   %1 = fadd fast <2 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %0, <2 x half> %1)
+  %interleaved.vec = tail call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %0, <2 x half> %1)
   ret <4 x half> %interleaved.vec
 }
 
@@ -146,15 +146,15 @@ define <8 x half> @complex_add_v8f16_with_intrinsic(<8 x half> %a, <8 x half> %b
 ; CHECK-NEXT:    fcadd v0.8h, v1.8h, v0.8h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %a)
+  %a.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half> %a)
   %a.real = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %b)
+  %b.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half> %b)
   %b.real = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 1
   %0 = fsub fast <4 x half> %b.real, %a.imag
   %1 = fadd fast <4 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %0, <4 x half> %1)
+  %interleaved.vec = tail call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %0, <4 x half> %1)
   ret <8 x half> %interleaved.vec
 }
 
@@ -166,15 +166,15 @@ define <16 x half> @complex_add_v16f16_with_intrinsic(<16 x half> %a, <16 x half
 ; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %a)
+  %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half> %a)
   %a.real = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %b)
+  %b.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half> %b)
   %b.real = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 1
   %0 = fsub fast <8 x half> %b.real, %a.imag
   %1 = fadd fast <8 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %0, <8 x half> %1)
+  %interleaved.vec = tail call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %0, <8 x half> %1)
   ret <16 x half> %interleaved.vec
 }
 
@@ -216,11 +216,11 @@ entry:
 }
 
 
-declare { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
 
-declare { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
 
-declare { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
index cb285c05b2e8..c09ec616b015 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -27,10 +27,10 @@ define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
   %a.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
   %b.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x half> %b.imag, %a.real
@@ -39,7 +39,7 @@ entry:
   %3 = fmul fast <vscale x 2 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %5, <vscale x 2 x half> %2)
+  %interleaved.vec = tail call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %5, <vscale x 2 x half> %2)
   ret <vscale x 4 x half> %interleaved.vec
 }
 
@@ -54,10 +54,10 @@ define <vscale x 8 x half> @complex_mul_v8f16(<vscale x 8 x half> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
   %a.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
   %b.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x half> %b.imag, %a.real
@@ -66,7 +66,7 @@ entry:
   %3 = fmul fast <vscale x 4 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %5, <vscale x 4 x half> %2)
+  %interleaved.vec = tail call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %5, <vscale x 4 x half> %2)
   ret <vscale x 8 x half> %interleaved.vec
 }
 ; Expected to transform
@@ -84,10 +84,10 @@ define <vscale x 16 x half> @complex_mul_v16f16(<vscale x 16 x half> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
   %a.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
   %b.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 8 x half> %b.imag, %a.real
@@ -96,7 +96,7 @@ entry:
   %3 = fmul fast <vscale x 8 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 8 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 8 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %5, <vscale x 8 x half> %2)
+  %interleaved.vec = tail call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %5, <vscale x 8 x half> %2)
   ret <vscale x 16 x half> %interleaved.vec
 }
 
@@ -123,10 +123,10 @@ define <vscale x 32 x half> @complex_mul_v32f16(<vscale x 32 x half> %a, <vscale
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
   %a.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
   %b.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 16 x half> %b.imag, %a.real
@@ -135,20 +135,20 @@ entry:
   %3 = fmul fast <vscale x 16 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 16 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 16 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half> %5, <vscale x 16 x half> %2)
+  %interleaved.vec = tail call <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half> %5, <vscale x 16 x half> %2)
   ret <vscale x 32 x half> %interleaved.vec
 }
 
-declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
 
-declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
 
-declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
 
-declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
 
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
index ab764a58a770..47ad9ea2451a 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
@@ -12,15 +12,15 @@ define <vscale x 4 x float> @complex_add_v4f32(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
   %a.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
   %b.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1)
+  %interleaved.vec = tail call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1)
   ret <vscale x 4 x float> %interleaved.vec
 }
 
@@ -35,15 +35,15 @@ define <vscale x 8 x float> @complex_add_v8f32(<vscale x 8 x float> %a, <vscale
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
   %a.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
   %b.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1)
+  %interleaved.vec = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1)
   ret <vscale x 8 x float> %interleaved.vec
 }
 ; Expected to transform
@@ -61,23 +61,23 @@ define <vscale x 16 x float> @complex_add_v16f32(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
   %a.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
   %b.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 8 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 8 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1)
+  %interleaved.vec = tail call <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1)
   ret <vscale x 16 x float> %interleaved.vec
 }
 
-declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
 
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
 
-declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
+declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
index 1e2afb78de1b..bcd46aa182b5 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
@@ -14,10 +14,10 @@ define <vscale x 4 x float> @complex_mul_v4f32(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
   %a.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
   %b.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x float> %b.imag, %a.real
@@ -26,7 +26,7 @@ entry:
   %3 = fmul fast <vscale x 2 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %5, <vscale x 2 x float> %2)
+  %interleaved.vec = tail call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %5, <vscale x 2 x float> %2)
   ret <vscale x 4 x float> %interleaved.vec
 }
 
@@ -45,10 +45,10 @@ define <vscale x 8 x float> @complex_mul_v8f32(<vscale x 8 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
   %a.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
   %b.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x float> %b.imag, %a.real
@@ -57,7 +57,7 @@ entry:
   %3 = fmul fast <vscale x 4 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %5, <vscale x 4 x float> %2)
+  %interleaved.vec = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %5, <vscale x 4 x float> %2)
   ret <vscale x 8 x float> %interleaved.vec
 }
 
@@ -84,10 +84,10 @@ define <vscale x 16 x float> @complex_mul_v16f32(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
   %a.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
   %b.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 8 x float> %b.imag, %a.real
@@ -96,16 +96,16 @@ entry:
   %3 = fmul fast <vscale x 8 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 8 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 8 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float> %5, <vscale x 8 x float> %2)
+  %interleaved.vec = tail call <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float> %5, <vscale x 8 x float> %2)
   ret <vscale x 16 x float> %interleaved.vec
 }
 
-declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
 
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
 
-declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
+declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
index 46a15f489d2b..c992d63ca283 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
@@ -12,15 +12,15 @@ define <vscale x 2 x double> @complex_add_v2f64(<vscale x 2 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
   %a.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
   %b.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 1 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 1 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1)
+  %interleaved.vec = tail call <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1)
   ret <vscale x 2 x double> %interleaved.vec
 }
 
@@ -35,15 +35,15 @@ define <vscale x 4 x double> @complex_add_v4f64(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %a.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %b.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -62,23 +62,23 @@ define <vscale x 8 x double> @complex_add_v8f64(<vscale x 8 x double> %a, <vscal
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
   %a.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
   %b.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1)
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1)
   ret <vscale x 8 x double> %interleaved.vec
 }
 
-declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
+declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
index 17a239a09a03..db28fa3997cb 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
@@ -14,10 +14,10 @@ define <vscale x 2 x double> @complex_mul_v2f64(<vscale x 2 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
   %a.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
   %b.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 1 x double> %b.imag, %a.real
@@ -26,7 +26,7 @@ entry:
   %3 = fmul fast <vscale x 1 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 1 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 1 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double> %5, <vscale x 1 x double> %2)
+  %interleaved.vec = tail call <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double> %5, <vscale x 1 x double> %2)
   ret <vscale x 2 x double> %interleaved.vec
 }
 
@@ -45,10 +45,10 @@ define <vscale x 4 x double> @complex_mul_v4f64(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %a.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %b.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x double> %b.imag, %a.real
@@ -57,7 +57,7 @@ entry:
   %3 = fmul fast <vscale x 2 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %5, <vscale x 2 x double> %2)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %5, <vscale x 2 x double> %2)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -84,10 +84,10 @@ define <vscale x 8 x double> @complex_mul_v8f64(<vscale x 8 x double> %a, <vscal
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
   %a.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
   %b.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x double> %b.imag, %a.real
@@ -96,15 +96,15 @@ entry:
   %3 = fmul fast <vscale x 4 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %5, <vscale x 4 x double> %2)
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %5, <vscale x 4 x double> %2)
   ret <vscale x 8 x double> %interleaved.vec
 }
 
-declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
+declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
index 001046f8f397..f0569674c651 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
@@ -22,15 +22,15 @@ define <vscale x 4 x i16> @complex_add_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
   %a.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
   %b.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i16> %b.real, %a.imag
   %1 = add <vscale x 2 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1)
+  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1)
   ret <vscale x 4 x i16> %interleaved.vec
 }
 
@@ -42,15 +42,15 @@ define <vscale x 8 x i16> @complex_add_v8i16(<vscale x 8 x i16> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
   %a.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
   %b.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i16> %b.real, %a.imag
   %1 = add <vscale x 4 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1)
+  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1)
   ret <vscale x 8 x i16> %interleaved.vec
 }
 
@@ -64,15 +64,15 @@ define <vscale x 16 x i16> @complex_add_v16i16(<vscale x 16 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
   %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
   %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i16> %b.real, %a.imag
   %1 = add <vscale x 8 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1)
+  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1)
   ret <vscale x 16 x i16> %interleaved.vec
 }
 
@@ -90,26 +90,26 @@ define <vscale x 32 x i16> @complex_add_v32i16(<vscale x 32 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
   %a.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
   %b.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 16 x i16> %b.real, %a.imag
   %1 = add <vscale x 16 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1)
+  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1)
   ret <vscale x 32 x i16> %interleaved.vec
 }
 
-declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
+declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
 
-declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
 
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 
-declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
+declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
index 07488b623b98..b4cb548f6308 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
@@ -26,10 +26,10 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
   %a.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
   %b.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i16> %b.imag, %a.real
@@ -38,7 +38,7 @@ entry:
   %3 = mul <vscale x 2 x i16> %b.real, %a.real
   %4 = mul <vscale x 2 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16> %5, <vscale x 2 x i16> %2)
+  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> %5, <vscale x 2 x i16> %2)
   ret <vscale x 4 x i16> %interleaved.vec
 }
 
@@ -52,10 +52,10 @@ define <vscale x 8 x i16> @complex_mul_v8i16(<vscale x 8 x i16> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
   %a.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
   %b.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i16> %b.imag, %a.real
@@ -64,7 +64,7 @@ entry:
   %3 = mul <vscale x 4 x i16> %b.real, %a.real
   %4 = mul <vscale x 4 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %5, <vscale x 4 x i16> %2)
+  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %5, <vscale x 4 x i16> %2)
   ret <vscale x 8 x i16> %interleaved.vec
 }
 ; Expected to transform
@@ -81,10 +81,10 @@ define <vscale x 16 x i16> @complex_mul_v16i16(<vscale x 16 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
   %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
   %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 8 x i16> %b.imag, %a.real
@@ -93,7 +93,7 @@ entry:
   %3 = mul <vscale x 8 x i16> %b.real, %a.real
   %4 = mul <vscale x 8 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 8 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %5, <vscale x 8 x i16> %2)
+  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %5, <vscale x 8 x i16> %2)
   ret <vscale x 16 x i16> %interleaved.vec
 }
 
@@ -119,10 +119,10 @@ define <vscale x 32 x i16> @complex_mul_v32i16(<vscale x 32 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
   %a.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
   %b.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 16 x i16> %b.imag, %a.real
@@ -131,20 +131,20 @@ entry:
   %3 = mul <vscale x 16 x i16> %b.real, %a.real
   %4 = mul <vscale x 16 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 16 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16> %5, <vscale x 16 x i16> %2)
+  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16> %5, <vscale x 16 x i16> %2)
   ret <vscale x 32 x i16> %interleaved.vec
 }
 
-declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
+declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
 
-declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
 
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 
-declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
+declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
 
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
index 1ce480bbf3d8..458cd62269f8 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
@@ -11,15 +11,15 @@ define <vscale x 4 x i32> @complex_add_v4i32(<vscale x 4 x i32> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
   %a.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
   %b.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i32> %b.real, %a.imag
   %1 = add <vscale x 2 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1)
+  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1)
   ret <vscale x 4 x i32> %interleaved.vec
 }
 
@@ -33,15 +33,15 @@ define <vscale x 8 x i32> @complex_add_v8i32(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
   %a.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
   %b.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i32> %b.real, %a.imag
   %1 = add <vscale x 4 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1)
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1)
   ret <vscale x 8 x i32> %interleaved.vec
 }
 
@@ -59,23 +59,23 @@ define <vscale x 16 x i32> @complex_add_v16i32(<vscale x 16 x i32> %a, <vscale x
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
   %a.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
   %b.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i32> %b.real, %a.imag
   %1 = add <vscale x 8 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1)
   ret <vscale x 16 x i32> %interleaved.vec
 }
 
-declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
 
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
-declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
index d88eef9800d7..4cfe4707b9a9 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
@@ -13,10 +13,10 @@ define <vscale x 4 x i32> @complex_mul_v4i32(<vscale x 4 x i32> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
   %a.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
   %b.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i32> %b.imag, %a.real
@@ -25,7 +25,7 @@ entry:
   %3 = mul <vscale x 2 x i32> %b.real, %a.real
   %4 = mul <vscale x 2 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %5, <vscale x 2 x i32> %2)
+  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %5, <vscale x 2 x i32> %2)
   ret <vscale x 4 x i32> %interleaved.vec
 }
 
@@ -43,10 +43,10 @@ define <vscale x 8 x i32> @complex_mul_v8i32(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
   %a.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
   %b.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i32> %b.imag, %a.real
@@ -55,7 +55,7 @@ entry:
   %3 = mul <vscale x 4 x i32> %b.real, %a.real
   %4 = mul <vscale x 4 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %5, <vscale x 4 x i32> %2)
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %5, <vscale x 4 x i32> %2)
   ret <vscale x 8 x i32> %interleaved.vec
 }
 
@@ -81,10 +81,10 @@ define <vscale x 16 x i32> @complex_mul_v16i32(<vscale x 16 x i32> %a, <vscale x
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
   %a.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
   %b.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 8 x i32> %b.imag, %a.real
@@ -93,16 +93,16 @@ entry:
   %3 = mul <vscale x 8 x i32> %b.real, %a.real
   %4 = mul <vscale x 8 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 8 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %5, <vscale x 8 x i32> %2)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %5, <vscale x 8 x i32> %2)
   ret <vscale x 16 x i32> %interleaved.vec
 }
 
-declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
 
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
-declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
index 0b59be9414fa..f06b55c68b7e 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
@@ -11,15 +11,15 @@ define <vscale x 2 x i64> @complex_add_v2i64(<vscale x 2 x i64> %a, <vscale x 2
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
   %a.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
   %b.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 1 x i64> %b.real, %a.imag
   %1 = add <vscale x 1 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1)
+  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1)
   ret <vscale x 2 x i64> %interleaved.vec
 }
 
@@ -33,15 +33,15 @@ define <vscale x 4 x i64> @complex_add_v4i64(<vscale x 4 x i64> %a, <vscale x 4
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
   %a.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
   %b.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i64> %b.real, %a.imag
   %1 = add <vscale x 2 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1)
+  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1)
   ret <vscale x 4 x i64> %interleaved.vec
 }
 
@@ -59,23 +59,23 @@ define <vscale x 8 x i64> @complex_add_v8i64(<vscale x 8 x i64> %a, <vscale x 8
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i64> %b.real, %a.imag
   %1 = add <vscale x 4 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
-declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
+declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
 
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
-declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
index 16e1f3e63dce..5975f3b491d4 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
@@ -13,10 +13,10 @@ define <vscale x 2 x i64> @complex_mul_v2i64(<vscale x 2 x i64> %a, <vscale x 2
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
   %a.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
   %b.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 1 x i64> %b.imag, %a.real
@@ -25,7 +25,7 @@ entry:
   %3 = mul <vscale x 1 x i64> %b.real, %a.real
   %4 = mul <vscale x 1 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 1 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64> %5, <vscale x 1 x i64> %2)
+  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> %5, <vscale x 1 x i64> %2)
   ret <vscale x 2 x i64> %interleaved.vec
 }
 
@@ -43,10 +43,10 @@ define <vscale x 4 x i64> @complex_mul_v4i64(<vscale x 4 x i64> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
   %a.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
   %b.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i64> %b.imag, %a.real
@@ -55,7 +55,7 @@ entry:
   %3 = mul <vscale x 2 x i64> %b.real, %a.real
   %4 = mul <vscale x 2 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %5, <vscale x 2 x i64> %2)
+  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %5, <vscale x 2 x i64> %2)
   ret <vscale x 4 x i64> %interleaved.vec
 }
 
@@ -81,10 +81,10 @@ define <vscale x 8 x i64> @complex_mul_v8i64(<vscale x 8 x i64> %a, <vscale x 8
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i64> %b.imag, %a.real
@@ -93,7 +93,7 @@ entry:
   %3 = mul <vscale x 4 x i64> %b.real, %a.real
   %4 = mul <vscale x 4 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %5, <vscale x 4 x i64> %2)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %5, <vscale x 4 x i64> %2)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
@@ -119,11 +119,11 @@ define <vscale x 8 x i64> @complex_minus_mul_v8i64(<vscale x 8 x i64> %a, <vscal
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
   %0 = sub <vscale x 4 x i64> zeroinitializer, %a.real
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %1 = mul <vscale x 4 x i64> %b.real, %0
@@ -132,15 +132,15 @@ entry:
   %4 = mul <vscale x 4 x i64> %b.real, %a.imag
   %5 = mul <vscale x 4 x i64> %b.imag, %0
   %6 = sub <vscale x 4 x i64> %5, %4
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %3, <vscale x 4 x i64> %6)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %3, <vscale x 4 x i64> %6)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
-declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
+declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
 
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
-declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
index b631486137e6..81872c1723f2 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
@@ -22,15 +22,15 @@ define <vscale x 8 x i8> @complex_add_v8i8(<vscale x 8 x i8> %a, <vscale x 8 x i
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %a)
   %a.real = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %b)
   %b.real = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i8> %b.real, %a.imag
   %1 = add <vscale x 4 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i8> @llvm.experimental.vector.interleave2.nxv8i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1)
+  %interleaved.vec = tail call <vscale x 8 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1)
   ret <vscale x 8 x i8> %interleaved.vec
 }
 
@@ -42,15 +42,15 @@ define <vscale x 16 x i8> @complex_add_v16i8(<vscale x 16 x i8> %a, <vscale x 16
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %a)
   %a.real = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %b)
   %b.real = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i8> %b.real, %a.imag
   %1 = add <vscale x 8 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1)
+  %interleaved.vec = tail call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1)
   ret <vscale x 16 x i8> %interleaved.vec
 }
 
@@ -64,23 +64,23 @@ define <vscale x 32 x i8> @complex_add_v32i8(<vscale x 32 x i8> %a, <vscale x 32
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a)
   %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b)
   %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 16 x i8> %b.real, %a.imag
   %1 = add <vscale x 16 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1)
+  %interleaved.vec = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1)
   ret <vscale x 32 x i8> %interleaved.vec
 }
 
-declare { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8>)
-declare <vscale x 8 x i8> @llvm.experimental.vector.interleave2.nxv8i8(<vscale x 4 x i8>, <vscale x 4 x i8>)
+declare { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8>)
+declare <vscale x 8 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 4 x i8>, <vscale x 4 x i8>)
 
-declare { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
+declare { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
 
-declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 19318fdeeca7..ac2b21af29ab 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -69,14 +69,14 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.phi27 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %16, %vector.body ]
   %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
   %scevgep34 = getelementptr i8, ptr %b, i64 %lsr.iv
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %interleaved.mask28 = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
+  %interleaved.mask28 = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
   %wide.masked.vec29 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep34, i32 8, <vscale x 4 x i1> %interleaved.mask28, <vscale x 4 x double> poison)
-  %strided.vec30 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec29)
+  %strided.vec30 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec29)
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec30, 0
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec30, 1
   %7 = fmul fast <vscale x 2 x double> %6, %3
@@ -175,13 +175,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %4 = icmp ne <vscale x 2 x i32> %wide.load, zeroinitializer
   %scevgep49 = getelementptr i8, ptr %a, i64 %lsr.iv48
   %scevgep50 = getelementptr i8, ptr %b, i64 %lsr.iv48
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %4, <vscale x 2 x i1> %4)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %4, <vscale x 2 x i1> %4)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep49, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
   %wide.masked.vec32 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep50, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
+  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 1
   %9 = fmul fast <vscale x 2 x double> %8, %5
@@ -279,14 +279,14 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep38 = getelementptr i8, ptr %a, i64 %lsr.iv
   %scevgep39 = getelementptr i8, ptr %b, i64 %lsr.iv
   %5 = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %4, <vscale x 2 x i1> zeroinitializer
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep38, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %interleaved.mask31 = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
+  %interleaved.mask31 = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
   %wide.masked.vec32 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep39, i32 8, <vscale x 4 x i1> %interleaved.mask31, <vscale x 4 x double> poison)
-  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
+  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 1
   %10 = fmul fast <vscale x 2 x double> %9, %6
@@ -320,6 +320,6 @@ declare i64 @llvm.vscale.i64()
 declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i32>)
 declare <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x double>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 5bef95910d90..af07519ad53d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -64,11 +64,11 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
   %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
-  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
   %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
-  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
+  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
   %9 = fmul fast <vscale x 2 x double> %8, %4
@@ -156,11 +156,11 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
   %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
-  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
   %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
-  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
+  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
   %9 = fmul fast <vscale x 2 x double> %8, %4
@@ -266,16 +266,16 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep62 = getelementptr i8, ptr %scevgep61, i64 %lsr.iv34
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep57, align 8
   %wide.vec32 = load <vscale x 4 x double>, ptr %scevgep64, align 8
-  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
-  %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32)
+  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32)
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 1
   %wide.vec34 = load <vscale x 4 x double>, ptr %scevgep58, align 8
   %wide.vec35 = load <vscale x 4 x double>, ptr %scevgep62, align 8
-  %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34)
-  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35)
+  %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34)
+  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
   %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 1
@@ -375,7 +375,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %5 = add <vscale x 2 x i32> %wide.load, %vec.phi
   %6 = getelementptr inbounds %"class.std::complex", ptr %a, i64 %index
   %wide.vec = load <vscale x 4 x double>, ptr %6, align 8
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
   %9 = fadd fast <vscale x 2 x double> %7, %vec.phi13
@@ -396,6 +396,6 @@ middle.block:                                     ; preds = %vector.body
 
 
 declare i64 @llvm.vscale.i64()
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
 declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 17bf5ba6eb48..b4425c0c01e1 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -28,10 +28,10 @@ define <vscale x 4 x double> @complex_mul_const(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
@@ -46,7 +46,7 @@ entry:
   %13 = fmul fast <vscale x 2 x double> %9, splat (double 1.100000e+01)
   %14 = fmul fast <vscale x 2 x double> %6, splat (double 3.000000e+00)
   %15 = fsub fast <vscale x 2 x double> %13, %14
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -83,10 +83,10 @@ entry:
   %broadcast.splat = shufflevector <vscale x 2 x double> %broadcast.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
   %broadcast.splatinsert49 = insertelement <vscale x 2 x double> poison, double %c.coerce.fca.0.extract, i64 0
   %broadcast.splat50 = shufflevector <vscale x 2 x double> %broadcast.splatinsert49, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
@@ -101,9 +101,9 @@ entry:
   %13 = fmul fast <vscale x 2 x double> %9, %broadcast.splat50
   %14 = fmul fast <vscale x 2 x double> %6, %broadcast.splat
   %15 = fsub fast <vscale x 2 x double> %13, %14
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/concatbinop.ll b/llvm/test/CodeGen/AArch64/concatbinop.ll
index a13e62e0612c..828182d18b38 100644
--- a/llvm/test/CodeGen/AArch64/concatbinop.ll
+++ b/llvm/test/CodeGen/AArch64/concatbinop.ll
@@ -5,9 +5,13 @@
 define <8 x i16> @concat_add(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: concat_add:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = add <4 x i16> %a, %b
   %y = add <4 x i16> %c, %d
@@ -33,13 +37,9 @@ define <8 x i16> @concat_addtunc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 define <8 x i16> @concat_addtunc2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; CHECK-LABEL: concat_addtunc2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    add v1.4h, v2.4h, v3.4h
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %at = trunc <4 x i32> %a to <4 x i16>
   %bt = trunc <4 x i32> %b to <4 x i16>
@@ -54,9 +54,13 @@ define <8 x i16> @concat_addtunc2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 define <8 x i16> @concat_sub(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: concat_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = sub <4 x i16> %a, %b
   %y = sub <4 x i16> %c, %d
@@ -67,9 +71,13 @@ define <8 x i16> @concat_sub(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
 define <8 x i16> @concat_mul(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: concat_mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = mul <4 x i16> %a, %b
   %y = mul <4 x i16> %c, %d
@@ -80,9 +88,13 @@ define <8 x i16> @concat_mul(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
 define <8 x i16> @concat_xor(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: concat_xor:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %x = xor <4 x i16> %a, %b
   %y = xor <4 x i16> %c, %d
@@ -93,9 +105,13 @@ define <8 x i16> @concat_xor(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
 define <8 x half> @concat_fadd(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: concat_fadd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fadd v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    fadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = fadd <4 x half> %a, %b
   %y = fadd <4 x half> %c, %d
@@ -106,9 +122,13 @@ define <8 x half> @concat_fadd(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x
 define <8 x half> @concat_fmul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: concat_fmul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    fmul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = fmul <4 x half> %a, %b
   %y = fmul <4 x half> %c, %d
@@ -119,9 +139,13 @@ define <8 x half> @concat_fmul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x
 define <8 x half> @concat_min(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: concat_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fminnm v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    fminnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    fminnm v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
   %y = call <4 x half> @llvm.minnum.v4f16(<4 x half> %c, <4 x half> %d)
@@ -146,21 +170,16 @@ define <16 x i8> @signOf_neon(ptr nocapture noundef readonly %a, ptr nocapture n
 ; CHECK-LABEL: signOf_neon:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    movi v0.8b, #1
+; CHECK-NEXT:    movi v0.16b, #1
 ; CHECK-NEXT:    ldp q3, q4, [x1]
 ; CHECK-NEXT:    cmhi v5.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    cmhi v6.8h, v2.8h, v4.8h
 ; CHECK-NEXT:    cmhi v1.8h, v3.8h, v1.8h
 ; CHECK-NEXT:    cmhi v2.8h, v4.8h, v2.8h
-; CHECK-NEXT:    xtn v3.8b, v5.8h
-; CHECK-NEXT:    xtn v4.8b, v6.8h
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    xtn v2.8b, v2.8h
-; CHECK-NEXT:    and v3.8b, v3.8b, v0.8b
-; CHECK-NEXT:    and v4.8b, v4.8b, v0.8b
-; CHECK-NEXT:    orr v0.8b, v3.8b, v1.8b
-; CHECK-NEXT:    orr v1.8b, v4.8b, v2.8b
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uzp1 v3.16b, v5.16b, v6.16b
+; CHECK-NEXT:    uzp1 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <8 x i16>, ptr %a, align 2
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 2ad5623b6551..c58db8290c87 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -25,7 +25,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+  %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>}   %retval
 }
 
@@ -45,7 +45,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+  %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>}   %retval
 }
 
@@ -56,7 +56,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v
 ; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+  %retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
   ret {<8 x half>, <8 x half>}   %retval
 }
 
@@ -76,7 +76,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+  %retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
   ret {<2 x float>, <2 x float>}   %retval
 }
 
@@ -87,7 +87,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %
 ; CHECK-NEXT:    uzp2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+  %retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
 ret  {<4 x float>, <4 x float>}   %retval
 }
 
@@ -98,7 +98,7 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+  %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
   ret {<2 x double>, <2 x double>}   %retval
 }
 
@@ -111,7 +111,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) {
 ; CHECK-NEXT:    uzp2 v1.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+  %retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
   ret {<16 x i8>, <16 x i8>}   %retval
 }
 
@@ -122,7 +122,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec)
 ; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>}   %retval
 }
 
@@ -133,7 +133,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_v8i32(<8 x i32> %vec) {
 ; CHECK-NEXT:    uzp2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+  %retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
   ret {<4 x i32>, <4 x i32>}   %retval
 }
 
@@ -144,22 +144,22 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+  %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
   ret {<2 x i64>, <2 x i64>}   %retval
 }
 
 
 ; Floating declarations
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
 
 ; Integer declarations
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index eb81aff33e49..2e992964f598 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -7,7 +7,7 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip1 v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
-  %retval = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
+  %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
   ret <4 x half> %retval
 }
 
@@ -28,7 +28,7 @@ define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    zip1 v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
-  %retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
+  %retval = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
   ret <8 x half> %retval
 }
 
@@ -39,7 +39,7 @@ define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) {
 ; CHECK-NEXT:    zip2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1)
+  %retval = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1)
   ret <16 x half> %retval
 }
 
@@ -59,7 +59,7 @@ define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    zip1 v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    ret
-  %retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
+  %retval = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
   ret <4 x float> %retval
 }
 
@@ -70,7 +70,7 @@ define <8 x float> @interleave2_v8f32(<4 x float> %vec0, <4 x float> %vec1) {
 ; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1)
+  %retval = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1)
   ret <8 x float> %retval
 }
 
@@ -81,7 +81,7 @@ define <4 x double> @interleave2_v4f64(<2 x double> %vec0, <2 x double> %vec1) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <4 x double>@llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1)
+  %retval = call <4 x double>@llvm.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1)
   ret <4 x double> %retval
 }
 
@@ -94,7 +94,7 @@ define <32 x i8> @interleave2_v32i8(<16 x i8> %vec0, <16 x i8> %vec1) {
 ; CHECK-NEXT:    zip2 v1.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
- %retval = call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1)
+ %retval = call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1)
   ret <32 x i8> %retval
 }
 
@@ -105,7 +105,7 @@ define <16 x i16> @interleave2_v16i16(<8 x i16> %vec0, <8 x i16> %vec1) {
 ; CHECK-NEXT:    zip2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1)
+  %retval = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1)
   ret <16 x i16> %retval
 }
 
@@ -116,7 +116,7 @@ define <8 x i32> @interleave2_v8i32(<4 x i32> %vec0, <4 x i32> %vec1) {
 ; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1)
+  %retval = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1)
   ret <8 x i32> %retval
 }
 
@@ -127,22 +127,22 @@ define <4 x i64> @interleave2_v4i64(<2 x i64> %vec0, <2 x i64> %vec1) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1)
+  %retval = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1)
   ret <4 x i64> %retval
 }
 
 
 ; Float declarations
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
 
 ; Integer declarations
-declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
diff --git a/llvm/test/CodeGen/AArch64/fpmode.ll b/llvm/test/CodeGen/AArch64/fpmode.ll
index ebfb0696a95a..b185d9e04941 100644
--- a/llvm/test/CodeGen/AArch64/fpmode.ll
+++ b/llvm/test/CodeGen/AArch64/fpmode.ll
@@ -6,17 +6,14 @@ declare i32 @llvm.get.fpmode.i32()
 declare void @llvm.set.fpmode.i32(i32 %fpmode)
 declare void @llvm.reset.fpmode()
 
-define i32 @func_get_fpmode_soft() #0 {
-; DAG-LABEL: func_get_fpmode_soft:
+define i32 @func_get_fpmode() #0 {
+; DAG-LABEL: func_get_fpmode:
 ; DAG:       // %bb.0: // %entry
-; DAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; DAG-NEXT:    add x0, sp, #12
-; DAG-NEXT:    bl fegetmode
-; DAG-NEXT:    ldr w0, [sp, #12]
-; DAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; DAG-NEXT:    mrs x0, FPCR
+; DAG-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; DAG-NEXT:    ret
 ;
-; GIS-LABEL: func_get_fpmode_soft:
+; GIS-LABEL: func_get_fpmode:
 ; GIS:       // %bb.0: // %entry
 ; GIS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GIS-NEXT:    add x0, sp, #12
@@ -29,17 +26,14 @@ entry:
   ret i32 %fpmode
 }
 
-define void @func_set_fpmode_soft(i32 %fpmode) #0 {
-; DAG-LABEL: func_set_fpmode_soft:
+define void @func_set_fpmode(i32 %fpmode) #0 {
+; DAG-LABEL: func_set_fpmode:
 ; DAG:       // %bb.0: // %entry
-; DAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; DAG-NEXT:    str w0, [sp, #12]
-; DAG-NEXT:    add x0, sp, #12
-; DAG-NEXT:    bl fesetmode
-; DAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; DAG-NEXT:    mov w8, w0
+; DAG-NEXT:    msr FPCR, x8
 ; DAG-NEXT:    ret
 ;
-; GIS-LABEL: func_set_fpmode_soft:
+; GIS-LABEL: func_set_fpmode:
 ; GIS:       // %bb.0: // %entry
 ; GIS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GIS-NEXT:    str w0, [sp, #12]
@@ -52,16 +46,17 @@ entry:
   ret void
 }
 
-define void @func_reset_fpmode_soft() #0 {
-; DAG-LABEL: func_reset_fpmode_soft:
+define void @func_reset_fpmode() #0 {
+; DAG-LABEL: func_reset_fpmode:
 ; DAG:       // %bb.0: // %entry
-; DAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; DAG-NEXT:    mov x0, #-1 // =0xffffffffffffffff
-; DAG-NEXT:    bl fesetmode
-; DAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; DAG-NEXT:    mov x9, #-48904 // =0xffffffffffff40f8
+; DAG-NEXT:    mrs x8, FPCR
+; DAG-NEXT:    movk x9, #63488, lsl #16
+; DAG-NEXT:    and x8, x8, x9
+; DAG-NEXT:    msr FPCR, x8
 ; DAG-NEXT:    ret
 ;
-; GIS-LABEL: func_reset_fpmode_soft:
+; GIS-LABEL: func_reset_fpmode:
 ; GIS:       // %bb.0: // %entry
 ; GIS-NEXT:    mov x0, #-1 // =0xffffffffffffffff
 ; GIS-NEXT:    b fesetmode
@@ -70,4 +65,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "use-soft-float"="true" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir b/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
index d1770bb25fae..9da0808345a0 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -run-pass machine-combiner -o - %s | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass machine-combiner -verify-machineinstrs -o - %s | FileCheck %s
 
 # The test cases in this file check following transformation if the right form
 # can reduce latency.
@@ -237,3 +237,30 @@ body:              |
     RET_ReallyLR implicit $w0
 
 ...
+---
+# Drop nowrap flags in SUB
+
+# CHECK-LABEL: name: test8
+# CHECK:       [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr %1, %0
+# CHECK-NEXT:  %4:gpr64common = SUBXrr killed [[SUBXrr]], killed %2
+
+name:            test8
+registers:
+  - { id: 0, class: gpr64 }
+  - { id: 1, class: gpr64 }
+  - { id: 2, class: gpr64common }
+  - { id: 3, class: gpr64 }
+  - { id: 4, class: gpr64common }
+  - { id: 5, class: gpr64 }
+body:              |
+  bb.0:
+    %1:gpr64 = COPY $x1
+    %0:gpr64 = COPY $x0
+    %2:gpr64common = ORRXri %0:gpr64, 4096
+    %3:gpr64 = ADDXrr killed %2:gpr64common, %0:gpr64
+    %4:gpr64common = nsw SUBSXrr %1:gpr64, killed %3:gpr64, implicit-def dead $nzcv
+    %5:gpr64 = SUBSXri %4:gpr64common, 0, 0, implicit-def $nzcv
+    $x0 = COPY %5:gpr64
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index 90e560af4465..0c9ea51ba367 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -410,6 +410,23 @@ define i32 @test11(i32 %x) {
   ret i32 %mul
 }
 
+define i32 @test11_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
+; CHECK-LABEL: test11_fast_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w0, lsl #2
+; CHECK-NEXT:    add w0, w0, w8, lsl #1
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test11_fast_shift:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #11 // =0xb
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+
+  %mul = mul nsw i32 %x, 11 ; 11 = (((1<<2) + 1) << 1) + 1
+  ret i32 %mul
+}
+
 define i32 @test12(i32 %x) {
 ; CHECK-LABEL: test12:
 ; CHECK:       // %bb.0:
@@ -545,12 +562,29 @@ define i32 @test45(i32 %x) {
   ret i32 %mul
 }
 
-; Negative test: The shift amount 4 larger than 3
+; Negative test: The shift number 5 is out of bound
+define i32 @test67_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
+; CHECK-LABEL: test67_fast_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #67 // =0x43
+; CHECK-NEXT:    mul w0, w0, w8
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test67_fast_shift:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #67 // =0x43
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+
+  %mul = mul nsw i32 %x, 67 ; 67 = (((1<<5) + 1) << 1) + 1
+  ret i32 %mul
+}
+
 define i32 @test85_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test85_fast_shift:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #85 // =0x55
-; CHECK-NEXT:    mul w0, w0, w8
+; CHECK-NEXT:    add w8, w0, w0, lsl #2
+; CHECK-NEXT:    add w0, w8, w8, lsl #4
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test85_fast_shift:
@@ -563,7 +597,25 @@ define i32 @test85_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
   ret i32 %mul
 }
 
-; Negative test: The shift amount 5 larger than 3
+; Negative test: The shift number 5 is out of bound
+define i32 @test97_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
+; CHECK-LABEL: test97_fast_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #97 // =0x61
+; CHECK-NEXT:    mul w0, w0, w8
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test97_fast_shift:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #97 // =0x61
+; GISEL-NEXT:    mul w0, w0, w8
+; GISEL-NEXT:    ret
+
+  %mul = mul nsw i32 %x, 97 ; 97 = ((2 + 1) << 5) + 1
+  ret i32 %mul
+}
+
+; Negative test: The shift amount 5 larger than 4
 define i32 @test297_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test297_fast_shift:
 ; CHECK:       // %bb.0:
@@ -858,9 +910,9 @@ define <4 x i32> @muladd_demand_commute(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; GISEL-LABEL: muladd_demand_commute:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    adrp x8, .LCPI49_0
+; GISEL-NEXT:    adrp x8, .LCPI52_0
 ; GISEL-NEXT:    movi v3.4s, #1, msl #16
-; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI49_0]
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI52_0]
 ; GISEL-NEXT:    mla v1.4s, v0.4s, v2.4s
 ; GISEL-NEXT:    and v0.16b, v1.16b, v3.16b
 ; GISEL-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
index 0eee19ad2adb..cff7759c72c9 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
@@ -15,7 +15,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -26,7 +26,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -35,7 +35,7 @@ define <2 x i16> @reverse_v2i16(<2 x i16> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16> %a)
+  %res = call <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %res
 }
 
@@ -44,7 +44,7 @@ define <2 x i32> @reverse_v2i32(<2 x i32> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %a)
+  %res = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %a)
   ret <2 x i32> %res
 }
 
@@ -55,7 +55,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -65,7 +65,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -76,7 +76,7 @@ define <8 x half> @reverse_v8f16(<8 x half> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)
+  %res = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> %a)
   ret <8 x half> %res
 }
 
@@ -85,7 +85,7 @@ define <2 x float> @reverse_v2f32(<2 x float> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float> %a)
+  %res = call <2 x float> @llvm.vector.reverse.v2f32(<2 x float> %a)
   ret <2 x float> %res
 }
 
@@ -96,7 +96,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -106,7 +106,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -117,7 +117,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) #0 {
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -144,7 +144,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    add sp, sp, #16
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -182,23 +182,23 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    add sp, sp, #32
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
 
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <2 x float> @llvm.vector.reverse.v2f32(<2 x float>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
 
 attributes #0 = { nounwind "target-features"="+neon" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
index 4d5045feca08..a84e6e7bcae8 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
@@ -14,7 +14,7 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
+  %res = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
   ret <vscale x 2 x i1> %res
 }
 
@@ -24,7 +24,7 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %res = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
   ret <vscale x 4 x i1> %res
 }
 
@@ -34,7 +34,7 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
+  %res = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
   ret <vscale x 8 x i1> %res
 }
 
@@ -44,7 +44,7 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
+  %res = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
   ret <vscale x 16 x i1> %res
 }
 
@@ -70,7 +70,7 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
+  %res = call <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
   ret <vscale x 32 x i1> %res
 }
 
@@ -84,7 +84,7 @@ define <vscale x 16 x i8> @reverse_nxv16i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    rev z0.b, z0.b
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %res
 }
 
@@ -94,7 +94,7 @@ define <vscale x 8 x i16> @reverse_nxv8i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
+  %res = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
   ret <vscale x 8 x i16> %res
 }
 
@@ -104,7 +104,7 @@ define <vscale x 4 x i32> @reverse_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %res = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   ret <vscale x 4 x i32> %res
 }
 
@@ -114,7 +114,7 @@ define <vscale x 2 x i64> @reverse_nxv2i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
+  %res = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
   ret <vscale x 2 x i64> %res
 }
 
@@ -124,7 +124,7 @@ define <vscale x 2 x half> @reverse_nxv2f16(<vscale x 2 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
+  %res = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
   ret <vscale x 2 x half> %res
 }
 
@@ -134,7 +134,7 @@ define <vscale x 4 x half> @reverse_nxv4f16(<vscale x 4 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
+  %res = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
   ret <vscale x 4 x half> %res
 }
 
@@ -144,7 +144,7 @@ define <vscale x 8 x half> @reverse_nxv8f16(<vscale x 8 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
+  %res = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
   ret <vscale x 8 x half> %res
 }
 
@@ -154,7 +154,7 @@ define <vscale x 2 x bfloat> @reverse_nxv2bf16(<vscale x 2 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> %a)
+  %res = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> %a)
   ret <vscale x 2 x bfloat> %res
 }
 
@@ -164,7 +164,7 @@ define <vscale x 4 x bfloat> @reverse_nxv4bf16(<vscale x 4 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> %a)
+  %res = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> %a)
   ret <vscale x 4 x bfloat> %res
 }
 
@@ -174,7 +174,7 @@ define <vscale x 8 x bfloat> @reverse_nxv8bf16(<vscale x 8 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> %a)
+  %res = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> %a)
   ret <vscale x 8 x bfloat> %res
 }
 
@@ -184,7 +184,7 @@ define <vscale x 2 x float> @reverse_nxv2f32(<vscale x 2 x float> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> %a)  ret <vscale x 2 x float> %res
+  %res = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> %a)  ret <vscale x 2 x float> %res
 }
 
 define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) #0 {
@@ -193,7 +193,7 @@ define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)  ret <vscale x 4 x float> %res
+  %res = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)  ret <vscale x 4 x float> %res
 }
 
 define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) #0 {
@@ -202,7 +202,7 @@ define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
   ret <vscale x 2 x double> %res
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x i8> @reverse_nxv2i8(<vscale x 2 x i8> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
+  %res = call <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
   ret <vscale x 2 x i8> %res
 }
 
@@ -239,7 +239,7 @@ define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
@@ -273,32 +273,32 @@ define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
+  %res = call <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
   ret <vscale x 16 x float> %res
 }
 
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1>)
-declare <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1>)
+declare <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
 
 
 attributes #0 = { nounwind "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
index 9210a5ec1c8b..f2e62bc4f3c8 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
@@ -12,7 +12,7 @@ define <16 x i8> @splice_v16i8_idx(<16 x i8> %a, <16 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+  %res = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
   ret <16 x i8> %res
 }
 
@@ -21,7 +21,7 @@ define <2 x double> @splice_v2f64_idx(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1)
   ret <2 x double> %res
 }
 
@@ -31,7 +31,7 @@ define <2 x i8> @splice_v2i8_idx(<2 x i8> %a, <2 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
 ; CHECK-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1)
+  %res = call <2 x i8> @llvm.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1)
   ret <2 x i8> %res
 }
 
@@ -42,7 +42,7 @@ define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
+  %res = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
   ret <8 x i32> %res
 }
 
@@ -56,7 +56,7 @@ define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
+  %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
   ret <16 x float> %res
 }
 
@@ -69,7 +69,7 @@ define <16 x i8> @splice_v16i8(<16 x i8> %a, <16 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15)
+  %res = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15)
   ret <16 x i8> %res
 }
 
@@ -78,7 +78,7 @@ define <2 x double> @splice_v2f64(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1)
   ret <2 x double> %res
 }
 
@@ -88,7 +88,7 @@ define <2 x i8> @splice_v2i8(<2 x i8> %a, <2 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
 ; CHECK-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1)
+  %res = call <2 x i8> @llvm.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1)
   ret <2 x i8> %res
 }
 
@@ -99,7 +99,7 @@ define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
+  %res = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
   ret <8 x i32> %res
 }
 
@@ -113,14 +113,14 @@ define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
+  %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
   ret <16 x float> %res
 }
 
-declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
-declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
-declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
-declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <2 x i8> @llvm.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
+declare <16 x i8> @llvm.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
+declare <8 x i32> @llvm.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
+declare <16 x float> @llvm.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
 
 attributes #0 = { nounwind "target-features"="+neon" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index fac96e07de54..f5763cd61033 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -11,7 +11,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_zero_idx(<vscale x 16 x i8> %a, <vscal
 ; CHECK-LABEL: splice_nxv16i8_zero_idx:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 16 x i8> %res
 }
 
@@ -20,7 +20,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_first_idx(<vscale x 16 x i8> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -29,7 +29,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_last_idx(<vscale x 16 x i8> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #255
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 255)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 255)
   ret <vscale x 16 x i8> %res
 }
 
@@ -38,7 +38,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_first_idx(<vscale x 8 x i16> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -47,7 +47,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_first_idx(<vscale x 4 x i32> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -56,7 +56,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_last_idx(<vscale x 4 x i32> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 63)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 63)
   ret <vscale x 4 x i32> %res
 }
 
@@ -65,7 +65,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_first_idx(<vscale x 2 x i64> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -74,7 +74,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_last_idx(<vscale x 2 x i64> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 31)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 31)
   ret <vscale x 2 x i64> %res
 }
 
@@ -85,7 +85,7 @@ define <vscale x 2 x half> @splice_nxv2f16_neg_idx(<vscale x 2 x half> %a, <vsca
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
   ret <vscale x 2 x half> %res
 }
 
@@ -96,7 +96,7 @@ define <vscale x 2 x half> @splice_nxv2f16_neg2_idx(<vscale x 2 x half> %a, <vsc
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -2)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -2)
   ret <vscale x 2 x half> %res
 }
 
@@ -105,7 +105,7 @@ define <vscale x 2 x half> @splice_nxv2f16_first_idx(<vscale x 2 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 1)
   ret <vscale x 2 x half> %res
 }
 
@@ -114,7 +114,7 @@ define <vscale x 2 x half> @splice_nxv2f16_last_idx(<vscale x 2 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 31)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 31)
   ret <vscale x 2 x half> %res
 }
 
@@ -125,7 +125,7 @@ define <vscale x 4 x half> @splice_nxv4f16_neg_idx(<vscale x 4 x half> %a, <vsca
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
   ret <vscale x 4 x half> %res
 }
 
@@ -136,7 +136,7 @@ define <vscale x 4 x half> @splice_nxv4f16_neg3_idx(<vscale x 4 x half> %a, <vsc
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -3)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -3)
   ret <vscale x 4 x half> %res
 }
 
@@ -145,7 +145,7 @@ define <vscale x 4 x half> @splice_nxv4f16_first_idx(<vscale x 4 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 1)
   ret <vscale x 4 x half> %res
 }
 
@@ -154,7 +154,7 @@ define <vscale x 4 x half> @splice_nxv4f16_last_idx(<vscale x 4 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 63)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 63)
   ret <vscale x 4 x half> %res
 }
 
@@ -163,7 +163,7 @@ define <vscale x 8 x half> @splice_nxv8f16_first_idx(<vscale x 8 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 1)
   ret <vscale x 8 x half> %res
 }
 
@@ -172,7 +172,7 @@ define <vscale x 8 x half> @splice_nxv8f16_last_idx(<vscale x 8 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #254
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 127)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 127)
   ret <vscale x 8 x half> %res
 }
 
@@ -183,7 +183,7 @@ define <vscale x 2 x float> @splice_nxv2f32_neg_idx(<vscale x 2 x float> %a, <vs
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
   ret <vscale x 2 x float> %res
 }
 
@@ -194,7 +194,7 @@ define <vscale x 2 x float> @splice_nxv2f32_neg2_idx(<vscale x 2 x float> %a, <v
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -2)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -2)
   ret <vscale x 2 x float> %res
 }
 
@@ -203,7 +203,7 @@ define <vscale x 2 x float> @splice_nxv2f32_first_idx(<vscale x 2 x float> %a, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 1)
   ret <vscale x 2 x float> %res
 }
 
@@ -212,7 +212,7 @@ define <vscale x 2 x float> @splice_nxv2f32_last_idx(<vscale x 2 x float> %a, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 31)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 31)
   ret <vscale x 2 x float> %res
 }
 
@@ -221,7 +221,7 @@ define <vscale x 4 x float> @splice_nxv4f32_first_idx(<vscale x 4 x float> %a, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 1)
   ret <vscale x 4 x float> %res
 }
 
@@ -230,7 +230,7 @@ define <vscale x 4 x float> @splice_nxv4f32_last_idx(<vscale x 4 x float> %a, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 63)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 63)
   ret <vscale x 4 x float> %res
 }
 
@@ -239,7 +239,7 @@ define <vscale x 2 x double> @splice_nxv2f64_first_idx(<vscale x 2 x double> %a,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 1)
   ret <vscale x 2 x double> %res
 }
 
@@ -248,7 +248,7 @@ define <vscale x 2 x double> @splice_nxv2f64_last_idx(<vscale x 2 x double> %a,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 31)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 31)
   ret <vscale x 2 x double> %res
 }
 
@@ -263,7 +263,7 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -278,7 +278,7 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
   ret <vscale x 4 x i1> %res
 }
 
@@ -293,7 +293,7 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
   ret <vscale x 8 x i1> %res
 }
 
@@ -308,7 +308,7 @@ define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 1
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
   ret <vscale x 16 x i1> %res
 }
 
@@ -318,7 +318,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_idx(<vscale x 2 x i8> %a, <vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 1)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 1)
   ret <vscale x 2 x i8> %res
 }
 
@@ -340,7 +340,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 2)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 2)
   ret <vscale x 8 x i32> %res
 }
 
@@ -373,7 +373,7 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    addvl sp, sp, #8
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 16)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 16)
   ret <vscale x 16 x float> %res
 }
 
@@ -388,7 +388,7 @@ define <vscale x 16 x i8> @splice_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -16)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -16)
   ret <vscale x 16 x i8> %res
 }
 
@@ -399,7 +399,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg32(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
   ret <vscale x 16 x i8> %res
 }
 
@@ -410,7 +410,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg64(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -64)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -64)
   ret <vscale x 16 x i8> %res
 }
 
@@ -421,7 +421,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg128(<vscale x 16 x i8> %a, <vscale
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -128)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -128)
   ret <vscale x 16 x i8> %res
 }
 
@@ -432,7 +432,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg256(<vscale x 16 x i8> %a, <vscale
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -256)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -256)
   ret <vscale x 16 x i8> %res
 }
 
@@ -443,7 +443,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_1(<vscale x 16 x i8> %a, <vscale x 16
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -466,7 +466,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -17)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -17)
   ret <vscale x 16 x i8> %res
 }
 
@@ -477,7 +477,7 @@ define <vscale x 8 x i16> @splice_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -8)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -8)
   ret <vscale x 8 x i16> %res
 }
 
@@ -488,7 +488,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_1(<vscale x 8 x i16> %a, <vscale x 8 x
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -511,7 +511,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -9)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -9)
   ret <vscale x 8 x i16> %res
 }
 
@@ -522,7 +522,7 @@ define <vscale x 4 x i32> @splice_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -4)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -4)
   ret <vscale x 4 x i32> %res
 }
 
@@ -533,7 +533,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_1(<vscale x 4 x i32> %a, <vscale x 4 x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -544,7 +544,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_neg5(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -5)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -5)
   ret <vscale x 4 x i32> %res
 }
 
@@ -555,7 +555,7 @@ define <vscale x 2 x i64> @splice_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -2)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -2)
   ret <vscale x 2 x i64> %res
 }
 
@@ -566,7 +566,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_1(<vscale x 2 x i64> %a, <vscale x 2 x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -577,7 +577,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_neg3(<vscale x 2 x i64> %a, <vscale x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -3)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -3)
   ret <vscale x 2 x i64> %res
 }
 
@@ -588,7 +588,7 @@ define <vscale x 8 x half> @splice_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -8)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -8)
   ret <vscale x 8 x half> %res
 }
 
@@ -599,7 +599,7 @@ define <vscale x 8 x half> @splice_nxv8f16_1(<vscale x 8 x half> %a, <vscale x 8
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
   ret <vscale x 8 x half> %res
 }
 
@@ -622,7 +622,7 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -9)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -9)
   ret <vscale x 8 x half> %res
 }
 
@@ -633,7 +633,7 @@ define <vscale x 4 x float> @splice_nxv4f32(<vscale x 4 x float> %a, <vscale x 4
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -4)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -4)
   ret <vscale x 4 x float> %res
 }
 
@@ -644,7 +644,7 @@ define <vscale x 4 x float> @splice_nxv4f32_1(<vscale x 4 x float> %a, <vscale x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
   ret <vscale x 4 x float> %res
 }
 
@@ -655,7 +655,7 @@ define <vscale x 4 x float> @splice_nxv4f32_neg5(<vscale x 4 x float> %a, <vscal
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -5)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -5)
   ret <vscale x 4 x float> %res
 }
 
@@ -666,7 +666,7 @@ define <vscale x 2 x double> @splice_nxv2f64(<vscale x 2 x double> %a, <vscale x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -2)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -2)
   ret <vscale x 2 x double> %res
 }
 
@@ -677,7 +677,7 @@ define <vscale x 2 x double> @splice_nxv2f64_1(<vscale x 2 x double> %a, <vscale
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
   ret <vscale x 2 x double> %res
 }
 
@@ -688,7 +688,7 @@ define <vscale x 2 x double> @splice_nxv2f64_neg3(<vscale x 2 x double> %a, <vsc
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
   ret <vscale x 2 x double> %res
 }
 
@@ -705,7 +705,7 @@ define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1>
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -722,7 +722,7 @@ define <vscale x 4 x i1> @splice_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1>
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
   ret <vscale x 4 x i1> %res
 }
 
@@ -739,7 +739,7 @@ define <vscale x 8 x i1> @splice_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1>
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
   ret <vscale x 8 x i1> %res
 }
 
@@ -756,7 +756,7 @@ define <vscale x 16 x i1> @splice_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
   ret <vscale x 16 x i1> %res
 }
 
@@ -768,7 +768,7 @@ define <vscale x 2 x i8> @splice_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8>
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -2)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -2)
   ret <vscale x 2 x i8> %res
 }
 
@@ -793,7 +793,7 @@ define <vscale x 8 x i32> @splice_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -8)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -8)
   ret <vscale x 8 x i32> %res
 }
 
@@ -826,26 +826,26 @@ define <vscale x 16 x float> @splice_nxv16f32_neg17(<vscale x 16 x float> %a, <v
 ; CHECK-NEXT:    addvl sp, sp, #8
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -17)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -17)
   ret <vscale x 16 x float> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
 
 attributes #0 = { nounwind "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-reassocadd.ll b/llvm/test/CodeGen/AArch64/sve-reassocadd.ll
index 47ddab8e2964..c7261200a567 100644
--- a/llvm/test/CodeGen/AArch64/sve-reassocadd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-reassocadd.ll
@@ -181,4 +181,149 @@ entry:
   ret <vscale x 2 x i64> %2
 }
 
+
+define <vscale x 16 x i8> @i8_m2v_4s(ptr %b) {
+; CHECK-LABEL: i8_m2v_4s:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnth x8, all, mul #4
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    sub x8, x0, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8, x9]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, -32
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 %1
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 4
+  %2 = load <vscale x 16 x i8>, ptr %add.ptr1, align 16
+  ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 16 x i8> @i8_4s_m2v(ptr %b) {
+; CHECK-LABEL: i8_4s_m2v:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnth x8, all, mul #4
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    sub x8, x0, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8, x9]
+; CHECK-NEXT:    ret
+entry:
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 4
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, -32
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
+  %2 = load <vscale x 16 x i8>, ptr %add.ptr1, align 16
+  ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 8 x i16> @i16_m2v_8s(ptr %b) {
+; CHECK-LABEL: i16_m2v_8s:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnth x8, all, mul #4
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    sub x8, x0, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, -32
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 %1
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 8
+  %2 = load <vscale x 8 x i16>, ptr %add.ptr1, align 16
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 8 x i16> @i16_8s_m2v(ptr %b) {
+; CHECK-LABEL: i16_8s_m2v:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnth x8, all, mul #4
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    sub x8, x0, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8, x9, lsl #1]
+; CHECK-NEXT:    ret
+entry:
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 8
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, -32
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
+  %2 = load <vscale x 8 x i16>, ptr %add.ptr1, align 16
+  ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @i32_m2v_16s(ptr %b) {
+; CHECK-LABEL: i32_m2v_16s:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnth x8, all, mul #4
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    sub x8, x0, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, -32
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 %1
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 16
+  %2 = load <vscale x 4 x i32>, ptr %add.ptr1, align 16
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @i32_16s_m2v(ptr %b) {
+; CHECK-LABEL: i32_16s_m2v:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnth x8, all, mul #4
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    sub x8, x0, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
+; CHECK-NEXT:    ret
+entry:
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 16
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, -32
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
+  %2 = load <vscale x 4 x i32>, ptr %add.ptr1, align 16
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @i64_m2v_32s(ptr %b) {
+; CHECK-LABEL: i64_m2v_32s:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnth x8, all, mul #4
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    sub x8, x0, x8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, -32
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 %1
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 32
+  %2 = load <vscale x 2 x i64>, ptr %add.ptr1, align 16
+  ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 2 x i64> @i64_32s_m2v(ptr %b) {
+; CHECK-LABEL: i64_32s_m2v:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cnth x8, all, mul #4
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    sub x8, x0, x8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, x9, lsl #3]
+; CHECK-NEXT:    ret
+entry:
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 32
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = mul i64 %0, -32
+  %add.ptr1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
+  %2 = load <vscale x 2 x i64>, ptr %add.ptr1, align 16
+  ret <vscale x 2 x i64> %2
+}
+
 declare i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index d81f725eaefc..fd9259048df5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -14,6 +15,12 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff000000ff0000
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i8> %b, <i8 0, i8 255, i8 0, i8 255>
  ret <4 x i8> %c
 }
@@ -27,6 +34,12 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff00
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <8 x i8> %c
 }
@@ -40,6 +53,12 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_16xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0xff00ff00ff00ff00
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <16 x i8> %c
 }
@@ -56,6 +75,13 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_32xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v2.2d, #0xff00ff00ff00ff00
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
  %b = and <32 x i8> %ap, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255,
                          i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <32 x i8> %b
@@ -73,6 +99,13 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i16> %b, <i16 0, i16 65535>
  ret <2 x i16> %c
 }
@@ -86,6 +119,12 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xffff0000ffff0000
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535>
  ret <4 x i16> %c
 }
@@ -99,6 +138,12 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffff0000ffff0000
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <8 x i16> %c
 }
@@ -115,6 +160,13 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_16xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffff0000ffff0000
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <16 x i16> %c
 }
@@ -128,6 +180,13 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov v0.s[0], wzr
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i32> %b, <i32 0, i32 4294967295>
  ret <2 x i32> %c
 }
@@ -141,6 +200,12 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0xffffffff00000000
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <4 x i32> %c
 }
@@ -157,6 +222,13 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v2.2d, #0xffffffff00000000
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <8 x i32> %c
 }
@@ -170,6 +242,11 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i64> %b, <i64 0, i64 18446744073709551615>
  ret <2 x i64> %c
 }
@@ -185,6 +262,12 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov v0.d[0], xzr
+; NONEON-NOSVE-NEXT:    mov v1.d[0], xzr
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i64> %b, <i64 0, i64 18446744073709551615, i64 0, i64 18446744073709551615>
  ret <4 x i64> %c
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index d547f99a0230..8f0378252a54 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,16 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    sub z0.h, z0.h, #8 // =0x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    mov w8, #8 // =0x8
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -30,6 +41,11 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -42,6 +58,11 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -55,6 +76,14 @@ define void @ctlz_v32i8(ptr %a) {
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    clz v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    clz v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -71,6 +100,16 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    sub z0.s, z0.s, #16 // =0x10
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -83,6 +122,11 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -95,6 +139,11 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -108,6 +157,14 @@ define void @ctlz_v16i16(ptr %a) {
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -122,6 +179,11 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -134,6 +196,11 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -147,6 +214,14 @@ define void @ctlz_v8i32(ptr %a) {
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -161,6 +236,27 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #1
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #2
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #4
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #8
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #16
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushr d1, d0, #32
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    mvn v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -173,6 +269,27 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #1
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #2
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #4
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #8
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #16
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v0.2d, #32
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -186,6 +303,46 @@ define void @ctlz_v4i64(ptr %a) {
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #1
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #1
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #2
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #2
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #4
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #4
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #8
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #8
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #16
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #16
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ushr v2.2d, v0.2d, #32
+; NONEON-NOSVE-NEXT:    ushr v3.2d, v1.2d, #32
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -205,6 +362,14 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -217,6 +382,11 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -229,6 +399,11 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -242,6 +417,14 @@ define void @ctpop_v32i8(ptr %a) {
 ; CHECK-NEXT:    cnt z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -257,6 +440,15 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -269,6 +461,12 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -281,6 +479,12 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -294,6 +498,16 @@ define void @ctpop_v16i16(ptr %a) {
 ; CHECK-NEXT:    cnt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -308,6 +522,13 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -320,6 +541,13 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -333,6 +561,18 @@ define void @ctpop_v8i32(ptr %a) {
 ; CHECK-NEXT:    cnt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -347,6 +587,14 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -359,6 +607,14 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -372,6 +628,20 @@ define void @ctpop_v4i64(ptr %a) {
 ; CHECK-NEXT:    cnt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -392,6 +662,21 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #256 // =0x100
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v2.4h
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -405,6 +690,14 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.8b, #1
+; NONEON-NOSVE-NEXT:    sub v1.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -418,6 +711,14 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.16b, #1
+; NONEON-NOSVE-NEXT:    sub v1.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -433,6 +734,19 @@ define void @cttz_v32i8(ptr %a) {
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #1
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v3.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -449,6 +763,21 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #65536 // =0x10000
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v2.2s
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -462,6 +791,18 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    sub v1.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    clz v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    sub v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -475,6 +816,18 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.8h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    sub v1.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    dup v1.8h, w8
+; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -490,6 +843,24 @@ define void @cttz_v16i16(ptr %a) {
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    mov w8, #16 // =0x10
+; NONEON-NOSVE-NEXT:    sub v3.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    dup v2.8h, w8
+; NONEON-NOSVE-NEXT:    clz v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    clz v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -505,6 +876,18 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
+; NONEON-NOSVE-NEXT:    sub v1.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    dup v1.2s, w8
+; NONEON-NOSVE-NEXT:    clz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    sub v0.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -518,6 +901,18 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.4s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
+; NONEON-NOSVE-NEXT:    sub v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    dup v1.4s, w8
+; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -533,6 +928,24 @@ define void @cttz_v8i32(ptr %a) {
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    mov w8, #32 // =0x20
+; NONEON-NOSVE-NEXT:    sub v3.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    dup v2.4s, w8
+; NONEON-NOSVE-NEXT:    clz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    clz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -548,6 +961,18 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    sub d1, d0, d1
+; NONEON-NOSVE-NEXT:    bic v0.8b, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4h, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlp v0.2s, v0.4h
+; NONEON-NOSVE-NEXT:    uaddlp v0.1d, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -561,6 +986,18 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    dup v1.2d, x8
+; NONEON-NOSVE-NEXT:    sub v1.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    bic v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -576,6 +1013,26 @@ define void @cttz_v4i64(ptr %a) {
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #1 // =0x1
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    sub v3.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bic v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    cnt v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    cnt v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.8h, v1.16b
+; NONEON-NOSVE-NEXT:    uaddlp v0.8h, v0.16b
+; NONEON-NOSVE-NEXT:    uaddlp v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    uaddlp v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    uaddlp v1.2d, v1.4s
+; NONEON-NOSVE-NEXT:    uaddlp v0.2d, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index e3cc74f766ee..64dc7ae117d3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,12 @@ define void @bitcast_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i8>, ptr %a
   %cast = bitcast <4 x i8> %load to <4 x i8>
   store volatile <4 x i8> %cast, ptr %b
@@ -23,6 +30,12 @@ define void @bitcast_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i8>, ptr %a
   %cast = bitcast <8 x i8> %load to <8 x i8>
   store volatile <8 x i8> %cast, ptr %b
@@ -35,6 +48,12 @@ define void @bitcast_v16i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <16 x i8>, ptr %a
   %cast = bitcast <16 x i8> %load to <16 x i8>
   store volatile <16 x i8> %cast, ptr %b
@@ -49,6 +68,14 @@ define void @bitcast_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <32 x i8>, ptr %a
   %cast = bitcast <32 x i8> %load to <32 x i8>
   store volatile <32 x i8> %cast, ptr %b
@@ -72,6 +99,16 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    add x8, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i16>, ptr %a
   %cast = bitcast <2 x i16> %load to <2 x half>
   store volatile <2 x half> %cast, ptr %b
@@ -84,6 +121,12 @@ define void @bitcast_v4i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i16>, ptr %a
   %cast = bitcast <4 x i16> %load to <4 x half>
   store volatile <4 x half> %cast, ptr %b
@@ -96,6 +139,12 @@ define void @bitcast_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i16>, ptr %a
   %cast = bitcast <8 x i16> %load to <8 x half>
   store volatile <8 x half> %cast, ptr %b
@@ -110,6 +159,14 @@ define void @bitcast_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <16 x i16>, ptr %a
   %cast = bitcast <16 x i16> %load to <16 x half>
   store volatile <16 x half> %cast, ptr %b
@@ -122,6 +179,12 @@ define void @bitcast_v2i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i32>, ptr %a
   %cast = bitcast <2 x i32> %load to <2 x float>
   store volatile <2 x float> %cast, ptr %b
@@ -134,6 +197,12 @@ define void @bitcast_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i32>, ptr %a
   %cast = bitcast <4 x i32> %load to <4 x float>
   store volatile <4 x float> %cast, ptr %b
@@ -148,6 +217,14 @@ define void @bitcast_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i32>, ptr %a
   %cast = bitcast <8 x i32> %load to <8 x float>
   store volatile <8 x float> %cast, ptr %b
@@ -160,6 +237,12 @@ define void @bitcast_v1i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <1 x i64>, ptr %a
   %cast = bitcast <1 x i64> %load to <1 x double>
   store volatile <1 x double> %cast, ptr %b
@@ -172,6 +255,12 @@ define void @bitcast_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i64>, ptr %a
   %cast = bitcast <2 x i64> %load to <2 x double>
   store volatile <2 x double> %cast, ptr %b
@@ -186,6 +275,14 @@ define void @bitcast_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i64>, ptr %a
   %cast = bitcast <4 x i64> %load to <4 x double>
   store volatile <4 x double> %cast, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 74a4aab15597..5e06cd62118d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64"
 
@@ -30,6 +31,17 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x2]
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    ret
   %pre_cond = load <8 x i32>, ptr %pre_cond_ptr
   %left = load <8 x i32>, ptr %left_ptr
   %right = load <8 x i32>, ptr %right_ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index 0c490a662a79..7a24430a3385 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -10,6 +11,12 @@ define void @build_vector_7_inc1_v4i1(ptr %a) {
 ; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_7_inc1_v4i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    strb w8, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i1> <i1 true, i1 false, i1 true, i1 false>, ptr %a, align 1
   ret void
 }
@@ -23,6 +30,15 @@ define void @build_vector_7_inc1_v32i8(ptr %a) {
 ; CHECK-NEXT:    add z1.b, z1.b, #23 // =0x17
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_7_inc1_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI1_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI1_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38>, ptr %a, align 1
   ret void
 }
@@ -35,6 +51,15 @@ define void @build_vector_0_inc2_v16i16(ptr %a) {
 ; CHECK-NEXT:    add z0.h, z0.h, #16 // =0x10
 ; CHECK-NEXT:    str q0, [x0, #16]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI2_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI2_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> <i16 0, i16 2, i16 4, i16 6, i16 8, i16 10, i16 12, i16 14, i16 16, i16 18, i16 20, i16 22, i16 24, i16 26, i16 28, i16 30>, ptr %a, align 2
   ret void
 }
@@ -48,6 +73,15 @@ define void @build_vector_0_dec3_v8i32(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z0.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> <i32 0, i32 -3, i32 -6, i32 -9, i32 -12, i32 -15, i32 -18, i32 -21>, ptr %a, align 4
   ret void
 }
@@ -64,6 +98,15 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) {
 ; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI4_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI4_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> <i64 -2, i64 -34, i64 -66, i64 -98>, ptr %a, align 8
   ret void
 }
@@ -76,6 +119,15 @@ define void @build_vector_no_stride_v4i64(ptr %a) {
 ; CHECK-NEXT:    index z1.d, #0, #4
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_no_stride_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI5_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI5_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> <i64 0, i64 4, i64 1, i64 8>, ptr %a, align 8
   ret void
 }
@@ -89,6 +141,15 @@ define void @build_vector_0_inc2_v16f16(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI6_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x half> <half 0.0, half 2.0, half 4.0, half 6.0, half 8.0, half 10.0, half 12.0, half 14.0, half 16.0, half 18.0, half 20.0, half 22.0, half 24.0, half 26.0, half 28.0, half 30.0>, ptr %a, align 2
   ret void
 }
@@ -103,6 +164,15 @@ define void @build_vector_0_dec3_v8f32(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x float> <float 0.0, float -3.0, float -6.0, float -9.0, float -12.0, float -15.0, float -18.0, float -21.0>, ptr %a, align 4
   ret void
 }
@@ -117,6 +187,15 @@ define void @build_vector_minus2_dec32_v4f64(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> <double -2.0, double -34.0, double -66.0, double -98.0>, ptr %a, align 8
   ret void
 }
@@ -131,6 +210,15 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_no_stride_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI9_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 86494c4be501..ee997228e453 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -40,6 +41,11 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
 }
@@ -53,6 +59,13 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
@@ -65,6 +78,13 @@ define void @concat_v32i8(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
   %res = shufflevector <16 x i8> %op1, <16 x i8> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -83,6 +103,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = shufflevector <32 x i8> %op1, <32 x i8> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -121,6 +149,11 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i16> %res
 }
@@ -135,6 +168,13 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -146,6 +186,13 @@ define void @concat_v16i16(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %res = shufflevector <8 x i16> %op1, <8 x i16> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -162,6 +209,14 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = shufflevector <16 x i16> %op1, <16 x i16> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -185,6 +240,11 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2)  {
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
 }
@@ -199,6 +259,13 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
@@ -210,6 +277,13 @@ define void @concat_v8i32(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
   %res = shufflevector <4 x i32> %op1, <4 x i32> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -225,6 +299,14 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = shufflevector <8 x i32> %op1, <8 x i32> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -247,6 +329,13 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2)  {
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %res
 }
@@ -258,6 +347,13 @@ define void @concat_v4i64(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %res = shufflevector <2 x i64> %op1, <2 x i64> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -273,6 +369,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = shufflevector <4 x i64> %op1, <4 x i64> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -300,6 +404,11 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %res
 }
@@ -313,6 +422,13 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
 }
@@ -324,6 +440,13 @@ define void @concat_v16f16(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
   %res = shufflevector <8 x half> %op1, <8 x half> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -340,6 +463,14 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = shufflevector <16 x half> %op1, <16 x half> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -363,6 +494,11 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2)  {
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
 }
@@ -377,6 +513,13 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
@@ -388,6 +531,13 @@ define void @concat_v8f32(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
   %res = shufflevector <4 x float> %op1, <4 x float> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -403,6 +553,14 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = shufflevector <8 x float> %op1, <8 x float> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -425,6 +583,13 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2)  {
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %res
 }
@@ -436,6 +601,13 @@ define void @concat_v4f64(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
   %res = shufflevector <2 x double> %op1, <2 x double> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -451,6 +623,14 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = shufflevector <4 x double> %op1, <4 x double> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -468,6 +648,12 @@ define void @concat_v32i8_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %res = shufflevector <16 x i8> %op1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
@@ -483,6 +669,12 @@ define void @concat_v16i16_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = shufflevector <8 x i16> %op1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -496,6 +688,12 @@ define void @concat_v8i32_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = shufflevector <4 x i32> %op1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %res, ptr %b
@@ -508,6 +706,12 @@ define void @concat_v4i64_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = shufflevector <2 x i64> %op1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i64> %res, ptr %b
@@ -524,6 +728,12 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -541,6 +751,12 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %res = shufflevector <8 x i16> %shuffle, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -555,6 +771,12 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res = shufflevector <4 x i32> %shuffle, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -568,6 +790,12 @@ define void @concat_v4i64_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x i64>, ptr %a
   %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
   %res = shufflevector <2 x i64> %shuffle, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 0aefba2d4c6a..42aa67fb2ab8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,12 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap)  {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v8i8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %ap
   %val = zext <8 x i8> %a to <8 x i16>
   ret <8 x i16> %val
@@ -23,6 +30,12 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap)  {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v4i16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
   ret <4 x i32> %val
@@ -35,6 +48,12 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) {
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v2i32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %ap
   %val = zext <2 x i32> %a to <2 x i64>
   ret <2 x i64> %val
@@ -54,6 +73,19 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    mov x7, xzr
 ; CHECK-NEXT:    fmov x4, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v2i64i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    mov x1, xzr
+; NONEON-NOSVE-NEXT:    mov x2, xzr
+; NONEON-NOSVE-NEXT:    mov x3, xzr
+; NONEON-NOSVE-NEXT:    mov x5, xzr
+; NONEON-NOSVE-NEXT:    mov x6, xzr
+; NONEON-NOSVE-NEXT:    mov x4, v0.d[1]
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    mov x7, xzr
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = zext <2 x i64> %a to <2 x i256>
   ret <2 x i256> %val
@@ -75,6 +107,24 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $z3
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v16i8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v2.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %ap
   %val = sext <16 x i8> %a to <16 x i32>
   ret <16 x i32> %val
@@ -90,6 +140,17 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v8i16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = sext <8 x i16> %a to <8 x i32>
   ret <8 x i32> %val
@@ -121,6 +182,39 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; CHECK-NEXT:    stp x12, x12, [x8, #112]
 ; CHECK-NEXT:    stp x11, x12, [x8, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x10, x8, #32
+; NONEON-NOSVE-NEXT:    add x11, x8, #96
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
+; NONEON-NOSVE-NEXT:    st1 { v0.d }[1], [x10]
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    st1 { v1.d }[1], [x11]
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    asr x10, x10, #63
+; NONEON-NOSVE-NEXT:    str d0, [x8]
+; NONEON-NOSVE-NEXT:    asr x9, x9, #63
+; NONEON-NOSVE-NEXT:    str d1, [x8, #64]
+; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #48]
+; NONEON-NOSVE-NEXT:    str x9, [x8, #40]
+; NONEON-NOSVE-NEXT:    fmov x9, d1
+; NONEON-NOSVE-NEXT:    str x10, [x8, #8]
+; NONEON-NOSVE-NEXT:    asr x10, x11, #63
+; NONEON-NOSVE-NEXT:    asr x9, x9, #63
+; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #112]
+; NONEON-NOSVE-NEXT:    str x10, [x8, #104]
+; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #80]
+; NONEON-NOSVE-NEXT:    str x9, [x8, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = sext <4 x i32> %a to <4 x i256>
   ret <4 x i256> %val
@@ -154,6 +248,22 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    fmov x1, d6
 ; CHECK-NEXT:    fmov x5, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    dup v1.2d, v0.d[1]
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    asr x1, x0, #63
+; NONEON-NOSVE-NEXT:    asr x5, x8, #63
+; NONEON-NOSVE-NEXT:    mov x2, x1
+; NONEON-NOSVE-NEXT:    mov x3, x1
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x5
+; NONEON-NOSVE-NEXT:    mov x6, x5
+; NONEON-NOSVE-NEXT:    mov x7, x5
+; NONEON-NOSVE-NEXT:    fmov x4, d1
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = sext <2 x i64> %a to <2 x i256>
   ret <2 x i256> %val
@@ -187,6 +297,34 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q6 killed $q6 killed $z6
 ; CHECK-NEXT:    // kill: def $q7 killed $q7 killed $z7
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v16i16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v5.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d16, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d17, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v6.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v16.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v7.2d, v17.2s, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %ap
   %val = zext <16 x i16> %a to <16 x i64>
   ret <16 x i64> %val
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 25ecd7a8d7e3..d050ddc77640 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -27,6 +28,11 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4)
   ret <4 x i1> %ret
 }
@@ -54,6 +60,11 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4)
   ret <4 x i8> %ret
 }
@@ -65,6 +76,14 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %op, i64 8)
   ret <8 x i8> %ret
 }
@@ -75,6 +94,12 @@ define void @extract_subvector_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %ret = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %op, i64 16)
   store <16 x i8> %ret, ptr %b
@@ -91,6 +116,15 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
   ret <2 x i16> %ret
 }
@@ -102,6 +136,14 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %op, i64 4)
   ret <4 x i16> %ret
 }
@@ -112,6 +154,12 @@ define void @extract_subvector_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %ret = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> %op, i64 8)
   store <8 x i16> %ret, ptr %b
@@ -127,6 +175,12 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
   ret <1 x i32> %ret
 }
@@ -138,6 +192,14 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %op, i64 2)
   ret <2 x i32> %ret
 }
@@ -148,6 +210,12 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %ret = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %op, i64 4)
   store <4 x i32> %ret, ptr %b
@@ -163,6 +231,14 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %op, i64 1)
   ret <1 x i64> %ret
 }
@@ -173,6 +249,12 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %ret = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> %op, i64 2)
   store <2 x i64> %ret, ptr %b
@@ -190,6 +272,12 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
   ret <2 x half> %ret
 }
@@ -201,6 +289,14 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half> %op, i64 4)
   ret <4 x half> %ret
 }
@@ -211,6 +307,12 @@ define void @extract_subvector_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %ret = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> %op, i64 8)
   store <8 x half> %ret, ptr %b
@@ -226,6 +328,12 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[1]
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
   ret <1 x float> %ret
 }
@@ -237,6 +345,14 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> %op, i64 2)
   ret <2 x float> %ret
 }
@@ -247,6 +363,12 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %ret = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> %op, i64 4)
   store <4 x float> %ret, ptr %b
@@ -262,6 +384,14 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double> %op, i64 1)
   ret <1 x double> %ret
 }
@@ -272,6 +402,12 @@ define void @extract_subvector_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %ret = call <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double> %op, i64 2)
   store <2 x double> %ret, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index a752e119b2fb..b2cf818e6e3c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,12 @@ define half @extractelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x half> %op1, i64 1
   ret half %r
 }
@@ -26,6 +33,12 @@ define half @extractelement_v4f16(<4 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[3]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x half> %op1, i64 3
   ret half %r
 }
@@ -37,6 +50,11 @@ define half @extractelement_v8f16(<8 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <8 x half> %op1, i64 7
   ret half %r
 }
@@ -48,6 +66,11 @@ define half @extractelement_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = extractelement <16 x half> %op1, i64 15
   ret half %r
@@ -60,6 +83,12 @@ define float @extractelement_v2f32(<2 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov s0, v0.s[1]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x float> %op1, i64 1
   ret float %r
 }
@@ -71,6 +100,11 @@ define float @extractelement_v4f32(<4 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov s0, v0.s[3]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x float> %op1, i64 3
   ret float %r
 }
@@ -82,6 +116,11 @@ define float @extractelement_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0, #28]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = extractelement <8 x float> %op1, i64 7
   ret float %r
@@ -91,6 +130,10 @@ define double @extractelement_v1f64(<1 x double> %op1) {
 ; CHECK-LABEL: extractelement_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <1 x double> %op1, i64 0
   ret double %r
 }
@@ -101,6 +144,11 @@ define double @extractelement_v2f64(<2 x double> %op1) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov d0, v0.d[1]
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x double> %op1, i64 1
   ret double %r
 }
@@ -112,6 +160,11 @@ define double @extractelement_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0, #24]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = extractelement <4 x double> %op1, i64 3
   ret double %r
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 0d6675def8b5..bed5dd53c519 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -28,6 +29,16 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    ldr d2, [x1]
+; NONEON-NOSVE-NEXT:    dup v0.4h, w8
+; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x half>, ptr %bp
   %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b)
@@ -54,6 +65,16 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x half>, ptr %bp
   %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
@@ -84,6 +105,17 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %ap
   %b = load <16 x half>, ptr %bp
   %r = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b)
@@ -112,6 +144,16 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    ldr d2, [x1]
+; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    bsl v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x float>, ptr %bp
   %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
@@ -138,6 +180,16 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
@@ -168,6 +220,17 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>, ptr %ap
   %b = load <8 x float>, ptr %bp
   %r = call <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b)
@@ -196,6 +259,16 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load <2 x double>, ptr %bp
   %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
@@ -226,6 +299,17 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldp q1, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
@@ -260,6 +344,17 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d0, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr d2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
+; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    bsl v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x double>, ptr %bp
   %tmp0 = fptrunc <2 x double> %b to <2 x float>
@@ -304,6 +399,18 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v2.2d
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %tmp0 = fptrunc <4 x double> %b to <4 x float>
@@ -337,6 +444,17 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load < 2 x float>, ptr %bp
   %tmp0 = fpext <2 x float> %b to <2 x double>
@@ -381,6 +499,23 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z4.d, z4.d, z1.d, z2.d
 ; SVE2-NEXT:    stp q3, q4, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0xffffffffffffffff
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
+; NONEON-NOSVE-NEXT:    bit v1.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %tmp0 = fpext <4 x float> %b to <4 x double>
@@ -416,6 +551,17 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr d2, [x0]
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %tmp0 = fptrunc <4 x float> %b to <4 x half>
@@ -457,6 +603,19 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr d2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtxn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    dup v1.4h, w8
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    bit v0.8b, v2.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %tmp0 = fptrunc <4 x double> %b to <4 x half>
@@ -500,6 +659,18 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov w8, #32767 // =0x7fff
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    dup v1.8h, w8
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x float>, ptr %bp
   %tmp0 = fptrunc <8 x float> %b to <8 x half>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index c2d6ed4e9ccf..662a8f2b55fd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,14 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -30,6 +39,14 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -43,6 +60,18 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -58,6 +87,29 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fadd <16 x half> %op1, %op2
@@ -74,6 +126,11 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -87,6 +144,11 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -102,6 +164,15 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fadd <8 x float> %op1, %op2
@@ -118,6 +189,11 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -133,6 +209,15 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fadd <4 x double> %op1, %op2
@@ -153,6 +238,14 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -166,6 +259,14 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -179,6 +280,18 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -194,6 +307,30 @@ define void @fdiv_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v5.4s, v4.8h
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v4.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ldr q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl2 v6.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fdiv v3.4s, v3.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fdiv v5.4s, v6.4s, v5.4s
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fdiv <16 x half> %op1, %op2
@@ -210,6 +347,11 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fdiv v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -223,6 +365,11 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -238,6 +385,15 @@ define void @fdiv_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fdiv v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fdiv v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fdiv <8 x float> %op1, %op2
@@ -254,6 +410,11 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fdiv v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -269,6 +430,15 @@ define void @fdiv_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fdiv v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fdiv v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fdiv <4 x double> %op1, %op2
@@ -290,6 +460,46 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    fcvt s16, h0
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt s4, h17
+; NONEON-NOSVE-NEXT:    fcvt s5, h18
+; NONEON-NOSVE-NEXT:    fcvt h0, s6
+; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h16
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
   ret <2 x half> %res
 }
@@ -304,6 +514,46 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d2 killed $d2 def $q2
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    fcvt s16, h0
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmadd s3, s5, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt s4, h17
+; NONEON-NOSVE-NEXT:    fcvt s5, h18
+; NONEON-NOSVE-NEXT:    fcvt h0, s6
+; NONEON-NOSVE-NEXT:    fmadd s4, s7, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h16
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fmadd s1, s5, s1, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
   ret <4 x half> %res
 }
@@ -318,6 +568,79 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    fcvt s16, h0
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmadd s6, s16, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s7, h17
+; NONEON-NOSVE-NEXT:    fcvt s16, h18
+; NONEON-NOSVE-NEXT:    fcvt s17, h19
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmadd s4, s5, s4, s3
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fmadd s6, s17, s16, s7
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s7, h18
+; NONEON-NOSVE-NEXT:    fcvt s16, h19
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov v3.h[1], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    fmadd s5, s16, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    mov v3.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fmadd s17, s19, s18, s17
+; NONEON-NOSVE-NEXT:    mov h18, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmadd s4, s16, s7, s4
+; NONEON-NOSVE-NEXT:    mov v3.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h18
+; NONEON-NOSVE-NEXT:    fcvt s7, h19
+; NONEON-NOSVE-NEXT:    fcvt h16, s17
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fmadd s5, s7, s6, s5
+; NONEON-NOSVE-NEXT:    mov v3.h[4], v16.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
+; NONEON-NOSVE-NEXT:    mov v3.h[5], v4.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h4, s5
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v3.h[6], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov v3.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
   ret <8 x half> %res
 }
@@ -334,6 +657,150 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q4, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [x2]
+; NONEON-NOSVE-NEXT:    mov h25, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s18, h1
+; NONEON-NOSVE-NEXT:    mov h22, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    mov h20, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h26, v5.h[1]
+; NONEON-NOSVE-NEXT:    mov h27, v4.h[1]
+; NONEON-NOSVE-NEXT:    mov h28, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s25, h25
+; NONEON-NOSVE-NEXT:    mov h7, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h29, v4.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s23, h17
+; NONEON-NOSVE-NEXT:    mov h17, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h30, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s21, h16
+; NONEON-NOSVE-NEXT:    fmadd s6, s19, s18, s6
+; NONEON-NOSVE-NEXT:    fcvt s18, h20
+; NONEON-NOSVE-NEXT:    fcvt s19, h22
+; NONEON-NOSVE-NEXT:    fcvt s20, h24
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s22, h5
+; NONEON-NOSVE-NEXT:    fcvt s24, h4
+; NONEON-NOSVE-NEXT:    fcvt s26, h26
+; NONEON-NOSVE-NEXT:    fcvt s27, h27
+; NONEON-NOSVE-NEXT:    fcvt s28, h28
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fmadd s21, s25, s23, s21
+; NONEON-NOSVE-NEXT:    fcvt s23, h3
+; NONEON-NOSVE-NEXT:    mov h25, v5.h[2]
+; NONEON-NOSVE-NEXT:    fmadd s18, s20, s19, s18
+; NONEON-NOSVE-NEXT:    mov h19, v3.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    mov h31, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
+; NONEON-NOSVE-NEXT:    mov h27, v4.h[3]
+; NONEON-NOSVE-NEXT:    mov h28, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmadd s22, s23, s24, s22
+; NONEON-NOSVE-NEXT:    fcvt h20, s21
+; NONEON-NOSVE-NEXT:    mov h21, v2.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt s24, h29
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fmadd s16, s17, s16, s7
+; NONEON-NOSVE-NEXT:    mov h25, v5.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt h26, s26
+; NONEON-NOSVE-NEXT:    mov h29, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov v6.h[1], v20.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s17, h21
+; NONEON-NOSVE-NEXT:    fcvt s20, h30
+; NONEON-NOSVE-NEXT:    fmadd s19, s19, s24, s23
+; NONEON-NOSVE-NEXT:    fcvt s21, h31
+; NONEON-NOSVE-NEXT:    fcvt h7, s22
+; NONEON-NOSVE-NEXT:    fcvt s22, h25
+; NONEON-NOSVE-NEXT:    fcvt s23, h27
+; NONEON-NOSVE-NEXT:    fcvt s24, h28
+; NONEON-NOSVE-NEXT:    mov h25, v5.h[4]
+; NONEON-NOSVE-NEXT:    mov h27, v4.h[4]
+; NONEON-NOSVE-NEXT:    mov h28, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov h30, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h31, v0.h[5]
+; NONEON-NOSVE-NEXT:    mov v6.h[2], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s17, s21, s20, s17
+; NONEON-NOSVE-NEXT:    mov v7.h[1], v26.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h18, s19
+; NONEON-NOSVE-NEXT:    fmadd s19, s24, s23, s22
+; NONEON-NOSVE-NEXT:    mov h26, v5.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s20, h25
+; NONEON-NOSVE-NEXT:    fcvt s21, h27
+; NONEON-NOSVE-NEXT:    fcvt s22, h28
+; NONEON-NOSVE-NEXT:    mov h27, v4.h[5]
+; NONEON-NOSVE-NEXT:    mov h28, v3.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s23, h29
+; NONEON-NOSVE-NEXT:    fcvt s24, h30
+; NONEON-NOSVE-NEXT:    fcvt s25, h31
+; NONEON-NOSVE-NEXT:    mov h29, v2.h[6]
+; NONEON-NOSVE-NEXT:    mov h30, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h31, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov v7.h[2], v18.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h18, s19
+; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
+; NONEON-NOSVE-NEXT:    mov h20, v5.h[6]
+; NONEON-NOSVE-NEXT:    mov h21, v4.h[6]
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s26, h26
+; NONEON-NOSVE-NEXT:    fmadd s23, s25, s24, s23
+; NONEON-NOSVE-NEXT:    fcvt s27, h27
+; NONEON-NOSVE-NEXT:    fcvt s28, h28
+; NONEON-NOSVE-NEXT:    mov v6.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s17
+; NONEON-NOSVE-NEXT:    fcvt s17, h29
+; NONEON-NOSVE-NEXT:    fcvt s24, h30
+; NONEON-NOSVE-NEXT:    fcvt s25, h31
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    fcvt s21, h21
+; NONEON-NOSVE-NEXT:    fcvt s22, h22
+; NONEON-NOSVE-NEXT:    mov v7.h[3], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s26, s28, s27, s26
+; NONEON-NOSVE-NEXT:    fcvt h18, s19
+; NONEON-NOSVE-NEXT:    mov h5, v5.h[7]
+; NONEON-NOSVE-NEXT:    mov h4, v4.h[7]
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    fmadd s17, s25, s24, s17
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmadd s19, s22, s21, s20
+; NONEON-NOSVE-NEXT:    mov v6.h[4], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s23
+; NONEON-NOSVE-NEXT:    mov v7.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h18, s26
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v6.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    mov v7.h[5], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmadd s3, s3, s4, s5
+; NONEON-NOSVE-NEXT:    fcvt h4, s19
+; NONEON-NOSVE-NEXT:    fcvt h5, s17
+; NONEON-NOSVE-NEXT:    fmadd s0, s0, s1, s2
+; NONEON-NOSVE-NEXT:    mov v7.h[6], v4.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s3
+; NONEON-NOSVE-NEXT:    mov v6.h[6], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v6.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %op3 = load <16 x half>, ptr %c
@@ -352,6 +819,12 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
   ret <2 x float> %res
 }
@@ -366,6 +839,12 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
   ret <4 x float> %res
 }
@@ -382,6 +861,16 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %op3 = load <8 x float>, ptr %c
@@ -400,6 +889,12 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
   ret <2 x double> %res
 }
@@ -416,6 +911,16 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %op3 = load <4 x double>, ptr %c
@@ -437,6 +942,14 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -450,6 +963,14 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -463,6 +984,18 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -478,6 +1011,29 @@ define void @fmul_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fmul <16 x half> %op1, %op2
@@ -494,6 +1050,11 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -507,6 +1068,11 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -522,6 +1088,15 @@ define void @fmul_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmul v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fmul <8 x float> %op1, %op2
@@ -538,6 +1113,11 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -553,6 +1133,15 @@ define void @fmul_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmul v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmul v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fmul <4 x double> %op1, %op2
@@ -572,6 +1161,12 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x half> %op
   ret <2 x half> %res
 }
@@ -584,6 +1179,12 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x half> %op
   ret <4 x half> %res
 }
@@ -596,6 +1197,12 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v1.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <8 x half> %op
   ret <8 x half> %res
 }
@@ -609,6 +1216,15 @@ define void @fneg_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fneg z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = fneg <16 x half> %op
   store <16 x half> %res, ptr %a
@@ -623,6 +1239,11 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fneg v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x float> %op
   ret <2 x float> %res
 }
@@ -635,6 +1256,11 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x float> %op
   ret <4 x float> %res
 }
@@ -648,6 +1274,14 @@ define void @fneg_v8f32(ptr %a) {
 ; CHECK-NEXT:    fneg z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fneg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fneg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = fneg <8 x float> %op
   store <8 x float> %res, ptr %a
@@ -662,6 +1296,11 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x double> %op
   ret <2 x double> %res
 }
@@ -675,6 +1314,14 @@ define void @fneg_v4f64(ptr %a) {
 ; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fneg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fneg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = fneg <4 x double> %op
   store <4 x double> %res, ptr %a
@@ -693,6 +1340,30 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsqrt s2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s1, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fsqrt s3, s3
+; NONEON-NOSVE-NEXT:    fsqrt s4, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s3
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s4
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -705,6 +1376,30 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsqrt s2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s1, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fsqrt s3, s3
+; NONEON-NOSVE-NEXT:    fsqrt s4, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s3
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s4
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -717,6 +1412,48 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsqrt s2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h0
+; NONEON-NOSVE-NEXT:    fcvt h0, s2
+; NONEON-NOSVE-NEXT:    fsqrt s1, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s3, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s3
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s4, s4
+; NONEON-NOSVE-NEXT:    fcvt h1, s4
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s5, s5
+; NONEON-NOSVE-NEXT:    fcvt h1, s5
+; NONEON-NOSVE-NEXT:    mov v0.h[4], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s6, s6
+; NONEON-NOSVE-NEXT:    fcvt h1, s6
+; NONEON-NOSVE-NEXT:    mov v0.h[5], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s7, s7
+; NONEON-NOSVE-NEXT:    fcvt h1, s7
+; NONEON-NOSVE-NEXT:    mov v0.h[6], v1.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s2, s16
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v0.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -730,6 +1467,89 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsqrt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q16, [x0]
+; NONEON-NOSVE-NEXT:    mov h0, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h17, v16.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s18, h16
+; NONEON-NOSVE-NEXT:    mov h19, v16.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h20, v16.h[3]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h21, v16.h[4]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h22, v16.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fsqrt s2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s21, h21
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s22, h22
+; NONEON-NOSVE-NEXT:    mov h23, v16.h[6]
+; NONEON-NOSVE-NEXT:    mov h16, v16.h[7]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s23, h23
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s17, s17
+; NONEON-NOSVE-NEXT:    fcvt h17, s17
+; NONEON-NOSVE-NEXT:    fsqrt s18, s18
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    mov v18.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s3, s3
+; NONEON-NOSVE-NEXT:    fcvt h0, s3
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s19, s19
+; NONEON-NOSVE-NEXT:    fcvt h17, s19
+; NONEON-NOSVE-NEXT:    mov v18.h[2], v17.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s4, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s20, s20
+; NONEON-NOSVE-NEXT:    fcvt h3, s20
+; NONEON-NOSVE-NEXT:    mov v18.h[3], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s5, s5
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s21, s21
+; NONEON-NOSVE-NEXT:    fcvt h3, s21
+; NONEON-NOSVE-NEXT:    mov v18.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s6, s6
+; NONEON-NOSVE-NEXT:    fcvt h0, s6
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s22, s22
+; NONEON-NOSVE-NEXT:    fcvt h3, s22
+; NONEON-NOSVE-NEXT:    mov v18.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s7, s7
+; NONEON-NOSVE-NEXT:    fcvt h0, s7
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v0.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s23, s23
+; NONEON-NOSVE-NEXT:    fcvt h3, s23
+; NONEON-NOSVE-NEXT:    mov v18.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s16, s16
+; NONEON-NOSVE-NEXT:    fcvt h3, s16
+; NONEON-NOSVE-NEXT:    mov v18.h[7], v3.h[0]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q18, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -744,6 +1564,11 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsqrt v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -756,6 +1581,11 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -769,6 +1599,14 @@ define void @fsqrt_v8f32(ptr %a) {
 ; CHECK-NEXT:    fsqrt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fsqrt v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fsqrt v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -783,6 +1621,11 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -796,6 +1639,14 @@ define void @fsqrt_v4f64(ptr %a) {
 ; CHECK-NEXT:    fsqrt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fsqrt v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fsqrt v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -815,6 +1666,14 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -828,6 +1687,14 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -841,6 +1708,18 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -856,6 +1735,29 @@ define void @fsub_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fsub v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fsub v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fsub v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fsub <16 x half> %op1, %op2
@@ -872,6 +1774,11 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -885,6 +1792,11 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -900,6 +1812,15 @@ define void @fsub_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fsub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fsub v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fsub <8 x float> %op1, %op2
@@ -916,6 +1837,11 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fsub v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -931,6 +1857,15 @@ define void @fsub_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fsub v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fsub v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fsub <4 x double> %op1, %op2
@@ -950,6 +1885,11 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -962,6 +1902,11 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    bic v0.4h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -974,6 +1919,11 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -987,6 +1937,14 @@ define void @fabs_v16f16(ptr %a) {
 ; CHECK-NEXT:    fabs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    bic v0.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    bic v1.8h, #128, lsl #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -1001,6 +1959,11 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fabs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -1013,6 +1976,11 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -1026,6 +1994,14 @@ define void @fabs_v8f32(ptr %a) {
 ; CHECK-NEXT:    fabs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fabs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fabs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -1040,6 +2016,11 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -1053,6 +2034,14 @@ define void @fabs_v4f64(ptr %a) {
 ; CHECK-NEXT:    fabs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fabs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fabs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index 465cc179a3b9..d4810c78cb53 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,14 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x half> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i16>
   ret <2 x i16> %sext
@@ -34,6 +43,14 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x half> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
@@ -49,6 +66,65 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, s5
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <8 x half> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
@@ -66,6 +142,123 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, eq
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp oeq <16 x half> %op1, %op2
@@ -84,6 +277,11 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x float> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
@@ -99,6 +297,11 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x float> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
@@ -116,6 +319,15 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %cmp = fcmp oeq <8 x float> %op1, %op2
@@ -132,6 +344,11 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcmeq d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <1 x double> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
@@ -147,6 +364,11 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x double> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
@@ -164,6 +386,15 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcmeq v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcmeq v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %cmp = fcmp oeq <4 x double> %op1, %op2
@@ -192,6 +423,139 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h2
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s6, h16
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w13, eq
+; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s4, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w15, eq
+; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s3
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w16, eq
+; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s4, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h3
+; NONEON-NOSVE-NEXT:    fmov s2, w12
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w17, eq
+; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmov s3, w17
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
+; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
+; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
+; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ueq <16 x half> %op1, %op2
@@ -220,6 +584,139 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_one_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h2
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    csinv w12, w9, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    csinv w10, w9, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    csinv w11, w9, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s6, h16
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w13, mi
+; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[2]
+; NONEON-NOSVE-NEXT:    csetm w14, mi
+; NONEON-NOSVE-NEXT:    csinv w14, w14, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s4, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w15, mi
+; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s3
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w16, mi
+; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s4, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h3
+; NONEON-NOSVE-NEXT:    fmov s2, w12
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w17, mi
+; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmov s3, w17
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    mov v3.h[1], w16
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v3.h[2], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v3.h[3], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w9
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v3.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w13
+; NONEON-NOSVE-NEXT:    mov v3.h[5], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w14
+; NONEON-NOSVE-NEXT:    mov v3.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w15
+; NONEON-NOSVE-NEXT:    mov v3.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp one <16 x half> %op1, %op2
@@ -244,6 +741,123 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_une_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp une <16 x half> %op1, %op2
@@ -268,6 +882,123 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, gt
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, gt
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, gt
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ogt <16 x half> %op1, %op2
@@ -295,6 +1026,123 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, hi
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, hi
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, hi
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, hi
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, hi
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, hi
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, hi
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, hi
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ugt <16 x half> %op1, %op2
@@ -319,6 +1167,123 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_olt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, mi
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, mi
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, mi
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, mi
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, mi
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, mi
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, mi
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, mi
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, mi
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp olt <16 x half> %op1, %op2
@@ -346,6 +1311,123 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ult_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, lt
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, lt
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, lt
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ult <16 x half> %op1, %op2
@@ -370,6 +1452,123 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ge
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ge
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ge
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp oge <16 x half> %op1, %op2
@@ -397,6 +1596,123 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_uge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, pl
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, pl
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, pl
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, pl
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, pl
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, pl
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, pl
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, pl
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, pl
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp uge <16 x half> %op1, %op2
@@ -421,6 +1737,123 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ole_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ls
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ls
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ls
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ls
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ls
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ls
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ls
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ls
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ls
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ole <16 x half> %op1, %op2
@@ -448,6 +1881,123 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ule_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, le
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, le
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, le
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ule <16 x half> %op1, %op2
@@ -472,6 +2022,123 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_uno_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, vs
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, vs
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, vs
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, vs
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, vs
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, vs
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, vs
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, vs
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, vs
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp uno <16 x half> %op1, %op2
@@ -499,6 +2166,123 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ord_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, vc
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, vc
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, vc
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, vc
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, vc
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ord <16 x half> %op1, %op2
@@ -523,6 +2307,123 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_eq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, eq
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, eq
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast oeq <16 x half> %op1, %op2
@@ -547,6 +2448,123 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ne_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast one <16 x half> %op1, %op2
@@ -571,6 +2589,123 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_gt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, gt
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, gt
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, gt
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, gt
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, gt
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast ogt <16 x half> %op1, %op2
@@ -595,6 +2730,123 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_lt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, lt
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, lt
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, lt
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, lt
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, lt
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast olt <16 x half> %op1, %op2
@@ -619,6 +2871,123 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, ge
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, ge
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, ge
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, ge
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, ge
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast oge <16 x half> %op1, %op2
@@ -643,6 +3012,123 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_le_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h2
+; NONEON-NOSVE-NEXT:    fcvt s7, h1
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h0, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w12, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w11, le
+; NONEON-NOSVE-NEXT:    fcmp s3, s0
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w9, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    csetm w13, le
+; NONEON-NOSVE-NEXT:    fcmp s7, s3
+; NONEON-NOSVE-NEXT:    fmov s7, w12
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    csetm w14, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov v7.h[1], w8
+; NONEON-NOSVE-NEXT:    csetm w15, le
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    mov v7.h[2], w11
+; NONEON-NOSVE-NEXT:    csetm w16, le
+; NONEON-NOSVE-NEXT:    fcmp s5, s2
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    csetm w17, le
+; NONEON-NOSVE-NEXT:    mov v7.h[3], w9
+; NONEON-NOSVE-NEXT:    fmov s2, w17
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w16
+; NONEON-NOSVE-NEXT:    mov v7.h[4], w10
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    mov v7.h[5], w13
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v7.h[6], w14
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s6, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v7.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    fcmp s4, s3
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    fcmp s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    stp q2, q7, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast ole <16 x half> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index 9bdde14e8d83..ac0b6c0e0440 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,17 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fp_convert_combine_crash:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov v0.4s, #8.00000000
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmul v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %f = load <8 x float>, ptr %a
   %mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,
                                  float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 244a40510173..16f30adbd14e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,12 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <2 x half> %a to <2 x float>
   store <2 x float> %res, ptr %b
   ret void
@@ -31,6 +38,12 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <4 x half> %a to <4 x float>
   store <4 x float> %res, ptr %b
   ret void
@@ -48,6 +61,17 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <8 x half> %a to <8 x float>
   store <8 x float> %res, ptr %b
   ret void
@@ -72,6 +96,21 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    stp q2, q1, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <16 x half> %a to <16 x float>
   store <16 x float> %res, ptr %b
   ret void
@@ -90,6 +129,13 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
@@ -104,6 +150,13 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x float>
   store <4 x float> %res, ptr %b
@@ -121,6 +174,18 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -145,6 +210,22 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -162,6 +243,13 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt d0, h0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x half>, ptr %a
   %res = fpext <1 x half> %op1 to <1 x double>
   store <1 x double> %res, ptr %b
@@ -176,6 +264,14 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x double>
   store <2 x double> %res, ptr %b
@@ -193,6 +289,19 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -217,6 +326,26 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -258,6 +387,38 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    stp q1, q2, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtl v5.2d, v5.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    fcvtl v4.2d, v4.2s
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v7.2s
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v6.2s
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -275,6 +436,13 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt d0, s0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x float>, ptr %a
   %res = fpext <1 x float> %op1 to <1 x double>
   store <1 x double> %res, ptr %b
@@ -289,6 +457,13 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fpext <2 x float> %op1 to <2 x double>
   store <2 x double> %res, ptr %b
@@ -306,6 +481,18 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fpext <4 x float> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -330,6 +517,22 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fpext <8 x float> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -348,6 +551,13 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fptrunc <2 x float> %op1 to <2 x half>
   store <2 x half> %res, ptr %b
@@ -362,6 +572,13 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptrunc <4 x float> %op1 to <4 x half>
   store <4 x half> %res, ptr %b
@@ -379,6 +596,14 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptrunc <8 x float> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -397,6 +622,13 @@ define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x double>, ptr %a
   %res = fptrunc <1 x double> %op1 to <1 x half>
   store <1 x half> %res, ptr %b
@@ -411,6 +643,14 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %res = fptrunc <2 x double> %op1 to <2 x half>
   store <2 x half> %res, ptr %b
@@ -428,6 +668,15 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtxn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtxn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x half>
   store <4 x half> %res, ptr %b
@@ -446,6 +695,13 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <1 x double> %op1 to <1 x float>
   store <1 x float> %res, ptr %b
   ret void
@@ -459,6 +715,12 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <2 x double> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
   ret void
@@ -475,6 +737,14 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x float>
   store <4 x float> %res, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index cbe71d715a8f..44d7116e5f87 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,18 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x half> %op1, %op2
   %res = fadd contract <4 x half> %mul, %op3
   ret <4 x half> %res
@@ -32,6 +45,26 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fmul v3.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <8 x half> %op1, %op2
   %res = fadd contract <8 x half> %mul, %op3
   ret <8 x half> %res
@@ -49,6 +82,46 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fmul v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fmul v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x2]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %op3 = load <16 x half>, ptr %c
@@ -68,6 +141,12 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.2s, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x float> %op1, %op2
   %res = fadd contract <2 x float> %mul, %op3
   ret <2 x float> %res
@@ -83,6 +162,12 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x float> %op1, %op2
   %res = fadd contract <4 x float> %mul, %op3
   ret <4 x float> %res
@@ -100,6 +185,16 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fmla v1.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fmla v5.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %op3 = load <8 x float>, ptr %c
@@ -114,6 +209,11 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmadd d0, d0, d1, d2
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmadd d0, d0, d1, d2
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <1 x double> %op1, %op2
   %res = fadd contract <1 x double> %mul, %op3
   ret <1 x double> %res
@@ -129,6 +229,12 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmla v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x double> %op1, %op2
   %res = fadd contract <2 x double> %mul, %op3
   ret <2 x double> %res
@@ -146,6 +252,16 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q5, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fmla v1.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fmla v5.2d, v4.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %op3 = load <4 x double>, ptr %c
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 94a74763aa0e..bc7659c06ad0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,38 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -30,6 +63,64 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s4
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s5, s5, s16
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s3, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s6, s3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -45,6 +136,119 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    fcvt s20, h3
+; NONEON-NOSVE-NEXT:    fcvt s21, h2
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s19, s4
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmaxnm s20, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
+; NONEON-NOSVE-NEXT:    fmaxnm s5, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmaxnm s6, s16, s6
+; NONEON-NOSVE-NEXT:    fmaxnm s16, s18, s17
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s18, h19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h17, s5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt h5, s20
+; NONEON-NOSVE-NEXT:    fmaxnm s20, s22, s21
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s21, h23
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
+; NONEON-NOSVE-NEXT:    fmaxnm s7, s18, s7
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s20
+; NONEON-NOSVE-NEXT:    fmaxnm s19, s21, s19
+; NONEON-NOSVE-NEXT:    fcvt s20, h23
+; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s6, h17
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s19
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s6, s17, s6
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmaxnm s18, s20, s18
+; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s7, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s16, h21
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    fmaxnm s7, s22, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s16, s21, s16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
+; NONEON-NOSVE-NEXT:    fmaxnm s6, s19, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmaxnm s17, s23, s20
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -61,6 +265,11 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnm v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -74,6 +283,11 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -89,6 +303,15 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -101,6 +324,11 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmaxnm d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -114,6 +342,11 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -129,6 +362,15 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmaxnm v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -149,6 +391,38 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fminnm s5, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    fminnm s3, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    fminnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -162,6 +436,64 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fminnm s3, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s4
+; NONEON-NOSVE-NEXT:    fminnm s4, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fminnm s5, s5, s16
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s3, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fminnm s3, s6, s3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fminnm s6, s16, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -177,6 +509,119 @@ define void @fminnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    fcvt s20, h3
+; NONEON-NOSVE-NEXT:    fcvt s21, h2
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fminnm s4, s19, s4
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
+; NONEON-NOSVE-NEXT:    fminnm s20, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
+; NONEON-NOSVE-NEXT:    fminnm s5, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
+; NONEON-NOSVE-NEXT:    fminnm s6, s16, s6
+; NONEON-NOSVE-NEXT:    fminnm s16, s18, s17
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s18, h19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h17, s5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt h5, s20
+; NONEON-NOSVE-NEXT:    fminnm s20, s22, s21
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s21, h23
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
+; NONEON-NOSVE-NEXT:    fminnm s7, s18, s7
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s20
+; NONEON-NOSVE-NEXT:    fminnm s19, s21, s19
+; NONEON-NOSVE-NEXT:    fcvt s20, h23
+; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s6, h17
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s19
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s6, s17, s6
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fminnm s18, s20, s18
+; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s7, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s16, h21
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    fminnm s7, s22, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s16, s21, s16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
+; NONEON-NOSVE-NEXT:    fminnm s6, s19, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fminnm s17, s23, s20
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -193,6 +638,11 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnm v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -206,6 +656,11 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -221,6 +676,15 @@ define void @fminnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fminnm v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -233,6 +697,11 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fminnm d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -246,6 +715,11 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -261,6 +735,15 @@ define void @fminnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fminnm v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -281,6 +764,38 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fmax s5, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmax s3, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    fmax s1, s4, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -294,6 +809,64 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fmax s3, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s4
+; NONEON-NOSVE-NEXT:    fmax s4, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fmax s5, s5, s16
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s3, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fmax s3, s6, s3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fmax s6, s16, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -309,6 +882,119 @@ define void @fmax_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    fcvt s20, h3
+; NONEON-NOSVE-NEXT:    fcvt s21, h2
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fmax s4, s19, s4
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmax s20, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
+; NONEON-NOSVE-NEXT:    fmax s5, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmax s6, s16, s6
+; NONEON-NOSVE-NEXT:    fmax s16, s18, s17
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s18, h19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h17, s5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt h5, s20
+; NONEON-NOSVE-NEXT:    fmax s20, s22, s21
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s21, h23
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
+; NONEON-NOSVE-NEXT:    fmax s7, s18, s7
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s20
+; NONEON-NOSVE-NEXT:    fmax s19, s21, s19
+; NONEON-NOSVE-NEXT:    fcvt s20, h23
+; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s6, h17
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s19
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s6, s17, s6
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmax s18, s20, s18
+; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s7, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s16, h21
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    fmax s7, s22, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s16, s21, s16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
+; NONEON-NOSVE-NEXT:    fmax s6, s19, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmax s17, s23, s20
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -325,6 +1011,11 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -338,6 +1029,11 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -353,6 +1049,15 @@ define void @fmax_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -365,6 +1070,11 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmax d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmax d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -378,6 +1088,11 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmax v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -393,6 +1108,15 @@ define void @fmax_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmax v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -413,6 +1137,38 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h4
+; NONEON-NOSVE-NEXT:    fcvt s4, h5
+; NONEON-NOSVE-NEXT:    fmin s5, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmin s3, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s5
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    fmin s1, s4, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[2], v2.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[3], v1.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -426,6 +1182,64 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fmin s3, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s4
+; NONEON-NOSVE-NEXT:    fmin s4, s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fmin s5, s5, s16
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    mov v2.h[1], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s3, h6
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v2.h[2], v4.h[0]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fmin s3, s6, s3
+; NONEON-NOSVE-NEXT:    mov h6, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], v5.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h6
+; NONEON-NOSVE-NEXT:    fmin s6, s16, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov v2.h[4], v3.h[0]
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h3, s6
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov v2.h[5], v3.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v2.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v2.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -441,6 +1255,119 @@ define void @fmin_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h18, v2.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h17, v3.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s19, h0
+; NONEON-NOSVE-NEXT:    fcvt s20, h3
+; NONEON-NOSVE-NEXT:    fcvt s21, h2
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[2]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fmin s4, s19, s4
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h24, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmin s20, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h25, v2.h[6]
+; NONEON-NOSVE-NEXT:    fmin s5, s7, s5
+; NONEON-NOSVE-NEXT:    mov h7, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmin s6, s16, s6
+; NONEON-NOSVE-NEXT:    fmin s16, s18, s17
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s18, h19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    mov h24, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h17, s5
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt h5, s20
+; NONEON-NOSVE-NEXT:    fmin s20, s22, s21
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt s21, h23
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    mov h22, v0.h[4]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[1], v17.h[0]
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[4]
+; NONEON-NOSVE-NEXT:    fmin s7, s18, s7
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s20
+; NONEON-NOSVE-NEXT:    fmin s19, s21, s19
+; NONEON-NOSVE-NEXT:    fcvt s20, h23
+; NONEON-NOSVE-NEXT:    mov h21, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h23, v2.h[5]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], v6.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s6, h17
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    mov h22, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h16, s19
+; NONEON-NOSVE-NEXT:    mov h19, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s6, s17, s6
+; NONEON-NOSVE-NEXT:    mov h17, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmin s18, s20, s18
+; NONEON-NOSVE-NEXT:    mov h20, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s7, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    mov v5.h[3], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt s16, h21
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    mov h3, v3.h[7]
+; NONEON-NOSVE-NEXT:    fmin s7, s22, s7
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s16, s21, s16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], v6.h[0]
+; NONEON-NOSVE-NEXT:    fmin s6, s19, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[4], v18.h[0]
+; NONEON-NOSVE-NEXT:    fmin s17, s23, s20
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h7, s7
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], v7.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    mov v4.h[5], v16.h[0]
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    mov v5.h[6], v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], v6.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[7], v1.h[0]
+; NONEON-NOSVE-NEXT:    mov v4.h[7], v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -457,6 +1384,11 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -470,6 +1402,11 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -485,6 +1422,15 @@ define void @fmin_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -497,6 +1443,11 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmin d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmin d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -510,6 +1461,11 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmin v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -525,6 +1481,15 @@ define void @fmin_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmin v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
index b56e67d95ba0..fdb81b8e5fe1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=FA64
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -26,6 +27,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; NO-FA64-NEXT:    fadd h0, h0, h2
 ; NO-FA64-NEXT:    fadd h0, h0, h1
 ; NO-FA64-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index df9613a30e40..74a5db4b38e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
@@ -43,6 +68,49 @@ define half @fadda_v8f16(half %start, <8 x half> %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
@@ -83,6 +151,90 @@ define half @fadda_v16f16(half %start, ptr %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    fcvt s2, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
@@ -96,6 +248,14 @@ define float @fadda_v2f32(float %start, <2 x float> %a) {
 ; CHECK-NEXT:    mov z1.s, z1.s[1]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
@@ -112,6 +272,17 @@ define float @fadda_v4f32(float %start, <4 x float> %a) {
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
@@ -136,6 +307,26 @@ define float @fadda_v8f32(float %start, ptr %a) {
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    mov s2, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov s3, v1.s[2]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    mov s1, v1.s[3]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s3
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
@@ -146,6 +337,11 @@ define double @fadda_v1f64(double %start, <1 x double> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
@@ -158,6 +354,13 @@ define double @fadda_v2f64(double %start, <2 x double> %a) {
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
@@ -174,6 +377,17 @@ define double @fadda_v4f64(double %start, ptr %a) {
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov d2, v3.d[1]
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d3
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    mov d2, v1.d[1]
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
@@ -191,6 +405,30 @@ define half @faddv_v4f16(half %start, <4 x half> %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
@@ -203,6 +441,49 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
@@ -216,6 +497,58 @@ define half @faddv_v16f16(half %start, ptr %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fadd v3.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
+; NONEON-NOSVE-NEXT:    mov h1, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s3, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    mov h3, v2.h[6]
+; NONEON-NOSVE-NEXT:    mov h2, v2.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
@@ -229,6 +562,12 @@ define float @faddv_v2f32(float %start, <2 x float> %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
@@ -241,6 +580,13 @@ define float @faddv_v4f32(float %start, <4 x float> %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
@@ -254,6 +600,15 @@ define float @faddv_v8f32(float %start, ptr %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    faddp v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    faddp s1, v1.2s
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
@@ -264,6 +619,11 @@ define double @faddv_v1f64(double %start, <1 x double> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
@@ -276,6 +636,12 @@ define double @faddv_v2f64(double %start, <2 x double> %a) {
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
@@ -289,6 +655,14 @@ define double @faddv_v4f64(double %start, ptr %a) {
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v1.2d
+; NONEON-NOSVE-NEXT:    faddp d1, v1.2d
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
@@ -306,6 +680,26 @@ define half @fmaxv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -318,6 +712,45 @@ define half @fmaxv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -331,6 +764,85 @@ define half @fmaxv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s4, s2
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
   ret half %res
@@ -344,6 +856,11 @@ define float @fmaxv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnmp s0, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -356,6 +873,11 @@ define float @fmaxv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -369,6 +891,13 @@ define float @fmaxv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmaxnm v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmaxnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
   ret float %res
@@ -378,6 +907,10 @@ define double @fmaxv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fmaxv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -390,6 +923,11 @@ define double @fmaxv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -403,6 +941,13 @@ define double @fmaxv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmaxnm v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmaxnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
   ret double %res
@@ -420,6 +965,26 @@ define half @fminv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -432,6 +997,45 @@ define half @fminv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -445,6 +1049,85 @@ define half @fminv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fminnm s2, s4, s2
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fminnm s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
   ret half %res
@@ -458,6 +1141,11 @@ define float @fminv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnmp s0, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -470,6 +1158,11 @@ define float @fminv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -483,6 +1176,13 @@ define float @fminv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fminnm v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fminnmv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
   ret float %res
@@ -492,6 +1192,10 @@ define double @fminv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fminv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -504,6 +1208,11 @@ define double @fminv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -517,6 +1226,13 @@ define double @fminv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fminnm v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fminnmp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
   ret double %res
@@ -534,6 +1250,26 @@ define half @fmaximumv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -546,6 +1282,45 @@ define half @fmaximumv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -559,6 +1334,85 @@ define half @fmaximumv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmax s2, s4, s2
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmax s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op)
   ret half %res
@@ -572,6 +1426,11 @@ define float @fmaximumv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxp s0, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -584,6 +1443,11 @@ define float @fmaximumv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -597,6 +1461,13 @@ define float @fmaximumv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fmaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op)
   ret float %res
@@ -606,6 +1477,10 @@ define double @fmaximumv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fmaximumv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -618,6 +1493,11 @@ define double @fmaximumv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -631,6 +1511,13 @@ define double @fmaximumv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmax v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fmaxp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op)
   ret double %res
@@ -648,6 +1535,26 @@ define half @fminimumv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -660,6 +1567,45 @@ define half @fminimumv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s2, s1
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -673,6 +1619,85 @@ define half @fminimumv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s4, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[3]
+; NONEON-NOSVE-NEXT:    fmin s2, s4, s2
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[3]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[4]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    mov h3, v1.h[4]
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s3, s5, s3
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s4, s5, s4
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[6]
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[7]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h3, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[7]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmin s3, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op)
   ret half %res
@@ -686,6 +1711,11 @@ define float @fminimumv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminp s0, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -698,6 +1728,11 @@ define float @fminimumv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -711,6 +1746,13 @@ define float @fminimumv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op)
   ret float %res
@@ -720,6 +1762,10 @@ define double @fminimumv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fminimumv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -732,6 +1778,11 @@ define double @fminimumv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -745,6 +1796,13 @@ define double @fminimumv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    fmin v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fminp d0, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op)
   ret double %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 7ddc641f366c..454683865eb9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,13 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -28,6 +36,13 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -40,6 +55,16 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintp v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -53,6 +78,24 @@ define void @frintp_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintp z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintp v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintp v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -67,6 +110,11 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintp v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -79,6 +127,11 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -92,6 +145,14 @@ define void @frintp_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintp z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintp v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintp v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -103,6 +164,11 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintp d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -115,6 +181,11 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -128,6 +199,14 @@ define void @frintp_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintp z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintp v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintp v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -146,6 +225,13 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -158,6 +244,13 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -170,6 +263,16 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintm v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -183,6 +286,24 @@ define void @frintm_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintm z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintm v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintm v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -197,6 +318,11 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintm v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -209,6 +335,11 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -222,6 +353,14 @@ define void @frintm_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintm z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintm v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintm v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -233,6 +372,11 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintm d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -245,6 +389,11 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -258,6 +407,14 @@ define void @frintm_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintm z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintm v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintm v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -276,6 +433,13 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -288,6 +452,13 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -300,6 +471,16 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frinti v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -313,6 +494,24 @@ define void @frinti_v16f16(ptr %a) {
 ; CHECK-NEXT:    frinti z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frinti v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frinti v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -327,6 +526,11 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinti v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -339,6 +543,11 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -352,6 +561,14 @@ define void @frinti_v8f32(ptr %a) {
 ; CHECK-NEXT:    frinti z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frinti v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frinti v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -363,6 +580,11 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frinti d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -375,6 +597,11 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -388,6 +615,14 @@ define void @frinti_v4f64(ptr %a) {
 ; CHECK-NEXT:    frinti z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frinti v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frinti v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -406,6 +641,13 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -418,6 +660,13 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -430,6 +679,16 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintx v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -443,6 +702,24 @@ define void @frintx_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintx v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintx v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -457,6 +734,11 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintx v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -469,6 +751,11 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -482,6 +769,14 @@ define void @frintx_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintx v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintx v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -493,6 +788,11 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintx d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -505,6 +805,11 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -518,6 +823,14 @@ define void @frintx_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintx v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintx v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -536,6 +849,13 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -548,6 +868,13 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -560,6 +887,16 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frinta v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -573,6 +910,24 @@ define void @frinta_v16f16(ptr %a) {
 ; CHECK-NEXT:    frinta z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frinta v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frinta v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -587,6 +942,11 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinta v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -599,6 +959,11 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -612,6 +977,14 @@ define void @frinta_v8f32(ptr %a) {
 ; CHECK-NEXT:    frinta z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frinta v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frinta v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -623,6 +996,11 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frinta d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -635,6 +1013,11 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -648,6 +1031,14 @@ define void @frinta_v4f64(ptr %a) {
 ; CHECK-NEXT:    frinta z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frinta v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frinta v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -666,6 +1057,13 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -678,6 +1076,13 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -690,6 +1095,16 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintn v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -703,6 +1118,24 @@ define void @frintn_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintn z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintn v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintn v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -717,6 +1150,11 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintn v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -729,6 +1167,11 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -742,6 +1185,14 @@ define void @frintn_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintn z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintn v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintn v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -753,6 +1204,11 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintn d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -765,6 +1221,11 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -778,6 +1239,14 @@ define void @frintn_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintn z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintn v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintn v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -796,6 +1265,13 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -808,6 +1284,13 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -820,6 +1303,16 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v0.8h
+; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v1.4s
+; NONEON-NOSVE-NEXT:    frintz v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -833,6 +1326,24 @@ define void @frintz_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    frintz v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    frintz v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v1.4s
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -847,6 +1358,11 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintz v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -859,6 +1375,11 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -872,6 +1393,14 @@ define void @frintz_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintz v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    frintz v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -883,6 +1412,11 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintz d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -895,6 +1429,11 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -908,6 +1447,14 @@ define void @frintz_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    frintz v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    frintz v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 7d36925fdc57..0268dd1b5d31 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,14 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
 }
@@ -32,6 +41,14 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
 }
@@ -48,6 +65,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.8h, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
 }
@@ -67,6 +92,20 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x half>, ptr %a
   %op2 = load volatile <16 x half>, ptr %b
   %sel = select i1 %mask, <16 x half> %op1, <16 x half> %op2
@@ -86,6 +125,14 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
 }
@@ -102,6 +149,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4s, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
 }
@@ -121,6 +176,20 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x float>, ptr %a
   %op2 = load volatile <8 x float>, ptr %b
   %sel = select i1 %mask, <8 x float> %op1, <8 x float> %op2
@@ -134,6 +203,14 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    fmov d2, x8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
 }
@@ -151,6 +228,14 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2d, x8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
 }
@@ -171,6 +256,20 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x double>, ptr %a
   %op2 = load volatile <4 x double>, ptr %b
   %sel = select i1 %mask, <4 x double> %op1, <4 x double> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index bf8a335a8503..1c63a3870d68 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,13 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -27,6 +35,21 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, ptr %b
@@ -42,6 +65,27 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -61,6 +105,13 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -74,6 +125,12 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -90,6 +147,20 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -114,6 +185,26 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, ptr %b
@@ -130,6 +221,13 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu x8, h0
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -145,6 +243,18 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -167,6 +277,27 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x10, s2
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s3
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptoui <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -204,6 +335,47 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvtzu x13, s2
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h7
+; NONEON-NOSVE-NEXT:    fcvtzu x10, s3
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s4
+; NONEON-NOSVE-NEXT:    fcvtzu x12, s5
+; NONEON-NOSVE-NEXT:    fcvtzu x14, s6
+; NONEON-NOSVE-NEXT:    fmov d3, x13
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s1
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d2, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
+; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -264,6 +436,80 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h0
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s17, h4
+; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s6
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s2
+; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
+; NONEON-NOSVE-NEXT:    fcvtzu x10, s17
+; NONEON-NOSVE-NEXT:    fcvtzu x13, s5
+; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
+; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s17, h18
+; NONEON-NOSVE-NEXT:    fcvtzu x14, s7
+; NONEON-NOSVE-NEXT:    fmov d7, x8
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fmov d0, x11
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s1
+; NONEON-NOSVE-NEXT:    fmov d1, x13
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvtzu x13, s16
+; NONEON-NOSVE-NEXT:    fmov d16, x9
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvtzu x15, s17
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s2
+; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s6
+; NONEON-NOSVE-NEXT:    fcvtzu x14, s4
+; NONEON-NOSVE-NEXT:    fcvtzu x12, s3
+; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
+; NONEON-NOSVE-NEXT:    fmov d3, x10
+; NONEON-NOSVE-NEXT:    fcvtzu x11, s5
+; NONEON-NOSVE-NEXT:    fmov d2, x15
+; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d4, x8
+; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
+; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
+; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
+; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, ptr %b
@@ -282,6 +528,11 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -295,6 +546,12 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -312,6 +569,14 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -336,6 +601,19 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptoui <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -354,6 +632,11 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -366,6 +649,11 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -379,6 +667,14 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzu v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -398,6 +694,13 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -411,6 +714,12 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -427,6 +736,20 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptoui <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -451,6 +774,26 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -468,6 +811,12 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
@@ -481,6 +830,12 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -509,6 +864,15 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -552,6 +916,23 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI26_0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
+; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
+; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
+; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -628,6 +1009,35 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI27_0
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
+; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI27_0]
+; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
+; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
+; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
+; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
+; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
+; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
+; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
+; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptoui <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -647,6 +1057,13 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
@@ -660,6 +1077,12 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -677,6 +1100,14 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -701,6 +1132,19 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -719,6 +1163,12 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -731,6 +1181,11 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -744,6 +1199,14 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzu v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzu v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -762,6 +1225,13 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -774,6 +1244,21 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, ptr %b
@@ -789,6 +1274,27 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -808,6 +1314,13 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -821,6 +1334,12 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -837,6 +1356,20 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -861,6 +1394,26 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, ptr %b
@@ -877,6 +1430,13 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs x8, h0
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -893,6 +1453,18 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -915,6 +1487,27 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h2, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x10, s2
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s3
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptosi <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -952,6 +1545,47 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov h1, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    mov h4, v0.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[2]
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
+; NONEON-NOSVE-NEXT:    mov h7, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvtzs x13, s2
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h7
+; NONEON-NOSVE-NEXT:    fcvtzs x10, s3
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s4
+; NONEON-NOSVE-NEXT:    fcvtzs x12, s5
+; NONEON-NOSVE-NEXT:    fcvtzs x14, s6
+; NONEON-NOSVE-NEXT:    fmov d3, x13
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s1
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d2, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v3.d[1], x8
+; NONEON-NOSVE-NEXT:    mov v2.d[1], x14
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -1012,6 +1646,80 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
+; NONEON-NOSVE-NEXT:    mov h5, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[2]
+; NONEON-NOSVE-NEXT:    mov h16, v0.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s6, h0
+; NONEON-NOSVE-NEXT:    mov h0, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h1, v1.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s17, h4
+; NONEON-NOSVE-NEXT:    mov h18, v4.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s3
+; NONEON-NOSVE-NEXT:    fcvt s3, h5
+; NONEON-NOSVE-NEXT:    fcvt s5, h7
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    mov h16, v4.h[3]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s6
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    mov h4, v4.h[1]
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s2
+; NONEON-NOSVE-NEXT:    mov h2, v6.h[2]
+; NONEON-NOSVE-NEXT:    fcvtzs x10, s17
+; NONEON-NOSVE-NEXT:    fcvtzs x13, s5
+; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
+; NONEON-NOSVE-NEXT:    mov h3, v6.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov h5, v6.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s17, h18
+; NONEON-NOSVE-NEXT:    fcvtzs x14, s7
+; NONEON-NOSVE-NEXT:    fmov d7, x8
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fmov d0, x11
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s1
+; NONEON-NOSVE-NEXT:    fmov d1, x13
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvtzs x13, s16
+; NONEON-NOSVE-NEXT:    fmov d16, x9
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvtzs x15, s17
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x12
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x14
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s2
+; NONEON-NOSVE-NEXT:    mov v16.d[1], x8
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s6
+; NONEON-NOSVE-NEXT:    fcvtzs x14, s4
+; NONEON-NOSVE-NEXT:    fcvtzs x12, s3
+; NONEON-NOSVE-NEXT:    mov v7.d[1], x11
+; NONEON-NOSVE-NEXT:    fmov d3, x10
+; NONEON-NOSVE-NEXT:    fcvtzs x11, s5
+; NONEON-NOSVE-NEXT:    fmov d2, x15
+; NONEON-NOSVE-NEXT:    stp q16, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    fmov d4, x8
+; NONEON-NOSVE-NEXT:    stp q7, q0, [x1]
+; NONEON-NOSVE-NEXT:    mov v2.d[1], x13
+; NONEON-NOSVE-NEXT:    mov v3.d[1], x14
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x12
+; NONEON-NOSVE-NEXT:    mov v4.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q4, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, ptr %b
@@ -1030,6 +1738,11 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -1043,6 +1756,12 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -1060,6 +1779,14 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -1084,6 +1811,19 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptosi <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1102,6 +1842,11 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1114,6 +1859,11 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -1127,6 +1877,14 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1146,6 +1904,13 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1159,6 +1924,12 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1175,6 +1946,20 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptosi <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -1199,6 +1984,26 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtl v1.2d, v1.2s
+; NONEON-NOSVE-NEXT:    fcvtl v0.2d, v0.2s
+; NONEON-NOSVE-NEXT:    fcvtl v2.2d, v2.2s
+; NONEON-NOSVE-NEXT:    fcvtl v3.2d, v3.2s
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -1218,6 +2023,12 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
@@ -1231,6 +2042,12 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -1259,6 +2076,15 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -1302,6 +2128,23 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI61_0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    xtn v7.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI61_0]
+; NONEON-NOSVE-NEXT:    xtn v6.2s, v1.2d
+; NONEON-NOSVE-NEXT:    xtn v5.2s, v2.2d
+; NONEON-NOSVE-NEXT:    xtn v4.2s, v3.2d
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -1378,6 +2221,35 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI62_0
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v6.2d, v6.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v7.2d, v7.2d
+; NONEON-NOSVE-NEXT:    xtn v19.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI62_0]
+; NONEON-NOSVE-NEXT:    xtn v23.2s, v3.2d
+; NONEON-NOSVE-NEXT:    xtn v18.2s, v1.2d
+; NONEON-NOSVE-NEXT:    xtn v22.2s, v2.2d
+; NONEON-NOSVE-NEXT:    xtn v17.2s, v5.2d
+; NONEON-NOSVE-NEXT:    xtn v21.2s, v6.2d
+; NONEON-NOSVE-NEXT:    xtn v16.2s, v4.2d
+; NONEON-NOSVE-NEXT:    xtn v20.2s, v7.2d
+; NONEON-NOSVE-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptosi <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1397,6 +2269,13 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
@@ -1410,6 +2289,12 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1427,6 +2312,14 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -1451,6 +2344,19 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1469,6 +2375,12 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1481,6 +2393,11 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1494,6 +2411,14 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fcvtzs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtzs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 30a4f04a3d2b..32fe74bbb65f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -27,6 +28,14 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uzp1 v2.4h, v2.4h, v0.4h
+; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
 }
@@ -45,6 +54,13 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
 }
@@ -64,6 +80,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
 }
@@ -80,6 +104,126 @@ define void @select_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[1]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[1]
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[2]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s6, h1
+; NONEON-NOSVE-NEXT:    fcvt s7, h0
+; NONEON-NOSVE-NEXT:    mov h16, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov h17, v0.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    mov h2, v1.h[3]
+; NONEON-NOSVE-NEXT:    mov h3, v0.h[3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[4]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov h5, v0.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w12, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w11, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov h7, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov h18, v3.h[3]
+; NONEON-NOSVE-NEXT:    csetm w13, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    mov h4, v3.h[1]
+; NONEON-NOSVE-NEXT:    mov h5, v2.h[1]
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s17, s16
+; NONEON-NOSVE-NEXT:    mov h16, v3.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[2]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s6, h3
+; NONEON-NOSVE-NEXT:    fcvt s7, h2
+; NONEON-NOSVE-NEXT:    csetm w15, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s4
+; NONEON-NOSVE-NEXT:    fmov s4, w14
+; NONEON-NOSVE-NEXT:    csetm w16, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v2.h[3]
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    fcvt s16, h17
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w8
+; NONEON-NOSVE-NEXT:    fcvt s17, h18
+; NONEON-NOSVE-NEXT:    csetm w14, eq
+; NONEON-NOSVE-NEXT:    fmov s5, w14
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcmp s16, s7
+; NONEON-NOSVE-NEXT:    mov h7, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov h16, v2.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w12
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w16
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s17
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[5]
+; NONEON-NOSVE-NEXT:    fcvt s6, h7
+; NONEON-NOSVE-NEXT:    fcvt s7, h16
+; NONEON-NOSVE-NEXT:    mov h16, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w11
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov h6, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov h7, v2.h[6]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w13
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcmp s17, s16
+; NONEON-NOSVE-NEXT:    mov h16, v3.h[7]
+; NONEON-NOSVE-NEXT:    mov h17, v2.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w8
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s6, h16
+; NONEON-NOSVE-NEXT:    fcvt s7, h17
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w8
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w10
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w8
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w15
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %mask = fcmp oeq <16 x half> %op1, %op2
@@ -102,6 +246,13 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
 }
@@ -121,6 +272,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
 }
@@ -137,6 +296,18 @@ define void @select_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %mask = fcmp oeq <8 x float> %op1, %op2
@@ -151,6 +322,14 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    fmov d2, x8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
 }
@@ -170,6 +349,14 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
+; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
 }
@@ -186,6 +373,18 @@ define void @select_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    fcmeq v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcmeq v5.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %mask = fcmp oeq <4 x double> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index 4aa965777c74..c85048ab72e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -21,6 +22,14 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i8> %op1, i8 5, i64 3
     ret <4 x i8> %r
 }
@@ -38,6 +47,14 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i8> %op1, i8 5, i64 7
     ret <8 x i8> %r
 }
@@ -55,6 +72,12 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i8> %op1, i8 5, i64 15
     ret <16 x i8> %r
 }
@@ -72,6 +95,12 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 ; CHECK-NEXT:    mov z1.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v1.b[15], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <32 x i8> %op1, i8 5, i64 31
     ret <32 x i8> %r
 }
@@ -90,6 +119,14 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i16> %op1, i16 5, i64 1
     ret <2 x i16> %r
 }
@@ -107,6 +144,14 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i16> %op1, i16 5, i64 3
     ret <4 x i16> %r
 }
@@ -124,6 +169,12 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.h[7], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i16> %op1, i16 5, i64 7
     ret <8 x i16> %r
 }
@@ -141,6 +192,12 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 ; CHECK-NEXT:    mov z1.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v1.h[7], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i16> %op1, i16 5, i64 15
     ret <16 x i16> %r
 }
@@ -159,6 +216,14 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i32> %op1, i32 5, i64 1
     ret <2 x i32> %r
 }
@@ -176,6 +241,12 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i32> %op1, i32 5, i64 3
     ret <4 x i32> %r
 }
@@ -193,6 +264,13 @@ define <8 x i32> @insertelement_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x i32>, ptr %a
     %r = insertelement <8 x i32> %op1, i32 5, i64 7
     ret <8 x i32> %r
@@ -205,6 +283,12 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    mov z0.d, #5 // =0x5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x i64> %op1, i64 5, i64 0
     ret <1 x i64> %r
 }
@@ -222,6 +306,12 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i64> %op1, i64 5, i64 1
     ret <2 x i64> %r
 }
@@ -239,6 +329,13 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x8
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x i64>, ptr %a
     %r = insertelement <4 x i64> %op1, i64 5, i64 3
     ret <4 x i64> %r
@@ -257,6 +354,16 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI14_0
+; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI14_0
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    ld1r { v1.4h }, [x8]
+; NONEON-NOSVE-NEXT:    mov v1.h[0], v0.h[0]
+; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x half> %op1, half 5.0, i64 1
     ret <2 x half> %r
 }
@@ -274,6 +381,15 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI15_0
+; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI15_0
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x half> %op1, half 5.0, i64 3
     ret <4 x half> %r
 }
@@ -291,6 +407,13 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
+; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI16_0
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x half> %op1, half 5.0, i64 7
     ret <8 x half> %r
 }
@@ -308,6 +431,14 @@ define <16 x half> @insertelement_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z1.h, p0/m, h2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
+; NONEON-NOSVE-NEXT:    add x8, x8, :lo12:.LCPI17_0
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = insertelement <16 x half> %op1, half 5.0, i64 15
     ret <16 x half> %r
@@ -327,6 +458,14 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    mov v0.s[1], v1.s[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x float> %op1, float 5.0, i64 1
     ret <2 x float> %r
 }
@@ -344,6 +483,12 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov s1, #5.00000000
+; NONEON-NOSVE-NEXT:    mov v0.s[3], v1.s[0]
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x float> %op1, float 5.0, i64 3
     ret <4 x float> %r
 }
@@ -361,6 +506,13 @@ define <8 x float> @insertelement_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/m, s2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov s2, #5.00000000
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov v1.s[3], v2.s[0]
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = insertelement <8 x float> %op1, float 5.0, i64 7
     ret <8 x float> %r
@@ -372,6 +524,12 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, #5.00000000
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x double> %op1, double 5.0, i64 0
     ret <1 x double> %r
 }
@@ -389,6 +547,12 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 ; CHECK-NEXT:    mov z0.d, p0/m, d1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d1, #5.00000000
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x double> %op1, double 5.0, i64 1
     ret <2 x double> %r
 }
@@ -406,6 +570,14 @@ define <4 x double> @insertelement_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/m, d2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d0, #5.00000000
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = insertelement <4 x double> %op1, double 5.0, i64 3
     ret <4 x double> %r
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index 8baa87c6d686..da408a11e784 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,11 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -28,6 +34,11 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -40,6 +51,11 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = add <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -53,6 +69,15 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = add <32 x i8> %op1, %op2
@@ -68,6 +93,11 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -80,6 +110,11 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -92,6 +127,11 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -105,6 +145,15 @@ define void @add_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = add <16 x i16> %op1, %op2
@@ -120,6 +169,11 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -132,6 +186,11 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -145,6 +204,15 @@ define void @add_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = add <8 x i32> %op1, %op2
@@ -160,6 +228,11 @@ define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = add <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -172,6 +245,11 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    add v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -185,6 +263,15 @@ define void @add_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    add v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = add <4 x i64> %op1, %op2
@@ -213,6 +300,11 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -234,6 +326,11 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -255,6 +352,11 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -279,6 +381,15 @@ define void @mul_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    mul v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mul v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = mul <32 x i8> %op1, %op2
@@ -303,6 +414,11 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -324,6 +440,11 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -345,6 +466,11 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -369,6 +495,15 @@ define void @mul_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    mul v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    mul v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = mul <16 x i16> %op1, %op2
@@ -393,6 +528,11 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -414,6 +554,11 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mul v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -438,6 +583,15 @@ define void @mul_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    mul v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    mul v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = mul <8 x i32> %op1, %op2
@@ -462,6 +616,16 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    mul z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -483,6 +647,18 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    mul z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x10, d1
+; NONEON-NOSVE-NEXT:    fmov x11, d0
+; NONEON-NOSVE-NEXT:    mov x8, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x9, v0.d[1]
+; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -507,6 +683,29 @@ define void @mul_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    fmov x12, d2
+; NONEON-NOSVE-NEXT:    mov x11, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x10, v3.d[1]
+; NONEON-NOSVE-NEXT:    mov x13, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x14, v0.d[1]
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov x9, d3
+; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    mul x9, x12, x9
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    mul x11, x14, x13
+; NONEON-NOSVE-NEXT:    fmov d0, x9
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x10
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = mul <4 x i64> %op1, %op2
@@ -526,6 +725,11 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -538,6 +742,11 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -550,6 +759,11 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -563,6 +777,15 @@ define void @sub_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sub v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = sub <32 x i8> %op1, %op2
@@ -578,6 +801,11 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -590,6 +818,11 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -602,6 +835,11 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -615,6 +853,15 @@ define void @sub_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = sub <16 x i16> %op1, %op2
@@ -630,6 +877,11 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -642,6 +894,11 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -655,6 +912,15 @@ define void @sub_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = sub <8 x i32> %op1, %op2
@@ -670,6 +936,11 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -682,6 +953,11 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -695,6 +971,15 @@ define void @sub_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = sub <4 x i64> %op1, %op2
@@ -715,6 +1000,13 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false)
   ret <4 x i8> %res
 }
@@ -727,6 +1019,11 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
   ret <8 x i8> %res
 }
@@ -739,6 +1036,11 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
   ret <16 x i8> %res
 }
@@ -752,6 +1054,14 @@ define void @abs_v32i8(ptr %a) {
 ; CHECK-NEXT:    abs z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    abs v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
   store <32 x i8> %res, ptr %a
@@ -767,6 +1077,13 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false)
   ret <2 x i16> %res
 }
@@ -779,6 +1096,11 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.4h, v0.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
   ret <4 x i16> %res
 }
@@ -791,6 +1113,11 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
   ret <8 x i16> %res
 }
@@ -804,6 +1131,14 @@ define void @abs_v16i16(ptr %a) {
 ; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    abs v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
   store <16 x i16> %res, ptr %a
@@ -818,6 +1153,11 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   ret <2 x i32> %res
 }
@@ -830,6 +1170,11 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   ret <4 x i32> %res
 }
@@ -843,6 +1188,14 @@ define void @abs_v8i32(ptr %a) {
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, ptr %a
@@ -857,6 +1210,11 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
   ret <1 x i64> %res
 }
@@ -869,6 +1227,11 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   ret <2 x i64> %res
 }
@@ -882,6 +1245,14 @@ define void @abs_v4i64(ptr %a) {
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index 73c1eac99dd3..3148d4f1677c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,11 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i8> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i8>
   ret <8 x i8> %sext
@@ -33,6 +39,11 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <16 x i8> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i8>
   ret <16 x i8> %sext
@@ -50,6 +61,15 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %cmp = icmp eq <32 x i8> %op1, %op2
@@ -68,6 +88,11 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i16> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
@@ -83,6 +108,11 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i16> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
@@ -100,6 +130,15 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    cmeq v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %cmp = icmp eq <16 x i16> %op1, %op2
@@ -118,6 +157,11 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i32> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
@@ -133,6 +177,11 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i32> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
@@ -150,6 +199,15 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    cmeq v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %cmp = icmp eq <8 x i32> %op1, %op2
@@ -168,6 +226,11 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <1 x i64> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
@@ -183,6 +246,11 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmeq v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i64> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
@@ -200,6 +268,15 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmeq v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %cmp = icmp eq <4 x i64> %op1, %op2
@@ -224,6 +301,17 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ne_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cmeq v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mvn v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %cmp = icmp ne <32 x i8> %op1, %op2
@@ -246,6 +334,14 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sge_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmge v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %cmp = icmp sge <8 x i16> %op1, %op2
@@ -270,6 +366,15 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sgt_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    cmgt v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %cmp = icmp sgt <16 x i16> %op1, %op2
@@ -292,6 +397,14 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sle_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
   %cmp = icmp sle <4 x i32> %op1, %op2
@@ -316,6 +429,15 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_slt_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    cmgt v1.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %cmp = icmp slt <8 x i32> %op1, %op2
@@ -338,6 +460,14 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_uge_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmhs v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp uge <2 x i64> %op1, %op2
@@ -360,6 +490,14 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ugt_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ugt <2 x i64> %op1, %op2
@@ -382,6 +520,14 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ule_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmhs v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ule <2 x i64> %op1, %op2
@@ -404,6 +550,14 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ult_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    cmhi v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ult <2 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 5158dda37a8b..27a4924ea367 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,6 +25,31 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -51,6 +77,45 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[6]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[7]
+; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -98,6 +163,75 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
+; NONEON-NOSVE-NEXT:    smov w16, v0.b[7]
+; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
+; NONEON-NOSVE-NEXT:    smov w18, v0.b[9]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[10]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[11]
+; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[12]
+; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    smov w15, v1.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[13]
+; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
+; NONEON-NOSVE-NEXT:    smov w16, v1.b[8]
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
+; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
+; NONEON-NOSVE-NEXT:    smov w17, v1.b[9]
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
+; NONEON-NOSVE-NEXT:    sdiv w8, w18, w17
+; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[11]
+; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[14]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w8, w13, w12
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[14]
+; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w9, w12, w9
+; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
+; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -178,6 +312,163 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[6]
+; NONEON-NOSVE-NEXT:    smov w17, v0.b[8]
+; NONEON-NOSVE-NEXT:    smov w2, v0.b[10]
+; NONEON-NOSVE-NEXT:    smov w3, v0.b[11]
+; NONEON-NOSVE-NEXT:    smov w4, v0.b[12]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    smov w5, v0.b[13]
+; NONEON-NOSVE-NEXT:    smov w6, v0.b[14]
+; NONEON-NOSVE-NEXT:    smov w1, v3.b[1]
+; NONEON-NOSVE-NEXT:    smov w7, v2.b[0]
+; NONEON-NOSVE-NEXT:    smov w19, v2.b[2]
+; NONEON-NOSVE-NEXT:    smov w20, v2.b[3]
+; NONEON-NOSVE-NEXT:    smov w21, v2.b[4]
+; NONEON-NOSVE-NEXT:    smov w22, v2.b[5]
+; NONEON-NOSVE-NEXT:    smov w23, v2.b[6]
+; NONEON-NOSVE-NEXT:    smov w24, v2.b[7]
+; NONEON-NOSVE-NEXT:    smov w25, v2.b[8]
+; NONEON-NOSVE-NEXT:    smov w26, v2.b[9]
+; NONEON-NOSVE-NEXT:    smov w27, v2.b[10]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w11, w11, w10
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s5, w9
+; NONEON-NOSVE-NEXT:    smov w9, v3.b[11]
+; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w10, w12, w10
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
+; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
+; NONEON-NOSVE-NEXT:    sdiv w13, w13, w12
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
+; NONEON-NOSVE-NEXT:    smov w10, v3.b[12]
+; NONEON-NOSVE-NEXT:    sdiv w12, w14, w12
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
+; NONEON-NOSVE-NEXT:    smov w13, v2.b[14]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[7]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[7]
+; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
+; NONEON-NOSVE-NEXT:    smov w12, v2.b[13]
+; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    smov w15, v1.b[8]
+; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
+; NONEON-NOSVE-NEXT:    sdiv w18, w17, w15
+; NONEON-NOSVE-NEXT:    smov w15, v1.b[9]
+; NONEON-NOSVE-NEXT:    smov w17, v0.b[9]
+; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
+; NONEON-NOSVE-NEXT:    sdiv w17, w17, w15
+; NONEON-NOSVE-NEXT:    smov w15, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
+; NONEON-NOSVE-NEXT:    sdiv w15, w2, w15
+; NONEON-NOSVE-NEXT:    smov w2, v1.b[11]
+; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
+; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
+; NONEON-NOSVE-NEXT:    smov w3, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
+; NONEON-NOSVE-NEXT:    sdiv w3, w4, w3
+; NONEON-NOSVE-NEXT:    smov w4, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
+; NONEON-NOSVE-NEXT:    sdiv w4, w5, w4
+; NONEON-NOSVE-NEXT:    smov w5, v1.b[14]
+; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
+; NONEON-NOSVE-NEXT:    sdiv w5, w6, w5
+; NONEON-NOSVE-NEXT:    smov w6, v2.b[1]
+; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
+; NONEON-NOSVE-NEXT:    sdiv w1, w6, w1
+; NONEON-NOSVE-NEXT:    smov w6, v3.b[0]
+; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
+; NONEON-NOSVE-NEXT:    sdiv w6, w7, w6
+; NONEON-NOSVE-NEXT:    smov w7, v3.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w7, w19, w7
+; NONEON-NOSVE-NEXT:    smov w19, v3.b[3]
+; NONEON-NOSVE-NEXT:    fmov s4, w6
+; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
+; NONEON-NOSVE-NEXT:    sdiv w19, w20, w19
+; NONEON-NOSVE-NEXT:    smov w20, v3.b[4]
+; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
+; NONEON-NOSVE-NEXT:    sdiv w20, w21, w20
+; NONEON-NOSVE-NEXT:    smov w21, v3.b[5]
+; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
+; NONEON-NOSVE-NEXT:    sdiv w21, w22, w21
+; NONEON-NOSVE-NEXT:    smov w22, v3.b[6]
+; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w22, w23, w22
+; NONEON-NOSVE-NEXT:    smov w23, v3.b[7]
+; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
+; NONEON-NOSVE-NEXT:    sdiv w23, w24, w23
+; NONEON-NOSVE-NEXT:    smov w24, v3.b[8]
+; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w24, w25, w24
+; NONEON-NOSVE-NEXT:    smov w25, v3.b[9]
+; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
+; NONEON-NOSVE-NEXT:    sdiv w25, w26, w25
+; NONEON-NOSVE-NEXT:    smov w26, v3.b[10]
+; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w8, w27, w26
+; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
+; NONEON-NOSVE-NEXT:    smov w11, v2.b[12]
+; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
+; NONEON-NOSVE-NEXT:    smov w8, v3.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v3.b[13]
+; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v3.b[14]
+; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
+; NONEON-NOSVE-NEXT:    smov w10, v0.b[15]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v2.b[15]
+; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
+; NONEON-NOSVE-NEXT:    sdiv w8, w13, w8
+; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = sdiv <32 x i8> %op1, %op2
@@ -196,6 +487,23 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -212,6 +520,29 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -238,6 +569,43 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[6]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w8, w14, w13
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -278,6 +646,79 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    smov w13, v0.h[4]
+; NONEON-NOSVE-NEXT:    smov w14, v0.h[5]
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[6]
+; NONEON-NOSVE-NEXT:    smov w16, v2.h[1]
+; NONEON-NOSVE-NEXT:    smov w17, v2.h[0]
+; NONEON-NOSVE-NEXT:    smov w18, v2.h[2]
+; NONEON-NOSVE-NEXT:    smov w1, v2.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w2, v2.h[4]
+; NONEON-NOSVE-NEXT:    smov w3, v2.h[5]
+; NONEON-NOSVE-NEXT:    smov w4, v2.h[6]
+; NONEON-NOSVE-NEXT:    sdiv w10, w10, w9
+; NONEON-NOSVE-NEXT:    smov w9, v1.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w9, w11, w9
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s5, w10
+; NONEON-NOSVE-NEXT:    smov w10, v3.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    smov w12, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
+; NONEON-NOSVE-NEXT:    smov w9, v2.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    smov w14, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
+; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    smov w15, v3.h[1]
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
+; NONEON-NOSVE-NEXT:    sdiv w15, w16, w15
+; NONEON-NOSVE-NEXT:    smov w16, v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
+; NONEON-NOSVE-NEXT:    sdiv w16, w17, w16
+; NONEON-NOSVE-NEXT:    smov w17, v3.h[2]
+; NONEON-NOSVE-NEXT:    sdiv w17, w18, w17
+; NONEON-NOSVE-NEXT:    smov w18, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmov s4, w16
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
+; NONEON-NOSVE-NEXT:    sdiv w18, w1, w18
+; NONEON-NOSVE-NEXT:    smov w1, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
+; NONEON-NOSVE-NEXT:    sdiv w1, w2, w1
+; NONEON-NOSVE-NEXT:    smov w2, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
+; NONEON-NOSVE-NEXT:    sdiv w2, w3, w2
+; NONEON-NOSVE-NEXT:    smov w3, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
+; NONEON-NOSVE-NEXT:    sdiv w8, w4, w3
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
+; NONEON-NOSVE-NEXT:    sdiv w9, w9, w10
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = sdiv <16 x i16> %op1, %op2
@@ -294,6 +735,21 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -307,6 +763,26 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w10, s0
+; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    fmov w9, s1
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -322,6 +798,45 @@ define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w10, s0
+; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
+; NONEON-NOSVE-NEXT:    fmov w13, s2
+; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
+; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    fmov w9, s1
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    fmov w12, s3
+; NONEON-NOSVE-NEXT:    sdiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
+; NONEON-NOSVE-NEXT:    sdiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w12
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
+; NONEON-NOSVE-NEXT:    sdiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
+; NONEON-NOSVE-NEXT:    fmov s1, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
+; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
+; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
+; NONEON-NOSVE-NEXT:    sdiv w8, w16, w15
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = sdiv <8 x i32> %op1, %op2
@@ -338,6 +853,16 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -351,6 +876,18 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -366,6 +903,29 @@ define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    sdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x11, d2
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x9, x10, x9
+; NONEON-NOSVE-NEXT:    fmov x10, d3
+; NONEON-NOSVE-NEXT:    sdiv x10, x11, x10
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    sdiv x11, x12, x11
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = sdiv <4 x i64> %op1, %op2
@@ -391,6 +951,37 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    and w9, w11, #0xff
+; NONEON-NOSVE-NEXT:    and w11, w12, #0xff
+; NONEON-NOSVE-NEXT:    udiv w8, w11, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -418,6 +1009,45 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[6]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[7]
+; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w9
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -465,6 +1095,75 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
+; NONEON-NOSVE-NEXT:    umov w16, v0.b[7]
+; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
+; NONEON-NOSVE-NEXT:    umov w18, v0.b[9]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w10
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[10]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[11]
+; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w12
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[12]
+; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    umov w15, v1.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w13
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[13]
+; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
+; NONEON-NOSVE-NEXT:    umov w16, v1.b[8]
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w14
+; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
+; NONEON-NOSVE-NEXT:    umov w17, v1.b[9]
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w15
+; NONEON-NOSVE-NEXT:    udiv w8, w18, w17
+; NONEON-NOSVE-NEXT:    mov v2.b[8], w16
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[11]
+; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v2.b[10], w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[14]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[11], w10
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[15]
+; NONEON-NOSVE-NEXT:    udiv w8, w13, w12
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[14]
+; NONEON-NOSVE-NEXT:    mov v2.b[12], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[15]
+; NONEON-NOSVE-NEXT:    udiv w9, w12, w9
+; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.b[14], w9
+; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -545,6 +1244,163 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x27, [sp, #-80]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[5]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[6]
+; NONEON-NOSVE-NEXT:    umov w17, v0.b[8]
+; NONEON-NOSVE-NEXT:    umov w2, v0.b[10]
+; NONEON-NOSVE-NEXT:    umov w3, v0.b[11]
+; NONEON-NOSVE-NEXT:    umov w4, v0.b[12]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[0]
+; NONEON-NOSVE-NEXT:    umov w5, v0.b[13]
+; NONEON-NOSVE-NEXT:    umov w6, v0.b[14]
+; NONEON-NOSVE-NEXT:    umov w1, v3.b[1]
+; NONEON-NOSVE-NEXT:    umov w7, v2.b[0]
+; NONEON-NOSVE-NEXT:    umov w19, v2.b[2]
+; NONEON-NOSVE-NEXT:    umov w20, v2.b[3]
+; NONEON-NOSVE-NEXT:    umov w21, v2.b[4]
+; NONEON-NOSVE-NEXT:    umov w22, v2.b[5]
+; NONEON-NOSVE-NEXT:    umov w23, v2.b[6]
+; NONEON-NOSVE-NEXT:    umov w24, v2.b[7]
+; NONEON-NOSVE-NEXT:    umov w25, v2.b[8]
+; NONEON-NOSVE-NEXT:    umov w26, v2.b[9]
+; NONEON-NOSVE-NEXT:    umov w27, v2.b[10]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[2]
+; NONEON-NOSVE-NEXT:    udiv w11, w11, w10
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[3]
+; NONEON-NOSVE-NEXT:    fmov s5, w9
+; NONEON-NOSVE-NEXT:    umov w9, v3.b[11]
+; NONEON-NOSVE-NEXT:    mov v5.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w10, w12, w10
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[4]
+; NONEON-NOSVE-NEXT:    mov v5.b[2], w11
+; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
+; NONEON-NOSVE-NEXT:    udiv w13, w13, w12
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[5]
+; NONEON-NOSVE-NEXT:    mov v5.b[3], w10
+; NONEON-NOSVE-NEXT:    umov w10, v3.b[12]
+; NONEON-NOSVE-NEXT:    udiv w12, w14, w12
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v5.b[4], w13
+; NONEON-NOSVE-NEXT:    umov w13, v2.b[14]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[7]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[7]
+; NONEON-NOSVE-NEXT:    mov v5.b[5], w12
+; NONEON-NOSVE-NEXT:    umov w12, v2.b[13]
+; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    umov w15, v1.b[8]
+; NONEON-NOSVE-NEXT:    mov v5.b[6], w16
+; NONEON-NOSVE-NEXT:    udiv w18, w17, w15
+; NONEON-NOSVE-NEXT:    umov w15, v1.b[9]
+; NONEON-NOSVE-NEXT:    umov w17, v0.b[9]
+; NONEON-NOSVE-NEXT:    mov v5.b[7], w14
+; NONEON-NOSVE-NEXT:    udiv w17, w17, w15
+; NONEON-NOSVE-NEXT:    umov w15, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v5.b[8], w18
+; NONEON-NOSVE-NEXT:    udiv w15, w2, w15
+; NONEON-NOSVE-NEXT:    umov w2, v1.b[11]
+; NONEON-NOSVE-NEXT:    mov v5.b[9], w17
+; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
+; NONEON-NOSVE-NEXT:    umov w3, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v5.b[10], w15
+; NONEON-NOSVE-NEXT:    udiv w3, w4, w3
+; NONEON-NOSVE-NEXT:    umov w4, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v5.b[11], w2
+; NONEON-NOSVE-NEXT:    udiv w4, w5, w4
+; NONEON-NOSVE-NEXT:    umov w5, v1.b[14]
+; NONEON-NOSVE-NEXT:    mov v5.b[12], w3
+; NONEON-NOSVE-NEXT:    udiv w5, w6, w5
+; NONEON-NOSVE-NEXT:    umov w6, v2.b[1]
+; NONEON-NOSVE-NEXT:    mov v5.b[13], w4
+; NONEON-NOSVE-NEXT:    udiv w1, w6, w1
+; NONEON-NOSVE-NEXT:    umov w6, v3.b[0]
+; NONEON-NOSVE-NEXT:    mov v5.b[14], w5
+; NONEON-NOSVE-NEXT:    udiv w6, w7, w6
+; NONEON-NOSVE-NEXT:    umov w7, v3.b[2]
+; NONEON-NOSVE-NEXT:    udiv w7, w19, w7
+; NONEON-NOSVE-NEXT:    umov w19, v3.b[3]
+; NONEON-NOSVE-NEXT:    fmov s4, w6
+; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
+; NONEON-NOSVE-NEXT:    udiv w19, w20, w19
+; NONEON-NOSVE-NEXT:    umov w20, v3.b[4]
+; NONEON-NOSVE-NEXT:    mov v4.b[2], w7
+; NONEON-NOSVE-NEXT:    udiv w20, w21, w20
+; NONEON-NOSVE-NEXT:    umov w21, v3.b[5]
+; NONEON-NOSVE-NEXT:    mov v4.b[3], w19
+; NONEON-NOSVE-NEXT:    udiv w21, w22, w21
+; NONEON-NOSVE-NEXT:    umov w22, v3.b[6]
+; NONEON-NOSVE-NEXT:    mov v4.b[4], w20
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w22, w23, w22
+; NONEON-NOSVE-NEXT:    umov w23, v3.b[7]
+; NONEON-NOSVE-NEXT:    mov v4.b[5], w21
+; NONEON-NOSVE-NEXT:    udiv w23, w24, w23
+; NONEON-NOSVE-NEXT:    umov w24, v3.b[8]
+; NONEON-NOSVE-NEXT:    mov v4.b[6], w22
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w24, w25, w24
+; NONEON-NOSVE-NEXT:    umov w25, v3.b[9]
+; NONEON-NOSVE-NEXT:    mov v4.b[7], w23
+; NONEON-NOSVE-NEXT:    udiv w25, w26, w25
+; NONEON-NOSVE-NEXT:    umov w26, v3.b[10]
+; NONEON-NOSVE-NEXT:    mov v4.b[8], w24
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w8, w27, w26
+; NONEON-NOSVE-NEXT:    mov v4.b[9], w25
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
+; NONEON-NOSVE-NEXT:    umov w11, v2.b[12]
+; NONEON-NOSVE-NEXT:    mov v4.b[10], w8
+; NONEON-NOSVE-NEXT:    umov w8, v3.b[15]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v3.b[13]
+; NONEON-NOSVE-NEXT:    mov v4.b[11], w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.b[15]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v3.b[14]
+; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
+; NONEON-NOSVE-NEXT:    umov w10, v0.b[15]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v2.b[15]
+; NONEON-NOSVE-NEXT:    mov v4.b[13], w11
+; NONEON-NOSVE-NEXT:    udiv w8, w13, w8
+; NONEON-NOSVE-NEXT:    mov v4.b[14], w12
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov v4.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v5.b[15], w9
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp], #80 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = udiv <32 x i8> %op1, %op2
@@ -563,6 +1419,22 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -579,6 +1451,29 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -605,6 +1500,43 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s2, w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w10
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[6]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
+; NONEON-NOSVE-NEXT:    udiv w8, w14, w13
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w12
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w11, w10
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w9
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -645,6 +1577,79 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w10, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[3]
+; NONEON-NOSVE-NEXT:    umov w13, v0.h[4]
+; NONEON-NOSVE-NEXT:    umov w14, v0.h[5]
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[6]
+; NONEON-NOSVE-NEXT:    umov w16, v2.h[1]
+; NONEON-NOSVE-NEXT:    umov w17, v2.h[0]
+; NONEON-NOSVE-NEXT:    umov w18, v2.h[2]
+; NONEON-NOSVE-NEXT:    umov w1, v2.h[3]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w2, v2.h[4]
+; NONEON-NOSVE-NEXT:    umov w3, v2.h[5]
+; NONEON-NOSVE-NEXT:    umov w4, v2.h[6]
+; NONEON-NOSVE-NEXT:    udiv w10, w10, w9
+; NONEON-NOSVE-NEXT:    umov w9, v1.h[2]
+; NONEON-NOSVE-NEXT:    udiv w9, w11, w9
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[3]
+; NONEON-NOSVE-NEXT:    fmov s5, w10
+; NONEON-NOSVE-NEXT:    umov w10, v3.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    umov w12, v1.h[4]
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w9
+; NONEON-NOSVE-NEXT:    umov w9, v2.h[7]
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.h[5]
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[7]
+; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    umov w14, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
+; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    umov w15, v3.h[1]
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w13
+; NONEON-NOSVE-NEXT:    udiv w15, w16, w15
+; NONEON-NOSVE-NEXT:    umov w16, v3.h[0]
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w14
+; NONEON-NOSVE-NEXT:    udiv w16, w17, w16
+; NONEON-NOSVE-NEXT:    umov w17, v3.h[2]
+; NONEON-NOSVE-NEXT:    udiv w17, w18, w17
+; NONEON-NOSVE-NEXT:    umov w18, v3.h[3]
+; NONEON-NOSVE-NEXT:    fmov s4, w16
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w15
+; NONEON-NOSVE-NEXT:    udiv w18, w1, w18
+; NONEON-NOSVE-NEXT:    umov w1, v3.h[4]
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w17
+; NONEON-NOSVE-NEXT:    udiv w1, w2, w1
+; NONEON-NOSVE-NEXT:    umov w2, v3.h[5]
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w18
+; NONEON-NOSVE-NEXT:    udiv w2, w3, w2
+; NONEON-NOSVE-NEXT:    umov w3, v3.h[6]
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w1
+; NONEON-NOSVE-NEXT:    udiv w8, w4, w3
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w2
+; NONEON-NOSVE-NEXT:    udiv w9, w9, w10
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w9
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w10
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = udiv <16 x i16> %op1, %op2
@@ -661,6 +1666,21 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w10, v0.s[1]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    mov w9, v1.s[1]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -674,6 +1694,26 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w10, s0
+; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w12, v0.s[3]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    fmov w9, s1
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov w11, v1.s[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    udiv w8, w12, w11
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -689,6 +1729,45 @@ define void @udiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w10, s0
+; NONEON-NOSVE-NEXT:    mov w11, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w12, v2.s[1]
+; NONEON-NOSVE-NEXT:    fmov w13, s2
+; NONEON-NOSVE-NEXT:    mov w14, v2.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v2.s[3]
+; NONEON-NOSVE-NEXT:    mov w16, v0.s[3]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    fmov w9, s1
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    mov w10, v1.s[2]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    mov w11, v3.s[1]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    fmov w12, s3
+; NONEON-NOSVE-NEXT:    udiv w12, w13, w12
+; NONEON-NOSVE-NEXT:    mov w13, v3.s[2]
+; NONEON-NOSVE-NEXT:    udiv w13, w14, w13
+; NONEON-NOSVE-NEXT:    mov w14, v3.s[3]
+; NONEON-NOSVE-NEXT:    fmov s0, w12
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w11
+; NONEON-NOSVE-NEXT:    udiv w14, w15, w14
+; NONEON-NOSVE-NEXT:    mov w15, v1.s[3]
+; NONEON-NOSVE-NEXT:    fmov s1, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w13
+; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
+; NONEON-NOSVE-NEXT:    mov v1.s[2], w10
+; NONEON-NOSVE-NEXT:    udiv w8, w16, w15
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w14
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = udiv <8 x i32> %op1, %op2
@@ -705,6 +1784,16 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -718,6 +1807,18 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x10, v0.d[1]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -733,6 +1834,29 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    udiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x10, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x11, d2
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    mov x9, v3.d[1]
+; NONEON-NOSVE-NEXT:    udiv x9, x10, x9
+; NONEON-NOSVE-NEXT:    fmov x10, d3
+; NONEON-NOSVE-NEXT:    udiv x10, x11, x10
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    udiv x11, x12, x11
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = udiv <4 x i64> %op1, %op2
@@ -778,6 +1902,27 @@ define void @udiv_constantsplat_v8i32(ptr %a)  {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #6
 ; SVE2-NEXT:    stp q1, q0, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #8969 // =0x2309
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    movk w8, #22765, lsl #16
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    umull2 v3.2d, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umull v4.2d, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    umull2 v5.2d, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umull v0.2d, v2.2s, v0.2s
+; NONEON-NOSVE-NEXT:    uzp2 v3.4s, v4.4s, v3.4s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v5.4s
+; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v3.4s
+; NONEON-NOSVE-NEXT:    sub v2.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    usra v3.4s, v1.4s, #1
+; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #1
+; NONEON-NOSVE-NEXT:    ushr v1.4s, v3.4s, #6
+; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #6
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
   store <8 x i32> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index c7a89612d278..e320fed2a498 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -26,6 +27,22 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) {
 ; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i1_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
+; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i1> %a to <8 x i32>
   store <8 x i32> %b, ptr %out
   ret void
@@ -52,6 +69,22 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) {
 ; CHECK-NEXT:    asr z0.d, z0.d, #61
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i3_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #61
+; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #61
+; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #61
+; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #61
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i3> %a to <4 x i64>
   store <4 x i64> %b, ptr %out
   ret void
@@ -70,6 +103,17 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
   ret void
@@ -91,6 +135,24 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i16>
@@ -112,6 +174,18 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i8_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -133,6 +207,25 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
   ret void
@@ -167,6 +260,40 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v3.8h, v3.8b, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
+; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v6.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v7.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i32>
@@ -194,6 +321,22 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i8_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #56
+; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #56
+; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #56
+; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #56
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -216,6 +359,26 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i8_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -253,6 +416,41 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q1, q4, [x0, #32]
 ; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
   ret void
@@ -321,6 +519,73 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q0, q2, [x1, #224]
 ; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    sshll v5.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    sshll v6.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v3.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v4.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    sshll v5.4s, v5.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
+; NONEON-NOSVE-NEXT:    sshll v6.4s, v6.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v4.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v7.4s, v7.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
+; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v19.2d, v19.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v16.2d, v16.2s, #0
+; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v7.2d, v22.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
+; NONEON-NOSVE-NEXT:    sshll v6.2d, v23.2s, #0
+; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v20.2s, #0
+; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v21.2s, #0
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v17.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v18.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i64>
@@ -341,6 +606,17 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -361,6 +637,24 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i32>
@@ -382,6 +676,18 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -403,6 +709,25 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -437,6 +762,40 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i64>
@@ -457,6 +816,17 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -477,6 +847,24 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
   %c = sext <8 x i32> %b to <8 x i64>
@@ -497,6 +885,17 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
   ret void
@@ -518,6 +917,24 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i16>
@@ -539,6 +956,18 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i8_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -560,6 +989,25 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
   ret void
@@ -594,6 +1042,40 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v3.8h, v3.8b, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v6.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v7.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i32>
@@ -619,6 +1101,20 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i8_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -641,6 +1137,26 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i8_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -678,6 +1194,41 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q1, q4, [x0, #32]
 ; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-112]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v1.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
   ret void
@@ -746,6 +1297,73 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q0, q2, [x1, #224]
 ; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ushll v5.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    ushll v6.8h, v1.8b, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v3.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v4.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    stp q3, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v5.4s, v5.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #64]
+; NONEON-NOSVE-NEXT:    ushll v6.4s, v6.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v4.4s, v4.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #88]
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v7.4s, v7.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q5, [sp, #128]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d19, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr d20, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp q1, q4, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr d17, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d21, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [sp, #192]
+; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v19.2d, v19.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d16, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d22, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d23, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldr d18, [sp, #168]
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v16.2d, v16.2s, #0
+; NONEON-NOSVE-NEXT:    stp q5, q19, [x1]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v7.2d, v22.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    stp q6, q16, [x1, #128]
+; NONEON-NOSVE-NEXT:    ushll v6.2d, v23.2s, #0
+; NONEON-NOSVE-NEXT:    stp q5, q7, [x1, #160]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v20.2s, #0
+; NONEON-NOSVE-NEXT:    stp q4, q6, [x1, #192]
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v21.2s, #0
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #32]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v17.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v18.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i64>
@@ -766,6 +1384,17 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -786,6 +1415,24 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i32>
@@ -807,6 +1454,18 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -828,6 +1487,25 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -862,6 +1540,40 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #72]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i64>
@@ -882,6 +1594,17 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -902,6 +1625,24 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
   %c = zext <8 x i32> %b to <8 x i64>
@@ -928,6 +1669,21 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; SVE2-NEXT:    mul z0.d, z1.d, z0.d
 ; SVE2-NEXT:    str q0, [x1]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extend_and_mul:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v1.2s, w0
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    fmov x11, d1
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    mul x10, x11, x10
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
   %4 = zext <2 x i32> %broadcast.splat3 to <2 x i64>
@@ -943,6 +1699,13 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extend_no_mul:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index f028b3eeca25..d86cfcbfb4f6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -22,6 +23,15 @@ define void @add_v32i8(ptr %a) {
 ; CHECK-NEXT:    add z1.b, z1.b, #7 // =0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -38,6 +48,16 @@ define void @add_v16i16(ptr %a) {
 ; CHECK-NEXT:    add z1.h, z1.h, #15 // =0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -54,6 +74,16 @@ define void @add_v8i32(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z1.s, #31 // =0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -70,6 +100,16 @@ define void @add_v4i64(ptr %a) {
 ; CHECK-NEXT:    add z1.d, z1.d, #63 // =0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    add v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -90,6 +130,15 @@ define void @and_v32i8(ptr %a) {
 ; CHECK-NEXT:    and z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -106,6 +155,16 @@ define void @and_v16i16(ptr %a) {
 ; CHECK-NEXT:    and z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -122,6 +181,16 @@ define void @and_v8i32(ptr %a) {
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -138,6 +207,16 @@ define void @and_v4i64(ptr %a) {
 ; CHECK-NEXT:    and z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -158,6 +237,14 @@ define void @ashr_v32i8(ptr %a) {
 ; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -174,6 +261,14 @@ define void @ashr_v16i16(ptr %a) {
 ; CHECK-NEXT:    asr z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8h, v0.8h, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.8h, v1.8h, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -190,6 +285,14 @@ define void @ashr_v8i32(ptr %a) {
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4s, v0.4s, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.4s, v1.4s, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -206,6 +309,14 @@ define void @ashr_v4i64(ptr %a) {
 ; CHECK-NEXT:    asr z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2d, v0.2d, #0
+; NONEON-NOSVE-NEXT:    cmlt v1.2d, v1.2d, #0
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -229,6 +340,15 @@ define void @icmp_eq_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmeq v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    cmeq v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -249,6 +369,16 @@ define void @icmp_sge_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sge_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    cmge v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    cmge v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -269,6 +399,16 @@ define void @icmp_sgt_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sgt_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #-8 // =0xfffffff8
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    cmgt v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    cmgt v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 -8, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -289,6 +429,16 @@ define void @icmp_ult_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ult_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmhi v1.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmhi v0.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -310,6 +460,14 @@ define void @lshr_v32i8(ptr %a) {
 ; CHECK-NEXT:    lsr z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    ushr v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -326,6 +484,14 @@ define void @lshr_v16i16(ptr %a) {
 ; CHECK-NEXT:    lsr z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v0.8h, v0.8h, #15
+; NONEON-NOSVE-NEXT:    ushr v1.8h, v1.8h, #15
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -342,6 +508,14 @@ define void @lshr_v8i32(ptr %a) {
 ; CHECK-NEXT:    lsr z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v0.4s, v0.4s, #31
+; NONEON-NOSVE-NEXT:    ushr v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -358,6 +532,14 @@ define void @lshr_v4i64(ptr %a) {
 ; CHECK-NEXT:    lsr z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ushr v0.2d, v0.2d, #63
+; NONEON-NOSVE-NEXT:    ushr v1.2d, v1.2d, #63
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -378,6 +560,15 @@ define void @mul_v32i8(ptr %a) {
 ; CHECK-NEXT:    mul z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    mul v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mul v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -394,6 +585,16 @@ define void @mul_v16i16(ptr %a) {
 ; CHECK-NEXT:    mul z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    mul v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    mul v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -410,6 +611,16 @@ define void @mul_v8i32(ptr %a) {
 ; CHECK-NEXT:    mul z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    mul v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    mul v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -426,6 +637,28 @@ define void @mul_v4i64(ptr %a) {
 ; CHECK-NEXT:    mul z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    fmov x11, d1
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    lsl x12, x10, #6
+; NONEON-NOSVE-NEXT:    lsl x13, x11, #6
+; NONEON-NOSVE-NEXT:    lsl x14, x8, #6
+; NONEON-NOSVE-NEXT:    sub x10, x12, x10
+; NONEON-NOSVE-NEXT:    sub x11, x13, x11
+; NONEON-NOSVE-NEXT:    lsl x12, x9, #6
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    fmov d1, x11
+; NONEON-NOSVE-NEXT:    sub x8, x14, x8
+; NONEON-NOSVE-NEXT:    sub x9, x12, x9
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x8
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x9
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -446,6 +679,15 @@ define void @or_v32i8(ptr %a) {
 ; CHECK-NEXT:    orr z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -462,6 +704,16 @@ define void @or_v16i16(ptr %a) {
 ; CHECK-NEXT:    orr z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -478,6 +730,16 @@ define void @or_v8i32(ptr %a) {
 ; CHECK-NEXT:    orr z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -494,6 +756,16 @@ define void @or_v4i64(ptr %a) {
 ; CHECK-NEXT:    orr z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    orr v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -514,6 +786,14 @@ define void @shl_v32i8(ptr %a) {
 ; CHECK-NEXT:    lsl z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -530,6 +810,14 @@ define void @shl_v16i16(ptr %a) {
 ; CHECK-NEXT:    lsl z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    shl v0.8h, v0.8h, #15
+; NONEON-NOSVE-NEXT:    shl v1.8h, v1.8h, #15
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -546,6 +834,14 @@ define void @shl_v8i32(ptr %a) {
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    shl v0.4s, v0.4s, #31
+; NONEON-NOSVE-NEXT:    shl v1.4s, v1.4s, #31
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -562,6 +858,14 @@ define void @shl_v4i64(ptr %a) {
 ; CHECK-NEXT:    lsl z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    shl v0.2d, v0.2d, #63
+; NONEON-NOSVE-NEXT:    shl v1.2d, v1.2d, #63
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -582,6 +886,15 @@ define void @smax_v32i8(ptr %a) {
 ; CHECK-NEXT:    smax z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smax v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smax v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -598,6 +911,16 @@ define void @smax_v16i16(ptr %a) {
 ; CHECK-NEXT:    smax z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    smax v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smax v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -614,6 +937,16 @@ define void @smax_v8i32(ptr %a) {
 ; CHECK-NEXT:    smax z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    smax v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smax v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -630,6 +963,18 @@ define void @smax_v4i64(ptr %a) {
 ; CHECK-NEXT:    smax z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmgt v3.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmgt v4.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -650,6 +995,15 @@ define void @smin_v32i8(ptr %a) {
 ; CHECK-NEXT:    smin z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smin v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smin v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -666,6 +1020,16 @@ define void @smin_v16i16(ptr %a) {
 ; CHECK-NEXT:    smin z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    smin v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smin v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -682,6 +1046,16 @@ define void @smin_v8i32(ptr %a) {
 ; CHECK-NEXT:    smin z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    smin v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smin v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -698,6 +1072,18 @@ define void @smin_v4i64(ptr %a) {
 ; CHECK-NEXT:    smin z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmgt v3.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -718,6 +1104,15 @@ define void @sub_v32i8(ptr %a) {
 ; CHECK-NEXT:    sub z1.b, z1.b, #7 // =0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sub v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -734,6 +1129,16 @@ define void @sub_v16i16(ptr %a) {
 ; CHECK-NEXT:    sub z1.h, z1.h, #15 // =0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    sub v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sub v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -750,6 +1155,16 @@ define void @sub_v8i32(ptr %a) {
 ; CHECK-NEXT:    sub z1.s, z1.s, #31 // =0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    sub v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sub v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -766,6 +1181,16 @@ define void @sub_v4i64(ptr %a) {
 ; CHECK-NEXT:    sub z1.d, z1.d, #63 // =0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    sub v1.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sub v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -786,6 +1211,15 @@ define void @umax_v32i8(ptr %a) {
 ; CHECK-NEXT:    umax z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umax v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umax v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -802,6 +1236,16 @@ define void @umax_v16i16(ptr %a) {
 ; CHECK-NEXT:    umax z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    umax v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umax v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -818,6 +1262,16 @@ define void @umax_v8i32(ptr %a) {
 ; CHECK-NEXT:    umax z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    umax v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umax v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -834,6 +1288,18 @@ define void @umax_v4i64(ptr %a) {
 ; CHECK-NEXT:    umax z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmhi v3.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmhi v4.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -854,6 +1320,15 @@ define void @umin_v32i8(ptr %a) {
 ; CHECK-NEXT:    umin z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umin v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umin v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -870,6 +1345,16 @@ define void @umin_v16i16(ptr %a) {
 ; CHECK-NEXT:    umin z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    umin v1.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umin v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -886,6 +1371,16 @@ define void @umin_v8i32(ptr %a) {
 ; CHECK-NEXT:    umin z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    umin v1.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umin v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -902,6 +1397,18 @@ define void @umin_v4i64(ptr %a) {
 ; CHECK-NEXT:    umin z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    cmhi v3.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    bif v1.16b, v0.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bit v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -922,6 +1429,15 @@ define void @xor_v32i8(ptr %a) {
 ; CHECK-NEXT:    eor z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #7
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -938,6 +1454,16 @@ define void @xor_v16i16(ptr %a) {
 ; CHECK-NEXT:    eor z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -954,6 +1480,16 @@ define void @xor_v8i32(ptr %a) {
 ; CHECK-NEXT:    eor z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -970,6 +1506,16 @@ define void @xor_v4i64(ptr %a) {
 ; CHECK-NEXT:    eor z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    eor v1.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index 4d70c1dd1c91..f0b39b275614 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,11 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -28,6 +34,11 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -41,6 +52,15 @@ define void @and_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = and <32 x i8> %op1, %op2
@@ -56,6 +76,11 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -68,6 +93,11 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -81,6 +111,15 @@ define void @and_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = and <16 x i16> %op1, %op2
@@ -96,6 +135,11 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -108,6 +152,11 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -121,6 +170,15 @@ define void @and_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = and <8 x i32> %op1, %op2
@@ -136,6 +194,11 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -148,6 +211,11 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -161,6 +229,15 @@ define void @and_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    and v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = and <4 x i64> %op1, %op2
@@ -180,6 +257,11 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -192,6 +274,11 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -205,6 +292,15 @@ define void @or_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = or <32 x i8> %op1, %op2
@@ -220,6 +316,11 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -232,6 +333,11 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -245,6 +351,15 @@ define void @or_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = or <16 x i16> %op1, %op2
@@ -260,6 +375,11 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -272,6 +392,11 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -285,6 +410,15 @@ define void @or_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = or <8 x i32> %op1, %op2
@@ -300,6 +434,11 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -312,6 +451,11 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    orr v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -325,6 +469,15 @@ define void @or_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orr v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = or <4 x i64> %op1, %op2
@@ -344,6 +497,11 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -356,6 +514,11 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -369,6 +532,15 @@ define void @xor_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = xor <32 x i8> %op1, %op2
@@ -384,6 +556,11 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -396,6 +573,11 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -409,6 +591,15 @@ define void @xor_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = xor <16 x i16> %op1, %op2
@@ -424,6 +615,11 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -436,6 +632,11 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -449,6 +650,15 @@ define void @xor_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = xor <8 x i32> %op1, %op2
@@ -464,6 +674,11 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -476,6 +691,11 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -489,6 +709,15 @@ define void @xor_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = xor <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 50cf9b73d9a7..51c404ece6cd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,11 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -30,6 +36,11 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -45,6 +56,15 @@ define void @smax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smax v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -61,6 +81,11 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -74,6 +99,11 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -89,6 +119,15 @@ define void @smax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smax v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -105,6 +144,11 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -118,6 +162,11 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -133,6 +182,15 @@ define void @smax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -150,6 +208,12 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -164,6 +228,12 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmgt v2.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -179,6 +249,18 @@ define void @smax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v4.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmgt v5.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -199,6 +281,11 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -212,6 +299,11 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -227,6 +319,15 @@ define void @smin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smin v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -243,6 +344,11 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -256,6 +362,11 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -271,6 +382,15 @@ define void @smin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smin v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -287,6 +407,11 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -300,6 +425,11 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -315,6 +445,15 @@ define void @smin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -332,6 +471,12 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -346,6 +491,12 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -361,6 +512,18 @@ define void @smin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmgt v5.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -381,6 +544,11 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -394,6 +562,11 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -409,6 +582,15 @@ define void @umax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umax v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -425,6 +607,11 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -438,6 +625,11 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -453,6 +645,15 @@ define void @umax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umax v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -469,6 +670,11 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -482,6 +688,11 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umax v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -497,6 +708,15 @@ define void @umax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umax v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -514,6 +734,12 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -528,6 +754,12 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmhi v2.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -543,6 +775,18 @@ define void @umax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmhi v4.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    cmhi v5.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -563,6 +807,11 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -576,6 +825,11 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -591,6 +845,15 @@ define void @umin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umin v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -607,6 +870,11 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -620,6 +888,11 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -635,6 +908,15 @@ define void @umin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umin v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -651,6 +933,11 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -664,6 +951,11 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umin v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -679,6 +971,15 @@ define void @umin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umin v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -696,6 +997,12 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -710,6 +1017,12 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -725,6 +1038,18 @@ define void @umin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    cmhi v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmhi v5.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
index 149ad6d1e267..83714152c173 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=FA64
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,6 +21,12 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
 ; NO-FA64-NEXT:    mad z0.b, p0/m, z1.b, z2.b
 ; NO-FA64-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; NO-FA64-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mla8xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mla v2.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = mul <8 x i8> %A, %B;
   %tmp2 = add <8 x i8> %C, %tmp1;
   ret <8 x i8> %tmp2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index cb7fa53eac51..6e6d40e2ea04 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 ; This test only tests the legal types for a given vector width, as mulh nodes
 ; do not get generated for non-legal types.
@@ -36,6 +37,16 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    lsr z0.h, z0.h, #4
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 4, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
   %1 = sext <4 x i8> %op1 to <4 x i16>
@@ -63,6 +74,12 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 8, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   %1 = sext <8 x i8> %op1 to <8 x i16>
@@ -90,6 +107,13 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    smull v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <16 x i8> %op1 to <16 x i16>
   %2 = sext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
@@ -118,6 +142,19 @@ define void @smulh_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smull2 v4.8h, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smull v0.8h, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    smull2 v1.8h, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    smull v2.8h, v2.8b, v3.8b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
+; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %1 = sext <32 x i8> %op1 to <32 x i16>
@@ -153,6 +190,16 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #16
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    shl v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i16> %op1 to <2 x i32>
   %2 = sext <2 x i16> %op2 to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -178,6 +225,12 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i16> %op1 to <4 x i32>
   %2 = sext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -203,6 +256,13 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <8 x i16> %op1 to <8 x i32>
   %2 = sext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
@@ -231,6 +291,19 @@ define void @smulh_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smull2 v4.4s, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smull v0.4s, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    smull2 v1.4s, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    smull v2.4s, v2.4h, v3.4h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %1 = sext <16 x i16> %op1 to <16 x i32>
@@ -259,6 +332,12 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -284,6 +363,13 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    smull v0.2d, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i32> %op1 to <4 x i64>
   %2 = sext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
@@ -312,6 +398,19 @@ define void @smulh_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    smull2 v4.2d, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smull v0.2d, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    smull2 v1.2d, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    smull v2.2d, v2.2s, v3.2s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %1 = sext <8 x i32> %op1 to <8 x i64>
@@ -340,6 +439,16 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    fmov x9, d1
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i128> undef, i128 64, i128 0
   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
   %1 = sext <1 x i64> %op1 to <1 x i128>
@@ -367,6 +476,19 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    fmov x11, d1
+; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -395,6 +517,31 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x12, d3
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov x9, d2
+; NONEON-NOSVE-NEXT:    smulh x10, x10, x11
+; NONEON-NOSVE-NEXT:    smulh x9, x9, x12
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    smulh x11, x13, x14
+; NONEON-NOSVE-NEXT:    fmov d1, x10
+; NONEON-NOSVE-NEXT:    fmov d2, x9
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    fmov d3, x11
+; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %1 = sext <4 x i64> %op1 to <4 x i128>
@@ -433,6 +580,15 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    lsr z0.h, z0.h, #4
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #4
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i8> %op1 to <4 x i16>
   %2 = zext <4 x i8> %op2 to <4 x i16>
   %mul = mul <4 x i16> %1, %2
@@ -458,6 +614,12 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    shrn v0.8b, v0.8h, #8
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i8> %op1 to <8 x i16>
   %2 = zext <8 x i8> %op2 to <8 x i16>
   %mul = mul <8 x i16> %1, %2
@@ -483,6 +645,13 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <16 x i8> %op1 to <16 x i16>
   %2 = zext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
@@ -511,6 +680,19 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umull2 v4.8h, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umull v0.8h, v1.8b, v0.8b
+; NONEON-NOSVE-NEXT:    umull2 v1.8h, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    umull v2.8h, v2.8b, v3.8b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v0.16b, v4.16b
+; NONEON-NOSVE-NEXT:    uzp2 v1.16b, v2.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %1 = zext <32 x i8> %op1 to <32 x i16>
@@ -545,6 +727,15 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #16
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i16> %op1 to <2 x i32>
   %2 = zext <2 x i16> %op2 to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -570,6 +761,12 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    shrn v0.4h, v0.4s, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i16> %op1 to <4 x i32>
   %2 = zext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -595,6 +792,13 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i16> %op1 to <8 x i32>
   %2 = zext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
@@ -623,6 +827,19 @@ define void @umulh_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umull2 v4.4s, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umull v0.4s, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    umull2 v1.4s, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    umull v2.4s, v2.4h, v3.4h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %1 = zext <16 x i16> %op1 to <16 x i32>
@@ -651,6 +868,12 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    shrn v0.2s, v0.2d, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -676,6 +899,13 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    umull v0.2d, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i32> %op1 to <4 x i64>
   %2 = zext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
@@ -704,6 +934,19 @@ define void @umulh_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    umull2 v4.2d, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umull v0.2d, v1.2s, v0.2s
+; NONEON-NOSVE-NEXT:    umull2 v1.2d, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    umull v2.2d, v2.2s, v3.2s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v0.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp2 v1.4s, v2.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %insert = insertelement <8 x i64> undef, i64 32, i64 0
@@ -734,6 +977,16 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    fmov x9, d1
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <1 x i64> %op1 to <1 x i128>
   %2 = zext <1 x i64> %op2 to <1 x i128>
   %mul = mul <1 x i128> %1, %2
@@ -759,6 +1012,19 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x9, v1.d[1]
+; NONEON-NOSVE-NEXT:    fmov x10, d0
+; NONEON-NOSVE-NEXT:    fmov x11, d1
+; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -787,6 +1053,31 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x11, v0.d[1]
+; NONEON-NOSVE-NEXT:    mov x14, v3.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    mov x10, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x13, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x12, d3
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    fmov x9, d2
+; NONEON-NOSVE-NEXT:    umulh x10, x10, x11
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x12
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    umulh x11, x13, x14
+; NONEON-NOSVE-NEXT:    fmov d1, x10
+; NONEON-NOSVE-NEXT:    fmov d2, x9
+; NONEON-NOSVE-NEXT:    mov v0.d[1], v1.d[0]
+; NONEON-NOSVE-NEXT:    fmov d3, x11
+; NONEON-NOSVE-NEXT:    mov v2.d[1], v3.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %1 = zext <4 x i64> %op1 to <4 x i128>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 751f43768a51..50eaa6c12d71 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,12 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -30,6 +37,12 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -44,6 +57,14 @@ define i8 @uaddv_v32i8(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    addv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -58,6 +79,12 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -71,6 +98,12 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -85,6 +118,14 @@ define i16 @uaddv_v16i16(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -99,6 +140,12 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -112,6 +159,12 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -126,6 +179,14 @@ define i32 @uaddv_v8i32(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    addv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -139,6 +200,12 @@ define i64 @uaddv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    addp d0, v0.2d
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -152,6 +219,14 @@ define i64 @uaddv_v4i64(ptr %a) {
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    addp d0, v0.2d
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -169,6 +244,12 @@ define i8 @smaxv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -181,6 +262,12 @@ define i8 @smaxv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -194,6 +281,14 @@ define i8 @smaxv_v32i8(ptr %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    smaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -207,6 +302,12 @@ define i16 @smaxv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -219,6 +320,12 @@ define i16 @smaxv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -232,6 +339,14 @@ define i16 @smaxv_v16i16(ptr %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    smaxv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -245,6 +360,12 @@ define i32 @smaxv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -257,6 +378,12 @@ define i32 @smaxv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -270,6 +397,14 @@ define i32 @smaxv_v8i32(ptr %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    smaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -284,6 +419,17 @@ define i64 @smaxv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -297,6 +443,20 @@ define i64 @smaxv_v4i64(ptr %a) {
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmgt d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -314,6 +474,12 @@ define i8 @sminv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -326,6 +492,12 @@ define i8 @sminv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -339,6 +511,14 @@ define i8 @sminv_v32i8(ptr %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -352,6 +532,12 @@ define i16 @sminv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -364,6 +550,12 @@ define i16 @sminv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -377,6 +569,14 @@ define i16 @sminv_v16i16(ptr %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sminv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -390,6 +590,12 @@ define i32 @sminv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -402,6 +608,12 @@ define i32 @sminv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -415,6 +627,14 @@ define i32 @sminv_v8i32(ptr %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    smin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -429,6 +649,17 @@ define i64 @sminv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -442,6 +673,20 @@ define i64 @sminv_v4i64(ptr %a) {
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmgt d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -459,6 +704,12 @@ define i8 @umaxv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -471,6 +722,12 @@ define i8 @umaxv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -484,6 +741,14 @@ define i8 @umaxv_v32i8(ptr %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -497,6 +762,12 @@ define i16 @umaxv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -509,6 +780,12 @@ define i16 @umaxv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -522,6 +799,14 @@ define i16 @umaxv_v16i16(ptr %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    umaxv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -535,6 +820,12 @@ define i32 @umaxv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -547,6 +838,12 @@ define i32 @umaxv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -560,6 +857,14 @@ define i32 @umaxv_v8i32(ptr %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umax v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    umaxv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -574,6 +879,17 @@ define i64 @umaxv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -587,6 +903,20 @@ define i64 @umaxv_v4i64(ptr %a) {
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bit v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmhi d2, d0, d1
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -604,6 +934,12 @@ define i8 @uminv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -616,6 +952,12 @@ define i8 @uminv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -629,6 +971,14 @@ define i8 @uminv_v32i8(ptr %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -642,6 +992,12 @@ define i16 @uminv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -654,6 +1010,12 @@ define i16 @uminv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -667,6 +1029,14 @@ define i16 @uminv_v16i16(ptr %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uminv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -680,6 +1050,12 @@ define i32 @uminv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -692,6 +1068,12 @@ define i32 @uminv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -705,6 +1087,14 @@ define i32 @uminv_v8i32(ptr %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    umin v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uminv s0, v0.4s
+; NONEON-NOSVE-NEXT:    fmov w0, s0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -719,6 +1109,17 @@ define i64 @uminv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -732,6 +1133,20 @@ define i64 @uminv_v4i64(ptr %a) {
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmhi d2, d1, d0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
   ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index d373a9063f85..97bd76311b61 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,6 +25,35 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    shl v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -53,6 +83,53 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
+; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
+; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[7]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[6]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
+; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -102,6 +179,112 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    smov w11, v1.b[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.b[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.b[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.b[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.b[3]
+; NONEON-NOSVE-NEXT:    smov w1, v1.b[4]
+; NONEON-NOSVE-NEXT:    smov w2, v0.b[4]
+; NONEON-NOSVE-NEXT:    smov w4, v1.b[5]
+; NONEON-NOSVE-NEXT:    smov w5, v0.b[5]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    smov w7, v1.b[6]
+; NONEON-NOSVE-NEXT:    smov w19, v0.b[6]
+; NONEON-NOSVE-NEXT:    smov w21, v1.b[7]
+; NONEON-NOSVE-NEXT:    smov w22, v0.b[7]
+; NONEON-NOSVE-NEXT:    smov w24, v1.b[8]
+; NONEON-NOSVE-NEXT:    smov w25, v0.b[8]
+; NONEON-NOSVE-NEXT:    smov w27, v1.b[9]
+; NONEON-NOSVE-NEXT:    smov w28, v0.b[9]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[11]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[10]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[11]
+; NONEON-NOSVE-NEXT:    smov w16, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
+; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    smov w17, v0.b[12]
+; NONEON-NOSVE-NEXT:    smov w0, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
+; NONEON-NOSVE-NEXT:    sdiv w6, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    smov w1, v0.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
+; NONEON-NOSVE-NEXT:    sdiv w20, w19, w7
+; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w23, w22, w21
+; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
+; NONEON-NOSVE-NEXT:    sdiv w26, w25, w24
+; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w28, w27
+; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
+; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
+; NONEON-NOSVE-NEXT:    sdiv w15, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    smov w10, v1.b[14]
+; NONEON-NOSVE-NEXT:    smov w11, v0.b[14]
+; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
+; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
+; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
+; NONEON-NOSVE-NEXT:    smov w13, v1.b[15]
+; NONEON-NOSVE-NEXT:    smov w14, v0.b[15]
+; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w1, w0
+; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
+; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
+; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -189,6 +372,279 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #320
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    smov w4, v3.b[1]
+; NONEON-NOSVE-NEXT:    smov w1, v2.b[1]
+; NONEON-NOSVE-NEXT:    smov w7, v3.b[7]
+; NONEON-NOSVE-NEXT:    smov w5, v2.b[7]
+; NONEON-NOSVE-NEXT:    smov w6, v3.b[8]
+; NONEON-NOSVE-NEXT:    smov w3, v2.b[8]
+; NONEON-NOSVE-NEXT:    smov w22, v3.b[9]
+; NONEON-NOSVE-NEXT:    smov w20, v2.b[9]
+; NONEON-NOSVE-NEXT:    smov w13, v3.b[0]
+; NONEON-NOSVE-NEXT:    smov w17, v3.b[3]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[0]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[0]
+; NONEON-NOSVE-NEXT:    smov w14, v2.b[3]
+; NONEON-NOSVE-NEXT:    smov w15, v3.b[4]
+; NONEON-NOSVE-NEXT:    smov w12, v2.b[4]
+; NONEON-NOSVE-NEXT:    smov w2, v3.b[5]
+; NONEON-NOSVE-NEXT:    smov w18, v2.b[5]
+; NONEON-NOSVE-NEXT:    smov w0, v3.b[6]
+; NONEON-NOSVE-NEXT:    smov w16, v2.b[6]
+; NONEON-NOSVE-NEXT:    smov w21, v3.b[10]
+; NONEON-NOSVE-NEXT:    smov w19, v2.b[10]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[2]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[2]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[3]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[3]
+; NONEON-NOSVE-NEXT:    sdiv w26, w14, w17
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[4]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[4]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[5]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[5]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[6]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[6]
+; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[7]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[7]
+; NONEON-NOSVE-NEXT:    sdiv w25, w12, w15
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[8]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[9]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[9]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w11, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[10]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[10]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[11]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[11]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[12]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[12]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[13]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[13]
+; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w11, v3.b[2]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.b[14]
+; NONEON-NOSVE-NEXT:    smov w9, v0.b[14]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w9, v2.b[2]
+; NONEON-NOSVE-NEXT:    sdiv w8, w1, w4
+; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w10, v2.b[0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w8, w5, w7
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w8, w3, w6
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w8, w20, w22
+; NONEON-NOSVE-NEXT:    sdiv w24, w10, w13
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s4, w8
+; NONEON-NOSVE-NEXT:    sdiv w23, w9, w11
+; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s5, w10
+; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
+; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
+; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w28, w18, w2
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    smov w10, v3.b[11]
+; NONEON-NOSVE-NEXT:    smov w11, v2.b[11]
+; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
+; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
+; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
+; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w27, w16, w0
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
+; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
+; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
+; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w4, w19, w21
+; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
+; NONEON-NOSVE-NEXT:    smov w12, v3.b[12]
+; NONEON-NOSVE-NEXT:    smov w14, v2.b[12]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w13, w11, w10
+; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
+; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
+; NONEON-NOSVE-NEXT:    smov w16, v3.b[13]
+; NONEON-NOSVE-NEXT:    smov w17, v2.b[13]
+; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
+; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
+; NONEON-NOSVE-NEXT:    sdiv w15, w14, w12
+; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
+; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
+; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
+; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
+; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
+; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w18, w17, w16
+; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
+; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
+; NONEON-NOSVE-NEXT:    smov w0, v3.b[14]
+; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
+; NONEON-NOSVE-NEXT:    smov w1, v2.b[14]
+; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
+; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
+; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
+; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
+; NONEON-NOSVE-NEXT:    sdiv w2, w1, w0
+; NONEON-NOSVE-NEXT:    smov w9, v3.b[15]
+; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
+; NONEON-NOSVE-NEXT:    smov w4, v2.b[15]
+; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
+; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
+; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w11, w4, w9
+; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
+; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
+; NONEON-NOSVE-NEXT:    smov w12, v1.b[15]
+; NONEON-NOSVE-NEXT:    smov w13, v0.b[15]
+; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
+; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w14, w13, w12
+; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
+; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
+; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
+; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
+; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #320
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = srem <32 x i8> %op1, %op2
@@ -210,6 +666,33 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -238,6 +721,51 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    smov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    smov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    smov w1, v1.h[4]
+; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
+; NONEON-NOSVE-NEXT:    smov w4, v1.h[5]
+; NONEON-NOSVE-NEXT:    smov w5, v0.h[5]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    smov w11, v0.h[6]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    smov w10, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    smov w14, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    sdiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -282,6 +810,139 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    smov w20, v1.h[0]
+; NONEON-NOSVE-NEXT:    smov w21, v0.h[0]
+; NONEON-NOSVE-NEXT:    smov w19, v0.h[3]
+; NONEON-NOSVE-NEXT:    smov w5, v1.h[4]
+; NONEON-NOSVE-NEXT:    smov w2, v0.h[4]
+; NONEON-NOSVE-NEXT:    smov w1, v3.h[1]
+; NONEON-NOSVE-NEXT:    smov w23, v2.h[1]
+; NONEON-NOSVE-NEXT:    smov w25, v3.h[0]
+; NONEON-NOSVE-NEXT:    smov w26, v2.h[0]
+; NONEON-NOSVE-NEXT:    smov w6, v1.h[5]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[2]
+; NONEON-NOSVE-NEXT:    smov w9, v0.h[2]
+; NONEON-NOSVE-NEXT:    smov w3, v0.h[5]
+; NONEON-NOSVE-NEXT:    smov w4, v1.h[6]
+; NONEON-NOSVE-NEXT:    smov w7, v0.h[6]
+; NONEON-NOSVE-NEXT:    smov w28, v3.h[2]
+; NONEON-NOSVE-NEXT:    smov w29, v2.h[2]
+; NONEON-NOSVE-NEXT:    smov w15, v3.h[3]
+; NONEON-NOSVE-NEXT:    smov w13, v2.h[3]
+; NONEON-NOSVE-NEXT:    smov w12, v3.h[4]
+; NONEON-NOSVE-NEXT:    smov w14, v3.h[5]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w11, w21, w20
+; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    smov w8, v1.h[3]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w11, v2.h[4]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
+; NONEON-NOSVE-NEXT:    sdiv w9, w19, w8
+; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w10, v3.h[6]
+; NONEON-NOSVE-NEXT:    fmov s5, w20
+; NONEON-NOSVE-NEXT:    smov w20, v3.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w8, w2, w5
+; NONEON-NOSVE-NEXT:    sdiv w24, w23, w1
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sdiv w27, w26, w25
+; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
+; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w9, w3, w6
+; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s4, w21
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
+; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
+; NONEON-NOSVE-NEXT:    sdiv w8, w7, w4
+; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
+; NONEON-NOSVE-NEXT:    smov w23, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    sdiv w30, w29, w28
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    smov w9, v2.h[5]
+; NONEON-NOSVE-NEXT:    smov w8, v2.h[6]
+; NONEON-NOSVE-NEXT:    sdiv w18, w13, w15
+; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
+; NONEON-NOSVE-NEXT:    sdiv w16, w11, w12
+; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
+; NONEON-NOSVE-NEXT:    smov w13, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
+; NONEON-NOSVE-NEXT:    smov w15, v0.h[7]
+; NONEON-NOSVE-NEXT:    sdiv w17, w9, w14
+; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
+; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
+; NONEON-NOSVE-NEXT:    sdiv w24, w8, w10
+; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
+; NONEON-NOSVE-NEXT:    sdiv w18, w23, w20
+; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
+; NONEON-NOSVE-NEXT:    sdiv w12, w15, w13
+; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
+; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = srem <16 x i16> %op1, %op2
@@ -300,6 +961,23 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -315,6 +993,30 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov w11, s1
+; NONEON-NOSVE-NEXT:    fmov w12, s0
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
+; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -334,6 +1036,65 @@ define void @srem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov w12, s0
+; NONEON-NOSVE-NEXT:    fmov w3, s2
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w11, s1
+; NONEON-NOSVE-NEXT:    fmov w2, s3
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
+; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
+; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
+; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
+; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
+; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
+; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
+; NONEON-NOSVE-NEXT:    sdiv w4, w3, w2
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s1, w11
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
+; NONEON-NOSVE-NEXT:    fmov s0, w12
+; NONEON-NOSVE-NEXT:    sdiv w1, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
+; NONEON-NOSVE-NEXT:    sdiv w7, w6, w5
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
+; NONEON-NOSVE-NEXT:    sdiv w21, w20, w19
+; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
+; NONEON-NOSVE-NEXT:    sdiv w9, w23, w22
+; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = srem <8 x i32> %op1, %op2
@@ -352,6 +1113,17 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -367,6 +1139,20 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -386,6 +1172,33 @@ define void @srem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    fmov x15, d2
+; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x14, d3
+; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
+; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    sdiv x16, x15, x14
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    sdiv x13, x12, x11
+; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    sdiv x1, x18, x17
+; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = srem <4 x i64> %op1, %op2
@@ -413,6 +1226,41 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0xff
+; NONEON-NOSVE-NEXT:    and w12, w12, #0xff
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xff
+; NONEON-NOSVE-NEXT:    and w14, w14, #0xff
+; NONEON-NOSVE-NEXT:    and w15, w15, #0xff
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    and w12, w17, #0xff
+; NONEON-NOSVE-NEXT:    and w13, w18, #0xff
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w13, w12
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w12, w13
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -442,6 +1290,53 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
+; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
+; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[7]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[6]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[6]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[7]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
+; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    fmov d0, d2
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -491,6 +1386,112 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    umov w11, v1.b[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.b[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.b[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.b[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.b[3]
+; NONEON-NOSVE-NEXT:    umov w1, v1.b[4]
+; NONEON-NOSVE-NEXT:    umov w2, v0.b[4]
+; NONEON-NOSVE-NEXT:    umov w4, v1.b[5]
+; NONEON-NOSVE-NEXT:    umov w5, v0.b[5]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    umov w7, v1.b[6]
+; NONEON-NOSVE-NEXT:    umov w19, v0.b[6]
+; NONEON-NOSVE-NEXT:    umov w21, v1.b[7]
+; NONEON-NOSVE-NEXT:    umov w22, v0.b[7]
+; NONEON-NOSVE-NEXT:    umov w24, v1.b[8]
+; NONEON-NOSVE-NEXT:    umov w25, v0.b[8]
+; NONEON-NOSVE-NEXT:    umov w27, v1.b[9]
+; NONEON-NOSVE-NEXT:    umov w28, v0.b[9]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[11]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[10]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[10]
+; NONEON-NOSVE-NEXT:    mov v2.b[1], w8
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[11]
+; NONEON-NOSVE-NEXT:    umov w16, v1.b[12]
+; NONEON-NOSVE-NEXT:    mov v2.b[2], w8
+; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    umov w17, v0.b[12]
+; NONEON-NOSVE-NEXT:    umov w0, v1.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[3], w8
+; NONEON-NOSVE-NEXT:    udiv w6, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    umov w1, v0.b[13]
+; NONEON-NOSVE-NEXT:    mov v2.b[4], w8
+; NONEON-NOSVE-NEXT:    udiv w20, w19, w7
+; NONEON-NOSVE-NEXT:    msub w8, w6, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.b[5], w8
+; NONEON-NOSVE-NEXT:    udiv w23, w22, w21
+; NONEON-NOSVE-NEXT:    msub w8, w20, w7, w19
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[6], w8
+; NONEON-NOSVE-NEXT:    udiv w26, w25, w24
+; NONEON-NOSVE-NEXT:    msub w8, w23, w21, w22
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[7], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w28, w27
+; NONEON-NOSVE-NEXT:    msub w8, w26, w24, w25
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v2.b[8], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w27, w28
+; NONEON-NOSVE-NEXT:    mov v2.b[9], w8
+; NONEON-NOSVE-NEXT:    udiv w15, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    umov w10, v1.b[14]
+; NONEON-NOSVE-NEXT:    umov w11, v0.b[14]
+; NONEON-NOSVE-NEXT:    mov v2.b[10], w8
+; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
+; NONEON-NOSVE-NEXT:    msub w8, w15, w13, w14
+; NONEON-NOSVE-NEXT:    umov w13, v1.b[15]
+; NONEON-NOSVE-NEXT:    umov w14, v0.b[15]
+; NONEON-NOSVE-NEXT:    mov v2.b[11], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w1, w0
+; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
+; NONEON-NOSVE-NEXT:    mov v2.b[12], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w0, w1
+; NONEON-NOSVE-NEXT:    mov v2.b[13], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.b[14], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.b[15], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -578,6 +1579,279 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #320
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #272] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 320
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    str x0, [sp, #216] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[1]
+; NONEON-NOSVE-NEXT:    umov w4, v3.b[1]
+; NONEON-NOSVE-NEXT:    umov w1, v2.b[1]
+; NONEON-NOSVE-NEXT:    umov w7, v3.b[7]
+; NONEON-NOSVE-NEXT:    umov w5, v2.b[7]
+; NONEON-NOSVE-NEXT:    umov w6, v3.b[8]
+; NONEON-NOSVE-NEXT:    umov w3, v2.b[8]
+; NONEON-NOSVE-NEXT:    umov w22, v3.b[9]
+; NONEON-NOSVE-NEXT:    umov w20, v2.b[9]
+; NONEON-NOSVE-NEXT:    umov w13, v3.b[0]
+; NONEON-NOSVE-NEXT:    umov w17, v3.b[3]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[0]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #108] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[0]
+; NONEON-NOSVE-NEXT:    umov w14, v2.b[3]
+; NONEON-NOSVE-NEXT:    umov w15, v3.b[4]
+; NONEON-NOSVE-NEXT:    umov w12, v2.b[4]
+; NONEON-NOSVE-NEXT:    umov w2, v3.b[5]
+; NONEON-NOSVE-NEXT:    umov w18, v2.b[5]
+; NONEON-NOSVE-NEXT:    umov w0, v3.b[6]
+; NONEON-NOSVE-NEXT:    umov w16, v2.b[6]
+; NONEON-NOSVE-NEXT:    umov w21, v3.b[10]
+; NONEON-NOSVE-NEXT:    umov w19, v2.b[10]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w10, [sp, #116] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[2]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[2]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #44] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[3]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #52] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[3]
+; NONEON-NOSVE-NEXT:    udiv w26, w14, w17
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[4]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[4]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #60] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[5]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[5]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #68] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[6]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[6]
+; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #112] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[7]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[7]
+; NONEON-NOSVE-NEXT:    udiv w25, w12, w15
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[8]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #140] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[9]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[9]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w11, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[10]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[10]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #128] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[11]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[11]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #212] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[12]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[12]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #172] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #180] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #200] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[13]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[13]
+; NONEON-NOSVE-NEXT:    stp w11, w8, [sp, #164] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w11, v3.b[2]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #176] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #188] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.b[14]
+; NONEON-NOSVE-NEXT:    umov w9, v0.b[14]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #144] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w9, [sp, #152] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    str w10, [sp, #184] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w9, v2.b[2]
+; NONEON-NOSVE-NEXT:    udiv w8, w1, w4
+; NONEON-NOSVE-NEXT:    str w10, [sp, #160] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w10, v2.b[0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w8, w5, w7
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w8, w3, w6
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w8, w20, w22
+; NONEON-NOSVE-NEXT:    udiv w24, w10, w13
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w29, w8, [sp, #40] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w30, w29
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s4, w8
+; NONEON-NOSVE-NEXT:    udiv w23, w9, w11
+; NONEON-NOSVE-NEXT:    msub w10, w24, w13, w10
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w13, w13, w4, w1
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s5, w10
+; NONEON-NOSVE-NEXT:    msub w1, w1, w24, w4
+; NONEON-NOSVE-NEXT:    mov v5.b[1], w13
+; NONEON-NOSVE-NEXT:    mov v4.b[1], w1
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w23, w11, w9
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w28, w18, w2
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #52] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #272] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w26, w17, w14
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w11, w10
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    umov w10, v3.b[11]
+; NONEON-NOSVE-NEXT:    umov w11, v2.b[11]
+; NONEON-NOSVE-NEXT:    mov v4.b[2], w9
+; NONEON-NOSVE-NEXT:    mov v5.b[3], w8
+; NONEON-NOSVE-NEXT:    msub w8, w25, w15, w12
+; NONEON-NOSVE-NEXT:    ldp w13, w9, [sp, #76] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w27, w16, w0
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w13
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[4], w8
+; NONEON-NOSVE-NEXT:    msub w8, w28, w2, w18
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #156] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[3], w9
+; NONEON-NOSVE-NEXT:    ldp w12, w9, [sp, #64] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[5], w8
+; NONEON-NOSVE-NEXT:    msub w8, w27, w0, w16
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w4, w19, w21
+; NONEON-NOSVE-NEXT:    msub w9, w9, w14, w12
+; NONEON-NOSVE-NEXT:    umov w12, v3.b[12]
+; NONEON-NOSVE-NEXT:    umov w14, v2.b[12]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[6], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[4], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w7, w5
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #204] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w15
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[7], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w13, w11, w10
+; NONEON-NOSVE-NEXT:    mov v4.b[5], w9
+; NONEON-NOSVE-NEXT:    ldp w16, w9, [sp, #88] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w8, w8, w6, w3
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w17, w16
+; NONEON-NOSVE-NEXT:    umov w16, v3.b[13]
+; NONEON-NOSVE-NEXT:    umov w17, v2.b[13]
+; NONEON-NOSVE-NEXT:    mov v5.b[8], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[6], w9
+; NONEON-NOSVE-NEXT:    msub w8, w8, w22, w20
+; NONEON-NOSVE-NEXT:    udiv w15, w14, w12
+; NONEON-NOSVE-NEXT:    ldp w18, w9, [sp, #136] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[9], w8
+; NONEON-NOSVE-NEXT:    msub w8, w4, w21, w19
+; NONEON-NOSVE-NEXT:    msub w9, w9, w0, w18
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[7], w9
+; NONEON-NOSVE-NEXT:    mov v5.b[10], w8
+; NONEON-NOSVE-NEXT:    msub w8, w13, w10, w11
+; NONEON-NOSVE-NEXT:    ldp w0, w9, [sp, #124] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #196] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #192] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w18, w17, w16
+; NONEON-NOSVE-NEXT:    msub w9, w9, w1, w0
+; NONEON-NOSVE-NEXT:    mov v5.b[11], w8
+; NONEON-NOSVE-NEXT:    umov w0, v3.b[14]
+; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w11
+; NONEON-NOSVE-NEXT:    umov w1, v2.b[14]
+; NONEON-NOSVE-NEXT:    msub w8, w15, w12, w14
+; NONEON-NOSVE-NEXT:    mov v4.b[8], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w13, [sp, #168] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w3, w2
+; NONEON-NOSVE-NEXT:    mov v5.b[12], w8
+; NONEON-NOSVE-NEXT:    ldp w4, w3, [sp, #208] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w14, w12, [sp, #176] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[9], w9
+; NONEON-NOSVE-NEXT:    udiv w2, w1, w0
+; NONEON-NOSVE-NEXT:    umov w9, v3.b[15]
+; NONEON-NOSVE-NEXT:    msub w3, w3, w5, w4
+; NONEON-NOSVE-NEXT:    umov w4, v2.b[15]
+; NONEON-NOSVE-NEXT:    msub w8, w18, w16, w17
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.b[10], w3
+; NONEON-NOSVE-NEXT:    mov v5.b[13], w8
+; NONEON-NOSVE-NEXT:    mov v4.b[11], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #188] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w11, w4, w9
+; NONEON-NOSVE-NEXT:    msub w8, w2, w0, w1
+; NONEON-NOSVE-NEXT:    msub w10, w10, w13, w12
+; NONEON-NOSVE-NEXT:    umov w12, v1.b[15]
+; NONEON-NOSVE-NEXT:    umov w13, v0.b[15]
+; NONEON-NOSVE-NEXT:    mov v5.b[14], w8
+; NONEON-NOSVE-NEXT:    mov v4.b[12], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #184] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w10, w10, w15, w14
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #152] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w14, w13, w12
+; NONEON-NOSVE-NEXT:    msub w8, w11, w9, w4
+; NONEON-NOSVE-NEXT:    mov v4.b[13], w10
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #160] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.b[15], w8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #216] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w10, w10, w16, w15
+; NONEON-NOSVE-NEXT:    mov v4.b[14], w10
+; NONEON-NOSVE-NEXT:    msub w9, w14, w12, w13
+; NONEON-NOSVE-NEXT:    mov v4.b[15], w9
+; NONEON-NOSVE-NEXT:    stp q5, q4, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #320
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = urem <32 x i8> %op1, %op2
@@ -599,6 +1873,33 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.h[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.h[3], w8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -627,6 +1928,51 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    umov w11, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w12, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w14, v1.h[2]
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w17, v1.h[3]
+; NONEON-NOSVE-NEXT:    umov w18, v0.h[3]
+; NONEON-NOSVE-NEXT:    umov w1, v1.h[4]
+; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
+; NONEON-NOSVE-NEXT:    umov w4, v1.h[5]
+; NONEON-NOSVE-NEXT:    umov w5, v0.h[5]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
+; NONEON-NOSVE-NEXT:    fmov s2, w11
+; NONEON-NOSVE-NEXT:    umov w11, v0.h[6]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    umov w10, v1.h[6]
+; NONEON-NOSVE-NEXT:    mov v2.h[1], w8
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    umov w14, v0.h[7]
+; NONEON-NOSVE-NEXT:    mov v2.h[2], w8
+; NONEON-NOSVE-NEXT:    udiv w3, w2, w1
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    mov v2.h[3], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w5, w4
+; NONEON-NOSVE-NEXT:    msub w8, w3, w1, w2
+; NONEON-NOSVE-NEXT:    mov v2.h[4], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w11, w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w4, w5
+; NONEON-NOSVE-NEXT:    mov v2.h[5], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w14, w13
+; NONEON-NOSVE-NEXT:    msub w8, w12, w10, w11
+; NONEON-NOSVE-NEXT:    mov v2.h[6], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w13, w14
+; NONEON-NOSVE-NEXT:    mov v2.h[7], w8
+; NONEON-NOSVE-NEXT:    mov v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -671,6 +2017,139 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[1]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[1]
+; NONEON-NOSVE-NEXT:    umov w20, v1.h[0]
+; NONEON-NOSVE-NEXT:    umov w21, v0.h[0]
+; NONEON-NOSVE-NEXT:    umov w19, v0.h[3]
+; NONEON-NOSVE-NEXT:    umov w5, v1.h[4]
+; NONEON-NOSVE-NEXT:    umov w2, v0.h[4]
+; NONEON-NOSVE-NEXT:    umov w1, v3.h[1]
+; NONEON-NOSVE-NEXT:    umov w23, v2.h[1]
+; NONEON-NOSVE-NEXT:    umov w25, v3.h[0]
+; NONEON-NOSVE-NEXT:    umov w26, v2.h[0]
+; NONEON-NOSVE-NEXT:    umov w6, v1.h[5]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #36] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[2]
+; NONEON-NOSVE-NEXT:    umov w9, v0.h[2]
+; NONEON-NOSVE-NEXT:    umov w3, v0.h[5]
+; NONEON-NOSVE-NEXT:    umov w4, v1.h[6]
+; NONEON-NOSVE-NEXT:    umov w7, v0.h[6]
+; NONEON-NOSVE-NEXT:    umov w28, v3.h[2]
+; NONEON-NOSVE-NEXT:    umov w29, v2.h[2]
+; NONEON-NOSVE-NEXT:    umov w15, v3.h[3]
+; NONEON-NOSVE-NEXT:    umov w13, v2.h[3]
+; NONEON-NOSVE-NEXT:    umov w12, v3.h[4]
+; NONEON-NOSVE-NEXT:    umov w14, v3.h[5]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w11, w21, w20
+; NONEON-NOSVE-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    umov w8, v1.h[3]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w11, v2.h[4]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #4] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w20, w22, w20, w21
+; NONEON-NOSVE-NEXT:    udiv w9, w19, w8
+; NONEON-NOSVE-NEXT:    str w10, [sp, #32] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w10, v3.h[6]
+; NONEON-NOSVE-NEXT:    fmov s5, w20
+; NONEON-NOSVE-NEXT:    umov w20, v3.h[7]
+; NONEON-NOSVE-NEXT:    udiv w8, w2, w5
+; NONEON-NOSVE-NEXT:    udiv w24, w23, w1
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    udiv w27, w26, w25
+; NONEON-NOSVE-NEXT:    msub w1, w24, w1, w23
+; NONEON-NOSVE-NEXT:    ldp w24, w23, [sp, #40] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w9, w3, w6
+; NONEON-NOSVE-NEXT:    msub w21, w27, w25, w26
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w23, w23, w25, w24
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fmov s4, w21
+; NONEON-NOSVE-NEXT:    mov v5.h[1], w23
+; NONEON-NOSVE-NEXT:    ldp w23, w21, [sp, #28] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[1], w1
+; NONEON-NOSVE-NEXT:    udiv w8, w7, w4
+; NONEON-NOSVE-NEXT:    msub w21, w21, w25, w23
+; NONEON-NOSVE-NEXT:    umov w23, v2.h[7]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.h[2], w21
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    udiv w30, w29, w28
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    umov w9, v2.h[5]
+; NONEON-NOSVE-NEXT:    umov w8, v2.h[6]
+; NONEON-NOSVE-NEXT:    udiv w18, w13, w15
+; NONEON-NOSVE-NEXT:    msub w1, w30, w28, w29
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[2], w1
+; NONEON-NOSVE-NEXT:    udiv w16, w11, w12
+; NONEON-NOSVE-NEXT:    msub w13, w18, w15, w13
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w18, [sp] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w15, w15, w18, w19
+; NONEON-NOSVE-NEXT:    mov v4.h[3], w13
+; NONEON-NOSVE-NEXT:    umov w13, v1.h[7]
+; NONEON-NOSVE-NEXT:    mov v5.h[3], w15
+; NONEON-NOSVE-NEXT:    umov w15, v0.h[7]
+; NONEON-NOSVE-NEXT:    udiv w17, w9, w14
+; NONEON-NOSVE-NEXT:    msub w11, w16, w12, w11
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w12, w12, w5, w2
+; NONEON-NOSVE-NEXT:    mov v4.h[4], w11
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v5.h[4], w12
+; NONEON-NOSVE-NEXT:    msub w11, w11, w6, w3
+; NONEON-NOSVE-NEXT:    udiv w24, w8, w10
+; NONEON-NOSVE-NEXT:    msub w9, w17, w14, w9
+; NONEON-NOSVE-NEXT:    mov v5.h[5], w11
+; NONEON-NOSVE-NEXT:    mov v4.h[5], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    msub w9, w9, w4, w7
+; NONEON-NOSVE-NEXT:    udiv w18, w23, w20
+; NONEON-NOSVE-NEXT:    msub w8, w24, w10, w8
+; NONEON-NOSVE-NEXT:    mov v5.h[6], w9
+; NONEON-NOSVE-NEXT:    mov v4.h[6], w8
+; NONEON-NOSVE-NEXT:    udiv w12, w15, w13
+; NONEON-NOSVE-NEXT:    msub w8, w18, w20, w23
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v4.h[7], w8
+; NONEON-NOSVE-NEXT:    msub w9, w12, w13, w15
+; NONEON-NOSVE-NEXT:    mov v5.h[7], w9
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = urem <16 x i16> %op1, %op2
@@ -689,6 +2168,23 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    mov w11, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w12, v0.s[1]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w9
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -704,6 +2200,30 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov w11, s1
+; NONEON-NOSVE-NEXT:    fmov w12, s0
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w17, v1.s[3]
+; NONEON-NOSVE-NEXT:    mov w18, v0.s[3]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s0, w11
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w8
+; NONEON-NOSVE-NEXT:    udiv w9, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w8
+; NONEON-NOSVE-NEXT:    msub w8, w9, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w8
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -723,6 +2243,65 @@ define void @urem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -48
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov w12, s0
+; NONEON-NOSVE-NEXT:    fmov w3, s2
+; NONEON-NOSVE-NEXT:    mov w9, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w11, s1
+; NONEON-NOSVE-NEXT:    fmov w2, s3
+; NONEON-NOSVE-NEXT:    mov w8, v1.s[1]
+; NONEON-NOSVE-NEXT:    mov w17, v3.s[1]
+; NONEON-NOSVE-NEXT:    mov w18, v2.s[1]
+; NONEON-NOSVE-NEXT:    mov w14, v1.s[2]
+; NONEON-NOSVE-NEXT:    mov w15, v0.s[2]
+; NONEON-NOSVE-NEXT:    mov w5, v3.s[2]
+; NONEON-NOSVE-NEXT:    mov w6, v2.s[2]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    mov w19, v3.s[3]
+; NONEON-NOSVE-NEXT:    mov w20, v2.s[3]
+; NONEON-NOSVE-NEXT:    mov w22, v1.s[3]
+; NONEON-NOSVE-NEXT:    mov w23, v0.s[3]
+; NONEON-NOSVE-NEXT:    udiv w4, w3, w2
+; NONEON-NOSVE-NEXT:    msub w11, w13, w11, w12
+; NONEON-NOSVE-NEXT:    fmov s1, w11
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w12, w4, w2, w3
+; NONEON-NOSVE-NEXT:    fmov s0, w12
+; NONEON-NOSVE-NEXT:    udiv w1, w18, w17
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    mov v1.s[1], w8
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w13, w1, w17, w18
+; NONEON-NOSVE-NEXT:    mov v0.s[1], w13
+; NONEON-NOSVE-NEXT:    udiv w7, w6, w5
+; NONEON-NOSVE-NEXT:    msub w8, w16, w14, w15
+; NONEON-NOSVE-NEXT:    mov v1.s[2], w8
+; NONEON-NOSVE-NEXT:    udiv w21, w20, w19
+; NONEON-NOSVE-NEXT:    msub w10, w7, w5, w6
+; NONEON-NOSVE-NEXT:    mov v0.s[2], w10
+; NONEON-NOSVE-NEXT:    udiv w9, w23, w22
+; NONEON-NOSVE-NEXT:    msub w10, w21, w19, w20
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v0.s[3], w10
+; NONEON-NOSVE-NEXT:    msub w8, w9, w22, w23
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mov v1.s[3], w8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = urem <8 x i32> %op1, %op2
@@ -741,6 +2320,17 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -756,6 +2346,20 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mov x11, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x12, v0.d[1]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -775,6 +2379,33 @@ define void @urem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    fmov x15, d2
+; NONEON-NOSVE-NEXT:    mov x12, v2.d[1]
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x14, d3
+; NONEON-NOSVE-NEXT:    mov x11, v3.d[1]
+; NONEON-NOSVE-NEXT:    mov x17, v1.d[1]
+; NONEON-NOSVE-NEXT:    mov x18, v0.d[1]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    udiv x16, x15, x14
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    fmov d1, x8
+; NONEON-NOSVE-NEXT:    udiv x13, x12, x11
+; NONEON-NOSVE-NEXT:    msub x10, x16, x14, x15
+; NONEON-NOSVE-NEXT:    fmov d0, x10
+; NONEON-NOSVE-NEXT:    udiv x1, x18, x17
+; NONEON-NOSVE-NEXT:    msub x9, x13, x11, x12
+; NONEON-NOSVE-NEXT:    mov v0.d[1], x9
+; NONEON-NOSVE-NEXT:    msub x11, x1, x17, x18
+; NONEON-NOSVE-NEXT:    mov v1.d[1], x11
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = urem <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index 906112f7ac39..b3adf4720ece 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,14 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
 }
@@ -31,6 +40,14 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.8b, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
@@ -46,6 +63,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.16b, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
@@ -64,6 +89,20 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.b, p0, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.16b, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <32 x i8>, ptr %a
   %op2 = load volatile <32 x i8>, ptr %b
   %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
@@ -83,6 +122,14 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
 }
@@ -99,6 +146,14 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4h, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
@@ -115,6 +170,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.8h, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
@@ -134,6 +197,20 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x i16>, ptr %a
   %op2 = load volatile <16 x i16>, ptr %b
   %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
@@ -153,6 +230,14 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
@@ -169,6 +254,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    dup v2.4s, w8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
@@ -188,6 +281,20 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x i32>, ptr %a
   %op2 = load volatile <8 x i32>, ptr %b
   %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
@@ -208,6 +315,14 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    fmov d2, x8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
@@ -225,6 +340,14 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    dup v2.2d, x8
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
@@ -245,6 +368,20 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    ldr q3, [x1]
+; NONEON-NOSVE-NEXT:    ldr q4, [x1, #16]
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    bif v1.16b, v3.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x i64>, ptr %a
   %op2 = load volatile <4 x i64>, ptr %b
   %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index 9ed52e321d9a..a429cd82a449 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,16 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -32,6 +43,12 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -45,6 +62,12 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sshl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -60,6 +83,17 @@ define void @ashr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    sshl v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    sshl v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = ashr <32 x i8> %op1, %op2
@@ -78,6 +112,16 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -91,6 +135,12 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
+; NONEON-NOSVE-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -104,6 +154,12 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -119,6 +175,17 @@ define void @ashr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    sshl v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    sshl v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = ashr <16 x i16> %op1, %op2
@@ -135,6 +202,12 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
+; NONEON-NOSVE-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -148,6 +221,12 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -163,6 +242,17 @@ define void @ashr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sshl v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sshl v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = ashr <8 x i32> %op1, %op2
@@ -179,6 +269,12 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg d1, d1
+; NONEON-NOSVE-NEXT:    sshl d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -192,6 +288,12 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -207,6 +309,17 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sshl v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sshl v1.2d, v3.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = ashr <4 x i64> %op1, %op2
@@ -229,6 +342,15 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -242,6 +364,12 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -255,6 +383,12 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -270,6 +404,17 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    neg v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ushl v0.16b, v2.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ushl v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = lshr <32 x i8> %op1, %op2
@@ -288,6 +433,15 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -301,6 +455,12 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -314,6 +474,12 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -329,6 +495,17 @@ define void @lshr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    neg v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ushl v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ushl v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = lshr <16 x i16> %op1, %op2
@@ -345,6 +522,12 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -358,6 +541,12 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -373,6 +562,17 @@ define void @lshr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    neg v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ushl v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ushl v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = lshr <8 x i32> %op1, %op2
@@ -389,6 +589,12 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg d1, d1
+; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -402,6 +608,12 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -417,6 +629,17 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    neg v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    neg v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ushl v0.2d, v2.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ushl v1.2d, v3.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = lshr <4 x i64> %op1, %op2
@@ -438,6 +661,13 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0x0000ff000000ff
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i8> %op1, %op2
   ret <2 x i8> %res
 }
@@ -452,6 +682,13 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -465,6 +702,11 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -478,6 +720,11 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -493,6 +740,15 @@ define void @shl_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ushl v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ushl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = shl <32 x i8> %op1, %op2
@@ -509,6 +765,11 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -522,6 +783,11 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -537,6 +803,15 @@ define void @shl_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ushl v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ushl v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = shl <16 x i16> %op1, %op2
@@ -553,6 +828,11 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -566,6 +846,11 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -581,6 +866,15 @@ define void @shl_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ushl v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = shl <8 x i32> %op1, %op2
@@ -597,6 +891,11 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -610,6 +909,11 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -625,6 +929,15 @@ define void @shl_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ushl v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ushl v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = shl <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index b285659258f3..d9ca19baea7d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,13 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -27,6 +35,22 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -42,6 +66,29 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -61,6 +108,13 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -74,6 +128,12 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -90,6 +150,20 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -114,6 +188,26 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -132,6 +226,13 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK-NEXT:    and w8, w8, #0xffff
 ; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    umov w8, v0.h[0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <1 x i16> %op1 to <1 x double>
   ret <1 x double> %res
 }
@@ -146,6 +247,14 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d1, #0x00ffff0000ffff
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -163,6 +272,21 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = uitofp <4 x i16> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -190,6 +314,30 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -238,6 +386,46 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    ushll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
+; NONEON-NOSVE-NEXT:    ushll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ushll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v7.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    ucvtf v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v7.2d
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v6.2d
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -257,6 +445,13 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -270,6 +465,12 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -287,6 +488,15 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x half>
   ret <8 x half> %res
@@ -311,6 +521,21 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ucvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v3.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = uitofp <16 x i32> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -329,6 +554,11 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -341,6 +571,11 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -354,6 +589,14 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ucvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -373,6 +616,12 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -389,6 +638,20 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = uitofp <4 x i32> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -413,6 +676,26 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ushll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ushll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -439,6 +722,18 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -459,6 +754,16 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x half>
   ret <4 x half> %res
@@ -492,6 +797,22 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v2.2s, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.4s, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v2.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x half>
   ret <8 x half> %res
@@ -510,6 +831,12 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -527,6 +854,15 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x float>
   ret <4 x float> %res
@@ -551,6 +887,21 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    ucvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v1.2s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v2.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.4s, v3.2d
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -569,6 +920,11 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -582,6 +938,14 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ucvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -600,6 +964,13 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -612,6 +983,22 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v1.4s
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -627,6 +1014,29 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v2.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v2.8h, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v3.4s
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -645,6 +1055,13 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -658,6 +1075,12 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -674,6 +1097,20 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -698,6 +1135,26 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    scvtf v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    scvtf v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -719,6 +1176,14 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -736,6 +1201,21 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = sitofp <4 x i16> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -763,6 +1243,30 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -811,6 +1315,46 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v1.4s, v1.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    sshll v3.4s, v3.4h, #0
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #40]
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    stp q0, q5, [x1]
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v7.2d
+; NONEON-NOSVE-NEXT:    stp q1, q4, [x1, #64]
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v6.2d
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -830,6 +1374,13 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -843,6 +1394,12 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -860,6 +1417,15 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.8h, v1.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x half>
   ret <8 x half> %res
@@ -877,6 +1443,11 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -889,6 +1460,11 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -902,6 +1478,14 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    scvtf v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -921,6 +1505,12 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -937,6 +1527,20 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = sitofp <4 x i32> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -961,6 +1565,26 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -1005,6 +1629,40 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d4, [sp, #24]
+; NONEON-NOSVE-NEXT:    sshll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d5, [sp, #56]
+; NONEON-NOSVE-NEXT:    sshll v3.2d, v3.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d6, [sp, #40]
+; NONEON-NOSVE-NEXT:    sshll v4.2d, v4.2s, #0
+; NONEON-NOSVE-NEXT:    ldr d7, [sp, #8]
+; NONEON-NOSVE-NEXT:    sshll v1.2d, v1.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v5.2d, v5.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v2.2d
+; NONEON-NOSVE-NEXT:    sshll v6.2d, v6.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v3.2d
+; NONEON-NOSVE-NEXT:    sshll v0.2d, v0.2s, #0
+; NONEON-NOSVE-NEXT:    sshll v7.2d, v7.2s, #0
+; NONEON-NOSVE-NEXT:    scvtf v4.2d, v4.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    scvtf v5.2d, v5.2d
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q2, q4, [x1, #96]
+; NONEON-NOSVE-NEXT:    scvtf v2.2d, v6.2d
+; NONEON-NOSVE-NEXT:    stp q3, q5, [x1, #64]
+; NONEON-NOSVE-NEXT:    scvtf v3.2d, v7.2d
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = sitofp <16 x i32> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -1031,6 +1689,18 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov x8, v0.d[1]
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s1
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v2.h[0]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -1051,6 +1721,16 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x half>
   ret <4 x half> %res
@@ -1069,6 +1749,12 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -1086,6 +1772,15 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    fcvtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    fcvtn2 v0.4s, v1.2d
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x float>
   ret <4 x float> %res
@@ -1103,6 +1798,11 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -1116,6 +1816,14 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    scvtf v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    scvtf v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -1128,6 +1836,13 @@ define half @scvtf_i16_f16(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to half
   ret half %3
@@ -1139,6 +1854,12 @@ define float @scvtf_i16_f32(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to float
   ret float %3
@@ -1150,6 +1871,12 @@ define double @scvtf_i16_f64(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to double
   ret double %3
@@ -1161,6 +1888,13 @@ define half @scvtf_i32_f16(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to half
   ret half %3
@@ -1172,6 +1906,12 @@ define float @scvtf_i32_f32(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to float
   ret float %3
@@ -1183,6 +1923,12 @@ define double @scvtf_i32_f64(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to double
   ret double %3
@@ -1194,6 +1940,13 @@ define half @scvtf_i64_f16(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf h0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to half
   ret half %3
@@ -1205,6 +1958,12 @@ define float @scvtf_i64_f32(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf s0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to float
   ret float %3
@@ -1216,6 +1975,12 @@ define double @scvtf_i64_f64(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to double
   ret double %3
@@ -1227,6 +1992,13 @@ define half @ucvtf_i16_f16(ptr %0) {
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ucvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to half
   ret half %3
@@ -1238,6 +2010,12 @@ define float @ucvtf_i16_f32(ptr %0) {
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ucvtf s0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to float
   ret float %3
@@ -1249,6 +2027,12 @@ define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to double
   ret double %3
@@ -1260,6 +2044,13 @@ define half @ucvtf_i32_f16(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ucvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to half
   ret half %3
@@ -1271,6 +2062,12 @@ define float @ucvtf_i32_f32(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ucvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to float
   ret float %3
@@ -1282,6 +2079,12 @@ define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to double
   ret double %3
@@ -1293,6 +2096,13 @@ define half @ucvtf_i64_f16(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf h0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to half
   ret half %3
@@ -1304,6 +2114,12 @@ define float @ucvtf_i64_f32(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf s0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to float
   ret float %3
@@ -1315,6 +2131,12 @@ define double @ucvtf_i64_f64(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to double
   ret double %3
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 81bbaa92d4b4..42daa4fedc94 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,13 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
 }
@@ -36,6 +44,13 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.8b, v2.8b, #7
+; NONEON-NOSVE-NEXT:    cmlt v2.8b, v2.8b, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
@@ -54,6 +69,13 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.16b, v2.16b, #7
+; NONEON-NOSVE-NEXT:    cmlt v2.16b, v2.16b, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
@@ -70,6 +92,18 @@ define void @select_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.b, p0, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    cmeq v4.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    cmeq v5.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %mask = icmp eq <32 x i8> %op1, %op2
@@ -92,6 +126,13 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
 }
@@ -110,6 +151,13 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.4h, v2.4h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.4h, v2.4h, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
@@ -129,6 +177,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.8h, v2.8b, #0
+; NONEON-NOSVE-NEXT:    shl v2.8h, v2.8h, #15
+; NONEON-NOSVE-NEXT:    cmlt v2.8h, v2.8h, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
@@ -145,6 +201,18 @@ define void @select_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    cmeq v4.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    cmeq v5.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %mask = icmp eq <16 x i16> %op1, %op2
@@ -167,6 +235,13 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v2.2s, v2.2s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.2s, v2.2s, #0
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
@@ -186,6 +261,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.4s, v2.4h, #0
+; NONEON-NOSVE-NEXT:    shl v2.4s, v2.4s, #31
+; NONEON-NOSVE-NEXT:    cmlt v2.4s, v2.4s, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
@@ -202,6 +285,18 @@ define void @select_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    cmeq v4.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    cmeq v5.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %mask = icmp eq <8 x i32> %op1, %op2
@@ -223,6 +318,14 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csetm x8, ne
+; NONEON-NOSVE-NEXT:    fmov d2, x8
+; NONEON-NOSVE-NEXT:    bif v0.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
@@ -242,6 +345,14 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v2.2d, v2.2s, #0
+; NONEON-NOSVE-NEXT:    shl v2.2d, v2.2d, #63
+; NONEON-NOSVE-NEXT:    cmlt v2.2d, v2.2d, #0
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
@@ -258,6 +369,18 @@ define void @select_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    cmeq v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    cmeq v5.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    bif v0.16b, v1.16b, v4.16b
+; NONEON-NOSVE-NEXT:    mov v1.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %mask = icmp eq <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 885030861469..01a7a5cafd26 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,19 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ; CHECK-NEXT:    stp q2, q5, [x0, #32]
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
+; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v5.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    dup v0.4s, v1.s[2]
+; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    add v3.4s, v4.4s, v4.4s
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
   %1 = load <16 x i32>, ptr %arg2, align 256
@@ -42,6 +56,19 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ; CHECK-NEXT:    stp q3, q4, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test2:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [x0]
+; NONEON-NOSVE-NEXT:    add v2.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    dup v0.2s, v1.s[2]
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
+; NONEON-NOSVE-NEXT:    stp q2, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x0]
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
   %1 = load <16 x i32>, ptr %arg2, align 256
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
index 8ca8e6980913..c57f3af0d4b6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,13 @@ define <4 x i8> @load_v4i8(ptr %a) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i8>, ptr %a
   ret <4 x i8> %load
 }
@@ -20,6 +28,11 @@ define <8 x i8> @load_v8i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i8>, ptr %a
   ret <8 x i8> %load
 }
@@ -29,6 +42,11 @@ define <16 x i8> @load_v16i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x i8>, ptr %a
   ret <16 x i8> %load
 }
@@ -38,6 +56,11 @@ define <32 x i8> @load_v32i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <32 x i8>, ptr %a
   ret <32 x i8> %load
 }
@@ -49,6 +72,15 @@ define <2 x i16> @load_v2i16(ptr %a) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    add x8, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i16>, ptr %a
   ret <2 x i16> %load
 }
@@ -58,6 +90,11 @@ define <2 x half> @load_v2f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x half>, ptr %a
   ret <2 x half> %load
 }
@@ -67,6 +104,11 @@ define <4 x i16> @load_v4i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i16>, ptr %a
   ret <4 x i16> %load
 }
@@ -76,6 +118,11 @@ define <4 x half> @load_v4f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x half>, ptr %a
   ret <4 x half> %load
 }
@@ -85,6 +132,11 @@ define <8 x i16> @load_v8i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i16>, ptr %a
   ret <8 x i16> %load
 }
@@ -94,6 +146,11 @@ define <8 x half> @load_v8f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x half>, ptr %a
   ret <8 x half> %load
 }
@@ -103,6 +160,11 @@ define <16 x i16> @load_v16i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x i16>, ptr %a
   ret <16 x i16> %load
 }
@@ -112,6 +174,11 @@ define <16 x half> @load_v16f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x half>, ptr %a
   ret <16 x half> %load
 }
@@ -121,6 +188,11 @@ define <2 x i32> @load_v2i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i32>, ptr %a
   ret <2 x i32> %load
 }
@@ -130,6 +202,11 @@ define <2 x float> @load_v2f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x float>, ptr %a
   ret <2 x float> %load
 }
@@ -139,6 +216,11 @@ define <4 x i32> @load_v4i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i32>, ptr %a
   ret <4 x i32> %load
 }
@@ -148,6 +230,11 @@ define <4 x float> @load_v4f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x float>, ptr %a
   ret <4 x float> %load
 }
@@ -157,6 +244,11 @@ define <8 x i32> @load_v8i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i32>, ptr %a
   ret <8 x i32> %load
 }
@@ -166,6 +258,11 @@ define <8 x float> @load_v8f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x float>, ptr %a
   ret <8 x float> %load
 }
@@ -175,6 +272,11 @@ define <1 x i64> @load_v1i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <1 x i64>, ptr %a
   ret <1 x i64> %load
 }
@@ -184,6 +286,11 @@ define <1 x double> @load_v1f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <1 x double>, ptr %a
   ret <1 x double> %load
 }
@@ -193,6 +300,11 @@ define <2 x i64> @load_v2i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i64>, ptr %a
   ret <2 x i64> %load
 }
@@ -202,6 +314,11 @@ define <2 x double> @load_v2f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x double>, ptr %a
   ret <2 x double> %load
 }
@@ -211,6 +328,11 @@ define <4 x i64> @load_v4i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i64>, ptr %a
   ret <4 x i64> %load
 }
@@ -220,6 +342,11 @@ define <4 x double> @load_v4f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x double>, ptr %a
   ret <4 x double> %load
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index c4aeb4465c53..65c45587e120 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -17,6 +18,14 @@ define i8 @andv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -29,6 +38,15 @@ define i8 @andv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -41,6 +59,20 @@ define i8 @andv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -54,6 +86,22 @@ define i8 @andv_v32i8(ptr %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -67,6 +115,13 @@ define i16 @andv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -79,6 +134,14 @@ define i16 @andv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -91,6 +154,19 @@ define i16 @andv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -104,6 +180,21 @@ define i16 @andv_v16i16(ptr %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    and x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -117,6 +208,13 @@ define i32 @andv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -129,6 +227,18 @@ define i32 @andv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -142,6 +252,20 @@ define i32 @andv_v8i32(ptr %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -155,6 +279,16 @@ define i64 @andv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -168,6 +302,18 @@ define i64 @andv_v4i64(ptr %a) {
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -185,6 +331,14 @@ define i8 @eorv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -197,6 +351,15 @@ define i8 @eorv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -209,6 +372,20 @@ define i8 @eorv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -222,6 +399,22 @@ define i8 @eorv_v32i8(ptr %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -235,6 +428,13 @@ define i16 @eorv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -247,6 +447,14 @@ define i16 @eorv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -259,6 +467,19 @@ define i16 @eorv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -272,6 +493,21 @@ define i16 @eorv_v16i16(ptr %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    eor x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -285,6 +521,13 @@ define i32 @eorv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -297,6 +540,18 @@ define i32 @eorv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -310,6 +565,20 @@ define i32 @eorv_v8i32(ptr %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -323,6 +592,16 @@ define i64 @eorv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -336,6 +615,18 @@ define i64 @eorv_v4i64(ptr %a) {
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -353,6 +644,14 @@ define i8 @orv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -365,6 +664,15 @@ define i8 @orv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -377,6 +685,20 @@ define i8 @orv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -390,6 +712,22 @@ define i8 @orv_v32i8(ptr %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #16
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #8
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -403,6 +741,13 @@ define i16 @orv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -415,6 +760,14 @@ define i16 @orv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -427,6 +780,19 @@ define i16 @orv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -440,6 +806,21 @@ define i16 @orv_v16i16(ptr %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    orr x8, x8, x8, lsr #32
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #16
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -453,6 +834,13 @@ define i32 @orv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -465,6 +853,18 @@ define i32 @orv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -478,6 +878,20 @@ define i32 @orv_v8i32(ptr %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #32
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -491,6 +905,16 @@ define i64 @orv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -504,6 +928,18 @@ define i64 @orv_v4i64(ptr %a) {
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    orr v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    fmov x0, d0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
   ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index ca58099244cf..886f97ed988d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,44 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB0_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[0], [x0]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_3
+; NONEON-NOSVE-NEXT:    b .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_2:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #1
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:  .LBB0_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
+; NONEON-NOSVE-NEXT:  .LBB0_6: // %else8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_6
+; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x8, x0, #3
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer)
   ret <4 x i8> %load
 }
@@ -34,6 +73,67 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB1_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr b0, [x0]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_3
+; NONEON-NOSVE-NEXT:    b .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_2:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #1
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
+; NONEON-NOSVE-NEXT:  .LBB1_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
+; NONEON-NOSVE-NEXT:  .LBB1_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
+; NONEON-NOSVE-NEXT:  .LBB1_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
+; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
+; NONEON-NOSVE-NEXT:  .LBB1_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
+; NONEON-NOSVE-NEXT:  .LBB1_10: // %else20
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_6
+; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #3
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_7
+; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_8
+; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #5
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_9
+; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_10
+; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x8, x0, #7
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer)
   ret <8 x i8> %load
 }
@@ -49,6 +149,115 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    addv h1, v0.8h
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
+; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
+; NONEON-NOSVE-NEXT:  .LBB2_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
+; NONEON-NOSVE-NEXT:  .LBB2_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
+; NONEON-NOSVE-NEXT:  .LBB2_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
+; NONEON-NOSVE-NEXT:  .LBB2_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
+; NONEON-NOSVE-NEXT:  .LBB2_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
+; NONEON-NOSVE-NEXT:  .LBB2_8: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
+; NONEON-NOSVE-NEXT:  .LBB2_9: // %else23
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
+; NONEON-NOSVE-NEXT:  .LBB2_10: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
+; NONEON-NOSVE-NEXT:  .LBB2_11: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
+; NONEON-NOSVE-NEXT:  .LBB2_12: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
+; NONEON-NOSVE-NEXT:  .LBB2_13: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
+; NONEON-NOSVE-NEXT:  .LBB2_14: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
+; NONEON-NOSVE-NEXT:  .LBB2_15: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
+; NONEON-NOSVE-NEXT:  .LBB2_16: // %else44
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr b0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
+; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #1
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
+; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #3
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
+; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
+; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #5
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
+; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
+; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x9, x0, #7
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
+; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.load22
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
+; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.load25
+; NONEON-NOSVE-NEXT:    add x9, x0, #9
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
+; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.load28
+; NONEON-NOSVE-NEXT:    add x9, x0, #10
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
+; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.load31
+; NONEON-NOSVE-NEXT:    add x9, x0, #11
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
+; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.load34
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
+; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.load37
+; NONEON-NOSVE-NEXT:    add x9, x0, #13
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
+; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.load40
+; NONEON-NOSVE-NEXT:    add x9, x0, #14
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
+; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.load43
+; NONEON-NOSVE-NEXT:    add x8, x0, #15
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
   ret <16 x i8> %load
 }
@@ -130,6 +339,277 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fmov s1, w1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
+; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
+; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
+; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
+; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
+; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
+; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
+; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    addv h1, v1.8h
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
+; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
+; NONEON-NOSVE-NEXT:  .LBB3_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
+; NONEON-NOSVE-NEXT:  .LBB3_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
+; NONEON-NOSVE-NEXT:  .LBB3_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
+; NONEON-NOSVE-NEXT:  .LBB3_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
+; NONEON-NOSVE-NEXT:  .LBB3_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
+; NONEON-NOSVE-NEXT:  .LBB3_8: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
+; NONEON-NOSVE-NEXT:  .LBB3_9: // %else23
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
+; NONEON-NOSVE-NEXT:  .LBB3_10: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
+; NONEON-NOSVE-NEXT:  .LBB3_11: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
+; NONEON-NOSVE-NEXT:  .LBB3_12: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
+; NONEON-NOSVE-NEXT:  .LBB3_13: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
+; NONEON-NOSVE-NEXT:  .LBB3_14: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
+; NONEON-NOSVE-NEXT:  .LBB3_15: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
+; NONEON-NOSVE-NEXT:  .LBB3_16: // %else44
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
+; NONEON-NOSVE-NEXT:  .LBB3_17: // %else47
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
+; NONEON-NOSVE-NEXT:  .LBB3_18: // %else50
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
+; NONEON-NOSVE-NEXT:  .LBB3_19: // %else53
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
+; NONEON-NOSVE-NEXT:  .LBB3_20: // %else56
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
+; NONEON-NOSVE-NEXT:  .LBB3_21: // %else59
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
+; NONEON-NOSVE-NEXT:  .LBB3_22: // %else62
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
+; NONEON-NOSVE-NEXT:  .LBB3_23: // %else65
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
+; NONEON-NOSVE-NEXT:  .LBB3_24: // %else68
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
+; NONEON-NOSVE-NEXT:  .LBB3_25: // %else71
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
+; NONEON-NOSVE-NEXT:  .LBB3_26: // %else74
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
+; NONEON-NOSVE-NEXT:  .LBB3_27: // %else77
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
+; NONEON-NOSVE-NEXT:  .LBB3_28: // %else80
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
+; NONEON-NOSVE-NEXT:  .LBB3_29: // %else83
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
+; NONEON-NOSVE-NEXT:  .LBB3_30: // %else86
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
+; NONEON-NOSVE-NEXT:  .LBB3_31: // %else89
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else92
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr b0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #1
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #3
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #5
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x9, x0, #7
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[7], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.load22
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[8], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.load25
+; NONEON-NOSVE-NEXT:    add x9, x0, #9
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[9], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.load28
+; NONEON-NOSVE-NEXT:    add x9, x0, #10
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[10], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.load31
+; NONEON-NOSVE-NEXT:    add x9, x0, #11
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[11], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.load34
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[12], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.load37
+; NONEON-NOSVE-NEXT:    add x9, x0, #13
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[13], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.load40
+; NONEON-NOSVE-NEXT:    add x9, x0, #14
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[14], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.load43
+; NONEON-NOSVE-NEXT:    add x9, x0, #15
+; NONEON-NOSVE-NEXT:    ld1 { v0.b }[15], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.load46
+; NONEON-NOSVE-NEXT:    add x9, x0, #16
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[0], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.load49
+; NONEON-NOSVE-NEXT:    add x9, x0, #17
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.load52
+; NONEON-NOSVE-NEXT:    add x9, x0, #18
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.load55
+; NONEON-NOSVE-NEXT:    add x9, x0, #19
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.load58
+; NONEON-NOSVE-NEXT:    add x9, x0, #20
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.load61
+; NONEON-NOSVE-NEXT:    add x9, x0, #21
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.load64
+; NONEON-NOSVE-NEXT:    add x9, x0, #22
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.load67
+; NONEON-NOSVE-NEXT:    add x9, x0, #23
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[7], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.load70
+; NONEON-NOSVE-NEXT:    add x9, x0, #24
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[8], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.load73
+; NONEON-NOSVE-NEXT:    add x9, x0, #25
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[9], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.load76
+; NONEON-NOSVE-NEXT:    add x9, x0, #26
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[10], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.load79
+; NONEON-NOSVE-NEXT:    add x9, x0, #27
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[11], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.load82
+; NONEON-NOSVE-NEXT:    add x9, x0, #28
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[12], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.load85
+; NONEON-NOSVE-NEXT:    add x9, x0, #29
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[13], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.load88
+; NONEON-NOSVE-NEXT:    add x9, x0, #30
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[14], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.load91
+; NONEON-NOSVE-NEXT:    add x8, x0, #31
+; NONEON-NOSVE-NEXT:    ld1 { v1.b }[15], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer)
   ret <32 x i8> %load
 }
@@ -155,6 +635,31 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
+; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x8, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
   ret <2 x half> %load
 }
@@ -170,6 +675,43 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h1, v0.4h
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
+; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
+; NONEON-NOSVE-NEXT:  .LBB5_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
+; NONEON-NOSVE-NEXT:  .LBB5_4: // %else8
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
+; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
+; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x8, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer)
   ret <4 x half> %load
 }
@@ -186,6 +728,65 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b1, v0.8b
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
+; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
+; NONEON-NOSVE-NEXT:  .LBB6_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
+; NONEON-NOSVE-NEXT:  .LBB6_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
+; NONEON-NOSVE-NEXT:  .LBB6_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
+; NONEON-NOSVE-NEXT:  .LBB6_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
+; NONEON-NOSVE-NEXT:  .LBB6_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
+; NONEON-NOSVE-NEXT:  .LBB6_8: // %else20
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
+; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
+; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
+; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
+; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #10
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
+; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
+; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x8, x0, #14
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
   ret <8 x half> %load
 }
@@ -210,6 +811,116 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    addv h2, v0.8h
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s2
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
+; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
+; NONEON-NOSVE-NEXT:  .LBB7_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
+; NONEON-NOSVE-NEXT:  .LBB7_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
+; NONEON-NOSVE-NEXT:  .LBB7_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
+; NONEON-NOSVE-NEXT:  .LBB7_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
+; NONEON-NOSVE-NEXT:  .LBB7_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
+; NONEON-NOSVE-NEXT:  .LBB7_8: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
+; NONEON-NOSVE-NEXT:  .LBB7_9: // %else23
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
+; NONEON-NOSVE-NEXT:  .LBB7_10: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
+; NONEON-NOSVE-NEXT:  .LBB7_11: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
+; NONEON-NOSVE-NEXT:  .LBB7_12: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
+; NONEON-NOSVE-NEXT:  .LBB7_13: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
+; NONEON-NOSVE-NEXT:  .LBB7_14: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
+; NONEON-NOSVE-NEXT:  .LBB7_15: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
+; NONEON-NOSVE-NEXT:  .LBB7_16: // %else44
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
+; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
+; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #6
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
+; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
+; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #10
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
+; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
+; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x9, x0, #14
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[7], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
+; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.load22
+; NONEON-NOSVE-NEXT:    add x9, x0, #16
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[0], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
+; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.load25
+; NONEON-NOSVE-NEXT:    add x9, x0, #18
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
+; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.load28
+; NONEON-NOSVE-NEXT:    add x9, x0, #20
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
+; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.load31
+; NONEON-NOSVE-NEXT:    add x9, x0, #22
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
+; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.load34
+; NONEON-NOSVE-NEXT:    add x9, x0, #24
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[4], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
+; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.load37
+; NONEON-NOSVE-NEXT:    add x9, x0, #26
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[5], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
+; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.load40
+; NONEON-NOSVE-NEXT:    add x9, x0, #28
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[6], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
+; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.load43
+; NONEON-NOSVE-NEXT:    add x8, x0, #30
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[7], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer)
   ret <16 x half> %load
 }
@@ -225,6 +936,31 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB8_3: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
+; NONEON-NOSVE-NEXT:  .LBB8_4: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x8, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x8]
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
   ret <2 x float> %load
 }
@@ -241,6 +977,41 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h1, v0.4h
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_6
+; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_7
+; NONEON-NOSVE-NEXT:  .LBB9_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_8
+; NONEON-NOSVE-NEXT:  .LBB9_4: // %else8
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB9_5: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
+; NONEON-NOSVE-NEXT:  .LBB9_6: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
+; NONEON-NOSVE-NEXT:  .LBB9_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x8, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
   ret <4 x float> %load
 }
@@ -290,6 +1061,66 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    addv b2, v0.8b
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s2
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_10
+; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB10_11
+; NONEON-NOSVE-NEXT:  .LBB10_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB10_12
+; NONEON-NOSVE-NEXT:  .LBB10_4: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB10_13
+; NONEON-NOSVE-NEXT:  .LBB10_5: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB10_14
+; NONEON-NOSVE-NEXT:  .LBB10_6: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB10_15
+; NONEON-NOSVE-NEXT:  .LBB10_7: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB10_16
+; NONEON-NOSVE-NEXT:  .LBB10_8: // %else20
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB10_9: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
+; NONEON-NOSVE-NEXT:  .LBB10_10: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB10_3
+; NONEON-NOSVE-NEXT:  .LBB10_11: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_12: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x9, x0, #12
+; NONEON-NOSVE-NEXT:    ld1 { v0.s }[3], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB10_5
+; NONEON-NOSVE-NEXT:  .LBB10_13: // %cond.load10
+; NONEON-NOSVE-NEXT:    add x9, x0, #16
+; NONEON-NOSVE-NEXT:    ld1 { v1.s }[0], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB10_6
+; NONEON-NOSVE-NEXT:  .LBB10_14: // %cond.load13
+; NONEON-NOSVE-NEXT:    add x9, x0, #20
+; NONEON-NOSVE-NEXT:    ld1 { v1.s }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB10_7
+; NONEON-NOSVE-NEXT:  .LBB10_15: // %cond.load16
+; NONEON-NOSVE-NEXT:    add x9, x0, #24
+; NONEON-NOSVE-NEXT:    ld1 { v1.s }[2], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB10_8
+; NONEON-NOSVE-NEXT:  .LBB10_16: // %cond.load19
+; NONEON-NOSVE-NEXT:    add x8, x0, #28
+; NONEON-NOSVE-NEXT:    ld1 { v1.s }[3], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
   ret <8 x float> %load
 }
@@ -306,6 +1137,29 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v1.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB11_3: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
+; NONEON-NOSVE-NEXT:  .LBB11_4: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x8, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
   ret <2 x double> %load
 }
@@ -331,6 +1185,42 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI12_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI12_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    movi v1.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    addv h2, v0.4h
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    fmov w8, s2
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB12_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB12_6
+; NONEON-NOSVE-NEXT:  .LBB12_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB12_7
+; NONEON-NOSVE-NEXT:  .LBB12_3: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB12_8
+; NONEON-NOSVE-NEXT:  .LBB12_4: // %else8
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB12_5: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB12_2
+; NONEON-NOSVE-NEXT:  .LBB12_6: // %cond.load1
+; NONEON-NOSVE-NEXT:    add x9, x0, #8
+; NONEON-NOSVE-NEXT:    ld1 { v0.d }[1], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB12_3
+; NONEON-NOSVE-NEXT:  .LBB12_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    add x9, x0, #16
+; NONEON-NOSVE-NEXT:    ld1 { v1.d }[0], [x9]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB12_4
+; NONEON-NOSVE-NEXT:  .LBB12_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    add x8, x0, #24
+; NONEON-NOSVE-NEXT:    ld1 { v1.d }[1], [x8]
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer)
   ret <4 x double> %load
 }
@@ -356,6 +1246,38 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
+; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB13_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB13_3
+; NONEON-NOSVE-NEXT:    b .LBB13_4
+; NONEON-NOSVE-NEXT:  .LBB13_2:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB13_4
+; NONEON-NOSVE-NEXT:  .LBB13_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
+; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:  .LBB13_4: // %else2
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB13_6
+; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
+; NONEON-NOSVE-NEXT:    add x8, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:  .LBB13_6: // %else5
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = zext <3 x i16> %load_value to <3 x i32>
   ret <3 x i32> %extend;
@@ -382,6 +1304,38 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
+; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB14_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB14_3
+; NONEON-NOSVE-NEXT:    b .LBB14_4
+; NONEON-NOSVE-NEXT:  .LBB14_2:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB14_4
+; NONEON-NOSVE-NEXT:  .LBB14_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    mov v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add x9, x0, #2
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[1], [x9]
+; NONEON-NOSVE-NEXT:    mov v1.h[2], v0.h[2]
+; NONEON-NOSVE-NEXT:    fmov d0, d1
+; NONEON-NOSVE-NEXT:  .LBB14_4: // %else2
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB14_6
+; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
+; NONEON-NOSVE-NEXT:    mov v0.h[1], v0.h[1]
+; NONEON-NOSVE-NEXT:    add x8, x0, #4
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:  .LBB14_6: // %else5
+; NONEON-NOSVE-NEXT:    sshll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = sext <3 x i16> %load_value to <3 x i32>
   ret <3 x i32> %extend;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index f2b3f9b12ea7..b175dcf3e9a0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,37 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI0_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB0_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_6
+; NONEON-NOSVE-NEXT:  .LBB0_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
+; NONEON-NOSVE-NEXT:  .LBB0_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
+; NONEON-NOSVE-NEXT:  .LBB0_4: // %else6
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB0_5: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_2
+; NONEON-NOSVE-NEXT:  .LBB0_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB0_3
+; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -34,6 +66,57 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB1_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_10
+; NONEON-NOSVE-NEXT:  .LBB1_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
+; NONEON-NOSVE-NEXT:  .LBB1_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
+; NONEON-NOSVE-NEXT:  .LBB1_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
+; NONEON-NOSVE-NEXT:  .LBB1_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
+; NONEON-NOSVE-NEXT:  .LBB1_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
+; NONEON-NOSVE-NEXT:  .LBB1_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
+; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB1_9: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_2
+; NONEON-NOSVE-NEXT:  .LBB1_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB1_3
+; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_5
+; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_6
+; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_7
+; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_8
+; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -49,6 +132,99 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
+; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
+; NONEON-NOSVE-NEXT:  .LBB2_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
+; NONEON-NOSVE-NEXT:  .LBB2_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
+; NONEON-NOSVE-NEXT:  .LBB2_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
+; NONEON-NOSVE-NEXT:  .LBB2_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
+; NONEON-NOSVE-NEXT:  .LBB2_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
+; NONEON-NOSVE-NEXT:  .LBB2_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
+; NONEON-NOSVE-NEXT:  .LBB2_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
+; NONEON-NOSVE-NEXT:  .LBB2_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
+; NONEON-NOSVE-NEXT:  .LBB2_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
+; NONEON-NOSVE-NEXT:  .LBB2_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
+; NONEON-NOSVE-NEXT:  .LBB2_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
+; NONEON-NOSVE-NEXT:  .LBB2_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
+; NONEON-NOSVE-NEXT:  .LBB2_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
+; NONEON-NOSVE-NEXT:  .LBB2_16: // %else30
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
+; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
+; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
+; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
+; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
+; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
+; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
+; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.store15
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
+; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.store17
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
+; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.store19
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
+; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.store21
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
+; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.store23
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
+; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.store25
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
+; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.store27
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
+; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.store29
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
 }
@@ -129,6 +305,244 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fmov s1, w1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    mov v1.b[1], w2
+; NONEON-NOSVE-NEXT:    mov v0.b[1], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    mov v1.b[2], w3
+; NONEON-NOSVE-NEXT:    mov v0.b[2], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    mov v1.b[3], w4
+; NONEON-NOSVE-NEXT:    mov v0.b[3], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    mov v1.b[4], w5
+; NONEON-NOSVE-NEXT:    mov v0.b[4], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    mov v1.b[5], w6
+; NONEON-NOSVE-NEXT:    mov v0.b[5], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    mov v1.b[6], w7
+; NONEON-NOSVE-NEXT:    mov v0.b[6], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    mov v1.b[7], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov v0.b[7], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    mov v1.b[8], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    mov v0.b[8], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    mov v1.b[9], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    mov v0.b[9], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    mov v1.b[10], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    mov v0.b[10], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    mov v1.b[11], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    mov v0.b[11], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    mov v1.b[12], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    mov v0.b[12], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    mov v1.b[13], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    mov v0.b[13], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    mov v1.b[14], w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    mov v0.b[14], w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    mov v1.b[15], w9
+; NONEON-NOSVE-NEXT:    mov v0.b[15], w8
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    shl v1.16b, v1.16b, #7
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    cmlt v1.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v1.16b, v1.16b, v3.16b
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    addv h1, v1.8h
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w8, s1
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    bfi w8, w9, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_33
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_34
+; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_35
+; NONEON-NOSVE-NEXT:  .LBB3_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_36
+; NONEON-NOSVE-NEXT:  .LBB3_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_37
+; NONEON-NOSVE-NEXT:  .LBB3_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_38
+; NONEON-NOSVE-NEXT:  .LBB3_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_39
+; NONEON-NOSVE-NEXT:  .LBB3_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_40
+; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_41
+; NONEON-NOSVE-NEXT:  .LBB3_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_42
+; NONEON-NOSVE-NEXT:  .LBB3_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_43
+; NONEON-NOSVE-NEXT:  .LBB3_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_44
+; NONEON-NOSVE-NEXT:  .LBB3_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_45
+; NONEON-NOSVE-NEXT:  .LBB3_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_46
+; NONEON-NOSVE-NEXT:  .LBB3_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_47
+; NONEON-NOSVE-NEXT:  .LBB3_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_48
+; NONEON-NOSVE-NEXT:  .LBB3_16: // %else30
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_49
+; NONEON-NOSVE-NEXT:  .LBB3_17: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_50
+; NONEON-NOSVE-NEXT:  .LBB3_18: // %else34
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_51
+; NONEON-NOSVE-NEXT:  .LBB3_19: // %else36
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_52
+; NONEON-NOSVE-NEXT:  .LBB3_20: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_53
+; NONEON-NOSVE-NEXT:  .LBB3_21: // %else40
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_54
+; NONEON-NOSVE-NEXT:  .LBB3_22: // %else42
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_55
+; NONEON-NOSVE-NEXT:  .LBB3_23: // %else44
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_56
+; NONEON-NOSVE-NEXT:  .LBB3_24: // %else46
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_57
+; NONEON-NOSVE-NEXT:  .LBB3_25: // %else48
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_58
+; NONEON-NOSVE-NEXT:  .LBB3_26: // %else50
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_59
+; NONEON-NOSVE-NEXT:  .LBB3_27: // %else52
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_60
+; NONEON-NOSVE-NEXT:  .LBB3_28: // %else54
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_61
+; NONEON-NOSVE-NEXT:  .LBB3_29: // %else56
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_62
+; NONEON-NOSVE-NEXT:  .LBB3_30: // %else58
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_63
+; NONEON-NOSVE-NEXT:  .LBB3_31: // %else60
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else62
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store15
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store17
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store19
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store21
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store23
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store25
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store27
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store29
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
+; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store31
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store33
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #17]
+; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store35
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #18]
+; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store37
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #19]
+; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store39
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store41
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #21]
+; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store43
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #22]
+; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store45
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #23]
+; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store47
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store49
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #25]
+; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store51
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #26]
+; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store53
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #27]
+; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store55
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #28]
+; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store57
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #29]
+; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store59
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #30]
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store61
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
   ret void
 }
@@ -154,6 +568,29 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
+; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
 }
@@ -169,6 +606,41 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
+; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
+; NONEON-NOSVE-NEXT:  .LBB5_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
+; NONEON-NOSVE-NEXT:  .LBB5_4: // %else6
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
+; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
+; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -185,6 +657,65 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB6_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
+; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
+; NONEON-NOSVE-NEXT:  .LBB6_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
+; NONEON-NOSVE-NEXT:  .LBB6_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
+; NONEON-NOSVE-NEXT:  .LBB6_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
+; NONEON-NOSVE-NEXT:  .LBB6_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
+; NONEON-NOSVE-NEXT:  .LBB6_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
+; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
+; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
+; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
+; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
+; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
+; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
+; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -209,6 +740,115 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    st1h { z1.h }, p1, [x0, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.16b, v0.16b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    addv h0, v0.8h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
+; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
+; NONEON-NOSVE-NEXT:  .LBB7_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
+; NONEON-NOSVE-NEXT:  .LBB7_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
+; NONEON-NOSVE-NEXT:  .LBB7_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
+; NONEON-NOSVE-NEXT:  .LBB7_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
+; NONEON-NOSVE-NEXT:  .LBB7_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
+; NONEON-NOSVE-NEXT:  .LBB7_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
+; NONEON-NOSVE-NEXT:  .LBB7_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
+; NONEON-NOSVE-NEXT:  .LBB7_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
+; NONEON-NOSVE-NEXT:  .LBB7_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
+; NONEON-NOSVE-NEXT:  .LBB7_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
+; NONEON-NOSVE-NEXT:  .LBB7_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
+; NONEON-NOSVE-NEXT:  .LBB7_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
+; NONEON-NOSVE-NEXT:  .LBB7_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
+; NONEON-NOSVE-NEXT:  .LBB7_16: // %else30
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
+; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
+; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
+; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.store7
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
+; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.store9
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
+; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.store11
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
+; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.store13
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
+; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.store15
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
+; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.store17
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #18]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
+; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.store19
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
+; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.store21
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #22]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
+; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.store23
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
+; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.store25
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #26]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
+; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.store27
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #28]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
+; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.store29
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
 }
@@ -225,6 +865,37 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_6
+; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB8_7
+; NONEON-NOSVE-NEXT:  .LBB8_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB8_8
+; NONEON-NOSVE-NEXT:  .LBB8_4: // %else6
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB8_5: // %cond.store
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
+; NONEON-NOSVE-NEXT:  .LBB8_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB8_3
+; NONEON-NOSVE-NEXT:  .LBB8_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -275,6 +946,57 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.8b, v0.8b, #7
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv b0, v0.8b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB9_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_10
+; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_11
+; NONEON-NOSVE-NEXT:  .LBB9_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_12
+; NONEON-NOSVE-NEXT:  .LBB9_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB9_13
+; NONEON-NOSVE-NEXT:  .LBB9_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB9_14
+; NONEON-NOSVE-NEXT:  .LBB9_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB9_15
+; NONEON-NOSVE-NEXT:  .LBB9_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB9_16
+; NONEON-NOSVE-NEXT:  .LBB9_8: // %else14
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB9_9: // %cond.store
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
+; NONEON-NOSVE-NEXT:  .LBB9_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
+; NONEON-NOSVE-NEXT:  .LBB9_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB9_5
+; NONEON-NOSVE-NEXT:  .LBB9_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB9_6
+; NONEON-NOSVE-NEXT:  .LBB9_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB9_7
+; NONEON-NOSVE-NEXT:  .LBB9_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB9_8
+; NONEON-NOSVE-NEXT:  .LBB9_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #28]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -291,6 +1013,27 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #31
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addp v0.2s, v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.store
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
+; NONEON-NOSVE-NEXT:  .LBB10_4: // %cond.store1
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
 }
@@ -315,6 +1058,37 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #15
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
+; NONEON-NOSVE-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    and v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    addv h0, v0.4h
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_6
+; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB11_7
+; NONEON-NOSVE-NEXT:  .LBB11_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB11_8
+; NONEON-NOSVE-NEXT:  .LBB11_4: // %else6
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB11_5: // %cond.store
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
+; NONEON-NOSVE-NEXT:  .LBB11_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB11_3
+; NONEON-NOSVE-NEXT:  .LBB11_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index b5adea594242..d7eaf766e7df 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,15 @@ define void @add_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ldr s1, [x1]
+; NONEON-NOSVE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i8>, ptr %a
   %op2 = load <4 x i8>, ptr %b
   %res = add <4 x i8> %op1, %op2
@@ -29,6 +39,14 @@ define void @add_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    add v0.8b, v0.8b, v1.8b
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
   %res = add <8 x i8> %op1, %op2
@@ -44,6 +62,14 @@ define void @add_v16i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
   %res = add <16 x i8> %op1, %op2
@@ -60,6 +86,15 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = add <32 x i8> %op1, %op2
@@ -76,6 +111,23 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ldrh w9, [x1]
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    fmov s1, w9
+; NONEON-NOSVE-NEXT:    add x8, x0, #2
+; NONEON-NOSVE-NEXT:    add x9, x1, #2
+; NONEON-NOSVE-NEXT:    ld1 { v0.h }[2], [x8]
+; NONEON-NOSVE-NEXT:    ld1 { v1.h }[2], [x9]
+; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    mov w8, v0.s[1]
+; NONEON-NOSVE-NEXT:    fmov w9, s0
+; NONEON-NOSVE-NEXT:    strh w9, [x0]
+; NONEON-NOSVE-NEXT:    strh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i16>, ptr %a
   %op2 = load <2 x i16>, ptr %b
   %res = add <2 x i16> %op1, %op2
@@ -91,6 +143,14 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    add v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %op2 = load <4 x i16>, ptr %b
   %res = add <4 x i16> %op1, %op2
@@ -106,6 +166,14 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %res = add <8 x i16> %op1, %op2
@@ -122,6 +190,15 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = add <16 x i16> %op1, %op2
@@ -137,6 +214,13 @@ define void @abs_v2i32(ptr %a) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   store <2 x i32> %res, ptr %a
@@ -151,6 +235,13 @@ define void @abs_v4i32(ptr %a) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   store <4 x i32> %res, ptr %a
@@ -166,6 +257,14 @@ define void @abs_v8i32(ptr %a) {
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    abs v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, ptr %a
@@ -180,6 +279,13 @@ define void @abs_v2i64(ptr %a) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   store <2 x i64> %res, ptr %a
@@ -195,6 +301,14 @@ define void @abs_v4i64(ptr %a) {
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    abs v0.2d, v0.2d
+; NONEON-NOSVE-NEXT:    abs v1.2d, v1.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, ptr %a
@@ -211,6 +325,17 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ldr s1, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %op2 = load <2 x half>, ptr %b
   %res = fadd <2 x half> %op1, %op2
@@ -227,6 +352,17 @@ define void @fadd_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v1.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v0.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %op2 = load <4 x half>, ptr %b
   %res = fadd <4 x half> %op1, %op2
@@ -243,6 +379,21 @@ define void @fadd_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fcvtl v2.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v3.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v2.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
   %res = fadd <8 x half> %op1, %op2
@@ -261,6 +412,29 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fcvtl v4.4s, v0.4h
+; NONEON-NOSVE-NEXT:    fcvtl v6.4s, v3.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v0.4s, v0.8h
+; NONEON-NOSVE-NEXT:    fcvtl v5.4s, v1.4h
+; NONEON-NOSVE-NEXT:    fcvtl v7.4s, v2.4h
+; NONEON-NOSVE-NEXT:    fcvtl2 v1.4s, v1.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v3.4s, v3.8h
+; NONEON-NOSVE-NEXT:    fcvtl2 v2.4s, v2.8h
+; NONEON-NOSVE-NEXT:    fadd v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    fcvtn v1.4h, v4.4s
+; NONEON-NOSVE-NEXT:    fcvtn v3.4h, v5.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v1.8h, v0.4s
+; NONEON-NOSVE-NEXT:    fcvtn2 v3.8h, v2.4s
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fadd <16 x half> %op1, %op2
@@ -277,6 +451,14 @@ define void @fadd_v2f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    fadd v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %op2 = load <2 x float>, ptr %b
   %res = fadd <2 x float> %op1, %op2
@@ -293,6 +475,14 @@ define void @fadd_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
   %res = fadd <4 x float> %op1, %op2
@@ -311,6 +501,15 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fadd <8 x float> %op1, %op2
@@ -327,6 +526,14 @@ define void @fadd_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
   %res = fadd <2 x double> %op1, %op2
@@ -345,6 +552,15 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fadd v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fadd <4 x double> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index 00413302798c..f595a4219cac 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -15,6 +16,14 @@ define void @test_revbv16i16(ptr %a) {
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 undef, i32 24, i32 27, i32 undef, i32 29, i32 28, i32 undef, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -31,6 +40,14 @@ define void @test_revbv8i32(ptr %a) {
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -47,6 +64,14 @@ define void @test_revbv4i64(ptr %a) {
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 31, i32 30, i32 29, i32 undef, i32 27, i32 undef, i32 undef, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -63,6 +88,14 @@ define void @test_revhv8i32(ptr %a) {
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   store <16 x i16> %tmp2, ptr %a
@@ -79,6 +112,14 @@ define void @test_revhv8f32(ptr %a) {
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    rev32 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x half>, ptr %a
   %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   store <16 x half> %tmp2, ptr %a
@@ -95,6 +136,14 @@ define void @test_revhv4i64(ptr %a) {
 ; CHECK-NEXT:    revh z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   store <16 x i16> %tmp2, ptr %a
@@ -111,6 +160,14 @@ define void @test_revwv4i64(ptr %a) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   store <8 x i32> %tmp2, ptr %a
@@ -127,6 +184,14 @@ define void @test_revwv4f64(ptr %a) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   store <8 x float> %tmp2, ptr %a
@@ -141,6 +206,12 @@ define <16 x i8> @test_revv16i8(ptr %a) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revv16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %a
   %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
   ret <16 x i8> %tmp2
@@ -156,6 +227,14 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv8i32v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -176,6 +255,18 @@ define void @test_revhv32i16(ptr %a) {
 ; CHECK-NEXT:    stp q0, q1, [x0, #32]
 ; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    rev64 v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    rev64 v2.8h, v2.8h
+; NONEON-NOSVE-NEXT:    rev64 v3.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i16> %tmp2, ptr %a
@@ -191,6 +282,14 @@ define void @test_rev_elts_fail(ptr %a) {
 ; CHECK-NEXT:    tbl z0.d, { z2.d }, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_rev_elts_fail:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x i64> %tmp2, ptr %a
@@ -208,6 +307,15 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 {
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revdv4i64_sve2p1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ptrue p0.d, vl2
+; NONEON-NOSVE-NEXT:    revd z0.q, p0/m, z0.q
+; NONEON-NOSVE-NEXT:    revd z1.q, p0/m, z1.q
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x i64> %tmp2, ptr %a
@@ -223,6 +331,15 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 {
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revdv4f64_sve2p1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ptrue p0.d
+; NONEON-NOSVE-NEXT:    revd z0.q, p0/m, z0.q
+; NONEON-NOSVE-NEXT:    revd z1.q, p0/m, z1.q
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x double> %tmp2, ptr %a
@@ -238,6 +355,16 @@ define void @test_revv8i32(ptr %a) {
 ; CHECK-NEXT:    tbl z0.s, { z2.s }, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    rev64 v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   store <8 x i32> %tmp2, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index cb73030306b0..df786933da88 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -68,6 +69,18 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
@@ -196,6 +209,28 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q4, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q5, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q6, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    ldp q7, q3, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v17.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    zip2 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    zip1 v16.8h, v1.8h, v3.8h
+; NONEON-NOSVE-NEXT:    zip2 v1.8h, v1.8h, v3.8h
+; NONEON-NOSVE-NEXT:    zip1 v2.8h, v5.8h, v7.8h
+; NONEON-NOSVE-NEXT:    zip1 v3.8h, v4.8h, v6.8h
+; NONEON-NOSVE-NEXT:    zip2 v5.8h, v5.8h, v7.8h
+; NONEON-NOSVE-NEXT:    zip2 v4.8h, v4.8h, v6.8h
+; NONEON-NOSVE-NEXT:    add v6.8h, v16.8h, v17.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    add v2.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    stp q6, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = load <32 x i16>, ptr %b
   %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
@@ -244,6 +279,18 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -276,6 +323,18 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -298,6 +357,19 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -330,6 +402,16 @@ define void @zip_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    zip2 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %a
   %tmp2 = load <4 x i32>, ptr %b
   %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -351,6 +433,16 @@ define void @zip1_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load  volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   store volatile <8 x i32> %tmp2, ptr %a
@@ -370,6 +462,19 @@ define void @trn_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    trn1 v4.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    trn2 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    trn1 v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    trn2 v2.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
@@ -392,6 +497,19 @@ define void @trn_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x9, :lo12:.LCPI8_1]
+; NONEON-NOSVE-NEXT:    tbl v0.16b, { v1.16b }, v0.16b
+; NONEON-NOSVE-NEXT:    tbl v1.16b, { v1.16b }, v2.16b
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
   %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 7, i32 2, i32 6, i32 4, i32 5, i32 1, i32 3>
@@ -414,6 +532,19 @@ define void @trn_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.h, z1.h, z2.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    trn1 v4.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    trn2 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    trn1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    trn2 v2.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -436,6 +567,19 @@ define void @trn_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.s, z1.s, z2.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    trn1 v1.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    trn2 v2.4s, v2.4s, v3.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v4.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
@@ -459,6 +603,19 @@ define void @trn_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v1.2d
+; NONEON-NOSVE-NEXT:    zip1 v1.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip2 v2.2d, v2.2d, v3.2d
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v4.2d, v0.2d
+; NONEON-NOSVE-NEXT:    fadd v1.2d, v1.2d, v2.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -479,6 +636,16 @@ define void @trn_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %a
   %tmp2 = load <4 x float>, ptr %b
   %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -500,6 +667,18 @@ define void @trn_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z3.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    trn1 v2.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    trn2 v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    trn1 v3.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    trn2 v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -571,6 +750,18 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    zip2 v2.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -617,6 +808,18 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -649,6 +852,18 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; NONEON-NOSVE-NEXT:    str q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -668,6 +883,16 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    zip2 v1.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    str q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
   store volatile <8 x i32> %tmp2, ptr %a
@@ -869,6 +1094,19 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp2 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    uzp2 v2.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v4.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v2.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
@@ -891,6 +1129,17 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
+; NONEON-NOSVE-NEXT:    ext v2.8b, v0.8b, v0.8b, #2
+; NONEON-NOSVE-NEXT:    trn1 v1.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    zip1 v0.4h, v2.4h, v0.4h
+; NONEON-NOSVE-NEXT:    add v0.4h, v1.4h, v0.4h
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %a
   %tmp2 = load <4 x i16>, ptr %b
   %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
@@ -1008,6 +1257,19 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp2 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v4.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v2.8h
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1047,6 +1309,19 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp2 v2.4s, v3.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v0.4s, v4.4s, v0.4s
+; NONEON-NOSVE-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = load <8 x float>, ptr %b
   %tmp3 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 6, i32 undef, i32 10, i32 12, i32 14>
@@ -1069,6 +1344,19 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    zip2 v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    zip2 v2.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    add v0.2d, v4.2d, v0.2d
+; NONEON-NOSVE-NEXT:    add v1.2d, v1.2d, v2.2d
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = load <4 x i64>, ptr %b
   %tmp3 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1136,6 +1424,16 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v2.8h, v0.8h
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
   %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1174,6 +1472,15 @@ define void @uzp_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp2 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v2.4s, v0.4s
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6>
   %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7>
@@ -1197,6 +1504,19 @@ define void @zip_vscale2_4(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_vscale2_4:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    zip1 v4.2d, v1.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip1 v5.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    zip2 v1.2d, v1.2d, v3.2d
+; NONEON-NOSVE-NEXT:    zip2 v0.2d, v0.2d, v2.2d
+; NONEON-NOSVE-NEXT:    fadd v2.2d, v4.2d, v5.2d
+; NONEON-NOSVE-NEXT:    fadd v0.2d, v1.2d, v0.2d
+; NONEON-NOSVE-NEXT:    stp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index ab7c42b3e9e3..6b3c85f59357 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -35,6 +36,23 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
@@ -92,6 +110,33 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_or_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
+; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    orn v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    umaxv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
@@ -159,6 +204,33 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_and_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    fcmeq v1.4s, v1.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v0.4s, v0.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v3.4s, v3.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v2.4s, v2.4s, #0.0
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x1]
+; NONEON-NOSVE-NEXT:    fcmeq v4.4s, v4.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v5.4s, v5.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; NONEON-NOSVE-NEXT:    fcmeq v7.4s, v7.4s, #0.0
+; NONEON-NOSVE-NEXT:    fcmeq v6.4s, v6.4s, #0.0
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    mvn v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    bic v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uminv b0, v0.16b
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index bfa931044bc5..0a7352bf4944 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -18,6 +19,13 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    lsr z0.h, z0.h, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ushr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -30,6 +38,11 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -42,6 +55,11 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -55,6 +73,14 @@ define void @bitreverse_v32i8(ptr %a) {
 ; CHECK-NEXT:    rbit z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -70,6 +96,13 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -82,6 +115,12 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -94,6 +133,12 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -107,6 +152,16 @@ define void @bitreverse_v16i16(ptr %a) {
 ; CHECK-NEXT:    rbit z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -121,6 +176,12 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -133,6 +194,12 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -146,6 +213,16 @@ define void @bitreverse_v8i32(ptr %a) {
 ; CHECK-NEXT:    rbit z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -160,6 +237,12 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    rbit v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -172,6 +255,12 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -185,6 +274,16 @@ define void @bitreverse_v4i64(ptr %a) {
 ; CHECK-NEXT:    rbit z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    rbit v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rbit v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -204,6 +303,12 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ushr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -216,6 +321,11 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -228,6 +338,11 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -241,6 +356,14 @@ define void @bswap_v16i16(ptr %a) {
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev16 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev16 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -255,6 +378,11 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -267,6 +395,11 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -280,6 +413,14 @@ define void @bswap_v8i32(ptr %a) {
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev32 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev32 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -294,6 +435,11 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -306,6 +452,11 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -319,6 +470,14 @@ define void @bswap_v4i64(ptr %a) {
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    rev64 v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    rev64 v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index 9dd42e7831e0..d86c7d36a104 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,19 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v1.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    movi d2, #0xff00ff00ff00ff
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v1.4h, v1.4h, #7
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #3
+; NONEON-NOSVE-NEXT:    shl v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #8
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer)
   ret <4 x i8> %res
 }
@@ -26,6 +40,13 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.8b, v0.8b, #0
+; NONEON-NOSVE-NEXT:    usra v0.8b, v1.8b, #3
+; NONEON-NOSVE-NEXT:    sshr v0.8b, v0.8b, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer)
   ret <8 x i8> %res
 }
@@ -38,6 +59,13 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    usra v0.16b, v1.16b, #3
+; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer)
   ret <16 x i8> %res
 }
@@ -51,6 +79,18 @@ define void @sdiv_v32i8(ptr %a) {
 ; CHECK-NEXT:    asrd z1.b, p0/m, z1.b, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v2.16b, v0.16b, #0
+; NONEON-NOSVE-NEXT:    cmlt v3.16b, v1.16b, #0
+; NONEON-NOSVE-NEXT:    usra v0.16b, v2.16b, #3
+; NONEON-NOSVE-NEXT:    usra v1.16b, v3.16b, #3
+; NONEON-NOSVE-NEXT:    sshr v0.16b, v0.16b, #5
+; NONEON-NOSVE-NEXT:    sshr v1.16b, v1.16b, #5
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer)
   store <32 x i8> %res, ptr %a
@@ -66,6 +106,20 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    shl v1.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    dup v2.2s, w8
+; NONEON-NOSVE-NEXT:    sshr v1.2s, v1.2s, #16
+; NONEON-NOSVE-NEXT:    ushr v1.2s, v1.2s, #26
+; NONEON-NOSVE-NEXT:    and v1.8b, v1.8b, v2.8b
+; NONEON-NOSVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; NONEON-NOSVE-NEXT:    shl v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #16
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer)
   ret <2 x i16> %res
 }
@@ -78,6 +132,13 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.4h, v0.4h, #0
+; NONEON-NOSVE-NEXT:    usra v0.4h, v1.4h, #11
+; NONEON-NOSVE-NEXT:    sshr v0.4h, v0.4h, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer)
   ret <4 x i16> %res
 }
@@ -90,6 +151,13 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.8h, v0.8h, #0
+; NONEON-NOSVE-NEXT:    usra v0.8h, v1.8h, #11
+; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer)
   ret <8 x i16> %res
 }
@@ -103,6 +171,18 @@ define void @sdiv_v16i16(ptr %a) {
 ; CHECK-NEXT:    asrd z1.h, p0/m, z1.h, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v2.8h, v0.8h, #0
+; NONEON-NOSVE-NEXT:    cmlt v3.8h, v1.8h, #0
+; NONEON-NOSVE-NEXT:    usra v0.8h, v2.8h, #11
+; NONEON-NOSVE-NEXT:    usra v1.8h, v3.8h, #11
+; NONEON-NOSVE-NEXT:    sshr v0.8h, v0.8h, #5
+; NONEON-NOSVE-NEXT:    sshr v1.8h, v1.8h, #5
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer)
   store <16 x i16> %res, ptr %a
@@ -117,6 +197,13 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.2s, v0.2s, #0
+; NONEON-NOSVE-NEXT:    usra v0.2s, v1.2s, #27
+; NONEON-NOSVE-NEXT:    sshr v0.2s, v0.2s, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer)
   ret <2 x i32> %res
 }
@@ -129,6 +216,13 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.4s, v0.4s, #0
+; NONEON-NOSVE-NEXT:    usra v0.4s, v1.4s, #27
+; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer)
   ret <4 x i32> %res
 }
@@ -142,6 +236,18 @@ define void @sdiv_v8i32(ptr %a) {
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v2.4s, v0.4s, #0
+; NONEON-NOSVE-NEXT:    cmlt v3.4s, v1.4s, #0
+; NONEON-NOSVE-NEXT:    usra v0.4s, v2.4s, #27
+; NONEON-NOSVE-NEXT:    usra v1.4s, v3.4s, #27
+; NONEON-NOSVE-NEXT:    sshr v0.4s, v0.4s, #5
+; NONEON-NOSVE-NEXT:    sshr v1.4s, v1.4s, #5
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer)
   store <8 x i32> %res, ptr %a
@@ -156,6 +262,13 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt d1, d0, #0
+; NONEON-NOSVE-NEXT:    usra d0, d1, #59
+; NONEON-NOSVE-NEXT:    sshr d0, d0, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer)
   ret <1 x i64> %res
 }
@@ -169,6 +282,13 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    cmlt v1.2d, v0.2d, #0
+; NONEON-NOSVE-NEXT:    usra v0.2d, v1.2d, #59
+; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer)
   ret <2 x i64> %res
 }
@@ -182,6 +302,18 @@ define void @sdiv_v4i64(ptr %a) {
 ; CHECK-NEXT:    asrd z1.d, p0/m, z1.d, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    cmlt v2.2d, v0.2d, #0
+; NONEON-NOSVE-NEXT:    cmlt v3.2d, v1.2d, #0
+; NONEON-NOSVE-NEXT:    usra v0.2d, v2.2d, #59
+; NONEON-NOSVE-NEXT:    usra v1.2d, v3.2d, #59
+; NONEON-NOSVE-NEXT:    sshr v0.2d, v0.2d, #5
+; NONEON-NOSVE-NEXT:    sshr v1.2d, v1.2d, #5
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index 323d5278592f..6489e8d94d31 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 
@@ -15,6 +16,11 @@ define <4 x i8> @splat_v4i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.4h, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer
   ret <4 x i8> %splat
@@ -26,6 +32,11 @@ define <8 x i8> @splat_v8i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.8b, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer
   ret <8 x i8> %splat
@@ -37,6 +48,11 @@ define <16 x i8> @splat_v16i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.16b, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %splat
@@ -48,6 +64,12 @@ define void @splat_v32i8(i8 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.16b, w0
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   store <32 x i8> %splat, ptr %b
@@ -60,6 +82,11 @@ define <2 x i16> @splat_v2i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer
   ret <2 x i16> %splat
@@ -71,6 +98,11 @@ define <4 x i16> @splat_v4i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.4h, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
   ret <4 x i16> %splat
@@ -82,6 +114,11 @@ define <8 x i16> @splat_v8i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.8h, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %splat
@@ -93,6 +130,12 @@ define void @splat_v16i16(i16 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.8h, w0
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   store <16 x i16> %splat, ptr %b
@@ -105,6 +148,11 @@ define <2 x i32> @splat_v2i32(i32 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.2s, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer
   ret <2 x i32> %splat
@@ -116,6 +164,11 @@ define <4 x i32> @splat_v4i32(i32 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.4s, w0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %splat
@@ -127,6 +180,12 @@ define void @splat_v8i32(i32 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.4s, w0
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   store <8 x i32> %splat, ptr %b
@@ -139,6 +198,11 @@ define <1 x i64> @splat_v1i64(i64 %a) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d0, x0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer
   ret <1 x i64> %splat
@@ -150,6 +214,11 @@ define <2 x i64> @splat_v2i64(i64 %a) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.2d, x0
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat
@@ -161,6 +230,12 @@ define void @splat_v4i64(i64 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    dup v0.2d, x0
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   store <4 x i64> %splat, ptr %b
@@ -178,6 +253,12 @@ define <2 x half> @splat_v2f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x half> undef, half %a, i64 0
   %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer
   ret <2 x half> %splat
@@ -190,6 +271,12 @@ define <4 x half> @splat_v4f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4h, v0.h[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x half> undef, half %a, i64 0
   %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer
   ret <4 x half> %splat
@@ -202,6 +289,12 @@ define <8 x half> @splat_v8f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x half> undef, half %a, i64 0
   %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer
   ret <8 x half> %splat
@@ -214,6 +307,13 @@ define void @splat_v16f16(half %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $h0 killed $h0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.8h, v0.h[0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half %a, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
   store <16 x half> %splat, ptr %b
@@ -227,6 +327,12 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2s, v0.s[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x float> undef, float %a, i64 0
   %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer
   ret <2 x float> %splat
@@ -239,6 +345,12 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x float> undef, float %a, i64 0
   %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %splat
@@ -251,6 +363,13 @@ define void @splat_v8f32(float %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $s0 killed $s0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float %a, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
   store <8 x float> %splat, ptr %b
@@ -261,6 +380,10 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) {
 ; CHECK-LABEL: splat_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x double> undef, double %a, i64 0
   %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer
   ret <1 x double> %splat
@@ -273,6 +396,12 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x double> undef, double %a, i64 0
   %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
   ret <2 x double> %splat
@@ -285,6 +414,13 @@ define void @splat_v4f64(double %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NONEON-NOSVE-NEXT:    dup v0.2d, v0.d[0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double %a, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
   store <4 x double> %splat, ptr %b
@@ -301,6 +437,12 @@ define void @splat_imm_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #1 // =0x1
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.16b, #1
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 1, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   store <32 x i8> %splat, ptr %a
@@ -313,6 +455,13 @@ define void @splat_imm_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #2 // =0x2
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #2 // =0x2
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 2, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   store <16 x i16> %splat, ptr %a
@@ -325,6 +474,13 @@ define void @splat_imm_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #3 // =0x3
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #3 // =0x3
+; NONEON-NOSVE-NEXT:    dup v0.4s, w8
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 3, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   store <8 x i32> %splat, ptr %a
@@ -337,6 +493,13 @@ define void @splat_imm_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #4 // =0x4
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #4 // =0x4
+; NONEON-NOSVE-NEXT:    dup v0.2d, x8
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 4, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   store <4 x i64> %splat, ptr %a
@@ -353,6 +516,13 @@ define void @splat_imm_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmov z0.h, #5.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #17664 // =0x4500
+; NONEON-NOSVE-NEXT:    dup v0.8h, w8
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half 5.0, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
   store <16 x half> %splat, ptr %a
@@ -365,6 +535,12 @@ define void @splat_imm_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmov z0.s, #6.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov v0.4s, #6.00000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float 6.0, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
   store <8 x float> %splat, ptr %a
@@ -377,6 +553,12 @@ define void @splat_imm_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmov z0.d, #7.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov v0.2d, #7.00000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double 7.0, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
   store <4 x double> %splat, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index 06709ca3685c..41449aa90ba0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -12,6 +13,11 @@ define void @store_v4i8(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -22,6 +28,12 @@ define void @store_v8i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -32,6 +44,12 @@ define void @store_v16i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -42,6 +60,12 @@ define void @store_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -53,6 +77,11 @@ define void @store_v2i16(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -64,6 +93,11 @@ define void @store_v2f16(ptr %a) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -74,6 +108,12 @@ define void @store_v4i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -84,6 +124,12 @@ define void @store_v4f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -94,6 +140,12 @@ define void @store_v8i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -104,6 +156,12 @@ define void @store_v8f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -114,6 +172,12 @@ define void @store_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -124,6 +188,12 @@ define void @store_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -133,6 +203,11 @@ define void @store_v2i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -142,6 +217,11 @@ define void @store_v2f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -151,6 +231,11 @@ define void @store_v4i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -160,6 +245,11 @@ define void @store_v4f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -170,6 +260,12 @@ define void @store_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -180,6 +276,12 @@ define void @store_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -190,6 +292,12 @@ define void @store_v1i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <1 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -200,6 +308,12 @@ define void @store_v1f64(ptr %a) {
 ; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi d0, #0000000000000000
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <1 x double> zeroinitializer, ptr %a
   ret void
 }
@@ -209,6 +323,11 @@ define void @store_v2i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -218,6 +337,11 @@ define void @store_v2f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x double> zeroinitializer, ptr %a
   ret void
 }
@@ -228,6 +352,12 @@ define void @store_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -238,6 +368,12 @@ define void @store_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> zeroinitializer, ptr %a
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
index 838db0ce8185..d1873f436815 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 ; Test we can code generater patterns of the form:
@@ -23,6 +24,12 @@ define void @subvector_v4i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i8>, ptr %in
   br label %bb1
 
@@ -37,6 +44,12 @@ define void @subvector_v8i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %in
   br label %bb1
 
@@ -51,6 +64,12 @@ define void @subvector_v16i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %in
   br label %bb1
 
@@ -65,6 +84,12 @@ define void @subvector_v32i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v32i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   br label %bb1
 
@@ -81,6 +106,12 @@ define void @subvector_v2i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i16>, ptr %in
   br label %bb1
 
@@ -95,6 +126,12 @@ define void @subvector_v4i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %in
   br label %bb1
 
@@ -109,6 +146,12 @@ define void @subvector_v8i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %in
   br label %bb1
 
@@ -123,6 +166,12 @@ define void @subvector_v16i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   br label %bb1
 
@@ -138,6 +187,12 @@ define void @subvector_v2i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %in
   br label %bb1
 
@@ -152,6 +207,12 @@ define void @subvector_v4i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %in
   br label %bb1
 
@@ -166,6 +227,12 @@ define void @subvector_v8i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   br label %bb1
 
@@ -181,6 +248,12 @@ define void @subvector_v2i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %in
   br label %bb1
 
@@ -195,6 +268,12 @@ define void @subvector_v4i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   br label %bb1
 
@@ -210,6 +289,12 @@ define void @subvector_v2f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x half>, ptr %in
   br label %bb1
 
@@ -224,6 +309,12 @@ define void @subvector_v4f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %in
   br label %bb1
 
@@ -238,6 +329,12 @@ define void @subvector_v8f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %in
   br label %bb1
 
@@ -252,6 +349,12 @@ define void @subvector_v16f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %in
   br label %bb1
 
@@ -267,6 +370,12 @@ define void @subvector_v2f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %in
   br label %bb1
 
@@ -281,6 +390,12 @@ define void @subvector_v4f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %in
   br label %bb1
 
@@ -295,6 +410,12 @@ define void @subvector_v8f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>,ptr %in
   br label %bb1
 
@@ -310,6 +431,12 @@ define void @subvector_v2f64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %in
   br label %bb1
 
@@ -324,6 +451,12 @@ define void @subvector_v4f64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %in
   br label %bb1
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index 7e3a175c40d2..f0a4368da3ee 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -12,6 +13,13 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v8i16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %val, ptr %dest
@@ -25,6 +33,14 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v4i32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str s0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i8>
   store <4 x i8> %val, ptr %dest
@@ -38,6 +54,13 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v4i32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i16>
   store <4 x i16> %val, ptr %dest
@@ -51,6 +74,13 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v2i64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    xtn v0.2s, v0.2d
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = trunc <2 x i64> %a to <2 x i32>
   store <2 x i32> %val, ptr %dest
@@ -66,6 +96,14 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v2i256i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    mov v1.d[1], v0.d[0]
+; NONEON-NOSVE-NEXT:    str q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i256>, ptr %ap
   %val = trunc <2 x i256> %a to <2 x i64>
   store <2 x i64> %val, ptr %dest
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 70219dd30f76..4895ffb6858e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,12 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = trunc <16 x i16> %a to <16 x i8>
   ret <16 x i8> %b
@@ -41,6 +48,17 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.b, z2.b, z2.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i16>, ptr %in
   %b = trunc <32 x i16> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -76,6 +94,24 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v4.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v6.16b, v1.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i16>, ptr %in
   %b = trunc <64 x i16> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -133,6 +169,38 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
+; NONEON-NOSVE-NEXT:    uzp1 v4.16b, v5.16b, v4.16b
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v7.16b, v6.16b
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v16.16b, v1.16b
+; NONEON-NOSVE-NEXT:    uzp1 v5.16b, v17.16b, v5.16b
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v4.16b, v4.16b, v4.16b
+; NONEON-NOSVE-NEXT:    uzp1 v7.16b, v18.16b, v7.16b
+; NONEON-NOSVE-NEXT:    add v3.16b, v6.16b, v6.16b
+; NONEON-NOSVE-NEXT:    uzp1 v6.16b, v17.16b, v16.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
+; NONEON-NOSVE-NEXT:    add v0.16b, v5.16b, v5.16b
+; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v4.16b, v7.16b, v7.16b
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    add v1.16b, v6.16b, v6.16b
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <128 x i16>, ptr %in
   %b = trunc <128 x i16> %a to <128 x i8>
   %c = add <128 x i8> %b, %b
@@ -155,6 +223,13 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i8>
   ret <8 x i8> %b
@@ -178,6 +253,15 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i8>
   ret <16 x i8> %b
@@ -215,6 +299,23 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.b, z3.b, z3.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v7.8h, v6.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -279,6 +380,36 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
+; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v16.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v19.8h, v18.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.16b, v4.16b, v6.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v1.16b, v7.16b
+; NONEON-NOSVE-NEXT:    uzp1 v3.16b, v5.16b, v3.16b
+; NONEON-NOSVE-NEXT:    add v2.16b, v2.16b, v2.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add v3.16b, v3.16b, v3.16b
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -300,6 +431,12 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i16>
   ret <8 x i16> %b
@@ -322,6 +459,17 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.h, z2.h, z2.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -357,6 +505,24 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v6.8h, v1.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -414,6 +580,38 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
+; NONEON-NOSVE-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v7.8h, v6.8h
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v16.8h, v1.8h
+; NONEON-NOSVE-NEXT:    uzp1 v5.8h, v17.8h, v5.8h
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v4.8h, v4.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v7.8h, v18.8h, v7.8h
+; NONEON-NOSVE-NEXT:    add v3.8h, v6.8h, v6.8h
+; NONEON-NOSVE-NEXT:    uzp1 v6.8h, v17.8h, v16.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
+; NONEON-NOSVE-NEXT:    add v0.8h, v5.8h, v5.8h
+; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v4.8h, v7.8h, v7.8h
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    add v1.8h, v6.8h, v6.8h
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i16>
   %c = add <64 x i16> %b, %b
@@ -437,6 +635,13 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i8>
   ret <4 x i8> %b
@@ -461,6 +666,16 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i8>
   ret <8 x i8> %b
@@ -499,6 +714,21 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i8>
   ret <16 x i8> %b
@@ -565,6 +795,35 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #160]
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v16.4s, v17.4s, v16.4s
+; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v19.4s, v18.4s
+; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v21.4s, v20.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v4.8h, v16.8h
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v2.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v6.8h, v5.8h
+; NONEON-NOSVE-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; NONEON-NOSVE-NEXT:    uzp1 v1.16b, v2.16b, v3.16b
+; NONEON-NOSVE-NEXT:    add v0.16b, v0.16b, v0.16b
+; NONEON-NOSVE-NEXT:    add v1.16b, v1.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -587,6 +846,13 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    xtn v0.4h, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i16>
   ret <4 x i16> %b
@@ -610,6 +876,15 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i16>
   ret <8 x i16> %b
@@ -647,6 +922,23 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.h, z3.h, z3.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -711,6 +1003,36 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #224]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    ldp q16, q7, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
+; NONEON-NOSVE-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v16.4s, v7.4s
+; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v19.4s, v18.4s
+; NONEON-NOSVE-NEXT:    uzp1 v2.8h, v4.8h, v6.8h
+; NONEON-NOSVE-NEXT:    add v0.8h, v0.8h, v0.8h
+; NONEON-NOSVE-NEXT:    uzp1 v1.8h, v1.8h, v7.8h
+; NONEON-NOSVE-NEXT:    uzp1 v3.8h, v5.8h, v3.8h
+; NONEON-NOSVE-NEXT:    add v2.8h, v2.8h, v2.8h
+; NONEON-NOSVE-NEXT:    add v1.8h, v1.8h, v1.8h
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    add v3.8h, v3.8h, v3.8h
+; NONEON-NOSVE-NEXT:    stp q1, q3, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -732,6 +1054,12 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i32>
   ret <4 x i32> %b
@@ -754,6 +1082,17 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.s, z2.s, z2.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i32>
   %c = add <8 x i32> %b, %b
@@ -789,6 +1128,24 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q6, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v6.4s, v1.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    add v3.4s, v3.4s, v3.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q2, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i32>
   %c = add <16 x i32> %b, %b
@@ -846,6 +1203,38 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #128]
+; NONEON-NOSVE-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; NONEON-NOSVE-NEXT:    ldp q16, q1, [x0, #160]
+; NONEON-NOSVE-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; NONEON-NOSVE-NEXT:    ldp q17, q5, [x0, #64]
+; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v7.4s, v6.4s
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q7, [x0, #96]
+; NONEON-NOSVE-NEXT:    uzp1 v1.4s, v16.4s, v1.4s
+; NONEON-NOSVE-NEXT:    uzp1 v5.4s, v17.4s, v5.4s
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #32]
+; NONEON-NOSVE-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
+; NONEON-NOSVE-NEXT:    add v0.4s, v0.4s, v0.4s
+; NONEON-NOSVE-NEXT:    add v4.4s, v4.4s, v4.4s
+; NONEON-NOSVE-NEXT:    uzp1 v7.4s, v18.4s, v7.4s
+; NONEON-NOSVE-NEXT:    add v3.4s, v6.4s, v6.4s
+; NONEON-NOSVE-NEXT:    uzp1 v6.4s, v17.4s, v16.4s
+; NONEON-NOSVE-NEXT:    add v1.4s, v1.4s, v1.4s
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #96]
+; NONEON-NOSVE-NEXT:    add v0.4s, v5.4s, v5.4s
+; NONEON-NOSVE-NEXT:    add v2.4s, v2.4s, v2.4s
+; NONEON-NOSVE-NEXT:    add v4.4s, v7.4s, v7.4s
+; NONEON-NOSVE-NEXT:    stp q3, q1, [x1, #64]
+; NONEON-NOSVE-NEXT:    add v1.4s, v6.4s, v6.4s
+; NONEON-NOSVE-NEXT:    stp q0, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i32>
   %c = add <32 x i32> %b, %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index 175731480407..dd308dfadd80 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,12 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v1.8b, v0.8b, v0.8b, #6
+; NONEON-NOSVE-NEXT:    trn1 v0.4h, v0.4h, v1.4h
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
   ret <4 x i8> %ret
 }
@@ -28,6 +35,11 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    insr z1.b, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #7
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i8> %ret
 }
@@ -42,6 +54,11 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    insr z1.b, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
                                                                    i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
   ret <16 x i8> %ret
@@ -60,6 +77,15 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.b, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #15
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #15
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %ret = shufflevector <32 x i8> %op1, <32 x i8> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -78,6 +104,11 @@ define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    rev64 v0.2s, v0.2s
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> <i32 1, i32 0>
   ret <2 x i16> %ret
 }
@@ -92,6 +123,11 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    insr z1.h, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %ret
 }
@@ -106,6 +142,11 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    insr z1.h, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i16> %ret
 }
@@ -123,6 +164,15 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.h, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %ret = shufflevector <16 x i16> %op1, <16 x i16> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -141,6 +191,11 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    insr z1.s, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %ret
 }
@@ -155,6 +210,11 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    insr z1.s, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i32> %ret
 }
@@ -172,6 +232,15 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.s, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %ret = shufflevector <8 x i32> %op1, <8 x i32> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -189,6 +258,11 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    insr z1.d, x8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %ret
 }
@@ -206,6 +280,15 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, x8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %ret = shufflevector <4 x i64> %op1, <4 x i64> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -223,6 +306,11 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    insr z0.h, h2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #6
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x half> %ret
 }
@@ -236,6 +324,11 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    insr z0.h, h2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x half> %ret
 }
@@ -251,6 +344,15 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.h, h2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #14
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #14
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %ret = shufflevector <16 x half> %op1, <16 x half> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -268,6 +370,11 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
 ; CHECK-NEXT:    insr z0.s, s2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %ret
 }
@@ -281,6 +388,11 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
 ; CHECK-NEXT:    insr z0.s, s2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x float> %ret
 }
@@ -296,6 +408,15 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.s, s2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %ret = shufflevector <8 x float> %op1, <8 x float> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -312,6 +433,11 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
 ; CHECK-NEXT:    insr z0.d, d2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %ret
 }
@@ -327,6 +453,15 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, d2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -345,6 +480,15 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, d2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v1.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ext v0.16b, v0.16b, v2.16b, #8
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
@@ -359,6 +503,13 @@ define void @shuffle_ext_invalid(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_invalid:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
index 337a2134de5b..42f3f03a5ea0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -11,6 +12,11 @@ define fp128 @test_streaming_compatible_register_mov(fp128 %q0, fp128 %q1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_streaming_compatible_register_mov:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   ret fp128 %q1
 }
 
@@ -20,6 +26,11 @@ define double @fp_zero_constant() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fp_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d0, xzr
+; NONEON-NOSVE-NEXT:    ret
   ret double 0.0
 }
 
@@ -29,6 +40,11 @@ define <2 x i64> @fixed_vec_zero_constant() {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_vec_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    ret
   ret <2 x i64> zeroinitializer
 }
 
@@ -38,5 +54,10 @@ define <2 x double> @fixed_vec_fp_zero_constant() {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    movi v0.2d, #0000000000000000
+; NONEON-NOSVE-NEXT:    ret
   ret <2 x double> <double 0.0, double 0.0>
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 9920bc6048e8..478f4a689d3c 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -9,7 +9,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>}   %retval
 }
 
@@ -21,7 +21,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_n
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>}   %retval
 }
 
@@ -32,7 +32,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
 ; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
   ret {<vscale x 8 x half>, <vscale x 8 x half>}   %retval
 }
 
@@ -44,7 +44,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>}   %retval
 }
 
@@ -55,7 +55,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32
 ; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
 ret  {<vscale x 4 x float>, <vscale x 4 x float>}   %retval
 }
 
@@ -66,7 +66,7 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
 ; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
   ret {<vscale x 2 x double>, <vscale x 2 x double>}   %retval
 }
 
@@ -79,7 +79,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv
 ; CHECK-NEXT:    uzp2 z1.b, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>}   %retval
 }
 
@@ -90,7 +90,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv
 ; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>}   %retval
 }
 
@@ -101,7 +101,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv
 ; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>}   %retval
 }
 
@@ -112,7 +112,7 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
 ; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>}   %retval
 }
 
@@ -124,7 +124,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-NEXT:    uzp2 p1.b, p0.b, p1.b
 ; CHECK-NEXT:    mov p0.b, p2.b
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
   ret {<vscale x 16 x i1>, <vscale x 16 x i1>}   %retval
 }
 
@@ -136,7 +136,7 @@ define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i
 ; CHECK-NEXT:    uzp1 p0.h, p2.h, p1.h
 ; CHECK-NEXT:    uzp2 p1.h, p2.h, p1.h
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.experimental.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
+  %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
   ret {<vscale x 8 x i1>, <vscale x 8 x i1>}   %retval
 }
 
@@ -148,7 +148,7 @@ define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1
 ; CHECK-NEXT:    uzp1 p0.s, p2.s, p1.s
 ; CHECK-NEXT:    uzp2 p1.s, p2.s, p1.s
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.experimental.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
+  %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
   ret {<vscale x 4 x i1>, <vscale x 4 x i1>}   %retval
 }
 
@@ -160,7 +160,7 @@ define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1
 ; CHECK-NEXT:    uzp1 p0.d, p2.d, p1.d
 ; CHECK-NEXT:    uzp2 p1.d, p2.d, p1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.experimental.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
+  %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
   ret {<vscale x 2 x i1>, <vscale x 2 x i1>}   %retval
 }
 
@@ -178,7 +178,7 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
 ; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    mov z2.d, z6.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
+%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
 ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 }
 
@@ -201,7 +201,7 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>}  @vector_deinterleave_nxv8i64_nx
 ; CHECK-NEXT:    mov z5.d, z29.d
 ; CHECK-NEXT:    mov z6.d, z30.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
 ret {<vscale x 8 x i64>, <vscale x 8 x i64>}  %retval
 }
 
@@ -216,7 +216,7 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>} @vector_deinterleave_nxv8i8_nxv16i
 ; CHECK-NEXT:    uzp1 z0.h, z2.h, z1.h
 ; CHECK-NEXT:    uzp2 z1.h, z2.h, z1.h
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %vec)
+%retval = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %vec)
 ret {<vscale x 8 x i8>, <vscale x 8 x i8>} %retval
 }
 
@@ -228,7 +228,7 @@ define {<vscale x 4 x i16>, <vscale x 4 x i16>} @vector_deinterleave_nxv4i16_nxv
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %vec)
+%retval = call {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %vec)
 ret {<vscale x 4 x i16>, <vscale x 4 x i16>} %retval
 }
 
@@ -240,35 +240,35 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
+%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
 ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
 }
 
 
 ; Floating declarations
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 
 ; Integer declarations
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
 
 ; Predicated declarations
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.experimental.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
-declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.experimental.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
-declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.experimental.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
+declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
+declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
 
 ; Illegal size type
-declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
-declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
-declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
+declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 23bf5065286e..e2c3b0abe21a 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -8,7 +8,7 @@ define <vscale x 4 x half> @interleave2_nxv4f16(<vscale x 2 x half> %vec0, <vsca
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1)
+  %retval = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1)
   ret  <vscale x 4 x half>   %retval
 }
 
@@ -19,7 +19,7 @@ define <vscale x 8 x half> @interleave2_nxv8f16(<vscale x 4 x half> %vec0, <vsca
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1)
+  %retval = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1)
   ret  <vscale x 8 x half>   %retval
 }
 
@@ -30,7 +30,7 @@ define <vscale x 16 x half> @interleave2_nxv16f16(<vscale x 8 x half> %vec0, <vs
 ; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1)
+  %retval = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1)
   ret  <vscale x 16 x half>   %retval
 }
 
@@ -41,7 +41,7 @@ define <vscale x 4 x float> @interleave2_nxv4f32(<vscale x 2 x float> %vec0, <vs
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1)
+  %retval = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1)
   ret  <vscale x 4 x float>   %retval
 }
 
@@ -52,7 +52,7 @@ define <vscale x 8 x float> @interleave2_nxv8f32(<vscale x 4 x float> %vec0, <vs
 ; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1)
+  %retval = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1)
   ret  <vscale x 8 x float>   %retval
 }
 
@@ -63,7 +63,7 @@ define <vscale x 4 x double> @interleave2_nxv4f64(<vscale x 2 x double> %vec0, <
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x double>@llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1)
+  %retval = call <vscale x 4 x double>@llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1)
   ret  <vscale x 4 x double>   %retval
 }
 
@@ -76,7 +76,7 @@ define <vscale x 32 x i8> @interleave2_nxv32i8(<vscale x 16 x i8> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.b, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-   %retval = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1)
+   %retval = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1)
   ret  <vscale x 32 x i8>   %retval
 }
 
@@ -87,7 +87,7 @@ define <vscale x 16 x i16> @interleave2_nxv16i16(<vscale x 8 x i16> %vec0, <vsca
 ; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1)
+  %retval = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1)
   ret  <vscale x 16 x i16>   %retval
 }
 
@@ -98,7 +98,7 @@ define <vscale x 8 x i32> @interleave2_nxv8i32(<vscale x 4 x i32> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1)
+  %retval = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1)
   ret  <vscale x 8 x i32>   %retval
 }
 
@@ -109,7 +109,7 @@ define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1)
+  %retval = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1)
   ret  <vscale x 4 x i64>   %retval
 }
 
@@ -122,7 +122,7 @@ define <vscale x 32 x i1> @interleave2_nxv32i1(<vscale x 16 x i1> %vec0, <vscale
 ; CHECK-NEXT:    zip2 p1.b, p0.b, p1.b
 ; CHECK-NEXT:    mov p0.b, p2.b
 ; CHECK-NEXT:    ret
-   %retval = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1)
+   %retval = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1)
   ret  <vscale x 32 x i1>   %retval
 }
 
@@ -133,7 +133,7 @@ define <vscale x 16 x i1> @interleave2_nxv16i1(<vscale x 8 x i1> %vec0, <vscale
 ; CHECK-NEXT:    zip1 p0.h, p0.h, p1.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p2.b
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i1> @llvm.experimental.vector.interleave2.nxv16i1(<vscale x 8 x i1> %vec0, <vscale x 8 x i1> %vec1)
+  %retval = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %vec0, <vscale x 8 x i1> %vec1)
   ret  <vscale x 16 x i1>   %retval
 }
 
@@ -144,7 +144,7 @@ define <vscale x 8 x i1> @interleave2_nxv8i1(<vscale x 4 x i1> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1> %vec0, <vscale x 4 x i1> %vec1)
+  %retval = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %vec0, <vscale x 4 x i1> %vec1)
   ret  <vscale x 8 x i1>   %retval
 }
 
@@ -155,7 +155,7 @@ define <vscale x 4 x i1> @interleave2_nxv4i1(<vscale x 2 x i1> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 p0.d, p0.d, p1.d
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %vec0, <vscale x 2 x i1> %vec1)
+  %retval = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %vec0, <vscale x 2 x i1> %vec1)
   ret  <vscale x 4 x i1>   %retval
 }
 
@@ -172,7 +172,7 @@ define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vsca
 ; CHECK-NEXT:    mov z1.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i32>@llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
+  %retval = call <vscale x 16 x i32>@llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
   ret <vscale x 16 x i32> %retval
 }
 
@@ -187,7 +187,7 @@ define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale
 ; CHECK-NEXT:    mov z1.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
+  %retval = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
   ret <vscale x 8 x i64> %retval
 }
 
@@ -200,7 +200,7 @@ define <vscale x 16 x i8> @interleave2_nxv8i8(<vscale x 8 x i8> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1)
+  %retval = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1)
   ret <vscale x 16 x i8> %retval
 }
 
@@ -211,7 +211,7 @@ define <vscale x 8 x i16> @interleave2_nxv4i16(<vscale x 4 x i16> %vec0, <vscale
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1)
+  %retval = call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1)
   ret <vscale x 8 x i16> %retval
 }
 
@@ -222,34 +222,34 @@ define <vscale x 4 x i32> @interleave2_nxv2i32(<vscale x 2 x i32> %vec0, <vscale
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1)
+  %retval = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1)
   ret <vscale x 4 x i32> %retval
 }
 
 ; Float declarations
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
 ; Integer declarations
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
 ; Predicated
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.interleave2.nxv16i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
 
 ; Illegal type size
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 23b2622f5f58..ef7d4abe5c5f 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -41,3 +41,55 @@ define <vscale x 4 x i32> @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32>
   %c = or <vscale x 4 x i32> %1, %2
   ret <vscale x 4 x i32> %c
 }
+
+define <vscale x 16 x i8> @nbsl_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: nbsl_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #127 // =0x7f
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 16 x i8> %a, splat(i8 127)
+  %2 = and <vscale x 16 x i8> %b, splat(i8 -128)
+  %3 = or <vscale x 16 x i8> %1, %2
+  %4 = xor <vscale x 16 x i8> %3, splat(i8 -1)
+  ret <vscale x 16 x i8> %4
+}
+
+define <vscale x 8 x i16> @nbsl_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: nbsl_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 8 x i16> %a, splat(i16 32767)
+  %2 = and <vscale x 8 x i16> %b, splat(i16 -32768)
+  %3 = or <vscale x 8 x i16> %1, %2
+  %4 = xor <vscale x 8 x i16> %3, splat(i16 -1)
+  ret <vscale x 8 x i16> %4
+}
+
+define <vscale x 4 x i32> @nbsl_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: nbsl_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 4 x i32> %a, splat(i32 2147483647)
+  %2 = and <vscale x 4 x i32> %b, splat(i32 -2147483648)
+  %3 = or <vscale x 4 x i32> %1, %2
+  %4 = xor <vscale x 4 x i32> %3, splat(i32 -1)
+  ret <vscale x 4 x i32> %4
+}
+
+define <vscale x 2 x i64> @nbsl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: nbsl_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 2 x i64> %a, splat(i64 9223372036854775807)
+  %2 = and <vscale x 2 x i64> %b, splat(i64 -9223372036854775808)
+  %3 = or <vscale x 2 x i64> %1, %2
+  %4 = xor <vscale x 2 x i64> %3, splat(i64 -1)
+  ret <vscale x 2 x i64> %4
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
index cb74cd8032ab..5f7476397891 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
@@ -16,7 +16,7 @@ define <vscale x 16 x i1> @whilege_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -28,7 +28,7 @@ define <vscale x 16 x i1> @whilege_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -40,7 +40,7 @@ define <vscale x 8 x i1> @whilege_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -52,7 +52,7 @@ define <vscale x 8 x i1> @whilege_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -64,7 +64,7 @@ define <vscale x 4 x i1> @whilege_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -76,7 +76,7 @@ define <vscale x 4 x i1> @whilege_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -88,7 +88,7 @@ define <vscale x 2 x i1> @whilege_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -100,7 +100,7 @@ define <vscale x 2 x i1> @whilege_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -116,7 +116,7 @@ define <vscale x 16 x i1> @whilehs_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -128,7 +128,7 @@ define <vscale x 16 x i1> @whilehs_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -140,7 +140,7 @@ define <vscale x 8 x i1> @whilehs_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -152,7 +152,7 @@ define <vscale x 8 x i1> @whilehs_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -164,7 +164,7 @@ define <vscale x 4 x i1> @whilehs_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -176,7 +176,7 @@ define <vscale x 4 x i1> @whilehs_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -188,7 +188,7 @@ define <vscale x 2 x i1> @whilehs_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -200,7 +200,7 @@ define <vscale x 2 x i1> @whilehs_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -214,7 +214,7 @@ define <vscale x 16 x i1> @whilegt_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -224,7 +224,7 @@ define <vscale x 16 x i1> @whilegt_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -234,7 +234,7 @@ define <vscale x 8 x i1> @whilegt_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -244,7 +244,7 @@ define <vscale x 8 x i1> @whilegt_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -254,7 +254,7 @@ define <vscale x 4 x i1> @whilegt_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -264,7 +264,7 @@ define <vscale x 4 x i1> @whilegt_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -274,7 +274,7 @@ define <vscale x 2 x i1> @whilegt_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -284,7 +284,7 @@ define <vscale x 2 x i1> @whilegt_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -298,7 +298,7 @@ define <vscale x 16 x i1> @whilehi_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -308,7 +308,7 @@ define <vscale x 16 x i1> @whilehi_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -318,7 +318,7 @@ define <vscale x 8 x i1> @whilehi_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -328,7 +328,7 @@ define <vscale x 8 x i1> @whilehi_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -338,7 +338,7 @@ define <vscale x 4 x i1> @whilehi_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -348,7 +348,7 @@ define <vscale x 4 x i1> @whilehi_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -358,7 +358,7 @@ define <vscale x 2 x i1> @whilehi_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -368,7 +368,7 @@ define <vscale x 2 x i1> @whilehi_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -384,7 +384,7 @@ define <vscale x 16 x i1> @whilele_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilege.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -396,7 +396,7 @@ define <vscale x 16 x i1> @whilele_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilege.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -408,7 +408,7 @@ define <vscale x 8 x i1> @whilele_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilege.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -420,7 +420,7 @@ define <vscale x 8 x i1> @whilele_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilege.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -432,7 +432,7 @@ define <vscale x 4 x i1> @whilele_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilege.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -444,7 +444,7 @@ define <vscale x 4 x i1> @whilele_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilege.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -456,7 +456,7 @@ define <vscale x 2 x i1> @whilele_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilege.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -468,7 +468,7 @@ define <vscale x 2 x i1> @whilele_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilege.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -482,7 +482,7 @@ define <vscale x 16 x i1> @whilelo_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehi.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -492,7 +492,7 @@ define <vscale x 16 x i1> @whilelo_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehi.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -502,7 +502,7 @@ define <vscale x 8 x i1> @whilelo_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehi.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -512,7 +512,7 @@ define <vscale x 8 x i1> @whilelo_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehi.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -522,7 +522,7 @@ define <vscale x 4 x i1> @whilelo_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehi.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -532,7 +532,7 @@ define <vscale x 4 x i1> @whilelo_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehi.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -542,7 +542,7 @@ define <vscale x 2 x i1> @whilelo_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehi.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -552,7 +552,7 @@ define <vscale x 2 x i1> @whilelo_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehi.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -568,7 +568,7 @@ define <vscale x 16 x i1> @whilels_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehs.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -580,7 +580,7 @@ define <vscale x 16 x i1> @whilels_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehs.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -592,7 +592,7 @@ define <vscale x 8 x i1> @whilels_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehs.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -604,7 +604,7 @@ define <vscale x 8 x i1> @whilels_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehs.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -616,7 +616,7 @@ define <vscale x 4 x i1> @whilels_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehs.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -628,7 +628,7 @@ define <vscale x 4 x i1> @whilels_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehs.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -640,7 +640,7 @@ define <vscale x 2 x i1> @whilels_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehs.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -652,7 +652,7 @@ define <vscale x 2 x i1> @whilels_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehs.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -666,7 +666,7 @@ define <vscale x 16 x i1> @whilelt_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilegt.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -676,7 +676,7 @@ define <vscale x 16 x i1> @whilelt_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilegt.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -686,7 +686,7 @@ define <vscale x 8 x i1> @whilelt_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilegt.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -696,7 +696,7 @@ define <vscale x 8 x i1> @whilelt_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilegt.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -706,7 +706,7 @@ define <vscale x 4 x i1> @whilelt_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilegt.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -716,7 +716,7 @@ define <vscale x 4 x i1> @whilelt_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilegt.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -726,7 +726,7 @@ define <vscale x 2 x i1> @whilelt_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilegt.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -736,6 +736,6 @@ define <vscale x 2 x i1> @whilelt_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilegt.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 3254c5ebe9c6..ab7cea8dfb77 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2825,10 +2825,11 @@ entry:
 define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    movi v2.2d, #0x00ffff0000ffff
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-SD-NEXT:    uaddlv d0, v0.4s
 ; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ret
@@ -3578,10 +3579,11 @@ entry:
 define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    movi v2.2d, #0x0000ff000000ff
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-SD-NEXT:    uaddlv d0, v0.4s
 ; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index b940dc74839b..eaaeb3dc77a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -16,8 +16,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x8
 ; GCN-NEXT:    s_add_u32 s0, s0, s9
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_movk_i32 s32, 0x400
 ; GCN-NEXT:    s_mov_b32 s33, 0
+; GCN-NEXT:    s_movk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
@@ -87,8 +87,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x8
 ; GCN-NEXT:    s_add_u32 s0, s0, s9
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_movk_i32 s32, 0x1000
 ; GCN-NEXT:    s_mov_b32 s33, 0
+; GCN-NEXT:    s_movk_i32 s32, 0x1000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index b7b2cb22c1b6..9d4f9434aa31 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GCN-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
-; GCN-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
+; GCN-NEXT:      '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
index 98aa04f6d26e..a3fd2a942bc2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
@@ -11,8 +11,8 @@
 ; GCN-NEXT:         .entry_point:    cs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2e12 (COMPUTE_PGM_RSRC1):
-; GCN-NEXT:       0x2e13 (COMPUTE_PGM_RSRC2):
+; GCN-NEXT:       '0x2e12 (COMPUTE_PGM_RSRC1)':
+; GCN-NEXT:       '0x2e13 (COMPUTE_PGM_RSRC2)':
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_cs half @cs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
index 012b2061756b..679e0858819e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
@@ -10,7 +10,7 @@
 ; GCN-NEXT:         .entry_point:    es_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0
+; GCN-NEXT:       '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_es half @es_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
index e2f67398d18a..75f7a1dc266d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    gs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0
+; GCN-NEXT:       '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_gs half @gs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
index 9ad47c1d604f..c61578a967b6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    hs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0
+; GCN-NEXT:       '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_hs half @hs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
index 8ee6f7283ce7..8162c824dc2c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
@@ -10,7 +10,7 @@
 ; GCN-NEXT:         .entry_point:    ls_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0
+; GCN-NEXT:       '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_ls half @ls_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
index 0d0c70c38ace..5e21ba494df1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
@@ -5,7 +5,7 @@
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
 ; GCN-LABEL: {{^}}cs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2e12 (COMPUTE_PGM_RSRC1)
+; GCN: '0x2e12 (COMPUTE_PGM_RSRC1)'
 define amdgpu_cs half @cs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
index b82e3ebdde4b..dc9a33ac0141 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -75,7 +75,7 @@ define amdgpu_vs half @vs_amdpal(half %arg0) {
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
index b86b42868005..ffce3ed08509 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -77,7 +77,7 @@ attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
index b1db7aafacab..3ea3064fa743 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -77,7 +77,7 @@ attributes #0 = { "amdgpu-dx10-clamp"="false" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
index f97117f3d909..bcc8da6e1bf4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
@@ -4,7 +4,7 @@
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
 ; GCN-LABEL: {{^}}es_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2cca (SPI_SHADER_PGM_RSRC1_ES)
+; GCN: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)'
 define amdgpu_es half @es_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
index a32d10390b98..ef4c9cbd5006 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
@@ -5,7 +5,7 @@
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
 ; GCN-LABEL: {{^}}gs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS)
+; GCN: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)'
 define amdgpu_gs half @gs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
index be08c93cdb31..eb814c11bceb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
@@ -5,7 +5,7 @@
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
 ; GCN-LABEL: {{^}}hs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS)
+; GCN: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)'
 define amdgpu_hs half @hs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
index 95d533544c30..d4826a22db79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
@@ -4,50 +4,50 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf0000{{$}}
-; GFX12-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x600f0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf0000{{$}}
-; GFX12-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xa0f0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xa0f0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x50f0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x50f0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -55,20 +55,20 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf0000{{$}}
-; GFX12-DAG:        0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x20f0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf0000{{$}}
+; GFX12-DAG:        '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x20f0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x80f0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x80f0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -85,7 +85,7 @@ attributes #0 = { "amdgpu-ieee"="true" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
index 46097fa20608..0d81e70b2e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
@@ -4,7 +4,7 @@
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
 ; GCN-LABEL: {{^}}ls_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS)
+; GCN: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)'
 define amdgpu_ls half @ls_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
index 9169c651f129..d31732f995b1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
@@ -12,8 +12,8 @@
 ; GCN-NEXT:      - 0x123456789abcdef0
 ; GCN-NEXT:      - 0xfedcba9876543210
 ; GCN:         .registers:
-; GCN:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS):
-; GCN:           0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42
+; GCN:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)':
+; GCN:           '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42
 define amdgpu_ps half @ps_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -23,12 +23,12 @@ define amdgpu_ps half @ps_amdpal(half %arg0) {
 ;
 ; 	.amdgpu_pal_metadata
 ; ---
-; amdpal.pipelines: 
+; amdpal.pipelines:
 ;   - .internal_pipeline_hash:
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
index d6322e2b4d3e..15b1a652077e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
@@ -7,8 +7,8 @@
 ; the workaround that ensures that an interpolation mode is also set in PSEnable.
 ; GCN-LABEL: {{^}}amdpal_psenable:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0xa1b3 (SPI_PS_INPUT_ENA): 0x2
-; GCN: 0xa1b4 (SPI_PS_INPUT_ADDR): 0x2
+; GCN: '0xa1b3 (SPI_PS_INPUT_ENA)': 0x2
+; GCN: '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x2
 define amdgpu_ps void @amdpal_psenable(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <2 x float> %pos) #6 {
   %inst23 = extractelement <2 x float> %pos, i32 0
   %inst24 = extractelement <2 x float> %pos, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
index 7c47129c28ce..42de6007f7e2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
@@ -5,7 +5,7 @@
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
 ; GCN-LABEL: {{^}}vs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS)
+; GCN: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)'
 define amdgpu_vs half @vs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
index 13d2050c491f..ace21207a7eb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
@@ -14,10 +14,10 @@
 ; GCN-NEXT:         .entry_point:    amdpal_psenable
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c0a (SPI_SHADER_PGM_RSRC1_PS):
-; GCN-NEXT:       0x2c0b (SPI_SHADER_PGM_RSRC2_PS):
-; GCN-NEXT:       0xa1b3 (SPI_PS_INPUT_ENA): 0x2
-; GCN-NEXT:       0xa1b4 (SPI_PS_INPUT_ADDR): 0x2
+; GCN-NEXT:       '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)':
+; GCN-NEXT:       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)':
+; GCN-NEXT:       '0xa1b3 (SPI_PS_INPUT_ENA)': 0x2
+; GCN-NEXT:       '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x2
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_ps void @amdpal_psenable(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <2 x float> %pos) #6 {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll b/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
index 52a9d57244c2..086a126b1ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
 
 ; We want to make sure that RSRC2 is left untouched
-; GCN:       0x2e13 (COMPUTE_PGM_RSRC2): 0x78a
+; GCN:       '0x2e13 (COMPUTE_PGM_RSRC2)': 0x78a
 define amdgpu_cs half @cs_amdpal(half %arg0, half inreg %arg1) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -9,4 +9,4 @@ define amdgpu_cs half @cs_amdpal(half %arg0, half inreg %arg1) {
 
 !amdgpu.pal.metadata.msgpack = !{!0}
 
-!0 = !{!"\82\B0amdpal.pipelines\91\89\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\83\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E83\B3\C2\D1)\7FG\CF[\8A\DF\EE[\7FD,\AA.registers\8A\CD.\07\01\CD.\08\01\CD.\09\01\CD.\12\CE@,\00\00\CD.\13\CD\07\8A\CD.(\00\CD.*\CE\16\0B\22Y\CD.@\CE\10\00\00\00\CD.B\CE\10\00\00\06\CD.D\00\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\D3s\A6\8D\C5x\84\D4\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E5\A0\EB\F9}\C6\C1\13\CF\1A_\E7\F7\F2.mR\AD.llpc_version\A454.5\AEamdpal.version\92\02\03"}
-\ No newline at end of file
+!0 = !{!"\82\B0amdpal.pipelines\91\89\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\83\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E83\B3\C2\D1)\7FG\CF[\8A\DF\EE[\7FD,\AA.registers\8A\CD.\07\01\CD.\08\01\CD.\09\01\CD.\12\CE@,\00\00\CD.\13\CD\07\8A\CD.(\00\CD.*\CE\16\0B\22Y\CD.@\CE\10\00\00\00\CD.B\CE\10\00\00\06\CD.D\00\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\D3s\A6\8D\C5x\84\D4\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E5\A0\EB\F9}\C6\C1\13\CF\1A_\E7\F7\F2.mR\AD.llpc_version\A454.5\AEamdpal.version\92\02\03"}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
index ec8f698d69c2..c300ba187740 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    vs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0
+; GCN-NEXT:       '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_vs half @vs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index bfd18f1b52a5..a3b6c283512f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -152,3 +152,57 @@ define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr
   %load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4
   ret ptr addrspace(3) %load
 }
+
+; GCN-LABEL: {{^}}atomic_load_monotonic_f16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_read_u16 v0, v0{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) {
+  %load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2
+  %ret = bitcast half %load to i16
+  ret i16 %ret
+}
+
+; GCN-LABEL: {{^}}atomic_load_monotonic_f16_offset:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic half, ptr addrspace(3) %gep monotonic, align 2
+  %ret = bitcast half %load to i16
+  ret i16 %ret
+}
+
+; GCN-LABEL: {{^}}atomic_load_monotonic_bf16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_read_u16 v0, v0{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) {
+  %load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
+  %ret = bitcast bfloat %load to i16
+  ret i16 %ret
+}
+
+; GCN-LABEL: {{^}}atomic_load_monotonic_bf16_offset:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2
+  %ret = bitcast bfloat %load to i16
+  ret i16 %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
index 71e24c1692c7..cd1e1fb1add4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
@@ -101,3 +101,56 @@ define void @atomic_store_monotonic_offset_i64(ptr addrspace(3) %ptr, i64 %val)
   ret void
 }
 
+; GCN-LABEL: {{^}}atomic_store_monotonic_f16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_write_b16 v0, v1{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @atomic_store_monotonic_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
+  %val = bitcast i16 %arg.val to half
+  store atomic half %val, ptr addrspace(3) %ptr monotonic, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_monotonic_offset_f16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @atomic_store_monotonic_offset_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
+  %val = bitcast i16 %arg.val to half
+  %gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
+  store atomic half %val, ptr addrspace(3) %gep monotonic, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_monotonic_bf16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_write_b16 v0, v1{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @atomic_store_monotonic_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
+  %val = bitcast i16 %arg.val to bfloat
+  store atomic bfloat %val, ptr addrspace(3) %ptr monotonic, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_monotonic_offset_bf16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @atomic_store_monotonic_offset_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
+  %val = bitcast i16 %arg.val to bfloat
+  %gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
+  store atomic bfloat %val, ptr addrspace(3) %gep monotonic, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index c674aebabcc8..8e773cad3b33 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -321,8 +321,8 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX803-NEXT:    s_mov_b32 s32, 0
 ; GFX803-NEXT:    s_mov_b32 s33, 0
+; GFX803-NEXT:    s_mov_b32 s32, 0
 ; GFX803-NEXT:    s_getpc_b64 s[16:17]
 ; GFX803-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
 ; GFX803-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
@@ -340,8 +340,8 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX900-NEXT:    s_mov_b32 s32, 0
 ; GFX900-NEXT:    s_mov_b32 s33, 0
+; GFX900-NEXT:    s_mov_b32 s32, 0
 ; GFX900-NEXT:    s_getpc_b64 s[16:17]
 ; GFX900-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
 ; GFX900-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
@@ -351,8 +351,8 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX1010-LABEL: test_force_fp_kern_call:
 ; GFX1010:       ; %bb.0: ; %entry
 ; GFX1010-NEXT:    s_add_u32 s10, s10, s15
-; GFX1010-NEXT:    s_mov_b32 s32, 0
 ; GFX1010-NEXT:    s_mov_b32 s33, 0
+; GFX1010-NEXT:    s_mov_b32 s32, 0
 ; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
@@ -378,16 +378,16 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX1100-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
-; GFX1100-NEXT:    s_mov_b32 s32, 0
 ; GFX1100-NEXT:    s_mov_b32 s33, 0
+; GFX1100-NEXT:    s_mov_b32 s32, 0
 ; GFX1100-NEXT:    s_getpc_b64 s[6:7]
 ; GFX1100-NEXT:    s_add_u32 s6, s6, ex@rel32@lo+4
 ; GFX1100-NEXT:    s_addc_u32 s7, s7, ex@rel32@hi+12
 ; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1100-NEXT:    s_endpgm
 ; GFX1010-NEXT    s_add_u32 s12, s12, s17
-; GFX1010-NEXT    s_mov_b32 s32, 0
 ; GFX1010-NEXT    s_mov_b32 s33, 0
+; GFX1010-NEXT    s_mov_b32 s32, 0
 ; GFX1010-NEXT    s_addc_u32 s13, s13, 0
 ; GFX1010-NEXT    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
 ; GFX1010-NEXT    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
@@ -459,8 +459,8 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX1010:       ; %bb.0: ; %entry
 ; GFX1010-NEXT:    s_add_u32 s10, s10, s15
-; GFX1010-NEXT:    s_movk_i32 s32, 0x200
 ; GFX1010-NEXT:    s_mov_b32 s33, 0
+; GFX1010-NEXT:    s_movk_i32 s32, 0x200
 ; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
 ; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index cf99b5d80e13..b2f9bf89d9ec 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -282,21 +282,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v4
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v5, v1
@@ -312,21 +312,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v3, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v7, v3, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v2, v5, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, v6
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
@@ -339,18 +339,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
@@ -403,7 +411,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
@@ -439,7 +448,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
@@ -690,10 +700,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
@@ -903,14 +913,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1028,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index e376c3df1ac9..96ec90b1f4d0 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -5,12 +5,12 @@
 
 ; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
 
-; GFX10-PAL: 0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x800
+; GFX10-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x800
 
 ; GFX10-MESA: .long 45100
 ; GFX10-MESA-NEXT: .long 2048
 
-; GFX11-PAL: 0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x400
+; GFX11-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400
 
 ; GFX11-MESA: .long 45100
 ; GFX11-MESA-NEXT: .long 1024
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 06ba60518adc..e44572985e6d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -6741,6 +6741,81 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) {
+; GCN1-LABEL: atomic_store_bf16_offset:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GCN1-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_store_bf16_offset:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GCN2-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_store_bf16_offset:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GCN3-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr %out, i64 8
+  store atomic bfloat %in, ptr %out seq_cst, align 2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) {
+; GCN1-LABEL: atomic_store_bf16:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GCN1-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_store_bf16:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GCN2-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_store_bf16:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GCN3-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  store atomic bfloat %in, ptr %out seq_cst, align 2
+  ret void
+}
+
 define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_inc_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
@@ -7868,3 +7943,201 @@ entry:
   store i32 %val, ptr %out2
   ret void
 }
+
+define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
+; GCN1-LABEL: atomic_load_f16_offset:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_add_u32 s0, s0, 16
+; GCN1-NEXT:    s_addc_u32 s1, s1, 0
+; GCN1-NEXT:    v_mov_b32_e32 v0, s0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s1
+; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_load_f16_offset:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_add_u32 s0, s0, 16
+; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_load_f16_offset:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %gep = getelementptr half, ptr %in, i64 8
+  %val = load atomic half, ptr %gep  seq_cst, align 2
+  store half %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
+; GCN1-LABEL: atomic_load_f16:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    v_mov_b32_e32 v0, s0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s1
+; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_load_f16:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_load_f16:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %val = load atomic half, ptr %in seq_cst, align 2
+  store half %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
+; GCN1-LABEL: atomic_load_bf16_offset:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_add_u32 s0, s0, 16
+; GCN1-NEXT:    s_addc_u32 s1, s1, 0
+; GCN1-NEXT:    v_mov_b32_e32 v0, s0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s1
+; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_load_bf16_offset:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_add_u32 s0, s0, 16
+; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_load_bf16_offset:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr %in, i64 8
+  %val = load atomic bfloat, ptr %gep  seq_cst, align 2
+  store bfloat %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
+; GCN1-LABEL: atomic_load_bf16:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    v_mov_b32_e32 v0, s0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s1
+; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_load_bf16:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_load_bf16:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %val = load atomic bfloat, ptr %in seq_cst, align 2
+  store bfloat %val, ptr %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll
new file mode 100644
index 000000000000..2ccc0337b8ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll
@@ -0,0 +1,481 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+define void @scalar(float %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: scalar:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v2, v1
+; CHECK-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; CHECK-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc float %num to bfloat
+  store bfloat %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v2(<2 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v2:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v4, v4, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v0, v1, v0, s4
+; CHECK-NEXT:    global_store_dword v[2:3], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <2 x float> %num to <2 x bfloat>
+  store <2 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v3(<3 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v3:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v5, v4
+; CHECK-NEXT:    v_mov_b32_e32 v4, v3
+; CHECK-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v3, v3, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
+; CHECK-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v3, v3, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v0, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT:    global_store_short_d16_hi v[4:5], v1, off offset:4
+; CHECK-NEXT:    global_store_dword v[4:5], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <3 x float> %num to <3 x bfloat>
+  store <3 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v4(<4 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v4:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v6, v6, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v3, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    global_store_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <4 x float> %num to <4 x bfloat>
+  store <4 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v8(<8 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v10, v10, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; CHECK-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v10, v10, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v10, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <8 x float> %num to <8 x bfloat>
+  store <8 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v16(<16 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v18, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v18, v18, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v19, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v18, v19, vcc
+; CHECK-NEXT:    v_bfe_u32 v18, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v18, v18, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v19, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v18, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v18, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v18, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v14, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v14, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v15, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v15, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v15
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v3, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v12, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v12
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v13, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v13, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v10, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v10, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v10
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v11, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v11, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v11
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v1, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v8, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v0, v8, v0, s5
+; CHECK-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <16 x float> %num to <16 x bfloat>
+  store <16 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v32(<32 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CHECK-NEXT:    v_bfe_u32 v34, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v34, v34, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v35, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v34, v35, vcc
+; CHECK-NEXT:    v_bfe_u32 v34, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v34, v34, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v35, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v34, v35, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v34, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v34, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v34, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v34, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v14, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v14, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v15, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v15, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v15
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v3, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v12, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v12
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v13, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v13, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v10, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v10, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v10
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v11, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v11, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v11
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v1, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v8, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v0, v8, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v22, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v22, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v22
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v23, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v23, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v23
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v11, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v20, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v20, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v20
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v21, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v21, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v21
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v10, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v18, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v18, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v18
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v19, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v19, s4
+; CHECK-NEXT:    v_or_b32_e32 v12, 0x400000, v19
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; CHECK-NEXT:    v_perm_b32 v9, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v16, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v16, s4
+; CHECK-NEXT:    v_or_b32_e32 v12, 0x400000, v16
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc
+; CHECK-NEXT:    v_bfe_u32 v12, v17, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v17, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v17
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_perm_b32 v8, v12, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v30, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v30, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v30
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v13, v31, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v31, s4
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v31
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v14, vcc
+; CHECK-NEXT:    v_perm_b32 v15, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v28, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v28
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v29, s4
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v29
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v14, vcc
+; CHECK-NEXT:    v_perm_b32 v14, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v26, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v26, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v26
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_bfe_u32 v13, v27, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v27, s4
+; CHECK-NEXT:    v_or_b32_e32 v16, 0x400000, v27
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc
+; CHECK-NEXT:    v_perm_b32 v13, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v24, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v24, s4
+; CHECK-NEXT:    v_or_b32_e32 v16, 0x400000, v24
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; CHECK-NEXT:    v_bfe_u32 v16, v25, 16, 1
+; CHECK-NEXT:    v_add3_u32 v16, v16, v25, s4
+; CHECK-NEXT:    v_or_b32_e32 v17, 0x400000, v25
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; CHECK-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; CHECK-NEXT:    v_perm_b32 v12, v16, v12, s5
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <32 x float> %num to <32 x bfloat>
+  store <32 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
new file mode 100644
index 000000000000..d824763c22e2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+define void @scalar(double %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: scalar:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    s_brev_b32 s4, 1
+; CHECK-NEXT:    v_and_or_b32 v5, v1, s4, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; CHECK-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc double %num to bfloat
+  store bfloat %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v2(<2 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v2:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v8, |v[0:1]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[6:7], v8
+; CHECK-NEXT:    v_and_b32_e32 v9, 1, v8
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[6:7]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[6:7]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v6, v8, v6
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CHECK-NEXT:    s_brev_b32 s8, 1
+; CHECK-NEXT:    v_and_or_b32 v7, v1, s8, v6
+; CHECK-NEXT:    v_bfe_u32 v6, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s9, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v6, v6, v7, s9
+; CHECK-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v7, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
+; CHECK-NEXT:    v_and_b32_e32 v8, 1, v7
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v7, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v0, v0, v6, s4
+; CHECK-NEXT:    global_store_dword v[4:5], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <2 x double> %num to <2 x bfloat>
+  store <2 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v3(<3 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v3:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v10, |v[0:1]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v10
+; CHECK-NEXT:    v_and_b32_e32 v11, 1, v10
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v8, v10, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    s_brev_b32 s8, 1
+; CHECK-NEXT:    v_and_or_b32 v9, v1, s8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    s_movk_i32 s9, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v9, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
+; CHECK-NEXT:    v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v9, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x7060302
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v3, |v[4:5]|
+; CHECK-NEXT:    v_perm_b32 v2, v0, v8, s4
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; CHECK-NEXT:    v_and_b32_e32 v8, 1, v3
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v3, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v5, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    global_store_short_d16_hi v[6:7], v0, off offset:4
+; CHECK-NEXT:    global_store_dword v[6:7], v2, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <3 x double> %num to <3 x bfloat>
+  store <3 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v4(<4 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v4:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v12, |v[4:5]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[10:11], v12
+; CHECK-NEXT:    v_and_b32_e32 v13, 1, v12
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[10:11]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[10:11]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v10, v12, v10
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
+; CHECK-NEXT:    s_brev_b32 s8, 1
+; CHECK-NEXT:    v_and_or_b32 v11, v5, s8, v10
+; CHECK-NEXT:    v_bfe_u32 v10, v10, 16, 1
+; CHECK-NEXT:    s_movk_i32 s9, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v10, v10, v11, s9
+; CHECK-NEXT:    v_or_b32_e32 v11, 0x400000, v11
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v11, |v[6:7]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v11
+; CHECK-NEXT:    v_and_b32_e32 v12, 1, v11
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v4, v11, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v7, s8, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    s_mov_b32 s10, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v5, v4, v10, s10
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v4, |v[0:1]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; CHECK-NEXT:    v_and_b32_e32 v10, 1, v4
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[6:7]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[6:7]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v6, v4, v6
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; CHECK-NEXT:    v_and_or_b32 v6, v1, s8, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v6, s9
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v6, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v0, v4, s10
+; CHECK-NEXT:    global_store_dwordx2 v[8:9], v[4:5], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <4 x double> %num to <4 x bfloat>
+  store <4 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v20, |v[12:13]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[18:19], v20
+; CHECK-NEXT:    v_and_b32_e32 v21, 1, v20
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[12:13]|, v[18:19]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[12:13]|, v[18:19]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
+; CHECK-NEXT:    v_cndmask_b32_e64 v18, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v18, v20, v18
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
+; CHECK-NEXT:    s_brev_b32 s8, 1
+; CHECK-NEXT:    v_and_or_b32 v19, v13, s8, v18
+; CHECK-NEXT:    v_bfe_u32 v18, v18, 16, 1
+; CHECK-NEXT:    s_movk_i32 s9, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v18, v18, v19, s9
+; CHECK-NEXT:    v_or_b32_e32 v19, 0x400000, v19
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[12:13]
+; CHECK-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v19, |v[14:15]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[12:13], v19
+; CHECK-NEXT:    v_and_b32_e32 v20, 1, v19
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[14:15]|, v[12:13]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[14:15]|, v[12:13]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v12, v19, v12
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v19, vcc
+; CHECK-NEXT:    v_and_or_b32 v13, v15, s8, v12
+; CHECK-NEXT:    v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v13, s9
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[14:15]
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    s_mov_b32 s10, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v13, v12, v18, s10
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v12, |v[8:9]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; CHECK-NEXT:    v_and_b32_e32 v18, 1, v12
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[8:9]|, v[14:15]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[8:9]|, v[14:15]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v14, v12, v14
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
+; CHECK-NEXT:    v_and_or_b32 v14, v9, s8, v12
+; CHECK-NEXT:    v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v14, s9
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[8:9]
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v14, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v14, |v[10:11]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v14
+; CHECK-NEXT:    v_and_b32_e32 v15, 1, v14
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[10:11]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[10:11]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v8, v14, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
+; CHECK-NEXT:    v_and_or_b32 v9, v11, s8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[10:11]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v10, |v[4:5]|
+; CHECK-NEXT:    v_perm_b32 v12, v8, v12, s10
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v10
+; CHECK-NEXT:    v_and_b32_e32 v11, 1, v10
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v8, v10, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_and_or_b32 v9, v5, s8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v9, |v[6:7]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v9
+; CHECK-NEXT:    v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v4, v9, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v7, s8, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; CHECK-NEXT:    v_perm_b32 v11, v4, v8, s10
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v1, s8, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 1, v5
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v5, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_perm_b32 v10, v0, v4, s10
+; CHECK-NEXT:    global_store_dwordx4 v[16:17], v[10:13], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <8 x double> %num to <8 x bfloat>
+  store <8 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v16(<16 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v36, |v[12:13]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[34:35], v36
+; CHECK-NEXT:    v_and_b32_e32 v37, 1, v36
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[12:13]|, v[34:35]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[12:13]|, v[34:35]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v37
+; CHECK-NEXT:    v_cndmask_b32_e64 v34, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v34, v36, v34
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v34, v34, v36, vcc
+; CHECK-NEXT:    s_brev_b32 s4, 1
+; CHECK-NEXT:    v_and_or_b32 v35, v13, s4, v34
+; CHECK-NEXT:    v_bfe_u32 v34, v34, 16, 1
+; CHECK-NEXT:    s_movk_i32 s5, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v34, v34, v35, s5
+; CHECK-NEXT:    v_or_b32_e32 v35, 0x400000, v35
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[12:13]
+; CHECK-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v35, |v[14:15]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[12:13], v35
+; CHECK-NEXT:    v_and_b32_e32 v36, 1, v35
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[8:9], |v[14:15]|, v[12:13]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[6:7], |v[14:15]|, v[12:13]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v36
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, -1, 1, s[8:9]
+; CHECK-NEXT:    v_add_u32_e32 v12, v35, v12
+; CHECK-NEXT:    s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v35, vcc
+; CHECK-NEXT:    v_and_or_b32 v13, v15, s4, v12
+; CHECK-NEXT:    v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v13, s5
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[14:15]
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    s_mov_b32 s6, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v13, v12, v34, s6
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v12, |v[8:9]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; CHECK-NEXT:    v_and_b32_e32 v34, 1, v12
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[8:9]|, v[14:15]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[8:9]|, v[14:15]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v34
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v14, v12, v14
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
+; CHECK-NEXT:    v_and_or_b32 v14, v9, s4, v12
+; CHECK-NEXT:    v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v14, s5
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[8:9]
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v14, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v14, |v[10:11]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v14
+; CHECK-NEXT:    v_and_b32_e32 v15, 1, v14
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[10:11]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[10:11]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v8, v14, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
+; CHECK-NEXT:    v_and_or_b32 v9, v11, s4, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s5
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[10:11]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v10, |v[4:5]|
+; CHECK-NEXT:    v_perm_b32 v12, v8, v12, s6
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v10
+; CHECK-NEXT:    v_and_b32_e32 v11, 1, v10
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[4:5]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[4:5]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v8, v10, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_and_or_b32 v9, v5, s4, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s5
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v9, |v[6:7]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v9
+; CHECK-NEXT:    v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v4, v9, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v7, s4, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s5
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; CHECK-NEXT:    v_perm_b32 v11, v4, v8, s6
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v1, s4, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s5
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 1, v5
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v5, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v2, |v[28:29]|
+; CHECK-NEXT:    v_perm_b32 v10, v0, v4, s6
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; CHECK-NEXT:    v_and_b32_e32 v3, 1, v2
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[28:29]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[28:29]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v2, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v29, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[28:29]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v3, |v[30:31]|
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; CHECK-NEXT:    v_and_b32_e32 v4, 1, v3
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[30:31]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[30:31]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v3, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v31, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[30:31]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_perm_b32 v3, v0, v2, s6
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v2, |v[24:25]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; CHECK-NEXT:    v_and_b32_e32 v4, 1, v2
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[24:25]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[24:25]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v2, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v25, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[24:25]
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v4, |v[26:27]|
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 1, v4
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[26:27]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[26:27]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v4, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v27, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[26:27]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v4, |v[20:21]|
+; CHECK-NEXT:    v_perm_b32 v2, v0, v2, s6
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 1, v4
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[20:21]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[20:21]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v4, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v21, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[20:21]
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v5, |v[22:23]|
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 1, v5
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[22:23]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[22:23]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v5, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v23, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[22:23]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_perm_b32 v1, v0, v4, s6
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v0, |v[16:17]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
+; CHECK-NEXT:    v_and_b32_e32 v6, 1, v0
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[16:17]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[16:17]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v4, v0, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT:    v_and_or_b32 v4, v17, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v4, s5
+; CHECK-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[16:17]
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[18:19]|
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[18:19]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[18:19]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v19, s4, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s5
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[18:19]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v0, v4, v0, s6
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[10:13], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <16 x double> %num to <16 x bfloat>
+  store <16 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 674d7a3c5c9b..dac3a3db7b45 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -6216,6 +6216,81 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_store_bf16_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:16
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_store_bf16_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s2, 16
+; VI-NEXT:    s_addc_u32 s1, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_store_bf16_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3] offset:16
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8
+  store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_store_bf16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_store_bf16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_store_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2
+  ret void
+}
+
 define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_inc_i32_offset:
 ; SI:       ; %bb.0: ; %entry
@@ -6963,3 +7038,207 @@ entry:
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
+
+define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_load_f16_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_load_f16_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    s_mov_b32 s2, s6
+; VI-NEXT:    s_mov_b32 s3, s7
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_load_f16_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr half, ptr addrspace(1) %in, i64 8
+  %val = load atomic half, ptr addrspace(1) %gep  seq_cst, align 2
+  store half %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_load_f16_negoffset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
+; SI-NEXT:    v_mov_b32_e32 v1, -1
+; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_load_f16_negoffset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 0xfffffe00
+; VI-NEXT:    s_addc_u32 s1, s1, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_load_f16_negoffset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr half, ptr addrspace(1) %in, i64 -256
+  %val = load atomic half, ptr addrspace(1) %gep  seq_cst, align 2
+  store half %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_load_bf16_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_load_bf16_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    s_mov_b32 s2, s6
+; VI-NEXT:    s_mov_b32 s3, s7
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_load_bf16_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8
+  %val = load atomic bfloat, ptr addrspace(1) %gep  seq_cst, align 2
+  store bfloat %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_load_bf16_negoffset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
+; SI-NEXT:    v_mov_b32_e32 v1, -1
+; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_load_bf16_negoffset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 0xfffffe00
+; VI-NEXT:    s_addc_u32 s1, s1, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_load_bf16_negoffset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256
+  %val = load atomic bfloat, ptr addrspace(1) %gep  seq_cst, align 2
+  store bfloat %val, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index f5c2bd6286cb..41a883302e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -8907,17 +8907,17 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s10
-; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_mov_b32_e32 v3, s5
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:  .LBB127_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
new file mode 100644
index 000000000000..50eea4aebd5e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg_nsa
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_nsa_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
new file mode 100644
index 000000000000..b22de06e68a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg_nsa
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT:   S_CLAUSE 1
+    ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_nsa_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; CHECK-NEXT:   S_CLAUSE 2
+    ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
new file mode 100644
index 000000000000..243a84562ab3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT:   S_CLAUSE 1
+    ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
+    ; CHECK-NEXT:   S_CLAUSE 2
+    ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
index 1c6bdff51015..44b988a7121c 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefix=GFX12
 
 ---
 name: nop1
@@ -19,6 +20,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     ; GFX11-NEXT: S_NOP 2
+    ;
+    ; GFX12-LABEL: name: nop1
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT: S_NOP 2
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
 ...
@@ -48,6 +55,16 @@ body: |
     ; GFX11-NEXT:   S_NOP 2
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: nop2
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+    ; GFX12-NEXT:   S_CLAUSE 2
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   S_NOP 2
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -80,6 +97,17 @@ body: |
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
     ; GFX11-NEXT: S_NOP 2
+    ;
+    ; GFX12-LABEL: name: nop3
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+    ; GFX12-NEXT:   S_CLAUSE 2
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   S_NOP 2
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: S_NOP 2
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -274,6 +302,99 @@ body: |
     ; GFX11-NEXT:   $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
     ; GFX11-NEXT:   $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: long_clause
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 31
+    ; GFX12-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 31
+    ; GFX12-NEXT:   $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 15
+    ; GFX12-NEXT:   $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
     $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
     $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
     $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
@@ -357,57 +478,6 @@ body: |
 ...
 
 ---
-name: mimg_nsa
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-LABEL: name: mimg_nsa
-    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ;
-    ; GFX11-LABEL: name: mimg_nsa
-    ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr11_vgpr12, implicit-def $vgpr11_vgpr12_vgpr13, implicit-def $vgpr12_vgpr13, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr21_vgpr22, implicit-def $vgpr21_vgpr22_vgpr23, implicit-def $vgpr22_vgpr23, implicit $vgpr3, implicit $vgpr8, implicit $vgpr7, implicit $vgpr5, implicit $vgpr4, implicit $vgpr6, implicit $vgpr0, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
-    ; GFX11-NEXT:   S_CLAUSE 1
-    ; GFX11-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT:   $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT: }
-    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-...
-
----
-name: mimg_nsa_mixed
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-LABEL: name: mimg_nsa_mixed
-    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ;
-    ; GFX11-LABEL: name: mimg_nsa_mixed
-    ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr11_vgpr12, implicit-def $vgpr11_vgpr12_vgpr13, implicit-def $vgpr12_vgpr13, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr21_vgpr22, implicit-def $vgpr21_vgpr22_vgpr23, implicit-def $vgpr22_vgpr23, implicit $vgpr3, implicit $vgpr8, implicit $vgpr7, implicit $vgpr5, implicit $vgpr4, implicit $vgpr6, implicit $vgpr0, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GFX11-NEXT:   S_CLAUSE 2
-    ; GFX11-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    ; GFX11-NEXT:   $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT: }
-    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-...
-
----
 name: kill
 tracksRegLiveness: true
 body: |
@@ -432,6 +502,16 @@ body: |
     ; GFX11-NEXT:   KILL undef renamable $sgpr4
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: kill
+    ; GFX12: liveins: $sgpr0_sgpr1, $sgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+    ; GFX12-NEXT:   S_CLAUSE 1
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   KILL undef renamable $sgpr4
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     KILL undef renamable $sgpr4
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -464,6 +544,17 @@ body: |
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
     ; GFX11-NEXT: KILL undef renamable $sgpr5
+    ;
+    ; GFX12-LABEL: name: kill2
+    ; GFX12: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+    ; GFX12-NEXT:   S_CLAUSE 1
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   KILL undef renamable $sgpr4
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: KILL undef renamable $sgpr5
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     KILL undef renamable $sgpr4
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -490,6 +581,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_load_atomic
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -514,6 +611,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX11-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: global_load_atomic
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX12-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
 ...
@@ -535,6 +638,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_global_load
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -559,6 +668,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX11-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: buffer_load_atomic
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
 ...
@@ -580,6 +695,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_load_store
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -601,6 +722,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: global_load_store
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
 ...
@@ -622,6 +749,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX11-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: buffer_load_store
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+    ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index 135a101822bf..513734388eb6 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -136,6 +136,7 @@ body: |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
   ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
   ; CHECK-NEXT: {{  $}}
@@ -278,6 +279,31 @@ body: |
     liveins: $vgpr2, $vgpr3
 ...
 
+# Second use is an instruction that reads and writes v1.
+---
+name: multiple_uses_4
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: multiple_uses_4
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  bb.0:
+    liveins: $vgpr0
+    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+  bb.1:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+...
+
 # Results are live-in to another basic block.
 ---
 name: basic_block_1
@@ -398,7 +424,6 @@ body: |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   $exec = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -424,7 +449,6 @@ body: |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   $exec_lo = COPY $sgpr0
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -450,7 +474,6 @@ body: |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   $exec_hi = COPY $sgpr0
   ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index fbf2ee1145ae..ec446f1f3bf2 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -7,8 +7,8 @@
 define amdgpu_kernel void @test_kernel(i32 %val) #0 {
 ; CHECK-LABEL: test_kernel:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_mov_b32 s32, 0x180000
 ; CHECK-NEXT:    s_mov_b32 s33, 0
+; CHECK-NEXT:    s_mov_b32 s32, 0x180000
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s15
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index e21d61036375..ffedde9416bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -97,9 +97,7 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a
 define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -114,9 +112,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a
 define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 9b8fdf901704..7662a3b78dea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -385,7 +385,7 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -409,7 +409,7 @@ define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -433,7 +433,7 @@ define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -479,7 +479,7 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -503,7 +503,7 @@ define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -527,7 +527,7 @@ define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
index 7b1f55e7eeba..b75723f544d1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
@@ -12,7 +12,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX12-LABEL: load_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -50,7 +50,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02]
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
 ; GFX12-NEXT:    image_msaa_load v[0:4], [v7, v6, v5], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x07,0x06,0x05,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    global_store_b32 v8, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x08,0x00,0x00,0x00]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -71,7 +71,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
 ; GFX12-LABEL: load_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x07,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -110,7 +110,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02]
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e]
 ; GFX12-NEXT:    image_msaa_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x08,0x07,0x06,0x05]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    global_store_b32 v9, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x09,0x00,0x00,0x00]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -131,7 +131,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_glc(<8 x i32> inreg %rsrc, i32 %s, i32
 ; GFX12-LABEL: load_2dmsaa_glc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_NT ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x10,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 1)
@@ -148,7 +148,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_slc(<8 x i32> inreg %rsrc, i32 %s, i32
 ; GFX12-LABEL: load_2dmsaa_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_HT ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x20,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 2)
@@ -165,7 +165,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_glc_slc(<8 x i32> inreg %rsrc, i32 %s,
 ; GFX12-LABEL: load_2dmsaa_glc_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_LU ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x30,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 3)
@@ -182,7 +182,7 @@ define amdgpu_ps <4 x half> @load_2dmsaa_d16(<8 x i32> inreg %rsrc, i32 %s, i32
 ; GFX12-LABEL: load_2dmsaa_d16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:1], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm d16 ; encoding: [0x26,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -215,7 +215,7 @@ define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
 ; GFX12-NEXT:    image_msaa_load v[0:2], [v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x05,0x04,0x03,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    global_store_b32 v6, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x06,0x00,0x00,0x00]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -236,7 +236,7 @@ define amdgpu_ps <4 x half> @load_2darraymsaa_d16(<8 x i32> inreg %rsrc, i32 %s,
 ; GFX12-LABEL: load_2darraymsaa_d16:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_msaa_load v[0:1], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm d16 ; encoding: [0x27,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -269,7 +269,7 @@ define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
 ; GFX12-NEXT:    image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x06,0x05,0x04,0x03]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    global_store_b32 v7, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x07,0x00,0x00,0x00]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
@@ -292,7 +292,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32 1, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -313,7 +313,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_a16(<8 x i32> inreg %rsrc, i16 %s
 ; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
 ; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x47,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
-; GFX12-NEXT:    s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
+; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
 ; GFX12-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i16(i32 4, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index a7d3115af29b..47c021769aa5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -96,6 +96,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT4-NEXT:    s_wait_kmcnt 0x0
 ; VARIANT4-NEXT:    v_xad_u32 v1, v0, -1, s2
 ; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
+; VARIANT4-NEXT:    s_wait_storecnt 0x0
 ; VARIANT4-NEXT:    s_barrier_signal -1
 ; VARIANT4-NEXT:    s_barrier_wait -1
 ; VARIANT4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
@@ -142,6 +143,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT6-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
 ; VARIANT6-NEXT:    v_sub_nc_u32_e32 v1, s2, v0
 ; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
+; VARIANT6-NEXT:    s_wait_storecnt 0x0
 ; VARIANT6-NEXT:    s_barrier_signal -1
 ; VARIANT6-NEXT:    s_barrier_wait -1
 ; VARIANT6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 4ab5e97964a8..38a34ec6daf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -12,6 +12,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal -1
 ; GCN-NEXT:    s_barrier_wait -1
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -28,6 +29,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal -1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait -1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -56,6 +58,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal 1
 ; GCN-NEXT:    s_barrier_wait 1
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -72,6 +75,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal 1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -100,6 +104,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal 0
 ; GCN-NEXT:    s_barrier_wait 0
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -116,6 +121,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal 0
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -146,6 +152,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal m0
 ; GCN-NEXT:    s_barrier_wait 1
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -163,6 +170,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal m0
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait 1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -192,6 +200,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    s_mov_b32 m0, s0
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal m0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -203,6 +212,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
 ; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal m0
 ; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
@@ -216,6 +226,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst -1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s4
@@ -235,6 +246,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst -1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -270,6 +282,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst 1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s4
@@ -289,6 +302,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst 1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -324,6 +338,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst 1
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s4
@@ -343,6 +358,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst 1
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -379,6 +395,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GCN-NEXT:    s_mov_b32 m0, 1
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst m0
 ; GCN-NEXT:    s_cselect_b32 s3, s3, s5
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s4
@@ -399,6 +416,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, 1
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst m0
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -444,6 +462,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 ; GCN-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v9
 ; GCN-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
 ; GCN-NEXT:    global_store_b32 v[7:8], v10, off
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal_isfirst m0
 ; GCN-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GCN-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
@@ -470,6 +489,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 ; GLOBAL-ISEL-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
 ; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v9, 0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v[7:8], v9, off
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst m0
 ; GLOBAL-ISEL-NEXT:    s_cselect_b32 s0, 1, 0
 ; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1339,6 +1359,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:    s_barrier_signal -1
 ; GCN-NEXT:    s_barrier_wait -1
 ; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
@@ -1355,6 +1376,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
 ; GLOBAL-ISEL-NEXT:    s_barrier_signal -1
 ; GLOBAL-ISEL-NEXT:    s_barrier_wait -1
 ; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
new file mode 100644
index 000000000000..e9945f005d26
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -run-pass=early-machinelicm -simplify-mir -o - %s | FileCheck %s
+
+# Test to check machine LICM does not hoist convergent instructions,
+# DS_PERMUTE_B32 in this example.
+
+---
+name: licm_reg_sequence
+body: |
+  ; CHECK-LABEL: name: licm_reg_sequence
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   S_NOP 0, implicit [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $vgpr0 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    successors: %bb.1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    %3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    S_NOP 0, implicit %3
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    $vgpr0 = COPY %3
+    S_ENDPGM 0
+
+...
+
+# Don't bother handling reg_sequence with physreg uses (is there any
+# reason for these to be legal)?
+---
+name: licm_reg_sequence_physreg_use
+body: |
+  ; CHECK-LABEL: name: licm_reg_sequence_physreg_use
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, $vgpr1, %subreg.sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $vgpr0 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    successors: %bb.1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0
+
+    %3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, $vgpr1, %subreg.sub1
+    S_NOP 0, implicit %3
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    $vgpr0 = COPY %3
+    S_ENDPGM 0
+
+...
+
+---
+name: licm_insert_subreg
+body: |
+  ; CHECK-LABEL: name: licm_insert_subreg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.sub0
+  ; CHECK-NEXT:   [[INSERT_SUBREG1:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[INSERT_SUBREG]], [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   S_NOP 0, implicit [[INSERT_SUBREG1]]
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $vgpr0_vgpr1 = COPY [[INSERT_SUBREG1]]
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    successors: %bb.1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    %3:vreg_64 = IMPLICIT_DEF
+    %4:vreg_64 = INSERT_SUBREG %3, %0, %subreg.sub0
+    %5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1
+    S_NOP 0, implicit %5
+    S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    $vgpr0_vgpr1 = COPY %5
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 14bcc4f994f8..400298bcff4f 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1150 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
 
 ; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok.
 
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index e13542f61474..4128bfe392dc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @singlethread_acquire_fence() {
 ; GFX6-LABEL: singlethread_acquire_fence:
@@ -928,59 +928,77 @@ entry:
 define amdgpu_kernel void @workgroup_acquire_fence() {
 ; GFX6-LABEL: workgroup_acquire_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: workgroup_acquire_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: workgroup_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: workgroup_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: workgroup_acquire_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: workgroup_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: workgroup_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_acquire_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: workgroup_acquire_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup") acquire
@@ -990,54 +1008,72 @@ entry:
 define amdgpu_kernel void @workgroup_release_fence() {
 ; GFX6-LABEL: workgroup_release_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: workgroup_release_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: workgroup_release_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: workgroup_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: workgroup_release_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: workgroup_release_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: workgroup_release_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: workgroup_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: workgroup_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup") release
@@ -1047,59 +1083,77 @@ entry:
 define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ; GFX6-LABEL: workgroup_acq_rel_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: workgroup_acq_rel_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: workgroup_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: workgroup_acq_rel_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: workgroup_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup") acq_rel
@@ -1109,59 +1163,77 @@ entry:
 define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ; GFX6-LABEL: workgroup_seq_cst_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: workgroup_seq_cst_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: workgroup_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: workgroup_seq_cst_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: workgroup_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup") seq_cst
@@ -1179,6 +1251,8 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
 ;
 ; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1196,6 +1270,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
 ;
 ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
@@ -1205,11 +1280,14 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
 ;
 ; GFX940-TGSPLIT-LABEL: workgroup_one_as_acquire_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: workgroup_one_as_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1219,6 +1297,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_acquire_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
@@ -1241,6 +1323,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX10-WGP-LABEL: workgroup_one_as_release_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_release_fence:
@@ -1257,6 +1341,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence:
@@ -1265,10 +1350,13 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX940-TGSPLIT-LABEL: workgroup_one_as_release_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: workgroup_one_as_release_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_release_fence:
@@ -1277,6 +1365,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_release_fence:
@@ -1298,6 +1390,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1315,6 +1409,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
@@ -1324,11 +1419,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX940-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1338,6 +1436,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
@@ -1360,6 +1462,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
@@ -1377,6 +1481,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
@@ -1386,11 +1491,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX940-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
@@ -1400,6 +1508,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
@@ -1414,69 +1526,92 @@ entry:
 define amdgpu_kernel void @agent_acquire_fence() {
 ; GFX6-LABEL: agent_acquire_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: agent_acquire_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: agent_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: agent_acquire_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: agent_acquire_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: agent_acquire_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: agent_acquire_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: agent_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: agent_acquire_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_acquire_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -1487,56 +1622,79 @@ entry:
 define amdgpu_kernel void @agent_release_fence() {
 ; GFX6-LABEL: agent_release_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: agent_release_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: agent_release_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: agent_release_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: agent_release_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: agent_release_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: agent_release_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: agent_release_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: agent_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent") release
@@ -1546,71 +1704,94 @@ entry:
 define amdgpu_kernel void @agent_acq_rel_fence() {
 ; GFX6-LABEL: agent_acq_rel_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: agent_acq_rel_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: agent_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: agent_acq_rel_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: agent_acq_rel_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: agent_acq_rel_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: agent_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: agent_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -1621,71 +1802,94 @@ entry:
 define amdgpu_kernel void @agent_seq_cst_fence() {
 ; GFX6-LABEL: agent_seq_cst_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: agent_seq_cst_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: agent_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: agent_seq_cst_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: agent_seq_cst_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: agent_seq_cst_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: agent_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: agent_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -1696,69 +1900,92 @@ entry:
 define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ; GFX6-LABEL: agent_one_as_acquire_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: agent_one_as_acquire_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: agent_one_as_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: agent_one_as_acquire_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: agent_one_as_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: agent_one_as_acquire_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_one_as_acquire_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -1769,56 +1996,79 @@ entry:
 define amdgpu_kernel void @agent_one_as_release_fence() {
 ; GFX6-LABEL: agent_one_as_release_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: agent_one_as_release_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: agent_one_as_release_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: agent_one_as_release_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: agent_one_as_release_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: agent_one_as_release_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: agent_one_as_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") release
@@ -1828,71 +2078,94 @@ entry:
 define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ; GFX6-LABEL: agent_one_as_acq_rel_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: agent_one_as_acq_rel_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: agent_one_as_acq_rel_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -1903,71 +2176,94 @@ entry:
 define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ; GFX6-LABEL: agent_one_as_seq_cst_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: agent_one_as_seq_cst_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: agent_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: agent_one_as_seq_cst_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: agent_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: agent_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -1978,71 +2274,94 @@ entry:
 define amdgpu_kernel void @system_acquire_fence() {
 ; GFX6-LABEL: system_acquire_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: system_acquire_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: system_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: system_acquire_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: system_acquire_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: system_acquire_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: system_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: system_acquire_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_acquire_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -2053,58 +2372,81 @@ entry:
 define amdgpu_kernel void @system_release_fence() {
 ; GFX6-LABEL: system_release_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: system_release_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: system_release_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: system_release_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: system_release_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: system_release_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: system_release_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: system_release_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: system_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
   fence release
@@ -2114,33 +2456,41 @@ entry:
 define amdgpu_kernel void @system_acq_rel_fence() {
 ; GFX6-LABEL: system_acq_rel_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: system_acq_rel_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: system_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
@@ -2148,6 +2498,7 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
@@ -2155,34 +2506,48 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ; GFX940-NOTTGSPLIT-LABEL: system_acq_rel_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: system_acq_rel_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: system_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: system_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -2193,33 +2558,41 @@ entry:
 define amdgpu_kernel void @system_seq_cst_fence() {
 ; GFX6-LABEL: system_seq_cst_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: system_seq_cst_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: system_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
@@ -2227,6 +2600,7 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
@@ -2234,34 +2608,48 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ; GFX940-NOTTGSPLIT-LABEL: system_seq_cst_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: system_seq_cst_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: system_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: system_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -2272,71 +2660,94 @@ entry:
 define amdgpu_kernel void @system_one_as_acquire_fence() {
 ; GFX6-LABEL: system_one_as_acquire_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: system_one_as_acquire_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: system_one_as_acquire_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_acquire_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: system_one_as_acquire_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: system_one_as_acquire_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: system_one_as_acquire_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_acquire_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: system_one_as_acquire_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_acquire_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -2347,58 +2758,81 @@ entry:
 define amdgpu_kernel void @system_one_as_release_fence() {
 ; GFX6-LABEL: system_one_as_release_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: system_one_as_release_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: system_one_as_release_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_release_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: system_one_as_release_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: system_one_as_release_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: system_one_as_release_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: system_one_as_release_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") release
@@ -2408,33 +2842,41 @@ entry:
 define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ; GFX6-LABEL: system_one_as_acq_rel_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: system_one_as_acq_rel_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
@@ -2442,6 +2884,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
@@ -2449,34 +2892,48 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ; GFX940-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: system_one_as_acq_rel_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
@@ -2487,33 +2944,41 @@ entry:
 define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ; GFX6-LABEL: system_one_as_seq_cst_fence:
 ; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: system_one_as_seq_cst_fence:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
@@ -2521,6 +2986,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
@@ -2528,34 +2994,48 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ; GFX940-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: system_one_as_seq_cst_fence:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: system_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index a922c8cd2f30..193f642cb1fa 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -1,162 +1,192 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX7-LABEL: flat_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -170,148 +200,178 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_load(
 ; GFX7-LABEL: flat_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -325,164 +385,198 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX7-LABEL: flat_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %in, ptr %out) {
@@ -495,164 +589,220 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX7-LABEL: flat_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %in, ptr %out) {
@@ -665,43 +815,43 @@ entry:
 define amdgpu_kernel void @flat_agent_unordered_store(
 ; GFX7-LABEL: flat_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -711,84 +861,88 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -801,43 +955,43 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_store(
 ; GFX7-LABEL: flat_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -847,84 +1001,88 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -937,133 +1095,159 @@ entry:
 define amdgpu_kernel void @flat_agent_release_store(
 ; GFX7-LABEL: flat_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -1075,133 +1259,159 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_store(
 ; GFX7-LABEL: flat_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -1213,41 +1423,45 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1259,81 +1473,97 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1345,12 +1575,13 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1358,13 +1589,14 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1374,13 +1606,14 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1390,6 +1623,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1402,11 +1636,13 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1414,11 +1650,13 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1426,11 +1664,13 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -1438,11 +1678,13 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -1450,11 +1692,12 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1465,11 +1708,12 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1480,10 +1724,13 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -1491,10 +1738,13 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -1508,129 +1758,171 @@ entry:
 define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1642,12 +1934,14 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1655,13 +1949,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1671,13 +1968,16 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1687,23 +1987,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1711,11 +2016,14 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1723,12 +2031,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -1736,12 +2047,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -1749,12 +2063,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1764,12 +2081,15 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1779,10 +2099,17 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -1790,10 +2117,17 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -1807,12 +2141,14 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1820,13 +2156,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1836,13 +2175,16 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1852,23 +2194,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1876,11 +2223,14 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1888,12 +2238,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -1901,12 +2254,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -1914,12 +2270,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1929,12 +2288,15 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1944,10 +2306,17 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -1955,10 +2324,17 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -1972,166 +2348,198 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2144,168 +2552,222 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2318,168 +2780,222 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2492,125 +3008,225 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2623,14 +3239,26 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -2638,14 +3266,26 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2655,14 +3295,26 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2672,24 +3324,42 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2697,10 +3367,16 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2708,10 +3384,16 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -2719,10 +3401,16 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -2730,10 +3418,17 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2743,10 +3438,17 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2756,10 +3458,17 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -2767,10 +3476,17 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -2785,127 +3501,249 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2918,14 +3756,27 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -2933,14 +3784,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2950,14 +3815,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2967,24 +3846,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2992,10 +3891,17 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3003,11 +3909,18 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -3015,11 +3928,18 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -3027,10 +3947,19 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3040,10 +3969,19 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3053,10 +3991,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -3064,10 +4013,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -3082,14 +4042,27 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3097,14 +4070,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3114,14 +4101,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3131,24 +4132,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3156,10 +4177,17 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3167,11 +4195,18 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -3179,11 +4214,18 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -3191,10 +4233,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3204,10 +4255,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3217,10 +4277,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -3228,10 +4299,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -3246,14 +4328,26 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3261,14 +4355,26 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3278,14 +4384,26 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3295,24 +4413,42 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3320,10 +4456,16 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3331,10 +4473,16 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -3342,10 +4490,16 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -3353,10 +4507,17 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3366,10 +4527,17 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3379,10 +4547,17 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -3390,10 +4565,17 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -3408,14 +4590,26 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3423,14 +4617,26 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3440,14 +4646,26 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3457,24 +4675,42 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3482,10 +4718,16 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3493,10 +4735,16 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -3504,10 +4752,16 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -3515,10 +4769,17 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3528,10 +4789,17 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3541,10 +4809,17 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -3552,10 +4827,17 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -3570,14 +4852,27 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3585,14 +4880,28 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3602,14 +4911,28 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3619,24 +4942,44 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3644,10 +4987,17 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3655,11 +5005,18 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -3667,11 +5024,18 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -3679,10 +5043,19 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3692,10 +5065,19 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3705,10 +5087,21 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -3716,10 +5109,21 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -3734,14 +5138,27 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3749,14 +5166,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3766,14 +5197,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3783,24 +5228,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3808,10 +5273,17 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3819,11 +5291,18 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -3831,11 +5310,18 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -3843,10 +5329,19 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3856,10 +5351,19 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3869,10 +5373,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -3880,10 +5395,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -3898,14 +5424,27 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3913,14 +5452,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3930,14 +5483,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3947,24 +5514,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3972,10 +5559,17 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3983,11 +5577,18 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -3995,11 +5596,18 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -4007,10 +5615,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4020,10 +5637,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4033,10 +5659,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -4044,10 +5681,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -4062,14 +5710,27 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4077,14 +5738,28 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4094,14 +5769,28 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4111,24 +5800,44 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4136,10 +5845,17 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4147,11 +5863,18 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -4159,11 +5882,18 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -4171,10 +5901,19 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4184,10 +5923,19 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4197,10 +5945,21 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -4208,10 +5967,21 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -4226,14 +5996,27 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4241,14 +6024,28 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4258,14 +6055,28 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4275,24 +6086,44 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4300,10 +6131,17 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4311,11 +6149,18 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -4323,11 +6168,18 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -4335,10 +6187,19 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4348,10 +6209,19 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4361,10 +6231,21 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -4372,10 +6253,21 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -4390,14 +6282,27 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4405,14 +6310,28 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4422,14 +6341,28 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4439,24 +6372,44 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4464,10 +6417,17 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4475,11 +6435,18 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -4487,11 +6454,18 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -4499,10 +6473,19 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4512,10 +6495,19 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4525,10 +6517,21 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -4536,10 +6539,21 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -4554,14 +6568,27 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4569,14 +6596,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4586,14 +6627,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4603,24 +6658,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4628,10 +6703,17 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4639,11 +6721,18 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -4651,11 +6740,18 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -4663,10 +6759,19 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4676,10 +6781,19 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4689,10 +6803,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -4700,10 +6825,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -4718,14 +6854,27 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4733,14 +6882,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4750,14 +6913,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4767,24 +6944,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4792,10 +6989,17 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4803,11 +7007,18 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -4815,11 +7026,18 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -4827,10 +7045,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4840,10 +7067,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4853,10 +7089,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -4864,10 +7111,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -4882,65 +7140,109 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4950,88 +7252,152 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5047,70 +7413,114 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5120,99 +7530,167 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5227,65 +7705,115 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5295,90 +7823,170 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5394,70 +8002,120 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5467,101 +8125,185 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5576,70 +8318,120 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5649,101 +8441,185 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5758,70 +8634,114 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5831,99 +8751,167 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5938,70 +8926,114 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6011,99 +9043,167 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6118,70 +9218,120 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6191,101 +9341,185 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6300,70 +9534,120 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6373,101 +9657,185 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6482,70 +9850,120 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6555,101 +9973,185 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6664,70 +10166,120 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6737,101 +10289,185 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6846,70 +10482,120 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6919,101 +10605,185 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -7028,70 +10798,120 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -7101,101 +10921,185 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -7210,70 +11114,120 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -7283,101 +11237,185 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -7392,70 +11430,120 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -7465,101 +11553,185 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -7574,148 +11746,178 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; GFX7-LABEL: flat_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7729,148 +11931,178 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7884,173 +12116,207 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX7-LABEL: flat_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8064,173 +12330,229 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8244,43 +12566,43 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ; GFX7-LABEL: flat_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -8290,84 +12612,88 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8380,43 +12706,43 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -8426,84 +12752,88 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8516,133 +12846,159 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_store(
 ; GFX7-LABEL: flat_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -8654,133 +13010,159 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -8792,41 +13174,45 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8838,81 +13224,97 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8924,12 +13326,13 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -8937,13 +13340,14 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -8952,13 +13356,14 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -8967,6 +13372,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8979,11 +13385,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -8991,11 +13399,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9003,11 +13413,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -9015,11 +13427,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -9027,11 +13441,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -9041,11 +13456,12 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -9055,10 +13471,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -9066,10 +13485,13 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -9083,129 +13505,171 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -9217,12 +13681,14 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9230,13 +13696,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -9245,13 +13714,16 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -9260,23 +13732,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9284,11 +13761,14 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9296,12 +13776,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -9309,12 +13792,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -9322,12 +13808,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -9336,12 +13825,15 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -9350,10 +13842,17 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -9361,10 +13860,17 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -9378,12 +13884,14 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9391,13 +13899,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -9406,13 +13917,16 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -9421,23 +13935,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9445,11 +13964,14 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9457,12 +13979,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -9470,12 +13995,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -9483,12 +14011,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -9497,12 +14028,15 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -9511,10 +14045,17 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -9522,10 +14063,17 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -9539,174 +14087,207 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9720,176 +14301,231 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9903,176 +14539,231 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -10086,125 +14777,225 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10217,14 +15008,26 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10232,14 +15035,26 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -10248,14 +15063,26 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -10264,24 +15091,42 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10289,10 +15134,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10300,10 +15151,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -10311,10 +15168,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -10322,10 +15185,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -10334,10 +15204,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -10346,10 +15223,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -10357,10 +15241,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -10375,127 +15266,249 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10508,14 +15521,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10523,14 +15549,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -10539,14 +15579,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -10555,24 +15609,44 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10580,10 +15654,17 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10591,11 +15672,18 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -10603,11 +15691,18 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -10615,10 +15710,19 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -10627,10 +15731,19 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -10639,10 +15752,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -10650,10 +15774,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -10668,14 +15803,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10683,14 +15831,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -10699,14 +15861,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -10715,24 +15891,44 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10740,10 +15936,17 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10751,11 +15954,18 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -10763,11 +15973,18 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -10775,10 +15992,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -10787,10 +16013,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -10799,10 +16034,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -10810,10 +16056,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -10828,14 +16085,26 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10843,14 +16112,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -10859,14 +16140,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -10875,24 +16168,42 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10900,10 +16211,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10911,10 +16228,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -10922,10 +16245,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -10933,10 +16262,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -10945,10 +16281,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -10957,10 +16300,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -10968,10 +16318,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -10986,14 +16343,26 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11001,14 +16370,26 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11017,14 +16398,26 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11033,24 +16426,42 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11058,10 +16469,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11069,10 +16486,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -11080,10 +16503,16 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -11091,10 +16520,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11103,10 +16539,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11115,10 +16558,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -11126,10 +16576,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -11144,14 +16601,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11159,14 +16629,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11175,14 +16659,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11191,24 +16689,44 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11216,10 +16734,17 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11227,11 +16752,18 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -11239,11 +16771,18 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -11251,10 +16790,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11263,10 +16811,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11275,10 +16832,21 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -11286,10 +16854,21 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -11304,14 +16883,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11319,14 +16911,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11335,14 +16941,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11351,24 +16971,44 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11376,10 +17016,17 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11387,11 +17034,18 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -11399,11 +17053,18 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -11411,10 +17072,19 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11423,10 +17093,19 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11435,10 +17114,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -11446,10 +17136,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -11464,14 +17165,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11479,14 +17193,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11495,14 +17223,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11511,24 +17253,44 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11536,10 +17298,17 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11547,11 +17316,18 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -11559,11 +17335,18 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -11571,10 +17354,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11583,10 +17375,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11595,10 +17396,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -11606,10 +17418,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -11624,14 +17447,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11639,14 +17475,28 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11655,14 +17505,28 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11671,24 +17535,44 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11696,10 +17580,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11707,11 +17598,18 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -11719,11 +17617,18 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -11731,10 +17636,19 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11743,10 +17657,19 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11755,10 +17678,21 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -11766,10 +17700,21 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -11784,14 +17729,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11799,14 +17757,28 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11815,14 +17787,28 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11831,24 +17817,44 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11856,10 +17862,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11867,11 +17880,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -11879,11 +17899,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -11891,10 +17918,19 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11903,10 +17939,19 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11915,10 +17960,21 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -11926,10 +17982,21 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -11944,14 +18011,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11959,14 +18039,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11975,14 +18069,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11991,24 +18099,44 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12016,10 +18144,17 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12027,11 +18162,18 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -12039,11 +18181,18 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -12051,10 +18200,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -12063,10 +18221,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -12075,10 +18242,21 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -12086,10 +18264,21 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -12104,14 +18293,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12119,14 +18321,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -12135,14 +18351,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -12151,24 +18381,44 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12176,10 +18426,17 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12187,11 +18444,18 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -12199,11 +18463,18 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -12211,10 +18482,19 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -12223,10 +18503,19 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -12235,10 +18524,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -12246,10 +18546,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -12264,14 +18575,27 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12279,14 +18603,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -12295,14 +18633,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -12311,24 +18663,44 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12336,10 +18708,17 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12347,11 +18726,18 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
@@ -12359,11 +18745,18 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
@@ -12371,10 +18764,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -12383,10 +18785,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -12395,10 +18806,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -12406,10 +18828,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -12424,65 +18857,109 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12492,88 +18969,152 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12589,73 +19130,117 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -12666,104 +19251,172 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12779,65 +19432,115 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12847,90 +19550,170 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12946,73 +19729,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13023,106 +19856,190 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13138,73 +20055,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13215,106 +20182,190 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13330,73 +20381,117 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13407,104 +20502,172 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13520,73 +20683,117 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13597,104 +20804,172 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13710,73 +20985,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13787,106 +21112,190 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13902,73 +21311,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13979,106 +21438,190 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14094,73 +21637,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14171,106 +21764,190 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14286,73 +21963,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14363,106 +22090,190 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14478,73 +22289,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14555,106 +22416,190 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14670,73 +22615,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14747,106 +22742,190 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14862,73 +22941,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14939,106 +23068,190 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -15054,73 +23267,123 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -15131,106 +23394,190 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index 71ec5512c72d..e7c6044b3fb6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -1,15 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
 
 define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
 ; GFX12-LABEL: flat_last_use_load_0:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
@@ -22,14 +26,29 @@ entry:
 define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
 ; GFX12-LABEL: flat_last_use_load_1:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX12-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-NEXT:    s_mov_b32 s2, 2
+; GFX12-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX12-NEXT:    s_mov_b32 s2, 0
+; GFX12-NEXT:    ; implicit-def: $sgpr2
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX12-NEXT:    s_mov_b32 s3, s4
+; GFX12-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-NEXT:    s_mov_b32 s2, s5
+; GFX12-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX12-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
-; GFX12-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
@@ -44,12 +63,18 @@ entry:
 define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
 ; GFX12-LABEL: flat_last_use_and_volatile_load:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
@@ -62,11 +87,15 @@ entry:
 define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) {
 ; GFX12-LABEL: flat_last_use_and_nontemporal_load:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index f6fac11d634c..5fa8e6891baf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -1,162 +1,192 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc slc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] slc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] slc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc slc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc slc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc slc
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc slc
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] nt
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] nt
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] slc dlc
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_load_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] slc dlc
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -170,172 +200,330 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s6, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v1, s6, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    ; implicit-def: $sgpr6
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_mov_b32 s6, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_mov_b32 s8, s9
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
+; GFX7-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc slc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e64 v1, s6, v0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, 0
+; GFX10-WGP-NEXT:    ; implicit-def: $sgpr6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-WGP-NEXT:    v_add_co_u32 v0, s7, s7, v0
+; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v2, s6, s6, v1, s7
+; GFX10-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] slc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
+; GFX10-CU-NEXT:    s_mov_b32 s6, 2
+; GFX10-CU-NEXT:    v_lshlrev_b32_e64 v1, s6, v0
+; GFX10-CU-NEXT:    s_mov_b32 s6, 0
+; GFX10-CU-NEXT:    ; implicit-def: $sgpr6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-CU-NEXT:    s_mov_b32 s7, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-CU-NEXT:    s_mov_b32 s6, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-CU-NEXT:    v_add_co_u32 v0, s7, s7, v0
+; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v2, s6, s6, v1, s7
+; GFX10-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] slc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 0
+; SKIP-CACHE-INV-NEXT:    ; implicit-def: $sgpr2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SKIP-CACHE-INV-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[2:3], s2, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc slc
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e64 v2, s6, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0
+; GFX90A-NOTTGSPLIT-NEXT:    ; implicit-def: $sgpr6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, s8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s8, s9
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NOTTGSPLIT-NEXT:    v_add_co_u32_e64 v0, s[6:7], s6, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-NOTTGSPLIT-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc slc
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e64 v2, s6, v0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0
+; GFX90A-TGSPLIT-NEXT:    ; implicit-def: $sgpr6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-TGSPLIT-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX90A-TGSPLIT-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, s8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s8, s9
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-TGSPLIT-NEXT:    v_add_co_u32_e64 v0, s[6:7], s6, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90A-TGSPLIT-NEXT:    v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc slc
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s4
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s4, 2
+; GFX940-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s4, 0
+; GFX940-NOTTGSPLIT-NEXT:    ; implicit-def: $sgpr4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] nt
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s4
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s4, 2
+; GFX940-TGSPLIT-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s4, 0
+; GFX940-TGSPLIT-NEXT:    ; implicit-def: $sgpr4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX940-TGSPLIT-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] nt
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0
+; GFX11-WGP-NEXT:    ; implicit-def: $sgpr2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-WGP-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-WGP-NEXT:    s_mov_b32 s3, s4
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-WGP-NEXT:    s_mov_b32 s2, s5
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-WGP-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX11-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] slc dlc
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0
+; GFX11-CU-NEXT:    ; implicit-def: $sgpr2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-CU-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-CU-NEXT:    s_mov_b32 s3, s4
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-CU-NEXT:    s_mov_b32 s2, s5
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-CU-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX11-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] slc dlc
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-WGP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0
+; GFX12-WGP-NEXT:    ; implicit-def: $sgpr2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-WGP-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX12-WGP-NEXT:    s_mov_b32 s3, s4
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-WGP-NEXT:    s_mov_b32 s2, s5
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-WGP-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX12-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-CU-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0
+; GFX12-CU-NEXT:    ; implicit-def: $sgpr2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-CU-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX12-CU-NEXT:    s_mov_b32 s3, s4
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-CU-NEXT:    s_mov_b32 s2, s5
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-CU-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX12-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -351,148 +539,178 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 nt sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 nt sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2 glc slc dlc
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_store_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2 glc slc dlc
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 th:TH_STORE_NT
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 th:TH_STORE_NT
 ; GFX12-CU-NEXT:    s_endpgm
@@ -506,172 +724,334 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    flat_load_dword v2, v[1:2]
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_mov_b32 s4, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v3, s4, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0
+; GFX7-NEXT:    ; implicit-def: $sgpr4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_mov_b32 s6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[4:5], v1, v3, s[4:5]
+; GFX7-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-WGP-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[1:2]
-; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX10-WGP-NEXT:    s_mov_b32 s4, 2
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e64 v3, s4, v0
+; GFX10-WGP-NEXT:    s_mov_b32 s4, 0
+; GFX10-WGP-NEXT:    ; implicit-def: $sgpr4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-WGP-NEXT:    v_add_co_u32 v0, s5, s5, v0
+; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v3, s4, s4, v1, s5
+; GFX10-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[1:2]
-; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX10-CU-NEXT:    s_mov_b32 s4, 2
+; GFX10-CU-NEXT:    v_lshlrev_b32_e64 v3, s4, v0
+; GFX10-CU-NEXT:    s_mov_b32 s4, 0
+; GFX10-CU-NEXT:    ; implicit-def: $sgpr4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-CU-NEXT:    s_mov_b32 s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-CU-NEXT:    v_add_co_u32 v0, s5, s5, v0
+; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v3, s4, s4, v1, s5
+; GFX10-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[1:2]
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; SKIP-CACHE-INV-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, 0
+; SKIP-CACHE-INV-NEXT:    ; implicit-def: $sgpr0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v4, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, v3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v4
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[0:1], s0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v3, s[0:1]
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[2:3]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90A-NOTTGSPLIT-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s4, 2
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e64 v4, s4, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NOTTGSPLIT-NEXT:    ; implicit-def: $sgpr4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s4, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v5
+; GFX90A-NOTTGSPLIT-NEXT:    v_add_co_u32_e64 v0, s[4:5], s4, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[2:3]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90A-TGSPLIT-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
-; GFX90A-TGSPLIT-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s4, 2
+; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e64 v4, s4, v0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s4, 0
+; GFX90A-TGSPLIT-NEXT:    ; implicit-def: $sgpr4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s4, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v5
+; GFX90A-TGSPLIT-NEXT:    v_add_co_u32_e64 v0, s[4:5], s4, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX940-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0
+; GFX940-NOTTGSPLIT-NEXT:    ; implicit-def: $sgpr2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v3
+; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 nt sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[2:3]
-; GFX940-TGSPLIT-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX940-TGSPLIT-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 0
+; GFX940-TGSPLIT-NEXT:    ; implicit-def: $sgpr2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v3
+; GFX940-TGSPLIT-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 nt sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-WGP-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[1:2]
-; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-WGP-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s0
+; GFX11-WGP-NEXT:    s_mov_b32 s0, 2
+; GFX11-WGP-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; GFX11-WGP-NEXT:    s_mov_b32 s0, 0
+; GFX11-WGP-NEXT:    ; implicit-def: $sgpr0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-WGP-NEXT:    s_mov_b32 s1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-WGP-NEXT:    s_mov_b32 s0, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-WGP-NEXT:    v_add_co_u32 v0, s1, s1, v0
+; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
+; GFX11-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2 glc slc dlc
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-CU-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[1:2]
-; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-CU-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s0
+; GFX11-CU-NEXT:    s_mov_b32 s0, 2
+; GFX11-CU-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; GFX11-CU-NEXT:    s_mov_b32 s0, 0
+; GFX11-CU-NEXT:    ; implicit-def: $sgpr0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-CU-NEXT:    s_mov_b32 s1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-CU-NEXT:    s_mov_b32 s0, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-CU-NEXT:    v_add_co_u32 v0, s1, s1, v0
+; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
+; GFX11-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2 glc slc dlc
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-WGP-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[1:2]
-; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX12-WGP-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s0
+; GFX12-WGP-NEXT:    s_mov_b32 s0, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; GFX12-WGP-NEXT:    s_mov_b32 s0, 0
+; GFX12-WGP-NEXT:    ; implicit-def: $sgpr0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX12-WGP-NEXT:    s_mov_b32 s1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-WGP-NEXT:    s_mov_b32 s0, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-WGP-NEXT:    v_add_co_u32 v0, s1, s1, v0
+; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
+; GFX12-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 th:TH_STORE_NT
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-CU-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[1:2]
-; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX12-CU-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s0
+; GFX12-CU-NEXT:    s_mov_b32 s0, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; GFX12-CU-NEXT:    s_mov_b32 s0, 0
+; GFX12-CU-NEXT:    ; implicit-def: $sgpr0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v4, v0
+; GFX12-CU-NEXT:    s_mov_b32 s1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-CU-NEXT:    s_mov_b32 s0, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-CU-NEXT:    v_add_co_u32 v0, s1, s1, v0
+; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
+; GFX12-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 th:TH_STORE_NT
 ; GFX12-CU-NEXT:    s_endpgm
@@ -687,158 +1067,192 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ; GFX7-LABEL: flat_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 84438d58d7e7..4c9ce15211e3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -1,162 +1,192 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX7-LABEL: flat_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -170,148 +200,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -325,148 +385,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; GFX7-LABEL: flat_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -480,148 +570,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -635,43 +755,43 @@ entry:
 define amdgpu_kernel void @flat_singlethread_unordered_store(
 ; GFX7-LABEL: flat_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -681,84 +801,88 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -771,43 +895,43 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -817,84 +941,88 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -907,43 +1035,43 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_store(
 ; GFX7-LABEL: flat_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -953,84 +1081,88 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1043,43 +1175,43 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -1089,84 +1221,88 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1179,41 +1315,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1225,81 +1365,97 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1311,41 +1467,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1357,81 +1517,97 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1443,41 +1619,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1489,81 +1669,97 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1575,41 +1771,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1621,81 +1821,97 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1707,41 +1923,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1753,81 +1973,97 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1839,150 +2075,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1996,150 +2260,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2153,150 +2445,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2310,125 +2630,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2441,125 +2861,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2572,125 +3092,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2703,125 +3323,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2834,125 +3554,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2965,125 +3785,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3096,125 +4016,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3227,125 +4247,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3358,125 +4478,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3489,125 +4709,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3620,125 +4940,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3751,125 +5171,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3882,125 +5402,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4013,125 +5633,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4144,125 +5864,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4275,65 +6095,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4343,88 +6207,152 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4440,65 +6368,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4508,88 +6480,152 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4605,65 +6641,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4673,88 +6753,152 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4770,65 +6914,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4838,88 +7026,152 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4935,65 +7187,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5003,88 +7299,152 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5100,65 +7460,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5168,88 +7572,152 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5265,65 +7733,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5333,88 +7845,152 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5430,65 +8006,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5498,88 +8118,152 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5595,65 +8279,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5663,88 +8391,152 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5760,65 +8552,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5828,88 +8664,152 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5925,65 +8825,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5993,88 +8937,152 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6090,65 +9098,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -6158,88 +9210,152 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6255,65 +9371,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -6323,88 +9483,152 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6420,65 +9644,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -6488,88 +9756,152 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6585,65 +9917,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -6653,88 +10029,152 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6750,148 +10190,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6905,148 +10375,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7060,148 +10560,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7215,148 +10745,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7370,43 +10930,43 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7416,84 +10976,88 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7506,43 +11070,43 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7552,84 +11116,88 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7642,43 +11210,43 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ; GFX7-LABEL: flat_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7688,84 +11256,88 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7778,43 +11350,43 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7824,84 +11396,88 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7914,41 +11490,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7960,81 +11540,97 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8046,41 +11642,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8092,81 +11692,97 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8178,41 +11794,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8224,81 +11844,97 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8310,41 +11946,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8356,81 +11996,97 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8442,41 +12098,45 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8488,81 +12148,97 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8574,150 +12250,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8731,150 +12435,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8888,150 +12620,178 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9045,125 +12805,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9176,125 +13036,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9307,125 +13267,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9438,125 +13498,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9569,125 +13729,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9700,125 +13960,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9831,125 +14191,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9962,125 +14422,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10093,125 +14653,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10224,125 +14884,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10355,125 +15115,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10486,125 +15346,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10617,125 +15577,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10748,125 +15808,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10879,125 +16039,225 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -11010,65 +16270,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11078,88 +16382,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11175,65 +16543,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11243,88 +16655,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11340,65 +16816,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11408,88 +16928,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11505,65 +17089,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11573,88 +17201,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11670,65 +17362,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11738,88 +17474,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11835,65 +17635,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11903,88 +17747,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12000,65 +17908,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12068,88 +18020,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12165,65 +18181,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12233,88 +18293,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12330,65 +18454,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12398,88 +18566,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12495,65 +18727,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12563,88 +18839,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12660,65 +19000,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12728,88 +19112,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12825,65 +19273,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12893,88 +19385,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12990,65 +19546,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13058,88 +19658,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13155,65 +19819,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13223,88 +19931,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13320,65 +20092,109 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13388,88 +20204,152 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 57a2e72c907e..3bb871467c23 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -1,162 +1,192 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX7-LABEL: flat_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -170,148 +200,178 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_load(
 ; GFX7-LABEL: flat_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -325,166 +385,200 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX7-LABEL: flat_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %in, ptr %out) {
@@ -497,166 +591,222 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX7-LABEL: flat_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %in, ptr %out) {
@@ -669,43 +819,43 @@ entry:
 define amdgpu_kernel void @flat_system_unordered_store(
 ; GFX7-LABEL: flat_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -715,84 +865,88 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -805,43 +959,43 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_store(
 ; GFX7-LABEL: flat_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -851,84 +1005,88 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -941,135 +1099,161 @@ entry:
 define amdgpu_kernel void @flat_system_release_store(
 ; GFX7-LABEL: flat_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -1081,135 +1265,161 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_store(
 ; GFX7-LABEL: flat_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -1221,41 +1431,45 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1267,81 +1481,97 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1353,12 +1583,13 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1366,13 +1597,14 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1382,13 +1614,14 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1398,6 +1631,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1410,11 +1644,13 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -1423,11 +1659,13 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -1436,11 +1674,13 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -1448,11 +1688,13 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -1460,11 +1702,12 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1475,11 +1718,12 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1490,10 +1734,13 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -1501,10 +1748,13 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -1518,131 +1768,173 @@ entry:
 define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; GFX7-LABEL: flat_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1654,12 +1946,14 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1667,13 +1961,16 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1683,13 +1980,16 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1699,24 +1999,29 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -1725,12 +2030,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -1739,12 +2047,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -1752,12 +2063,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -1765,12 +2079,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1780,12 +2097,15 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1795,10 +2115,17 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -1806,10 +2133,17 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -1823,12 +2157,14 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1836,13 +2172,16 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1852,13 +2191,16 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1868,24 +2210,29 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -1894,12 +2241,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -1908,12 +2258,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -1921,12 +2274,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -1934,12 +2290,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1949,12 +2308,15 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1964,10 +2326,17 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -1975,10 +2344,17 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -1992,168 +2368,200 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2166,172 +2574,226 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2344,172 +2806,226 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2522,125 +3038,225 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2653,14 +3269,26 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -2668,14 +3296,26 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2685,14 +3325,26 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2702,24 +3354,42 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -2728,10 +3398,16 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -2740,10 +3416,16 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -2751,10 +3433,16 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -2762,10 +3450,17 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2775,10 +3470,17 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2788,10 +3490,17 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -2799,10 +3508,17 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -2817,129 +3533,251 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2952,14 +3790,27 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -2967,14 +3818,28 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2984,14 +3849,28 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3001,25 +3880,45 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -3028,11 +3927,18 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -3041,11 +3947,18 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3053,11 +3966,18 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3065,10 +3985,19 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3078,10 +4007,19 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3091,10 +4029,21 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -3102,10 +4051,21 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -3120,14 +4080,27 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3135,14 +4108,28 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3152,14 +4139,28 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3169,25 +4170,45 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -3196,11 +4217,18 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -3209,11 +4237,18 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3221,11 +4256,18 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3233,10 +4275,19 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3246,10 +4297,19 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3259,10 +4319,21 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -3270,10 +4341,21 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -3288,14 +4370,26 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3303,14 +4397,26 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3320,14 +4426,26 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3337,24 +4455,42 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -3363,10 +4499,16 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -3375,10 +4517,16 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3386,10 +4534,16 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3397,10 +4551,17 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3410,10 +4571,17 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3423,10 +4591,17 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -3434,10 +4609,17 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -3452,14 +4634,26 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3467,14 +4661,26 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3484,14 +4690,26 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3501,24 +4719,42 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -3527,10 +4763,16 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -3539,10 +4781,16 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3550,10 +4798,16 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3561,10 +4815,17 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3574,10 +4835,17 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3587,10 +4855,17 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -3598,10 +4873,17 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -3616,14 +4898,27 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3631,14 +4926,28 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3648,14 +4957,28 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3665,25 +4988,45 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -3692,11 +5035,18 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -3705,11 +5055,18 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3717,11 +5074,18 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3729,10 +5093,19 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3742,10 +5115,19 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3755,10 +5137,21 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -3766,10 +5159,21 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -3784,14 +5188,27 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3799,14 +5216,28 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3816,14 +5247,28 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3833,25 +5278,45 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -3860,11 +5325,18 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -3873,11 +5345,18 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3885,11 +5364,18 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -3897,10 +5383,19 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3910,10 +5405,19 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3923,10 +5427,21 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -3934,10 +5449,21 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -3952,14 +5478,27 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3967,14 +5506,28 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3984,14 +5537,28 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4001,25 +5568,45 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -4028,11 +5615,18 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -4041,11 +5635,18 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4053,11 +5654,18 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4065,10 +5673,19 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4078,10 +5695,19 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4091,10 +5717,21 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -4102,10 +5739,21 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -4120,14 +5768,27 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4135,14 +5796,28 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4152,14 +5827,28 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4169,25 +5858,45 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -4196,11 +5905,18 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -4209,11 +5925,18 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4221,11 +5944,18 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4233,10 +5963,19 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4246,10 +5985,19 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4259,10 +6007,21 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -4270,10 +6029,21 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -4288,14 +6058,27 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4303,14 +6086,28 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4320,14 +6117,28 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4337,25 +6148,45 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -4364,11 +6195,18 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -4377,11 +6215,18 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4389,11 +6234,18 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4401,10 +6253,19 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4414,10 +6275,19 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4427,10 +6297,21 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -4438,10 +6319,21 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -4456,14 +6348,27 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4471,14 +6376,28 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4488,14 +6407,28 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4505,25 +6438,45 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -4532,11 +6485,18 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -4545,11 +6505,18 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4557,11 +6524,18 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4569,10 +6543,19 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4582,10 +6565,19 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4595,10 +6587,21 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -4606,10 +6609,21 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -4624,14 +6638,27 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4639,14 +6666,28 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4656,14 +6697,28 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4673,25 +6728,45 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -4700,11 +6775,18 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -4713,11 +6795,18 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4725,11 +6814,18 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4737,10 +6833,19 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4750,10 +6855,19 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4763,10 +6877,21 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -4774,10 +6899,21 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -4792,14 +6928,27 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4807,14 +6956,28 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4824,14 +6987,28 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4841,25 +7018,45 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -4868,11 +7065,18 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -4881,11 +7085,18 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4893,11 +7104,18 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -4905,10 +7123,19 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4918,10 +7145,19 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -4931,10 +7167,21 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -4942,10 +7189,21 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -4960,65 +7218,109 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5028,88 +7330,152 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5125,70 +7491,114 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5198,101 +7608,169 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5307,65 +7785,115 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5375,92 +7903,172 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5476,70 +8084,120 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5549,105 +8207,189 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5662,70 +8404,120 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5735,105 +8527,189 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5848,70 +8724,114 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5921,101 +8841,169 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6030,70 +9018,114 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6103,101 +9135,169 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6212,70 +9312,120 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6285,105 +9435,189 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6398,70 +9632,120 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6471,105 +9755,189 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6584,70 +9952,120 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6657,105 +10075,189 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6770,70 +10272,120 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6843,105 +10395,189 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6956,70 +10592,120 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -7029,105 +10715,189 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -7142,70 +10912,120 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -7215,105 +11035,189 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -7328,70 +11232,120 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -7401,105 +11355,189 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -7514,70 +11552,120 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -7587,105 +11675,189 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -7700,148 +11872,178 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; GFX7-LABEL: flat_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7855,148 +12057,178 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; GFX7-LABEL: flat_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8010,175 +12242,209 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX7-LABEL: flat_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8192,175 +12458,231 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8374,43 +12696,43 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ; GFX7-LABEL: flat_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -8420,84 +12742,88 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8510,43 +12836,43 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ; GFX7-LABEL: flat_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -8556,84 +12882,88 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8646,135 +12976,161 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_store(
 ; GFX7-LABEL: flat_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -8786,135 +13142,161 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -8926,41 +13308,45 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8972,81 +13358,97 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -9058,12 +13460,13 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9071,13 +13474,14 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -9086,13 +13490,14 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -9101,6 +13506,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9113,11 +13519,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -9126,11 +13534,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -9139,11 +13549,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -9151,11 +13563,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -9163,11 +13577,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -9177,11 +13592,12 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -9191,10 +13607,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -9202,10 +13621,13 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -9219,131 +13641,173 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -9355,12 +13819,14 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9368,13 +13834,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -9383,13 +13852,16 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -9398,24 +13870,29 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -9424,12 +13901,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -9438,12 +13918,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -9451,12 +13934,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -9464,12 +13950,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -9478,12 +13967,15 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -9492,10 +13984,17 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -9503,10 +14002,17 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -9520,12 +14026,14 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9533,13 +14041,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -9548,13 +14059,16 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -9563,24 +14077,29 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -9589,12 +14108,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -9603,12 +14125,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -9616,12 +14141,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -9629,12 +14157,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -9643,12 +14174,15 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -9657,10 +14191,17 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -9668,10 +14209,17 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -9685,176 +14233,209 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9868,180 +14449,235 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -10055,180 +14691,235 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -10242,125 +14933,225 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10373,14 +15164,26 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10388,14 +15191,26 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -10404,14 +15219,26 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -10420,24 +15247,42 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -10446,10 +15291,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -10458,10 +15309,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -10469,10 +15326,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -10480,10 +15343,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -10492,10 +15362,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -10504,10 +15381,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -10515,10 +15399,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -10533,129 +15424,251 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10668,14 +15681,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10683,14 +15709,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -10699,14 +15739,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -10715,25 +15769,45 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -10742,11 +15816,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -10755,11 +15836,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -10767,11 +15855,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -10779,10 +15874,19 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -10791,10 +15895,19 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -10803,10 +15916,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -10814,10 +15938,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -10832,14 +15967,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10847,14 +15995,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -10863,14 +16025,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -10879,25 +16055,45 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -10906,11 +16102,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -10919,11 +16122,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -10931,11 +16141,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -10943,10 +16160,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -10955,10 +16181,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -10967,10 +16202,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -10978,10 +16224,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -10996,14 +16253,26 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11011,14 +16280,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11027,14 +16308,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11043,24 +16336,42 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -11069,10 +16380,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -11081,10 +16398,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11092,10 +16415,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11103,10 +16432,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11115,10 +16451,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11127,10 +16470,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -11138,10 +16488,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -11156,14 +16513,26 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11171,14 +16540,26 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11187,14 +16568,26 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11203,24 +16596,42 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -11229,10 +16640,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -11241,10 +16658,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11252,10 +16675,16 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11263,10 +16692,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11275,10 +16711,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11287,10 +16730,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -11298,10 +16748,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -11316,14 +16773,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11331,14 +16801,28 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11347,14 +16831,28 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11363,25 +16861,45 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -11390,11 +16908,18 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -11403,11 +16928,18 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11415,11 +16947,18 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11427,10 +16966,19 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11439,10 +16987,19 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11451,10 +17008,21 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -11462,10 +17030,21 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -11480,14 +17059,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11495,14 +17087,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11511,14 +17117,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11527,25 +17147,45 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -11554,11 +17194,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -11567,11 +17214,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11579,11 +17233,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11591,10 +17252,19 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11603,10 +17273,19 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11615,10 +17294,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -11626,10 +17316,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -11644,14 +17345,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11659,14 +17373,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11675,14 +17403,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11691,25 +17433,45 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -11718,11 +17480,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -11731,11 +17500,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11743,11 +17519,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11755,10 +17538,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11767,10 +17559,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11779,10 +17580,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -11790,10 +17602,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -11808,14 +17631,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11823,14 +17659,28 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -11839,14 +17689,28 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -11855,25 +17719,45 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -11882,11 +17766,18 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -11895,11 +17786,18 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11907,11 +17805,18 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -11919,10 +17824,19 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -11931,10 +17845,19 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -11943,10 +17866,21 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -11954,10 +17888,21 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -11972,14 +17917,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11987,14 +17945,28 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -12003,14 +17975,28 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -12019,25 +18005,45 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -12046,11 +18052,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -12059,11 +18072,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -12071,11 +18091,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -12083,10 +18110,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -12095,10 +18131,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -12107,10 +18152,21 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -12118,10 +18174,21 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -12136,14 +18203,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12151,14 +18231,28 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -12167,14 +18261,28 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -12183,25 +18291,45 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -12210,11 +18338,18 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -12223,11 +18358,18 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -12235,11 +18377,18 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -12247,10 +18396,19 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -12259,10 +18417,19 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -12271,10 +18438,21 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -12282,10 +18460,21 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -12300,14 +18489,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12315,14 +18517,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -12331,14 +18547,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -12347,25 +18577,45 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -12374,11 +18624,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -12387,11 +18644,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -12399,11 +18663,18 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -12411,10 +18682,19 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -12423,10 +18703,19 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -12435,10 +18724,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -12446,10 +18746,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -12464,14 +18775,27 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12479,14 +18803,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
@@ -12495,14 +18833,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
@@ -12511,25 +18863,45 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
@@ -12538,11 +18910,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
@@ -12551,11 +18930,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -12563,11 +18949,18 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
@@ -12575,10 +18968,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -12587,10 +18989,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -12599,10 +19010,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -12610,10 +19032,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -12628,65 +19061,109 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12696,88 +19173,152 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12793,73 +19334,117 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -12870,106 +19455,174 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12985,65 +19638,115 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13053,92 +19756,172 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13154,73 +19937,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13231,110 +20064,194 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13350,73 +20267,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13427,110 +20394,194 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13546,73 +20597,117 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13623,106 +20718,174 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13738,73 +20901,117 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -13815,106 +21022,174 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13930,73 +21205,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14007,110 +21332,194 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14126,73 +21535,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14203,110 +21662,194 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14322,73 +21865,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14399,110 +21992,194 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14518,73 +22195,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14595,110 +22322,194 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14714,73 +22525,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14791,110 +22652,194 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -14910,73 +22855,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -14987,110 +22982,194 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -15106,73 +23185,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -15183,110 +23312,194 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -15302,73 +23515,123 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -15379,110 +23642,194 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 735e541b9f7c..8708fcf3b459 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1,114 +1,144 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_load_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -122,122 +152,238 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s6, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v1, s6, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    ; implicit-def: $sgpr6
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_mov_b32 s6, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_mov_b32 s8, s9
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
+; GFX7-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e64 v1, s6, v0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, 0
+; GFX10-WGP-NEXT:    ; implicit-def: $sgpr6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-WGP-NEXT:    v_add_co_u32 v0, s7, s7, v0
+; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v2, s6, s6, v1, s7
+; GFX10-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
+; GFX10-CU-NEXT:    s_mov_b32 s6, 2
+; GFX10-CU-NEXT:    v_lshlrev_b32_e64 v1, s6, v0
+; GFX10-CU-NEXT:    s_mov_b32 s6, 0
+; GFX10-CU-NEXT:    ; implicit-def: $sgpr6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-CU-NEXT:    s_mov_b32 s7, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-CU-NEXT:    s_mov_b32 s6, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-CU-NEXT:    v_add_co_u32 v0, s7, s7, v0
+; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v2, s6, s6, v1, s7
+; GFX10-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 0
+; SKIP-CACHE-INV-NEXT:    ; implicit-def: $sgpr2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SKIP-CACHE-INV-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[2:3], s2, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_addc_u32_e64 v2, s[2:3], v1, v2, s[2:3]
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0
+; GFX11-WGP-NEXT:    ; implicit-def: $sgpr2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-WGP-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-WGP-NEXT:    s_mov_b32 s3, s4
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-WGP-NEXT:    s_mov_b32 s2, s5
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-WGP-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX11-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0
+; GFX11-CU-NEXT:    ; implicit-def: $sgpr2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-CU-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-CU-NEXT:    s_mov_b32 s3, s4
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-CU-NEXT:    s_mov_b32 s2, s5
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-CU-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX11-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-WGP-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0
+; GFX12-WGP-NEXT:    ; implicit-def: $sgpr2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-WGP-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX12-WGP-NEXT:    s_mov_b32 s3, s4
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-WGP-NEXT:    s_mov_b32 s2, s5
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-WGP-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX12-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-CU-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x8
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0
+; GFX12-CU-NEXT:    ; implicit-def: $sgpr2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-CU-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX12-CU-NEXT:    s_mov_b32 s3, s4
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-CU-NEXT:    s_mov_b32 s2, s5
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-CU-NEXT:    v_add_co_u32 v0, s3, s3, v0
+; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
+; GFX12-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -253,13 +399,15 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -267,13 +415,16 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -281,13 +432,16 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -295,13 +449,15 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -309,11 +465,15 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2 dlc
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -321,11 +481,15 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_store_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2 dlc
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -333,11 +497,19 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
@@ -345,11 +517,19 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
@@ -364,15 +544,28 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    flat_load_dword v2, v[1:2]
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_mov_b32 s4, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v3, s4, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0
+; GFX7-NEXT:    ; implicit-def: $sgpr4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_mov_b32 s6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[4:5], v1, v3, s[4:5]
+; GFX7-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -380,14 +573,28 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-WGP-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[1:2]
-; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX10-WGP-NEXT:    s_mov_b32 s4, 2
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e64 v3, s4, v0
+; GFX10-WGP-NEXT:    s_mov_b32 s4, 0
+; GFX10-WGP-NEXT:    ; implicit-def: $sgpr4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-WGP-NEXT:    v_add_co_u32 v0, s5, s5, v0
+; GFX10-WGP-NEXT:    v_add_co_ci_u32_e64 v3, s4, s4, v1, s5
+; GFX10-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -395,14 +602,28 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[1:2]
-; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX10-CU-NEXT:    s_mov_b32 s4, 2
+; GFX10-CU-NEXT:    v_lshlrev_b32_e64 v3, s4, v0
+; GFX10-CU-NEXT:    s_mov_b32 s4, 0
+; GFX10-CU-NEXT:    ; implicit-def: $sgpr4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-CU-NEXT:    s_mov_b32 s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-CU-NEXT:    v_add_co_u32 v0, s5, s5, v0
+; GFX10-CU-NEXT:    v_add_co_ci_u32_e64 v3, s4, s4, v1, s5
+; GFX10-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -410,15 +631,28 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[1:2]
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; SKIP-CACHE-INV-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, 0
+; SKIP-CACHE-INV-NEXT:    ; implicit-def: $sgpr0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v4, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, v3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v4
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[0:1], s0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v3, s[0:1]
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -426,14 +660,29 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX11-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-WGP-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[1:2]
-; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-WGP-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s0
+; GFX11-WGP-NEXT:    s_mov_b32 s0, 2
+; GFX11-WGP-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; GFX11-WGP-NEXT:    s_mov_b32 s0, 0
+; GFX11-WGP-NEXT:    ; implicit-def: $sgpr0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-WGP-NEXT:    s_mov_b32 s1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-WGP-NEXT:    s_mov_b32 s0, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-WGP-NEXT:    v_add_co_u32 v0, s1, s1, v0
+; GFX11-WGP-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
+; GFX11-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2 dlc
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -441,14 +690,29 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX11-CU-LABEL: flat_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-CU-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[1:2]
-; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-CU-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s0
+; GFX11-CU-NEXT:    s_mov_b32 s0, 2
+; GFX11-CU-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; GFX11-CU-NEXT:    s_mov_b32 s0, 0
+; GFX11-CU-NEXT:    ; implicit-def: $sgpr0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-CU-NEXT:    s_mov_b32 s1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-CU-NEXT:    s_mov_b32 s0, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-CU-NEXT:    v_add_co_u32 v0, s1, s1, v0
+; GFX11-CU-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
+; GFX11-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2 dlc
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -456,14 +720,33 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX12-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-WGP-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[1:2]
-; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX12-WGP-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s0
+; GFX12-WGP-NEXT:    s_mov_b32 s0, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; GFX12-WGP-NEXT:    s_mov_b32 s0, 0
+; GFX12-WGP-NEXT:    ; implicit-def: $sgpr0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX12-WGP-NEXT:    s_mov_b32 s1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-WGP-NEXT:    s_mov_b32 s0, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-WGP-NEXT:    v_add_co_u32 v0, s1, s1, v0
+; GFX12-WGP-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
+; GFX12-WGP-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
@@ -471,14 +754,33 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX12-CU-LABEL: flat_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-CU-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[1:2]
-; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX12-CU-NEXT:    s_mov_b32 s0, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s0
+; GFX12-CU-NEXT:    s_mov_b32 s0, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v3, s0, v0
+; GFX12-CU-NEXT:    s_mov_b32 s0, 0
+; GFX12-CU-NEXT:    ; implicit-def: $sgpr0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v4, v0
+; GFX12-CU-NEXT:    s_mov_b32 s1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-CU-NEXT:    s_mov_b32 s0, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-CU-NEXT:    v_add_co_u32 v0, s1, s1, v0
+; GFX12-CU-NEXT:    v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
+; GFX12-CU-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v3
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
@@ -495,104 +797,132 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -606,91 +936,104 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX7-LABEL: flat_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_volatile_workgroup_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_volatile_workgroup_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_volatile_workgroup_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_volatile_workgroup_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
    i32 %in, ptr %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 9a1209bc47e8..c7826181cc8d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -1,162 +1,192 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX7-LABEL: flat_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -170,148 +200,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -325,148 +385,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; GFX7-LABEL: flat_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -480,148 +570,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -635,43 +755,43 @@ entry:
 define amdgpu_kernel void @flat_wavefront_unordered_store(
 ; GFX7-LABEL: flat_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -681,84 +801,88 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -771,43 +895,43 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -817,84 +941,88 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -907,43 +1035,43 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_store(
 ; GFX7-LABEL: flat_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -953,84 +1081,88 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1043,43 +1175,43 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -1089,84 +1221,88 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1179,41 +1315,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1225,81 +1365,97 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1311,41 +1467,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1357,81 +1517,97 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1443,41 +1619,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1489,81 +1669,97 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1575,41 +1771,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1621,81 +1821,97 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1707,41 +1923,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1753,81 +1973,97 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1839,150 +2075,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1996,150 +2260,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2153,150 +2445,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2310,125 +2630,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2441,125 +2861,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2572,125 +3092,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2703,125 +3323,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2834,125 +3554,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2965,125 +3785,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3096,125 +4016,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3227,125 +4247,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3358,125 +4478,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3489,125 +4709,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3620,125 +4940,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3751,125 +5171,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -3882,125 +5402,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4013,125 +5633,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4144,125 +5864,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4275,65 +6095,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4343,88 +6207,152 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4440,65 +6368,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4508,88 +6480,152 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4605,65 +6641,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4673,88 +6753,152 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4770,65 +6914,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4838,88 +7026,152 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4935,65 +7187,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5003,88 +7299,152 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5100,65 +7460,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5168,88 +7572,152 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5265,65 +7733,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5333,88 +7845,152 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5430,65 +8006,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5498,88 +8118,152 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5595,65 +8279,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5663,88 +8391,152 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5760,65 +8552,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5828,88 +8664,152 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5925,65 +8825,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -5993,88 +8937,152 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6090,65 +9098,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -6158,88 +9210,152 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6255,65 +9371,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -6323,88 +9483,152 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6420,65 +9644,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -6488,88 +9756,152 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6585,65 +9917,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -6653,88 +10029,152 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6750,148 +10190,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6905,148 +10375,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7060,148 +10560,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7215,148 +10745,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7370,43 +10930,43 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7416,84 +10976,88 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7506,43 +11070,43 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7552,84 +11116,88 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7642,43 +11210,43 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ; GFX7-LABEL: flat_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7688,84 +11256,88 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7778,43 +11350,43 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7824,84 +11396,88 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7914,41 +11490,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7960,81 +11540,97 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8046,41 +11642,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8092,81 +11692,97 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8178,41 +11794,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8224,81 +11844,97 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8310,41 +11946,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8356,81 +11996,97 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8442,41 +12098,45 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8488,81 +12148,97 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8574,150 +12250,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8731,150 +12435,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8888,150 +12620,178 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9045,125 +12805,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9176,125 +13036,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9307,125 +13267,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9438,125 +13498,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9569,125 +13729,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9700,125 +13960,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9831,125 +14191,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9962,125 +14422,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10093,125 +14653,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10224,125 +14884,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10355,125 +15115,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10486,125 +15346,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10617,125 +15577,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10748,125 +15808,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10879,125 +16039,225 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -11010,65 +16270,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11078,88 +16382,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11175,65 +16543,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11243,88 +16655,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11340,65 +16816,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11408,88 +16928,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11505,65 +17089,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11573,88 +17201,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11670,65 +17362,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11738,88 +17474,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11835,65 +17635,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11903,88 +17747,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12000,65 +17908,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12068,88 +18020,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12165,65 +18181,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12233,88 +18293,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12330,65 +18454,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12398,88 +18566,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12495,65 +18727,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12563,88 +18839,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12660,65 +19000,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12728,88 +19112,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12825,65 +19273,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12893,88 +19385,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12990,65 +19546,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13058,88 +19658,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13155,65 +19819,109 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13223,88 +19931,152 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 13affefad5df..405168a1b5c2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -1,162 +1,192 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX7-LABEL: flat_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -170,148 +200,178 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -325,160 +385,192 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; GFX7-LABEL: flat_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -492,160 +584,209 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -659,43 +800,43 @@ entry:
 define amdgpu_kernel void @flat_workgroup_unordered_store(
 ; GFX7-LABEL: flat_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -705,84 +846,88 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -795,43 +940,43 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -841,84 +986,88 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -931,131 +1080,152 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX7-LABEL: flat_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -1067,131 +1237,152 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr %out) {
@@ -1203,41 +1394,45 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1249,81 +1444,97 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1335,25 +1546,27 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1362,19 +1575,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1387,22 +1602,26 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1410,22 +1629,26 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -1433,11 +1656,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1447,11 +1671,12 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1459,10 +1684,13 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1470,10 +1698,13 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1486,127 +1717,164 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -1618,25 +1886,30 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1645,47 +1918,57 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1693,22 +1976,28 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -1716,12 +2005,15 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1730,22 +2022,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1753,10 +2054,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1769,25 +2074,30 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1796,47 +2106,57 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1844,22 +2164,28 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -1867,12 +2193,15 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1881,22 +2210,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1904,10 +2242,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1920,156 +2262,193 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2082,156 +2461,210 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2244,156 +2677,210 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -2406,125 +2893,225 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2537,28 +3124,52 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2567,48 +3178,84 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2616,20 +3263,32 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -2637,10 +3296,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2649,20 +3315,34 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2670,10 +3350,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2687,125 +3374,242 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -2818,28 +3622,55 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2848,48 +3679,88 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2897,20 +3768,34 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -2918,10 +3803,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2930,20 +3824,39 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2951,10 +3864,18 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2968,28 +3889,55 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -2998,48 +3946,88 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3047,20 +4035,34 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -3068,10 +4070,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3080,20 +4091,39 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3101,10 +4131,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3118,28 +4156,52 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3148,48 +4210,84 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3197,20 +4295,32 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -3218,10 +4328,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3230,20 +4347,34 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3251,10 +4382,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3268,28 +4406,52 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3298,48 +4460,84 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3347,20 +4545,32 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -3368,10 +4578,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3380,20 +4597,34 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3401,10 +4632,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3418,28 +4656,55 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3448,48 +4713,88 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3497,20 +4802,34 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -3518,10 +4837,19 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3530,20 +4858,39 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3551,10 +4898,18 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3568,28 +4923,55 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3598,48 +4980,88 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3647,20 +5069,34 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -3668,10 +5104,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3680,20 +5125,39 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3701,10 +5165,18 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3718,28 +5190,55 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3748,48 +5247,88 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3797,20 +5336,34 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -3818,10 +5371,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3830,20 +5392,39 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3851,10 +5432,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3868,28 +5457,55 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3898,48 +5514,88 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3947,20 +5603,34 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -3968,10 +5638,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3980,20 +5659,39 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4001,10 +5699,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4018,65 +5724,109 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4086,88 +5836,152 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4183,68 +5997,112 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -4255,93 +6113,163 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4356,65 +6284,114 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -4424,88 +6401,164 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4521,68 +6574,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -4593,93 +6695,175 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4694,68 +6878,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -4766,93 +6999,175 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -4867,68 +7182,112 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -4939,93 +7298,163 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5040,68 +7469,112 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5112,93 +7585,163 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5213,68 +7756,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5285,93 +7877,175 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5386,68 +8060,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5458,93 +8181,175 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5559,68 +8364,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5631,93 +8485,175 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5732,68 +8668,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5804,93 +8789,175 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -5905,68 +8972,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5977,93 +9093,175 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6078,68 +9276,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6150,93 +9397,175 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6251,68 +9580,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6323,93 +9701,175 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6424,68 +9884,117 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -6496,93 +10005,175 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -6597,148 +10188,178 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6752,148 +10373,178 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6907,156 +10558,188 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7070,156 +10753,198 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7233,43 +10958,43 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7279,84 +11004,88 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7369,43 +11098,43 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7415,84 +11144,88 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7505,43 +11238,45 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX7-LABEL: flat_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7551,84 +11286,96 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7641,43 +11388,45 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
@@ -7687,84 +11436,96 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x8
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -7777,41 +11538,45 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7823,81 +11588,97 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -7909,24 +11690,26 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -7934,18 +11717,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7957,21 +11742,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -7979,21 +11768,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -8001,11 +11794,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -8014,21 +11808,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -8036,10 +11834,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8051,41 +11852,47 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8097,81 +11904,105 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8183,24 +12014,28 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -8208,18 +12043,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8231,21 +12068,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -8253,21 +12095,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -8275,12 +12122,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -8288,21 +12138,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -8310,10 +12168,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8325,24 +12186,28 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -8350,18 +12215,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8373,21 +12240,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -8395,21 +12267,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -8417,12 +12294,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -8430,21 +12310,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -8452,10 +12340,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in) {
@@ -8467,158 +12358,188 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8632,158 +12553,198 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8797,158 +12758,198 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8962,125 +12963,225 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9093,27 +13194,51 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9121,45 +13246,81 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9167,19 +13328,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -9187,10 +13360,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -9198,19 +13378,33 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -9218,10 +13412,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9234,125 +13435,235 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9365,27 +13676,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9393,45 +13730,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9439,19 +13813,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -9459,10 +13846,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -9470,19 +13866,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -9490,10 +13904,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9506,27 +13927,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9534,45 +13981,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9580,19 +14064,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -9600,10 +14097,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -9611,19 +14117,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -9631,10 +14155,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9647,27 +14178,51 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9675,45 +14230,81 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9721,19 +14312,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -9741,10 +14344,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -9752,19 +14362,33 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -9772,10 +14396,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9788,27 +14419,51 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9816,45 +14471,81 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9862,19 +14553,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -9882,10 +14585,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -9893,19 +14603,33 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -9913,10 +14637,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -9929,27 +14660,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9957,45 +14714,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10003,19 +14797,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -10023,10 +14830,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10034,19 +14850,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -10054,10 +14888,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10070,27 +14911,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10098,45 +14965,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10144,19 +15048,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -10164,10 +15081,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10175,19 +15101,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -10195,10 +15139,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10211,27 +15162,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10239,45 +15216,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10285,19 +15299,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -10305,10 +15332,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10316,19 +15352,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -10336,10 +15390,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10352,27 +15413,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10380,45 +15467,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10426,19 +15550,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -10446,10 +15583,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10457,19 +15603,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -10477,10 +15641,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10493,27 +15664,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10521,45 +15718,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10567,19 +15801,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -10587,10 +15834,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10598,19 +15854,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -10618,10 +15892,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10634,27 +15915,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10662,45 +15969,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10708,19 +16052,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -10728,10 +16085,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10739,19 +16105,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -10759,10 +16143,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10775,27 +16166,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10803,45 +16220,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10849,19 +16303,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -10869,10 +16336,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10880,19 +16356,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -10900,10 +16394,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -10916,27 +16417,53 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10944,45 +16471,82 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s1, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10990,19 +16554,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
@@ -11010,10 +16587,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11021,19 +16607,37 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -11041,10 +16645,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr %out, i32 %in, i32 %old) {
@@ -11057,65 +16668,109 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11125,88 +16780,152 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11222,67 +16941,111 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11292,94 +17055,160 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11395,65 +17224,111 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11463,88 +17338,160 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11560,67 +17507,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11630,94 +17623,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11733,67 +17800,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11803,94 +17916,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11906,67 +18093,111 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -11976,94 +18207,160 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12079,67 +18376,111 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12149,94 +18490,160 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12252,67 +18659,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12322,94 +18775,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12425,67 +18952,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12495,94 +19068,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12598,67 +19245,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12668,94 +19361,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12771,67 +19538,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -12841,94 +19654,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12944,67 +19831,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13014,94 +19947,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13117,67 +20124,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13187,94 +20240,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13290,67 +20417,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13360,94 +20533,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13463,67 +20710,113 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16
-; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s0, 16
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
 ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
@@ -13533,94 +20826,168 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index d02b937dcc92..793a1bab76d3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -1,168 +1,206 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @global_agent_unordered_load(
 ; GFX6-LABEL: global_agent_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -174,153 +212,191 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_load(
 ; GFX6-LABEL: global_agent_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -332,169 +408,211 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_load(
 ; GFX6-LABEL: global_agent_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -506,169 +624,226 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX6-LABEL: global_agent_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -680,145 +855,163 @@ entry:
 define amdgpu_kernel void @global_agent_unordered_store(
 ; GFX6-LABEL: global_agent_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -829,145 +1022,163 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_store(
 ; GFX6-LABEL: global_agent_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -978,147 +1189,188 @@ entry:
 define amdgpu_kernel void @global_agent_release_store(
 ; GFX6-LABEL: global_agent_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1129,147 +1381,188 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_store(
 ; GFX6-LABEL: global_agent_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1280,141 +1573,154 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
 ; GFX6-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1425,25 +1731,32 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ; GFX6-LABEL: global_agent_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1451,13 +1764,13 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1465,13 +1778,13 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -1479,11 +1792,17 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -1491,59 +1810,63 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -1552,11 +1875,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -1565,9 +1888,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -1575,9 +1900,11 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -1591,143 +1918,179 @@ entry:
 define amdgpu_kernel void @global_agent_release_atomicrmw(
 ; GFX6-LABEL: global_agent_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1738,25 +2101,34 @@ entry:
 define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1764,13 +2136,15 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1778,13 +2152,15 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -1792,73 +2168,90 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -1867,11 +2260,13 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -1880,9 +2275,15 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -1890,9 +2291,15 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -1906,25 +2313,34 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1932,13 +2348,15 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1946,13 +2364,15 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -1960,73 +2380,90 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -2035,11 +2472,13 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -2048,9 +2487,15 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -2058,9 +2503,15 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -2074,69 +2525,83 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2145,112 +2610,116 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2262,70 +2731,91 @@ entry:
 define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2333,114 +2823,134 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2452,70 +2962,91 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2523,114 +3054,134 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2642,141 +3193,219 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2788,14 +3417,22 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -2803,14 +3440,26 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -2818,12 +3467,17 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2831,12 +3485,17 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -2844,69 +3503,107 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2914,11 +3611,17 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -2926,22 +3629,34 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2955,143 +3670,244 @@ entry:
 define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3103,14 +3919,23 @@ entry:
 define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3118,14 +3943,27 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3133,12 +3971,19 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3146,12 +3991,19 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3159,71 +4011,116 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3231,11 +4128,19 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3243,22 +4148,42 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3272,14 +4197,23 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3287,14 +4221,27 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3302,12 +4249,19 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3315,12 +4269,19 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3328,71 +4289,116 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3400,11 +4406,19 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3412,22 +4426,42 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3441,14 +4475,22 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3456,14 +4498,26 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3471,12 +4525,17 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3484,12 +4543,17 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3497,69 +4561,107 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3567,11 +4669,17 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3579,22 +4687,34 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3608,14 +4728,22 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3623,14 +4751,26 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3638,12 +4778,17 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3651,12 +4796,17 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3664,69 +4814,107 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3734,11 +4922,17 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3746,22 +4940,34 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3775,14 +4981,23 @@ entry:
 define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3790,14 +5005,27 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3805,12 +5033,19 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3818,12 +5053,19 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3831,71 +5073,116 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3903,11 +5190,19 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3915,22 +5210,42 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3944,14 +5259,23 @@ entry:
 define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3959,14 +5283,27 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3974,12 +5311,19 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3987,12 +5331,19 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4000,71 +5351,116 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4072,11 +5468,19 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4084,22 +5488,42 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4113,14 +5537,23 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4128,14 +5561,27 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4143,12 +5589,19 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4156,12 +5609,19 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4169,71 +5629,116 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4241,11 +5746,19 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4253,22 +5766,42 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4282,14 +5815,23 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4297,14 +5839,27 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4312,12 +5867,19 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4325,12 +5887,19 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4338,71 +5907,116 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4410,11 +6024,19 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4422,22 +6044,42 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4451,14 +6093,23 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4466,14 +6117,27 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4481,12 +6145,19 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4494,12 +6165,19 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4507,71 +6185,116 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4579,11 +6302,19 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4591,22 +6322,42 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4620,14 +6371,23 @@ entry:
 define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4635,14 +6395,27 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4650,12 +6423,19 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4663,12 +6443,19 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4676,71 +6463,116 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4748,11 +6580,19 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4760,22 +6600,42 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4789,14 +6649,23 @@ entry:
 define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4804,14 +6673,27 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4819,12 +6701,19 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4832,12 +6721,19 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4845,71 +6741,116 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4917,11 +6858,19 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4929,22 +6878,42 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4958,14 +6927,23 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4973,14 +6951,27 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4988,12 +6979,19 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -5001,12 +6999,19 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -5014,71 +7019,116 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -5086,11 +7136,19 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -5098,22 +7156,42 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5127,169 +7205,248 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5303,185 +7460,269 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5495,171 +7736,273 @@ entry:
 define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5673,187 +8016,294 @@ entry:
 define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5867,187 +8317,294 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6061,185 +8618,269 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6253,185 +8894,269 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6445,187 +9170,294 @@ entry:
 define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6639,187 +9471,294 @@ entry:
 define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6833,187 +9772,294 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7027,187 +10073,294 @@ entry:
 define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7221,187 +10374,294 @@ entry:
 define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7415,187 +10675,294 @@ entry:
 define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7609,187 +10976,294 @@ entry:
 define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7803,187 +11277,294 @@ entry:
 define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7997,153 +11578,191 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ; GFX6-LABEL: global_agent_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -8155,153 +11774,191 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ; GFX6-LABEL: global_agent_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -8313,169 +11970,211 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ; GFX6-LABEL: global_agent_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -8487,169 +12186,226 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -8661,145 +12417,163 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ; GFX6-LABEL: global_agent_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8810,145 +12584,163 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ; GFX6-LABEL: global_agent_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8959,147 +12751,188 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_release_store(
 ; GFX6-LABEL: global_agent_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -9110,147 +12943,188 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -9261,141 +13135,154 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9406,25 +13293,32 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9432,13 +13326,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9446,13 +13340,13 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -9460,11 +13354,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9472,59 +13372,63 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -9533,11 +13437,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -9546,9 +13450,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -9556,9 +13462,11 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -9572,143 +13480,179 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
 ; GFX6-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9719,25 +13663,34 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9745,13 +13698,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9759,13 +13714,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -9773,73 +13730,90 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -9848,11 +13822,13 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -9861,9 +13837,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -9871,9 +13853,15 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -9887,25 +13875,34 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9913,13 +13910,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9927,13 +13926,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -9941,73 +13942,90 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -10016,11 +14034,13 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -10029,9 +14049,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
@@ -10039,9 +14065,15 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
@@ -10055,69 +14087,83 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -10126,112 +14172,116 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -10243,70 +14293,91 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -10314,114 +14385,134 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -10433,70 +14524,91 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -10504,114 +14616,134 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -10623,141 +14755,219 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10769,14 +14979,22 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -10784,14 +15002,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10799,12 +15029,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10812,12 +15047,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -10825,69 +15065,107 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10895,11 +15173,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -10907,22 +15191,34 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -10936,143 +15232,244 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11084,14 +15481,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11099,14 +15505,27 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11114,12 +15533,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11127,12 +15553,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11140,71 +15573,116 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11212,11 +15690,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11224,22 +15710,42 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11253,14 +15759,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11268,14 +15783,27 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11283,12 +15811,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11296,12 +15831,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11309,71 +15851,116 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11381,11 +15968,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11393,22 +15988,42 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11422,14 +16037,22 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11437,14 +16060,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11452,12 +16087,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11465,12 +16105,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11478,69 +16123,107 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11548,11 +16231,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11560,22 +16249,34 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11589,14 +16290,22 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11604,14 +16313,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11619,12 +16340,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11632,12 +16358,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11645,69 +16376,107 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11715,11 +16484,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11727,22 +16502,34 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11756,14 +16543,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11771,14 +16567,27 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11786,12 +16595,19 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11799,12 +16615,19 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11812,71 +16635,116 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11884,11 +16752,19 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11896,22 +16772,42 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11925,14 +16821,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11940,14 +16845,27 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11955,12 +16873,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11968,12 +16893,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11981,71 +16913,116 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12053,11 +17030,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12065,22 +17050,42 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12094,14 +17099,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -12109,14 +17123,27 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12124,12 +17151,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -12137,12 +17171,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -12150,71 +17191,116 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12222,11 +17308,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12234,22 +17328,42 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12263,14 +17377,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -12278,14 +17401,27 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12293,12 +17429,19 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -12306,12 +17449,19 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -12319,71 +17469,116 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12391,11 +17586,19 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12403,22 +17606,42 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12432,14 +17655,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -12447,14 +17679,27 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12462,12 +17707,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -12475,12 +17727,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -12488,71 +17747,116 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12560,11 +17864,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12572,22 +17884,42 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12601,14 +17933,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -12616,14 +17957,27 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12631,12 +17985,19 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -12644,12 +18005,19 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -12657,71 +18025,116 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12729,11 +18142,19 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12741,22 +18162,42 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12770,14 +18211,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -12785,14 +18235,27 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12800,12 +18263,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -12813,12 +18283,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -12826,71 +18303,116 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12898,11 +18420,19 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12910,22 +18440,42 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12939,14 +18489,23 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -12954,14 +18513,27 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12969,12 +18541,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -12982,12 +18561,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -12995,71 +18581,116 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -13067,11 +18698,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -13079,22 +18718,42 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13108,169 +18767,248 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13284,185 +19022,269 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13476,187 +19298,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13670,187 +19599,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13864,185 +19900,269 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14056,185 +20176,269 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14248,187 +20452,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14442,187 +20753,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14636,187 +21054,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14830,187 +21355,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -15024,187 +21656,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -15218,187 +21957,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -15412,187 +22258,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -15606,187 +22559,294 @@ entry:
 define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index fa2fc3c42369..c889c67a5ca3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -1,18 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
 
 define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: global_last_use_load_0:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
@@ -23,14 +24,19 @@ entry:
 define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: global_last_use_load_1:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX12-NEXT:    s_mov_b32 s4, 2
+; GFX12-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v0, v0, s[0:1] th:TH_LOAD_LU
+; GFX12-NEXT:    global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -43,14 +49,16 @@ entry:
 define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: global_last_use_and_volatile_load:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
@@ -61,14 +69,19 @@ entry:
 define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: global_last_use_and_nontemporal_load:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX12-NEXT:    s_mov_b32 s4, 2
+; GFX12-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v0, v0, s[0:1] th:TH_LOAD_LU
+; GFX12-NEXT:    global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index c95ff2c691a3..9b2b3a4cfa9b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -1,171 +1,204 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX6-LABEL: global_nontemporal_load_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_0:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_nontemporal_load_0:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_nontemporal_load_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_nontemporal_load_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -177,165 +210,262 @@ entry:
 define amdgpu_kernel void @global_nontemporal_load_1(
 ; GFX6-LABEL: global_nontemporal_load_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
-; GFX6-NEXT:    s_mov_b32 s2, 0
-; GFX6-NEXT:    s_mov_b32 s3, s7
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc slc
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    s_mov_b32 s12, 0
+; GFX6-NEXT:    ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13
+; GFX6-NEXT:    s_mov_b32 s13, s10
+; GFX6-NEXT:    ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[12:13]
+; GFX6-NEXT:    s_mov_b32 s12, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s12, v0
+; GFX6-NEXT:    s_mov_b32 s12, 0
+; GFX6-NEXT:    ; implicit-def: $sgpr12
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 glc slc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s6, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v1, s6, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    ; implicit-def: $sgpr6
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_mov_b32 s6, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_mov_b32 s8, s9
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
+; GFX7-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc slc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s8, 2
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e64 v1, s8, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v0, v0, s[0:1] slc
+; GFX10-WGP-NEXT:    global_load_dword v1, v1, s[6:7] slc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_mov_b32 s8, 2
+; GFX10-CU-NEXT:    v_lshlrev_b32_e64 v1, s8, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v0, v0, s[0:1] slc
+; GFX10-CU-NEXT:    global_load_dword v1, v1, s[6:7] slc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s7
-; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc slc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s8, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0
+; SKIP-CACHE-INV-NEXT:    ; implicit-def: $sgpr8
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s8, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s8, 2
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e64 v1, s8, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v0, v0, s[0:1] glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v1, s[6:7] glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s8, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s8, 2
+; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e64 v1, s8, v1
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v0, v0, s[0:1] glc slc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v1, s[6:7] glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_1:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s4, 2
+; GFX940-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v0, v0, s[0:1] nt
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v1, s[2:3] nt
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_nontemporal_load_1:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s4, 2
+; GFX940-TGSPLIT-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v0, v0, s[0:1] nt
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v1, s[2:3] nt
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX11-WGP-NEXT:    s_mov_b32 s4, 2
+; GFX11-WGP-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v0, v0, s[0:1] slc dlc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v1, s[2:3] slc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX11-CU-NEXT:    s_mov_b32 s4, 2
+; GFX11-CU-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v0, v0, s[0:1] slc dlc
+; GFX11-CU-NEXT:    global_load_b32 v1, v1, s[2:3] slc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX12-WGP-NEXT:    s_mov_b32 s4, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v0, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX12-CU-NEXT:    s_mov_b32 s4, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v0, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -349,156 +479,189 @@ entry:
 define amdgpu_kernel void @global_nontemporal_store_0(
 ; GFX6-LABEL: global_nontemporal_store_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0 glc slc
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5] glc slc
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5] glc slc
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0 glc slc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5] glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5] glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_0:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 nt sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 nt sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_nontemporal_store_0:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 nt sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 nt sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_nontemporal_store_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3] glc slc dlc
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] glc slc dlc
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_nontemporal_store_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3] glc slc dlc
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1] glc slc dlc
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3] th:TH_STORE_NT
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3] th:TH_STORE_NT
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -510,160 +673,241 @@ entry:
 define amdgpu_kernel void @global_nontemporal_store_1(
 ; GFX6-LABEL: global_nontemporal_store_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_mov_b32 s6, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, 0
+; GFX6-NEXT:    ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s11, s6
+; GFX6-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GFX6-NEXT:    s_mov_b32 s9, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v1, s9, v0
+; GFX6-NEXT:    s_mov_b32 s9, 0
+; GFX6-NEXT:    ; implicit-def: $sgpr9
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 glc slc
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s5, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v1, s5, v0
+; GFX7-NEXT:    s_mov_b32 s5, 0
+; GFX7-NEXT:    ; implicit-def: $sgpr5
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_mov_b32 s6, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
+; GFX7-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2 glc slc
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_mov_b32 s7, 2
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5] glc slc
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_mov_b32 s7, 2
+; GFX10-CU-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5] glc slc
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v1, s5, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0
+; SKIP-CACHE-INV-NEXT:    ; implicit-def: $sgpr5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s7
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s7, 2
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5] glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s7
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s7, 2
+; GFX90A-TGSPLIT-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] glc slc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5] glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_1:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 2
+; GFX940-NOTTGSPLIT-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 nt sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 nt sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_nontemporal_store_1:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s3, 2
+; GFX940-TGSPLIT-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 nt sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 nt sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX11-WGP-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3] glc slc dlc
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] glc slc dlc
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX11-CU-NEXT:    s_mov_b32 s3, 2
+; GFX11-CU-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3] glc slc dlc
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1] glc slc dlc
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3] th:TH_STORE_NT
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX12-CU-NEXT:    s_mov_b32 s3, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3] th:TH_STORE_NT
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -677,153 +921,195 @@ entry:
 define amdgpu_kernel void @global_nontemporal_volatile_load(
 ; GFX6-LABEL: global_nontemporal_volatile_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_nontemporal_volatile_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_nontemporal_volatile_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_nontemporal_volatile_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_nontemporal_volatile_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 98cd97d07d6f..afc46fbc23a6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -1,168 +1,206 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @global_singlethread_unordered_load(
 ; GFX6-LABEL: global_singlethread_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -174,153 +212,191 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_load(
 ; GFX6-LABEL: global_singlethread_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -332,153 +408,191 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_load(
 ; GFX6-LABEL: global_singlethread_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -490,153 +604,191 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ; GFX6-LABEL: global_singlethread_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -648,145 +800,163 @@ entry:
 define amdgpu_kernel void @global_singlethread_unordered_store(
 ; GFX6-LABEL: global_singlethread_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -797,145 +967,163 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_store(
 ; GFX6-LABEL: global_singlethread_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -946,145 +1134,163 @@ entry:
 define amdgpu_kernel void @global_singlethread_release_store(
 ; GFX6-LABEL: global_singlethread_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1095,145 +1301,163 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ; GFX6-LABEL: global_singlethread_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1244,141 +1468,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
 ; GFX6-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1389,141 +1626,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
 ; GFX6-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1534,141 +1784,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_release_atomicrmw(
 ; GFX6-LABEL: global_singlethread_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1679,141 +1942,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1824,141 +2100,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1969,63 +2258,77 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2034,102 +2337,102 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2141,63 +2444,77 @@ entry:
 define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2206,102 +2523,102 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2313,63 +2630,77 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2378,102 +2709,102 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2485,141 +2816,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2631,141 +3040,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2777,141 +3264,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2923,141 +3488,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3069,141 +3712,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3215,141 +3936,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3361,141 +4160,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3507,141 +4384,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3653,141 +4608,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3799,141 +4832,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3945,141 +5056,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4091,141 +5280,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4237,141 +5504,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4383,141 +5728,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4529,141 +5952,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4675,169 +6176,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4851,169 +6431,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5027,169 +6686,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5203,169 +6941,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5379,169 +7196,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5555,169 +7451,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5731,169 +7706,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5907,169 +7961,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6083,169 +8216,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6259,169 +8471,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6435,169 +8726,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6611,169 +8981,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6787,169 +9236,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6963,169 +9491,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7139,169 +9746,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7315,153 +10001,191 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ; GFX6-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7473,153 +10197,191 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7631,153 +10393,191 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7789,153 +10589,191 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7947,145 +10785,163 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ; GFX6-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8096,145 +10952,163 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8245,145 +11119,163 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ; GFX6-LABEL: global_singlethread_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8394,145 +11286,163 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8543,141 +11453,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8688,141 +11611,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8833,141 +11769,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
 ; GFX6-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8978,141 +11927,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9123,141 +12085,154 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9268,63 +12243,77 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9333,102 +12322,102 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9440,63 +12429,77 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9505,102 +12508,102 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9612,63 +12615,77 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9677,102 +12694,102 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9784,141 +12801,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9930,141 +13025,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10076,141 +13249,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10222,141 +13473,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10368,141 +13697,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10514,141 +13921,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10660,141 +14145,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10806,141 +14369,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10952,141 +14593,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11098,141 +14817,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11244,141 +15041,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11390,141 +15265,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11536,141 +15489,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11682,141 +15713,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11828,141 +15937,219 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11974,169 +16161,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12150,169 +16416,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12326,169 +16671,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12502,169 +16926,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12678,169 +17181,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12854,169 +17436,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13030,169 +17691,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13206,169 +17946,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13382,169 +18201,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13558,169 +18456,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13734,169 +18711,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13910,169 +18966,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14086,169 +19221,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14262,169 +19476,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14438,169 +19731,248 @@ entry:
 define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index a68a6a113830..1dbf0b2e9778 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -1,168 +1,206 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @global_system_unordered_load(
 ; GFX6-LABEL: global_system_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -174,153 +212,191 @@ entry:
 define amdgpu_kernel void @global_system_monotonic_load(
 ; GFX6-LABEL: global_system_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -332,171 +408,213 @@ entry:
 define amdgpu_kernel void @global_system_acquire_load(
 ; GFX6-LABEL: global_system_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -508,171 +626,228 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX6-LABEL: global_system_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -684,145 +859,163 @@ entry:
 define amdgpu_kernel void @global_system_unordered_store(
 ; GFX6-LABEL: global_system_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -833,145 +1026,163 @@ entry:
 define amdgpu_kernel void @global_system_monotonic_store(
 ; GFX6-LABEL: global_system_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -982,149 +1193,190 @@ entry:
 define amdgpu_kernel void @global_system_release_store(
 ; GFX6-LABEL: global_system_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1135,149 +1387,190 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_store(
 ; GFX6-LABEL: global_system_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1288,141 +1581,154 @@ entry:
 define amdgpu_kernel void @global_system_monotonic_atomicrmw(
 ; GFX6-LABEL: global_system_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1433,25 +1739,32 @@ entry:
 define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ; GFX6-LABEL: global_system_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1459,13 +1772,13 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_system_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1473,13 +1786,13 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_system_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -1487,11 +1800,17 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -1499,12 +1818,13 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1512,12 +1832,13 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1525,35 +1846,37 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -1562,11 +1885,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_system_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -1575,9 +1898,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_system_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -1585,9 +1910,11 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_system_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -1601,145 +1928,181 @@ entry:
 define amdgpu_kernel void @global_system_release_atomicrmw(
 ; GFX6-LABEL: global_system_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1750,25 +2113,34 @@ entry:
 define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1776,13 +2148,15 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1790,13 +2164,15 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -1804,25 +2180,34 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1830,13 +2215,15 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -1844,37 +2231,43 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -1883,11 +2276,13 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -1896,9 +2291,15 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -1906,9 +2307,15 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -1922,25 +2329,34 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -1948,13 +2364,15 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1962,13 +2380,15 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -1976,25 +2396,34 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2002,13 +2431,15 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2016,37 +2447,43 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -2055,11 +2492,13 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -2068,9 +2507,15 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -2078,9 +2523,15 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -2094,69 +2545,83 @@ entry:
 define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2165,114 +2630,118 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2284,70 +2753,91 @@ entry:
 define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2355,118 +2845,138 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2478,70 +2988,91 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2549,118 +3080,138 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2672,141 +3223,219 @@ entry:
 define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2818,14 +3447,22 @@ entry:
 define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -2833,14 +3470,26 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -2848,12 +3497,17 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2861,12 +3515,17 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -2874,25 +3533,39 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2900,11 +3573,17 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -2912,33 +3591,51 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2946,11 +3643,17 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -2958,22 +3661,34 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2987,145 +3702,246 @@ entry:
 define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3137,14 +3953,23 @@ entry:
 define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3152,14 +3977,27 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3167,12 +4005,19 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3180,12 +4025,19 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3193,26 +4045,42 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3220,12 +4088,19 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3233,35 +4108,57 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3269,11 +4166,19 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3281,22 +4186,42 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3310,14 +4235,23 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3325,14 +4259,27 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3340,12 +4287,19 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3353,12 +4307,19 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3366,26 +4327,42 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3393,12 +4370,19 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3406,35 +4390,57 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3442,11 +4448,19 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3454,22 +4468,42 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3483,14 +4517,22 @@ entry:
 define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3498,14 +4540,26 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3513,12 +4567,17 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3526,12 +4585,17 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3539,25 +4603,39 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3565,11 +4643,17 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3577,33 +4661,51 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3611,11 +4713,17 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3623,22 +4731,34 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3652,14 +4772,22 @@ entry:
 define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3667,14 +4795,26 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3682,12 +4822,17 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3695,12 +4840,17 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3708,25 +4858,39 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3734,11 +4898,17 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3746,33 +4916,51 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3780,11 +4968,17 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3792,22 +4986,34 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3821,14 +5027,23 @@ entry:
 define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3836,14 +5051,27 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -3851,12 +5079,19 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3864,12 +5099,19 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -3877,26 +5119,42 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3904,12 +5162,19 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -3917,35 +5182,57 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3953,11 +5240,19 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -3965,22 +5260,42 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3994,14 +5309,23 @@ entry:
 define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4009,14 +5333,27 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4024,12 +5361,19 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4037,12 +5381,19 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4050,26 +5401,42 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4077,12 +5444,19 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4090,35 +5464,57 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4126,11 +5522,19 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4138,22 +5542,42 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4167,14 +5591,23 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4182,14 +5615,27 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4197,12 +5643,19 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4210,12 +5663,19 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4223,26 +5683,42 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4250,12 +5726,19 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4263,35 +5746,57 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4299,11 +5804,19 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4311,22 +5824,42 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4340,14 +5873,23 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -4355,14 +5897,27 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -4370,12 +5925,19 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4383,12 +5945,19 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -4396,26 +5965,42 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4423,12 +6008,19 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -4436,35 +6028,57 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4472,11 +6086,19 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -4484,22 +6106,42 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4513,169 +6155,248 @@ entry:
 define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4689,187 +6410,271 @@ entry:
 define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4883,191 +6688,298 @@ entry:
 define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5081,191 +6993,298 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5279,187 +7298,271 @@ entry:
 define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5473,187 +7576,271 @@ entry:
 define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5667,191 +7854,298 @@ entry:
 define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5865,191 +8159,298 @@ entry:
 define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6063,191 +8464,298 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6261,191 +8769,298 @@ entry:
 define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6459,191 +9074,298 @@ entry:
 define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6657,191 +9379,298 @@ entry:
 define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6855,191 +9684,298 @@ entry:
 define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7053,191 +9989,298 @@ entry:
 define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7251,153 +10294,191 @@ entry:
 define amdgpu_kernel void @global_system_one_as_unordered_load(
 ; GFX6-LABEL: global_system_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7409,153 +10490,191 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ; GFX6-LABEL: global_system_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7567,171 +10686,213 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_load(
 ; GFX6-LABEL: global_system_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7743,171 +10904,228 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX6-LABEL: global_system_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7919,145 +11137,163 @@ entry:
 define amdgpu_kernel void @global_system_one_as_unordered_store(
 ; GFX6-LABEL: global_system_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8068,145 +11304,163 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ; GFX6-LABEL: global_system_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8217,149 +11471,190 @@ entry:
 define amdgpu_kernel void @global_system_one_as_release_store(
 ; GFX6-LABEL: global_system_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8370,149 +11665,190 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ; GFX6-LABEL: global_system_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8523,141 +11859,154 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8668,25 +12017,32 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -8694,13 +12050,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -8708,13 +12064,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -8722,11 +12078,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -8734,12 +12096,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -8747,12 +12110,13 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -8760,35 +12124,37 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -8797,11 +12163,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -8810,9 +12176,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -8820,9 +12188,11 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -8836,145 +12206,181 @@ entry:
 define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
 ; GFX6-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8985,25 +12391,34 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9011,13 +12426,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9025,13 +12442,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -9039,25 +12458,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9065,13 +12493,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9079,37 +12509,43 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -9118,11 +12554,13 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -9131,9 +12569,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -9141,9 +12585,15 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -9157,25 +12607,34 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -9183,13 +12642,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -9197,13 +12658,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -9211,25 +12674,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9237,13 +12709,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -9251,37 +12725,43 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1] sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
@@ -9290,11 +12770,13 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
@@ -9303,9 +12785,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
@@ -9313,9 +12801,15 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
@@ -9329,69 +12823,83 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9400,114 +12908,118 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9519,70 +13031,91 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -9590,118 +13123,138 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9713,70 +13266,91 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -9784,118 +13358,138 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9907,141 +13501,219 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10053,14 +13725,22 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -10068,14 +13748,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10083,12 +13775,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10096,12 +13793,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -10109,25 +13811,39 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10135,11 +13851,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10147,33 +13869,51 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10181,11 +13921,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -10193,22 +13939,34 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -10222,145 +13980,246 @@ entry:
 define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10372,14 +14231,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -10387,14 +14255,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10402,12 +14283,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10415,12 +14303,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -10428,26 +14323,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10455,12 +14366,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10468,35 +14386,57 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10504,11 +14444,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -10516,22 +14464,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -10545,14 +14513,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -10560,14 +14537,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10575,12 +14565,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10588,12 +14585,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -10601,26 +14605,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10628,12 +14648,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10641,35 +14668,57 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10677,11 +14726,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -10689,22 +14746,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -10718,14 +14795,22 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -10733,14 +14818,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10748,12 +14845,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10761,12 +14863,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -10774,25 +14881,39 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10800,11 +14921,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10812,33 +14939,51 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -10846,11 +14991,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -10858,22 +15009,34 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -10887,14 +15050,22 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -10902,14 +15073,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -10917,12 +15100,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -10930,12 +15118,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -10943,25 +15136,39 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10969,11 +15176,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -10981,33 +15194,51 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11015,11 +15246,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11027,22 +15264,34 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11056,14 +15305,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11071,14 +15329,27 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11086,12 +15357,19 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11099,12 +15377,19 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11112,26 +15397,42 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11139,12 +15440,19 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11152,35 +15460,57 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11188,11 +15518,19 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11200,22 +15538,42 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11229,14 +15587,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11244,14 +15611,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11259,12 +15639,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11272,12 +15659,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11285,26 +15679,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11312,12 +15722,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11325,35 +15742,57 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11361,11 +15800,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11373,22 +15820,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11402,14 +15869,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11417,14 +15893,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11432,12 +15921,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11445,12 +15941,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11458,26 +15961,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11485,12 +16004,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11498,35 +16024,57 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11534,11 +16082,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11546,22 +16102,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11575,14 +16151,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11590,14 +16175,27 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11605,12 +16203,19 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11618,12 +16223,19 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11631,26 +16243,42 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11658,12 +16286,19 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11671,35 +16306,57 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11707,11 +16364,19 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11719,22 +16384,42 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11748,14 +16433,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11763,14 +16457,27 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11778,12 +16485,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11791,12 +16505,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11804,26 +16525,42 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11831,12 +16568,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -11844,35 +16588,57 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -11880,11 +16646,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -11892,22 +16666,42 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11921,14 +16715,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -11936,14 +16739,27 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -11951,12 +16767,19 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -11964,12 +16787,19 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -11977,26 +16807,42 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12004,12 +16850,19 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12017,35 +16870,57 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12053,11 +16928,19 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12065,22 +16948,42 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12094,14 +16997,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -12109,14 +17021,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12124,12 +17049,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -12137,12 +17069,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -12150,26 +17089,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12177,12 +17132,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12190,35 +17152,57 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12226,11 +17210,19 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12238,22 +17230,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12267,14 +17279,23 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -12282,14 +17303,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
@@ -12297,12 +17331,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -12310,12 +17351,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
@@ -12323,26 +17371,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12350,12 +17414,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
@@ -12363,35 +17434,57 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -12399,11 +17492,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
@@ -12411,22 +17512,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12440,169 +17561,248 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12616,187 +17816,271 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12810,173 +18094,275 @@ entry:
 define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12990,191 +18376,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13188,191 +18681,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13386,187 +18986,271 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13580,187 +19264,271 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13774,191 +19542,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13972,191 +19847,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14170,191 +20152,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14368,191 +20457,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14566,191 +20762,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14764,191 +21067,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14962,191 +21372,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -15160,191 +21677,298 @@ entry:
 define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1_vol
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl1_inv
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    buffer_gl1_inv
 ; GFX10-CU-NEXT:    buffer_gl0_inv
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbl2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_invl2
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0 sc1
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl1_inv
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    buffer_gl1_inv
 ; GFX11-CU-NEXT:    buffer_gl0_inv
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 263324ed5c03..11206cb695aa 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -1,124 +1,154 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd- -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @global_volatile_load_0(
 ; GFX6-LABEL: global_volatile_load_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b32 s2, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s8, 0xf000
+; GFX6-NEXT:    s_mov_b32 s9, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s2
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    s_mov_b32 s7, s8
+; GFX6-NEXT:    s_mov_b32 s10, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s10
+; GFX6-NEXT:    s_mov_b32 s2, s9
+; GFX6-NEXT:    s_mov_b32 s3, s8
 ; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_volatile_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_volatile_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_volatile_load_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_volatile_load_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_volatile_load_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_volatile_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_volatile_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -130,121 +160,194 @@ entry:
 define amdgpu_kernel void @global_volatile_load_1(
 ; GFX6-LABEL: global_volatile_load_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
-; GFX6-NEXT:    s_mov_b32 s2, 0
-; GFX6-NEXT:    s_mov_b32 s3, s7
-; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s8, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    s_mov_b32 s6, 0xf000
+; GFX6-NEXT:    s_mov_b32 s7, -1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s8
+; GFX6-NEXT:    s_mov_b32 s2, s7
+; GFX6-NEXT:    s_mov_b32 s3, s6
+; GFX6-NEXT:    s_mov_b32 s8, 0
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_mov_b32 s8, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s8, v0
+; GFX6-NEXT:    s_mov_b32 s8, 0
+; GFX6-NEXT:    ; implicit-def: $sgpr8
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s6, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v1, s6, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    ; implicit-def: $sgpr6
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_mov_b32 s6, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_mov_b32 s8, s9
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
+; GFX7-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s8, 2
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e64 v1, s8, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v0, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT:    global_load_dword v1, v1, s[6:7] glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_mov_b32 s8, 2
+; GFX10-CU-NEXT:    v_lshlrev_b32_e64 v1, s8, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v0, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT:    global_load_dword v1, v1, s[6:7] glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_volatile_load_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s7
-; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s8, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0
+; SKIP-CACHE-INV-NEXT:    ; implicit-def: $sgpr8
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_volatile_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX11-WGP-NEXT:    s_mov_b32 s4, 2
+; GFX11-WGP-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v1, s[2:3] glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_volatile_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX11-CU-NEXT:    s_mov_b32 s4, 2
+; GFX11-CU-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT:    global_load_b32 v1, v1, s[2:3] glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX12-WGP-NEXT:    s_mov_b32 s4, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_mov_b32 s4, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s4
+; GFX12-CU-NEXT:    s_mov_b32 s4, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s4, v1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-CU-NEXT:    global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -258,121 +361,152 @@ entry:
 define amdgpu_kernel void @global_volatile_store_0(
 ; GFX6-LABEL: global_volatile_store_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s8, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    s_mov_b32 s6, 0xf000
+; GFX6-NEXT:    s_mov_b32 s7, -1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s8
+; GFX6-NEXT:    s_mov_b32 s2, s7
+; GFX6-NEXT:    s_mov_b32 s3, s6
+; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_volatile_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_volatile_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_volatile_store_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_volatile_store_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_volatile_store_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_volatile_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_volatile_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -384,125 +518,192 @@ entry:
 define amdgpu_kernel void @global_volatile_store_1(
 ; GFX6-LABEL: global_volatile_store_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s2, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s7, s2
+; GFX6-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT:    s_mov_b32 s5, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v1, s5, v0
+; GFX6-NEXT:    s_mov_b32 s5, 0
+; GFX6-NEXT:    ; implicit-def: $sgpr5
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s5, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v1, s5, v0
+; GFX7-NEXT:    s_mov_b32 s5, 0
+; GFX7-NEXT:    ; implicit-def: $sgpr5
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_mov_b32 s6, s8
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_addc_u32_e64 v2, s[6:7], v1, v2, s[6:7]
+; GFX7-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_mov_b32 s7, 2
+; GFX10-WGP-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_mov_b32 s7, 2
+; GFX10-CU-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_volatile_store_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v1, s5, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0
+; SKIP-CACHE-INV-NEXT:    ; implicit-def: $sgpr5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, 0
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_volatile_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX11-WGP-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_volatile_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX11-CU-NEXT:    s_mov_b32 s3, 2
+; GFX11-CU-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_volatile_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_volatile_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s3
+; GFX12-CU-NEXT:    s_mov_b32 s3, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v0, s3, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -516,116 +717,144 @@ entry:
 define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ; GFX6-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b32 s2, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s8, 0xf000
+; GFX6-NEXT:    s_mov_b32 s9, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s2
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    s_mov_b32 s7, s8
+; GFX6-NEXT:    s_mov_b32 s10, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s10
+; GFX6-NEXT:    s_mov_b32 s2, s9
+; GFX6-NEXT:    s_mov_b32 s3, s8
 ; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -637,105 +866,129 @@ entry:
 define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX6-LABEL: global_volatile_workgroup_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s7, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    s_mov_b32 s5, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s7
+; GFX6-NEXT:    s_mov_b32 s2, s6
+; GFX6-NEXT:    s_mov_b32 s3, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_volatile_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_volatile_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_volatile_workgroup_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_volatile_workgroup_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_volatile_workgroup_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_volatile_workgroup_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
    i32 %in, ptr addrspace(1) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 968353063176..f805e2cf3700 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -1,168 +1,206 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @global_wavefront_unordered_load(
 ; GFX6-LABEL: global_wavefront_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -174,153 +212,191 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_load(
 ; GFX6-LABEL: global_wavefront_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -332,153 +408,191 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_load(
 ; GFX6-LABEL: global_wavefront_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -490,153 +604,191 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ; GFX6-LABEL: global_wavefront_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -648,145 +800,163 @@ entry:
 define amdgpu_kernel void @global_wavefront_unordered_store(
 ; GFX6-LABEL: global_wavefront_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -797,145 +967,163 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_store(
 ; GFX6-LABEL: global_wavefront_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -946,145 +1134,163 @@ entry:
 define amdgpu_kernel void @global_wavefront_release_store(
 ; GFX6-LABEL: global_wavefront_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1095,145 +1301,163 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ; GFX6-LABEL: global_wavefront_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1244,141 +1468,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
 ; GFX6-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1389,141 +1626,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
 ; GFX6-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1534,141 +1784,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_release_atomicrmw(
 ; GFX6-LABEL: global_wavefront_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1679,141 +1942,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1824,141 +2100,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1969,63 +2258,77 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2034,102 +2337,102 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2141,63 +2444,77 @@ entry:
 define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2206,102 +2523,102 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2313,63 +2630,77 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2378,102 +2709,102 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2485,141 +2816,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2631,141 +3040,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2777,141 +3264,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2923,141 +3488,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3069,141 +3712,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3215,141 +3936,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3361,141 +4160,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3507,141 +4384,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3653,141 +4608,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3799,141 +4832,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3945,141 +5056,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4091,141 +5280,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4237,141 +5504,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4383,141 +5728,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4529,141 +5952,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4675,169 +6176,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
    ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4851,169 +6431,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5027,169 +6686,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
    ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5203,169 +6941,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5379,169 +7196,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5555,169 +7451,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5731,169 +7706,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5907,169 +7961,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6083,169 +8216,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6259,169 +8471,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6435,169 +8726,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6611,169 +8981,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6787,169 +9236,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6963,169 +9491,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7139,169 +9746,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7315,153 +10001,191 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ; GFX6-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7473,153 +10197,191 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7631,153 +10393,191 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7789,153 +10589,191 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7947,145 +10785,163 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ; GFX6-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8096,145 +10952,163 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8245,145 +11119,163 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ; GFX6-LABEL: global_wavefront_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8394,145 +11286,163 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8543,141 +11453,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8688,141 +11611,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8833,141 +11769,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
 ; GFX6-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8978,141 +11927,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9123,141 +12085,154 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9268,63 +12243,77 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9333,102 +12322,102 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9440,63 +12429,77 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9505,102 +12508,102 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9612,63 +12615,77 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9677,102 +12694,102 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9784,141 +12801,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -9930,141 +13025,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10076,141 +13249,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10222,141 +13473,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10368,141 +13697,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10514,141 +13921,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10660,141 +14145,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10806,141 +14369,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10952,141 +14593,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11098,141 +14817,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11244,141 +15041,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11390,141 +15265,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11536,141 +15489,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11682,141 +15713,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11828,141 +15937,219 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11974,169 +16161,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12150,169 +16416,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12326,169 +16671,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12502,169 +16926,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12678,169 +17181,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12854,169 +17436,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13030,169 +17691,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13206,169 +17946,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13382,169 +18201,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13558,169 +18456,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13734,169 +18711,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13910,169 +18966,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14086,169 +19221,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14262,169 +19476,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14438,169 +19731,248 @@ entry:
 define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 550f06b970bc..643684b7550a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -1,168 +1,206 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @global_workgroup_unordered_load(
 ; GFX6-LABEL: global_workgroup_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -174,153 +212,191 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_load(
 ; GFX6-LABEL: global_workgroup_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -332,158 +408,198 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_load(
 ; GFX6-LABEL: global_workgroup_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -495,158 +611,208 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX6-LABEL: global_workgroup_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -658,145 +824,163 @@ entry:
 define amdgpu_kernel void @global_workgroup_unordered_store(
 ; GFX6-LABEL: global_workgroup_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -807,145 +991,163 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_store(
 ; GFX6-LABEL: global_workgroup_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -956,145 +1158,181 @@ entry:
 define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX6-LABEL: global_workgroup_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1105,145 +1343,181 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX6-LABEL: global_workgroup_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -1254,141 +1528,154 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
 ; GFX6-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1399,112 +1686,129 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ; GFX6-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1512,21 +1816,21 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1534,12 +1838,12 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1550,141 +1854,172 @@ entry:
 define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX6-LABEL: global_workgroup_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1695,112 +2030,141 @@ entry:
 define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1808,21 +2172,26 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1830,12 +2199,13 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1846,112 +2216,141 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1959,21 +2358,26 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1981,12 +2385,13 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -1997,64 +2402,78 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -2063,106 +2482,108 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2174,65 +2595,85 @@ entry:
 define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2240,106 +2681,120 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2351,65 +2806,85 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2417,106 +2892,120 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -2528,141 +3017,219 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2674,147 +3241,229 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2826,141 +3475,237 @@ entry:
 define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -2972,147 +3717,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3124,147 +3969,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3276,147 +4221,229 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3428,147 +4455,229 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3580,147 +4689,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3732,147 +4941,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -3884,147 +5193,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4036,147 +5445,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4188,147 +5697,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4340,147 +5949,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4492,147 +6201,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4644,147 +6453,247 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4796,169 +6705,248 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -4972,174 +6960,255 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5153,169 +7222,266 @@ entry:
 define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5329,174 +7495,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5510,174 +7775,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5691,174 +8055,255 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -5872,174 +8317,255 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6053,174 +8579,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6234,174 +8859,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6415,174 +9139,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6596,174 +9419,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6777,174 +9699,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -6958,174 +9979,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7139,174 +10259,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7320,174 +10539,273 @@ entry:
 define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -7501,153 +10819,191 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ; GFX6-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7659,153 +11015,191 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7817,158 +11211,198 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -7980,158 +11414,204 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX6-NEXT:    s_mov_b32 s4, s2
-; GFX6-NEXT:    s_mov_b32 s5, s3
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX7-NEXT:    flat_load_dword v2, v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
 ; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
@@ -8143,145 +11623,163 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ; GFX6-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8292,145 +11790,163 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8441,145 +11957,173 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX6-LABEL: global_workgroup_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8590,145 +12134,173 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(1) %out) {
 entry:
@@ -8739,141 +12311,154 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -8884,112 +12469,129 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -8997,21 +12599,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -9019,12 +12621,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9035,141 +12637,164 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9180,112 +12805,135 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -9293,21 +12941,25 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -9315,12 +12967,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9331,112 +12983,135 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -9444,21 +13119,25 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -9466,12 +13145,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9482,64 +13161,78 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9548,106 +13241,108 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9659,64 +13354,80 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9725,106 +13436,116 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -9836,64 +13557,80 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x2
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
@@ -9902,106 +13639,116 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x8
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in) {
 entry:
@@ -10013,141 +13760,219 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10159,147 +13984,229 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10311,141 +14218,229 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10457,147 +14452,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10609,147 +14696,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10761,147 +14940,229 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -10913,147 +15174,229 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11065,147 +15408,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11217,147 +15652,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11369,147 +15896,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11521,147 +16140,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11673,147 +16384,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11825,147 +16628,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -11977,147 +16872,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12129,147 +17116,239 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s0, 16
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12281,169 +17360,248 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12457,174 +17615,255 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12638,169 +17877,258 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12814,174 +18142,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -12995,174 +18414,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13176,174 +18686,255 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13357,174 +18948,255 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13538,174 +19210,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13719,174 +19482,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -13900,174 +19754,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14081,174 +20026,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14262,174 +20298,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14443,174 +20570,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14624,174 +20842,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
@@ -14805,174 +21114,265 @@ entry:
 define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s4, s0, 16
-; GFX7-NEXT:    s_addc_u32 s5, s1, 0
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s1
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
-; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %out, i32 %in, i32 %old) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 1e3333c4287e..ff1538b146d2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -1,159 +1,181 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @local_agent_unordered_load(
 ; GFX6-LABEL: local_agent_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -165,144 +187,166 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_load(
 ; GFX6-LABEL: local_agent_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -314,149 +358,171 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_load(
 ; GFX6-LABEL: local_agent_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -468,149 +534,189 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX6-LABEL: local_agent_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_read_b32 v1, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -622,45 +728,51 @@ entry:
 define amdgpu_kernel void @local_agent_unordered_store(
 ; GFX6-LABEL: local_agent_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -670,25 +782,28 @@ define amdgpu_kernel void @local_agent_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -697,7 +812,8 @@ define amdgpu_kernel void @local_agent_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -706,33 +822,41 @@ define amdgpu_kernel void @local_agent_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_agent_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -744,45 +868,51 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_store(
 ; GFX6-LABEL: local_agent_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -792,25 +922,28 @@ define amdgpu_kernel void @local_agent_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -819,7 +952,8 @@ define amdgpu_kernel void @local_agent_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -828,33 +962,41 @@ define amdgpu_kernel void @local_agent_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -866,117 +1008,153 @@ entry:
 define amdgpu_kernel void @local_agent_release_store(
 ; GFX6-LABEL: local_agent_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -988,117 +1166,153 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX6-LABEL: local_agent_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -1110,117 +1324,135 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_atomicrmw(
 ; GFX6-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1232,32 +1464,36 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_atomicrmw(
 ; GFX6-LABEL: local_agent_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1265,70 +1501,78 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1336,18 +1580,22 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1355,9 +1603,11 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1370,117 +1620,153 @@ entry:
 define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX6-LABEL: local_agent_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1492,32 +1778,40 @@ entry:
 define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1525,70 +1819,86 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1596,18 +1906,27 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1615,9 +1934,12 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1630,32 +1952,40 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1663,70 +1993,86 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1734,18 +2080,27 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1753,9 +2108,12 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1768,148 +2126,182 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1922,148 +2314,200 @@ entry:
 define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -2076,148 +2520,200 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -2230,131 +2726,174 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2367,36 +2906,45 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2404,77 +2952,99 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2482,20 +3052,28 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2503,10 +3081,14 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2520,131 +3102,192 @@ entry:
 define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2657,36 +3300,49 @@ entry:
 define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2694,77 +3350,107 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2772,20 +3458,33 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2793,10 +3492,15 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2810,36 +3514,49 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2847,77 +3564,107 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2925,20 +3672,33 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2946,10 +3706,15 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2963,36 +3728,45 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3000,77 +3774,99 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3078,20 +3874,28 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3099,10 +3903,14 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3116,36 +3924,45 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3153,77 +3970,99 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3231,20 +4070,28 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3252,10 +4099,14 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3269,36 +4120,49 @@ entry:
 define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3306,77 +4170,107 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3384,20 +4278,33 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3405,10 +4312,15 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3422,36 +4334,49 @@ entry:
 define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3459,77 +4384,107 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3537,20 +4492,33 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3558,10 +4526,15 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3575,36 +4548,49 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3612,77 +4598,107 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3690,20 +4706,33 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3711,10 +4740,15 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3728,36 +4762,49 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3765,77 +4812,107 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3843,20 +4920,33 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3864,10 +4954,15 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3881,36 +4976,49 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3918,77 +5026,107 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3996,20 +5134,33 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4017,10 +5168,15 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4034,36 +5190,49 @@ entry:
 define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4071,77 +5240,107 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4149,20 +5348,33 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4170,10 +5382,15 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4187,36 +5404,49 @@ entry:
 define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4224,77 +5454,107 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4302,20 +5562,33 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4323,10 +5596,15 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4340,36 +5618,49 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4377,77 +5668,107 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4455,20 +5776,33 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4476,10 +5810,15 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4493,156 +5832,212 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4658,59 +6053,77 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -4718,102 +6131,140 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -4828,156 +6279,230 @@ entry:
 define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4993,162 +6518,236 @@ entry:
 define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5163,162 +6762,236 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5333,59 +7006,77 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5393,102 +7084,140 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5503,59 +7232,77 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5563,102 +7310,140 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5673,162 +7458,236 @@ entry:
 define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5843,162 +7702,236 @@ entry:
 define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6013,162 +7946,236 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6183,162 +8190,236 @@ entry:
 define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6353,162 +8434,236 @@ entry:
 define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6523,162 +8678,236 @@ entry:
 define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6693,162 +8922,236 @@ entry:
 define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6863,162 +9166,236 @@ entry:
 define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -7033,144 +9410,166 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_unordered_load(
 ; GFX6-LABEL: local_agent_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7182,144 +9581,166 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_load(
 ; GFX6-LABEL: local_agent_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7331,144 +9752,166 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_load(
 ; GFX6-LABEL: local_agent_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7480,144 +9923,166 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_load(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7629,45 +10094,51 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_unordered_store(
 ; GFX6-LABEL: local_agent_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7677,25 +10148,28 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7704,7 +10178,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7713,33 +10188,41 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7751,45 +10234,51 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_store(
 ; GFX6-LABEL: local_agent_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7799,25 +10288,28 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7826,7 +10318,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7835,33 +10328,41 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7873,45 +10374,51 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_release_store(
 ; GFX6-LABEL: local_agent_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7921,25 +10428,28 @@ define amdgpu_kernel void @local_agent_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7948,7 +10458,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7957,33 +10468,41 @@ define amdgpu_kernel void @local_agent_one_as_release_store(
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7995,45 +10514,51 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -8043,25 +10568,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -8070,7 +10598,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -8079,33 +10608,41 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -8117,117 +10654,135 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8239,117 +10794,135 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8361,117 +10934,135 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_release_atomicrmw(
 ; GFX6-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8483,117 +11074,135 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8605,117 +11214,135 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8727,142 +11354,176 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8876,142 +11537,176 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9025,142 +11720,176 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9174,131 +11903,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9311,131 +12083,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9448,131 +12263,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9585,131 +12443,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9722,131 +12623,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9859,131 +12803,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9996,131 +12983,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10133,131 +13163,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10270,131 +13343,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10407,131 +13523,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10544,131 +13703,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10681,131 +13883,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10818,131 +14063,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10955,131 +14243,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -11092,131 +14423,174 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -11229,156 +14603,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11394,156 +14824,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11559,156 +15045,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11724,156 +15266,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11889,156 +15487,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12054,156 +15708,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12219,156 +15929,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12384,156 +16150,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12549,156 +16371,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12714,156 +16592,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12879,156 +16813,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13044,156 +17034,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13209,156 +17255,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13374,156 +17476,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13539,156 +17697,212 @@ entry:
 define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 16ede2665400..ba9711333a19 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -1,189 +1,209 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @local_nontemporal_load_0(
 ; GFX6-LABEL: local_nontemporal_load_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 m0, -1
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr8
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    ds_read_b32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_nontemporal_load_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_nontemporal_load_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(1) %out) {
 entry:
@@ -195,181 +215,240 @@ entry:
 define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX6-LABEL: local_nontemporal_load_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_mov_b32 m0, -1
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr8
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    s_mov_b32 s9, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s9, v0
+; GFX6-NEXT:    v_add_i32_e64 v0, s[8:9], s8, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX11-CU-NEXT:    s_mov_b32 s2, 2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX12-CU-NEXT:    s_mov_b32 s2, 2
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(1) %out) {
 entry:
@@ -383,100 +462,111 @@ entry:
 define amdgpu_kernel void @local_nontemporal_store_0(
 ; GFX6-LABEL: local_nontemporal_store_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x2
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -484,11 +574,13 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ;
 ; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -496,45 +588,53 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ;
 ; GFX11-WGP-LABEL: local_nontemporal_store_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_nontemporal_store_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -547,103 +647,126 @@ entry:
 define amdgpu_kernel void @local_nontemporal_store_1(
 ; GFX6-LABEL: local_nontemporal_store_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x2
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_mov_b32 s6, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
+; GFX6-NEXT:    v_add_i32_e64 v0, s[6:7], s5, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s6, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s5, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b32 s5, 2
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[2:3], s1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s5, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s5, v1
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s1, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -651,11 +774,16 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
-; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s1, 2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s1, v1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -663,36 +791,44 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX11-WGP-LABEL: local_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX11-CU-NEXT:    s_mov_b32 s1, 2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
@@ -700,10 +836,14 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX12-CU-LABEL: local_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX12-CU-NEXT:    s_mov_b32 s1, 2
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
@@ -720,174 +860,194 @@ entry:
 define amdgpu_kernel void @local_nontemporal_volatile_load(
 ; GFX6-LABEL: local_nontemporal_volatile_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_mov_b32 m0, -1
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr8
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    ds_read_b32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_nontemporal_volatile_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_nontemporal_volatile_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_nontemporal_volatile_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_nontemporal_volatile_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_nontemporal_volatile_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(1) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index 664203a8f730..fe5f2c51734f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -1,159 +1,181 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @local_singlethread_unordered_load(
 ; GFX6-LABEL: local_singlethread_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -165,144 +187,166 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_load(
 ; GFX6-LABEL: local_singlethread_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -314,144 +358,166 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_load(
 ; GFX6-LABEL: local_singlethread_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -463,144 +529,166 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_load(
 ; GFX6-LABEL: local_singlethread_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -612,45 +700,51 @@ entry:
 define amdgpu_kernel void @local_singlethread_unordered_store(
 ; GFX6-LABEL: local_singlethread_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -660,25 +754,28 @@ define amdgpu_kernel void @local_singlethread_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -687,7 +784,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -696,33 +794,41 @@ define amdgpu_kernel void @local_singlethread_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_singlethread_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -734,45 +840,51 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_store(
 ; GFX6-LABEL: local_singlethread_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -782,25 +894,28 @@ define amdgpu_kernel void @local_singlethread_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -809,7 +924,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -818,33 +934,41 @@ define amdgpu_kernel void @local_singlethread_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -856,45 +980,51 @@ entry:
 define amdgpu_kernel void @local_singlethread_release_store(
 ; GFX6-LABEL: local_singlethread_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -904,25 +1034,28 @@ define amdgpu_kernel void @local_singlethread_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -931,7 +1064,8 @@ define amdgpu_kernel void @local_singlethread_release_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -940,33 +1074,41 @@ define amdgpu_kernel void @local_singlethread_release_store(
 ;
 ; GFX11-WGP-LABEL: local_singlethread_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -978,45 +1120,51 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_store(
 ; GFX6-LABEL: local_singlethread_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -1026,25 +1174,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -1053,7 +1204,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -1062,33 +1214,41 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store(
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -1100,117 +1260,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw(
 ; GFX6-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1222,117 +1400,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_atomicrmw(
 ; GFX6-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1344,117 +1540,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_release_atomicrmw(
 ; GFX6-LABEL: local_singlethread_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1466,117 +1680,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1588,117 +1820,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1710,142 +1960,176 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1859,142 +2143,176 @@ entry:
 define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2008,142 +2326,176 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2157,131 +2509,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2294,131 +2689,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2431,131 +2869,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2568,131 +3049,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2705,131 +3229,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2842,131 +3409,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2979,131 +3589,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3116,131 +3769,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3253,131 +3949,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3390,131 +4129,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3527,131 +4309,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3664,131 +4489,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3801,131 +4669,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3938,131 +4849,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -4075,131 +5029,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -4212,156 +5209,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4377,156 +5430,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4542,156 +5651,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4707,156 +5872,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4872,156 +6093,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5037,156 +6314,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5202,156 +6535,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5367,156 +6756,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5532,156 +6977,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5697,156 +7198,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5862,156 +7419,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6027,156 +7640,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6192,156 +7861,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6357,156 +8082,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6522,156 +8303,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6687,144 +8524,166 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_unordered_load(
 ; GFX6-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -6836,144 +8695,166 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_load(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -6985,144 +8866,166 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_load(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7134,144 +9037,166 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7283,45 +9208,51 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
 ; GFX6-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7331,25 +9262,28 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7358,7 +9292,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7367,33 +9302,41 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7405,45 +9348,51 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7453,25 +9402,28 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7480,7 +9432,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7489,33 +9442,41 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7527,45 +9488,51 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_release_store(
 ; GFX6-LABEL: local_singlethread_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7575,25 +9542,28 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7602,7 +9572,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7611,33 +9582,41 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store(
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7649,45 +9628,51 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7697,25 +9682,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7724,7 +9712,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7733,33 +9722,41 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7771,117 +9768,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -7893,117 +9908,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8015,117 +10048,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw(
 ; GFX6-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8137,117 +10188,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8259,117 +10328,135 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8381,142 +10468,176 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8530,142 +10651,176 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8679,142 +10834,176 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8828,131 +11017,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -8965,131 +11197,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9102,131 +11377,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9239,131 +11557,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9376,131 +11737,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9513,131 +11917,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9650,131 +12097,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9787,131 +12277,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9924,131 +12457,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10061,131 +12637,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10198,131 +12817,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10335,131 +12997,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10472,131 +13177,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10609,131 +13357,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10746,131 +13537,174 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10883,156 +13717,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11048,156 +13938,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11213,156 +14159,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11378,156 +14380,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11543,156 +14601,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11708,156 +14822,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11873,156 +15043,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12038,156 +15264,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12203,156 +15485,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12368,156 +15706,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12533,156 +15927,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12698,156 +16148,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12863,156 +16369,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13028,156 +16590,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13193,156 +16811,212 @@ entry:
 define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 1a4268548b02..6d52a4d69b18 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -1,159 +1,181 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @local_system_unordered_load(
 ; GFX6-LABEL: local_system_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -165,144 +187,166 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_load(
 ; GFX6-LABEL: local_system_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -314,149 +358,171 @@ entry:
 define amdgpu_kernel void @local_system_acquire_load(
 ; GFX6-LABEL: local_system_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -468,149 +534,189 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX6-LABEL: local_system_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_read_b32 v1, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -622,45 +728,51 @@ entry:
 define amdgpu_kernel void @local_system_unordered_store(
 ; GFX6-LABEL: local_system_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -670,25 +782,28 @@ define amdgpu_kernel void @local_system_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -697,7 +812,8 @@ define amdgpu_kernel void @local_system_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -706,33 +822,41 @@ define amdgpu_kernel void @local_system_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_system_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -744,45 +868,51 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_store(
 ; GFX6-LABEL: local_system_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -792,25 +922,28 @@ define amdgpu_kernel void @local_system_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -819,7 +952,8 @@ define amdgpu_kernel void @local_system_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -828,33 +962,41 @@ define amdgpu_kernel void @local_system_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -866,117 +1008,153 @@ entry:
 define amdgpu_kernel void @local_system_release_store(
 ; GFX6-LABEL: local_system_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -988,117 +1166,153 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX6-LABEL: local_system_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -1110,117 +1324,135 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_atomicrmw(
 ; GFX6-LABEL: local_system_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1232,32 +1464,36 @@ entry:
 define amdgpu_kernel void @local_system_acquire_atomicrmw(
 ; GFX6-LABEL: local_system_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1265,70 +1501,78 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_system_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1336,18 +1580,22 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_system_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1355,9 +1603,11 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_system_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1370,117 +1620,153 @@ entry:
 define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX6-LABEL: local_system_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1492,32 +1778,40 @@ entry:
 define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1525,70 +1819,86 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1596,18 +1906,27 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1615,9 +1934,12 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1630,32 +1952,40 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1663,70 +1993,86 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1734,18 +2080,27 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1753,9 +2108,12 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1768,148 +2126,182 @@ entry:
 define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1922,148 +2314,200 @@ entry:
 define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -2076,148 +2520,200 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -2230,131 +2726,174 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2367,36 +2906,45 @@ entry:
 define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2404,77 +2952,99 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2482,20 +3052,28 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2503,10 +3081,14 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2520,131 +3102,192 @@ entry:
 define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2657,36 +3300,49 @@ entry:
 define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2694,77 +3350,107 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2772,20 +3458,33 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2793,10 +3492,15 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2810,36 +3514,49 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2847,77 +3564,107 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2925,20 +3672,33 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2946,10 +3706,15 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2963,36 +3728,45 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3000,77 +3774,99 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3078,20 +3874,28 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3099,10 +3903,14 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3116,36 +3924,45 @@ entry:
 define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3153,77 +3970,99 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3231,20 +4070,28 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3252,10 +4099,14 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3269,36 +4120,49 @@ entry:
 define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3306,77 +4170,107 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3384,20 +4278,33 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3405,10 +4312,15 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3422,36 +4334,49 @@ entry:
 define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3459,77 +4384,107 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3537,20 +4492,33 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3558,10 +4526,15 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3575,36 +4548,49 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3612,77 +4598,107 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3690,20 +4706,33 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3711,10 +4740,15 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3728,36 +4762,49 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3765,77 +4812,107 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3843,20 +4920,33 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3864,10 +4954,15 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3881,36 +4976,49 @@ entry:
 define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3918,77 +5026,107 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3996,20 +5134,33 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4017,10 +5168,15 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4034,36 +5190,49 @@ entry:
 define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4071,77 +5240,107 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4149,20 +5348,33 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4170,10 +5382,15 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4187,36 +5404,49 @@ entry:
 define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4224,77 +5454,107 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4302,20 +5562,33 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4323,10 +5596,15 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4340,36 +5618,49 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4377,77 +5668,107 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4455,20 +5776,33 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4476,10 +5810,15 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4493,156 +5832,212 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4658,59 +6053,77 @@ entry:
 define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -4718,102 +6131,140 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -4828,156 +6279,230 @@ entry:
 define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4993,162 +6518,236 @@ entry:
 define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5163,162 +6762,236 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5333,59 +7006,77 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5393,102 +7084,140 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5503,59 +7232,77 @@ entry:
 define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5563,102 +7310,140 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5673,162 +7458,236 @@ entry:
 define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5843,162 +7702,236 @@ entry:
 define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6013,162 +7946,236 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6183,162 +8190,236 @@ entry:
 define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6353,162 +8434,236 @@ entry:
 define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6523,162 +8678,236 @@ entry:
 define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6693,162 +8922,236 @@ entry:
 define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6863,162 +9166,236 @@ entry:
 define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -7033,144 +9410,166 @@ entry:
 define amdgpu_kernel void @local_system_one_as_unordered_load(
 ; GFX6-LABEL: local_system_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7182,144 +9581,166 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_load(
 ; GFX6-LABEL: local_system_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7331,144 +9752,166 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_load(
 ; GFX6-LABEL: local_system_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7480,144 +9923,166 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_load(
 ; GFX6-LABEL: local_system_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7629,45 +10094,51 @@ entry:
 define amdgpu_kernel void @local_system_one_as_unordered_store(
 ; GFX6-LABEL: local_system_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7677,25 +10148,28 @@ define amdgpu_kernel void @local_system_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7704,7 +10178,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7713,33 +10188,41 @@ define amdgpu_kernel void @local_system_one_as_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7751,45 +10234,51 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_store(
 ; GFX6-LABEL: local_system_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7799,25 +10288,28 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7826,7 +10318,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7835,33 +10328,41 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7873,45 +10374,51 @@ entry:
 define amdgpu_kernel void @local_system_one_as_release_store(
 ; GFX6-LABEL: local_system_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7921,25 +10428,28 @@ define amdgpu_kernel void @local_system_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7948,7 +10458,8 @@ define amdgpu_kernel void @local_system_one_as_release_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7957,33 +10468,41 @@ define amdgpu_kernel void @local_system_one_as_release_store(
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7995,45 +10514,51 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_store(
 ; GFX6-LABEL: local_system_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -8043,25 +10568,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -8070,7 +10598,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -8079,33 +10608,41 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store(
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -8117,117 +10654,135 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8239,117 +10794,135 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8361,117 +10934,135 @@ entry:
 define amdgpu_kernel void @local_system_one_as_release_atomicrmw(
 ; GFX6-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8483,117 +11074,135 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8605,117 +11214,135 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8727,142 +11354,176 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8876,142 +11537,176 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9025,142 +11720,176 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9174,131 +11903,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9311,131 +12083,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9448,131 +12263,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9585,131 +12443,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9722,131 +12623,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9859,131 +12803,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9996,131 +12983,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10133,131 +13163,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10270,131 +13343,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10407,131 +13523,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10544,131 +13703,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10681,131 +13883,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10818,131 +14063,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10955,131 +14243,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -11092,131 +14423,174 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -11229,156 +14603,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11394,156 +14824,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11559,156 +15045,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11724,156 +15266,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11889,156 +15487,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12054,156 +15708,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12219,156 +15929,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12384,156 +16150,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12549,156 +16371,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12714,156 +16592,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12879,156 +16813,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13044,156 +17034,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13209,156 +17255,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13374,156 +17476,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13539,156 +17697,212 @@ entry:
 define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index ac09f65c32f4..2a15a0d0727f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -1,137 +1,149 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd- -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @local_volatile_load_0(
 ; GFX6-LABEL: local_volatile_load_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 m0, -1
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x9
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr4
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_mov_b32 s7, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    s_mov_b32 s5, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s7
+; GFX6-NEXT:    s_mov_b32 s2, s6
+; GFX6-NEXT:    s_mov_b32 s3, s5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    ds_read_b32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_volatile_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_volatile_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_volatile_load_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
 ; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_volatile_load_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_volatile_load_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_volatile_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_volatile_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(1) %out) {
 entry:
@@ -143,133 +155,164 @@ entry:
 define amdgpu_kernel void @local_volatile_load_1(
 ; GFX6-LABEL: local_volatile_load_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_mov_b32 m0, -1
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x9
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr4
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    s_mov_b32 s7, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    s_mov_b32 s5, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s7
+; GFX6-NEXT:    s_mov_b32 s2, s6
+; GFX6-NEXT:    s_mov_b32 s3, s5
+; GFX6-NEXT:    s_mov_b32 s5, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
+; GFX6-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_volatile_load_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_volatile_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_volatile_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX11-CU-NEXT:    s_mov_b32 s2, 2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
-; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
-; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX12-CU-NEXT:    s_mov_b32 s2, 2
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(1) %out) {
 entry:
@@ -283,110 +326,133 @@ entry:
 define amdgpu_kernel void @local_volatile_store_0(
 ; GFX6-LABEL: local_volatile_store_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_load_dword s1, s[2:3], 0xb
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr1
+; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_volatile_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_volatile_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_volatile_store_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_volatile_store_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_volatile_store_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_volatile_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_volatile_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -399,117 +465,147 @@ entry:
 define amdgpu_kernel void @local_volatile_store_1(
 ; GFX6-LABEL: local_volatile_store_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_load_dword s1, s[2:3], 0xb
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr1
+; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s2, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
+; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], s1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s6, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s5, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b32 s5, 2
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_volatile_store_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[2:3], s1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_volatile_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_volatile_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX11-CU-NEXT:    s_mov_b32 s1, 2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_volatile_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_volatile_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX12-CU-NEXT:    s_mov_b32 s1, 2
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -524,103 +620,121 @@ entry:
 define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
 ; GFX6-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x9
+; GFX6-NEXT:    s_load_dword s0, s[2:3], 0xa
+; GFX6-NEXT:    ; kill: def $sgpr2 killed $sgpr0
+; GFX6-NEXT:    ; kill: def $sgpr2 killed $sgpr1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    ds_read_b32 v1, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -632,81 +746,109 @@ entry:
 define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX6-LABEL: local_volatile_workgroup_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s1, s[2:3], 0xa
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr1
+; GFX6-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_volatile_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_volatile_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_volatile_workgroup_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_volatile_workgroup_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_volatile_workgroup_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_volatile_workgroup_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
    i32 %in, ptr addrspace(3) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index d210de64b3fb..02e4e0d69dc2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -1,159 +1,181 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @local_wavefront_unordered_load(
 ; GFX6-LABEL: local_wavefront_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -165,144 +187,166 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_load(
 ; GFX6-LABEL: local_wavefront_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -314,144 +358,166 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_load(
 ; GFX6-LABEL: local_wavefront_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -463,144 +529,166 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_load(
 ; GFX6-LABEL: local_wavefront_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -612,45 +700,51 @@ entry:
 define amdgpu_kernel void @local_wavefront_unordered_store(
 ; GFX6-LABEL: local_wavefront_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -660,25 +754,28 @@ define amdgpu_kernel void @local_wavefront_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -687,7 +784,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -696,33 +794,41 @@ define amdgpu_kernel void @local_wavefront_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_wavefront_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -734,45 +840,51 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_store(
 ; GFX6-LABEL: local_wavefront_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -782,25 +894,28 @@ define amdgpu_kernel void @local_wavefront_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -809,7 +924,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -818,33 +934,41 @@ define amdgpu_kernel void @local_wavefront_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -856,45 +980,51 @@ entry:
 define amdgpu_kernel void @local_wavefront_release_store(
 ; GFX6-LABEL: local_wavefront_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -904,25 +1034,28 @@ define amdgpu_kernel void @local_wavefront_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -931,7 +1064,8 @@ define amdgpu_kernel void @local_wavefront_release_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -940,33 +1074,41 @@ define amdgpu_kernel void @local_wavefront_release_store(
 ;
 ; GFX11-WGP-LABEL: local_wavefront_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -978,45 +1120,51 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_store(
 ; GFX6-LABEL: local_wavefront_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -1026,25 +1174,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -1053,7 +1204,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -1062,33 +1214,41 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store(
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -1100,117 +1260,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw(
 ; GFX6-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1222,117 +1400,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_atomicrmw(
 ; GFX6-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1344,117 +1540,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_release_atomicrmw(
 ; GFX6-LABEL: local_wavefront_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1466,117 +1680,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1588,117 +1820,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1710,142 +1960,176 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1859,142 +2143,176 @@ entry:
 define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2008,142 +2326,176 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2157,131 +2509,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2294,131 +2689,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2431,131 +2869,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2568,131 +3049,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2705,131 +3229,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2842,131 +3409,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2979,131 +3589,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3116,131 +3769,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3253,131 +3949,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3390,131 +4129,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3527,131 +4309,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3664,131 +4489,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3801,131 +4669,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -3938,131 +4849,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -4075,131 +5029,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -4212,156 +5209,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4377,156 +5430,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4542,156 +5651,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4707,156 +5872,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4872,156 +6093,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5037,156 +6314,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5202,156 +6535,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5367,156 +6756,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5532,156 +6977,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5697,156 +7198,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5862,156 +7419,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6027,156 +7640,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6192,156 +7861,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6357,156 +8082,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6522,156 +8303,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6687,144 +8524,166 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_unordered_load(
 ; GFX6-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -6836,144 +8695,166 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_load(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -6985,144 +8866,166 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_load(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7134,144 +9037,166 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7283,45 +9208,51 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
 ; GFX6-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7331,25 +9262,28 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7358,7 +9292,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7367,33 +9302,41 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7405,45 +9348,51 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7453,25 +9402,28 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7480,7 +9432,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7489,33 +9442,41 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7527,45 +9488,51 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_release_store(
 ; GFX6-LABEL: local_wavefront_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7575,25 +9542,28 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7602,7 +9572,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7611,33 +9582,41 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store(
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7649,45 +9628,51 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7697,25 +9682,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7724,7 +9712,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7733,33 +9722,41 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7771,117 +9768,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -7893,117 +9908,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8015,117 +10048,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw(
 ; GFX6-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8137,117 +10188,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8259,117 +10328,135 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8381,142 +10468,176 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8530,142 +10651,176 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8679,142 +10834,176 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8828,131 +11017,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -8965,131 +11197,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9102,131 +11377,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9239,131 +11557,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9376,131 +11737,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9513,131 +11917,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9650,131 +12097,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9787,131 +12277,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9924,131 +12457,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10061,131 +12637,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10198,131 +12817,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10335,131 +12997,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10472,131 +13177,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10609,131 +13357,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10746,131 +13537,174 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10883,156 +13717,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11048,156 +13938,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11213,156 +14159,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11378,156 +14380,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11543,156 +14601,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11708,156 +14822,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11873,156 +15043,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12038,156 +15264,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12203,156 +15485,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12368,156 +15706,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12533,156 +15927,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12698,156 +16148,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12863,156 +16369,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13028,156 +16590,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13193,156 +16811,212 @@ entry:
 define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6f288bcf98af..0f56342c825b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -1,159 +1,181 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @local_workgroup_unordered_load(
 ; GFX6-LABEL: local_workgroup_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -165,144 +187,166 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_load(
 ; GFX6-LABEL: local_workgroup_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -314,149 +358,171 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_load(
 ; GFX6-LABEL: local_workgroup_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -468,149 +534,189 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX6-LABEL: local_workgroup_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_read_b32 v1, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -622,45 +728,51 @@ entry:
 define amdgpu_kernel void @local_workgroup_unordered_store(
 ; GFX6-LABEL: local_workgroup_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -670,25 +782,28 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -697,7 +812,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -706,33 +822,41 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_workgroup_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -744,45 +868,51 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_store(
 ; GFX6-LABEL: local_workgroup_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -792,25 +922,28 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -819,7 +952,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -828,33 +962,41 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -866,117 +1008,153 @@ entry:
 define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX6-LABEL: local_workgroup_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -988,117 +1166,153 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX6-LABEL: local_workgroup_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -1110,117 +1324,135 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
 ; GFX6-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1232,32 +1464,36 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
 ; GFX6-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1265,70 +1501,78 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1336,18 +1580,22 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1355,9 +1603,11 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1370,117 +1620,153 @@ entry:
 define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX6-LABEL: local_workgroup_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1492,32 +1778,40 @@ entry:
 define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1525,70 +1819,86 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1596,18 +1906,27 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1615,9 +1934,12 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1630,32 +1952,40 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -1663,70 +1993,86 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -1734,18 +2080,27 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -1753,9 +2108,12 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -1768,148 +2126,182 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -1922,148 +2314,200 @@ entry:
 define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -2076,148 +2520,200 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -2230,131 +2726,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2367,36 +2906,45 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2404,77 +2952,99 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2482,20 +3052,28 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2503,10 +3081,14 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2520,131 +3102,192 @@ entry:
 define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -2657,36 +3300,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2694,77 +3350,107 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2772,20 +3458,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2793,10 +3492,15 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2810,36 +3514,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -2847,77 +3564,107 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -2925,20 +3672,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -2946,10 +3706,15 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2963,36 +3728,45 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3000,77 +3774,99 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3078,20 +3874,28 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3099,10 +3903,14 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3116,36 +3924,45 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3153,77 +3970,99 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3231,20 +4070,28 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3252,10 +4099,14 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3269,36 +4120,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3306,77 +4170,107 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3384,20 +4278,33 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3405,10 +4312,15 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3422,36 +4334,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3459,77 +4384,107 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3537,20 +4492,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3558,10 +4526,15 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3575,36 +4548,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3612,77 +4598,107 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3690,20 +4706,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3711,10 +4740,15 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3728,36 +4762,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3765,77 +4812,107 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3843,20 +4920,33 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -3864,10 +4954,15 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3881,36 +4976,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -3918,77 +5026,107 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -3996,20 +5134,33 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4017,10 +5168,15 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4034,36 +5190,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4071,77 +5240,107 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4149,20 +5348,33 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4170,10 +5382,15 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4187,36 +5404,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4224,77 +5454,107 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4302,20 +5562,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4323,10 +5596,15 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4340,36 +5618,49 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
@@ -4377,77 +5668,107 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
@@ -4455,20 +5776,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
@@ -4476,10 +5810,15 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4493,156 +5832,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4658,59 +6053,77 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -4718,102 +6131,140 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -4828,156 +6279,230 @@ entry:
 define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4993,162 +6518,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5163,162 +6762,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5333,59 +7006,77 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5393,102 +7084,140 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5503,59 +7232,77 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
@@ -5563,102 +7310,140 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5673,162 +7458,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -5843,162 +7702,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6013,162 +7946,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6183,162 +8190,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6353,162 +8434,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6523,162 +8678,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6693,162 +8922,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -6863,162 +9166,236 @@ entry:
 define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -7033,144 +9410,166 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
 ; GFX6-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7182,144 +9581,166 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7331,144 +9752,166 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7480,144 +9923,166 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    ds_read_b32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-WGP-NEXT:    ds_read_b32 v0, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    ds_write_b32 v1, v0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT:    ds_read_b32 v0, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    ds_write_b32 v1, v0
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    ds_read_b32 v0, v0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    ds_load_b32 v0, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    ds_store_b32 v1, v0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    ds_load_b32 v0, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
-; GFX12-WGP-NEXT:    ds_store_b32 v1, v0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
@@ -7629,45 +10094,51 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
 ; GFX6-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7677,25 +10148,28 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7704,7 +10178,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7713,33 +10188,41 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7751,45 +10234,51 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7799,25 +10288,28 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7826,7 +10318,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7835,33 +10328,41 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7873,45 +10374,51 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_release_store(
 ; GFX6-LABEL: local_workgroup_one_as_release_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -7921,25 +10428,28 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7948,7 +10458,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -7957,33 +10468,41 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_release_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_release_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -7995,45 +10514,51 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_store:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
@@ -8043,25 +10568,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -8070,7 +10598,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
@@ -8079,33 +10608,41 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
@@ -8117,117 +10654,135 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8239,117 +10794,135 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8361,117 +10934,135 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
 ; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8483,117 +11074,135 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8605,117 +11214,135 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
@@ -8727,142 +11354,176 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -8876,142 +11537,176 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9025,142 +11720,176 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
 ; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -9174,131 +11903,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9311,131 +12083,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9448,131 +12263,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9585,131 +12443,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9722,131 +12623,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9859,131 +12803,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -9996,131 +12983,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10133,131 +13163,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10270,131 +13343,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10407,131 +13523,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10544,131 +13703,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10681,131 +13883,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10818,131 +14063,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -10955,131 +14243,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -11092,131 +14423,174 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
@@ -11229,156 +14603,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11394,156 +14824,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11559,156 +15045,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11724,156 +15266,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -11889,156 +15487,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12054,156 +15708,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12219,156 +15929,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12384,156 +16150,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12549,156 +16371,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12714,156 +16592,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -12879,156 +16813,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13044,156 +17034,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13209,156 +17255,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13374,156 +17476,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -13539,156 +17697,212 @@ entry:
 define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    ds_write_b32 v0, v1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    ds_write_b32 v0, v1
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
 ; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x4
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX940-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
index 1ce9fc308af3..1f835349b12b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
@@ -1,20 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
 
 define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: private_last_use_load_0:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_LU
+; GFX12-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_LU
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{}
@@ -25,15 +23,19 @@ entry:
 define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: private_last_use_load_1:
 ; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-NEXT:    v_and_b32_e64 v1, v1, s3
+; GFX12-NEXT:    s_mov_b32 s3, 2
+; GFX12-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, s2 th:TH_LOAD_LU
+; GFX12-NEXT:    scratch_load_b32 v1, v1, s2 th:TH_LOAD_LU
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -46,16 +48,16 @@ entry:
 define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: private_last_use_and_volatile_load:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load volatile i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{}
@@ -66,16 +68,14 @@ entry:
 define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: private_last_use_and_nontemporal_load:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_LU
+; GFX12-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_LU
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{}, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 30296a9c3d89..4e08065e879f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -1,213 +1,217 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX6-LABEL: private_nontemporal_load_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_add_u32 s8, s8, s7
-; GFX6-NEXT:    s_addc_u32 s9, s9, 0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX6-NEXT:    s_add_u32 s0, s0, s13
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc slc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc slc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen slc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen slc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
-; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s4 nt
+; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v1, off, s2 nt
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    scratch_load_dword v0, off, s4 nt
+; GFX940-TGSPLIT-NEXT:    scratch_load_dword v1, off, s2 nt
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_nontemporal_load_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s2 slc dlc
+; GFX11-WGP-NEXT:    scratch_load_b32 v1, off, s2 slc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: private_nontemporal_load_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s2 slc dlc
+; GFX11-CU-NEXT:    scratch_load_b32 v1, off, s2 slc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: private_nontemporal_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_nontemporal_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT
+; GFX12-CU-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_NT
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
@@ -219,203 +223,254 @@ entry:
 define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX6-LABEL: private_nontemporal_load_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_add_u32 s8, s8, s7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_addc_u32 s9, s9, 0
+; GFX6-NEXT:    s_add_u32 s0, s0, s13
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    s_mov_b32 s9, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s9, v0
+; GFX6-NEXT:    v_add_i32_e64 v0, s[8:9], s8, v0
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc slc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc slc
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc slc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen slc
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen slc
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
-; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc slc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
-; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v0, v0, off nt
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v1, v1, off nt
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 2
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
-; GFX940-TGSPLIT-NEXT:    scratch_load_dword v0, v0, off nt
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX940-TGSPLIT-NEXT:    scratch_load_dword v1, v1, off nt
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX11-WGP-NEXT:    scratch_load_b32 v0, v0, off slc dlc
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v1, v1, off slc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: private_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX11-CU-NEXT:    s_mov_b32 s2, 2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX11-CU-NEXT:    scratch_load_b32 v0, v0, off slc dlc
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v1, v1, off slc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: private_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, s2 th:TH_LOAD_NT
+; GFX12-WGP-NEXT:    scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
+; GFX12-CU-NEXT:    s_mov_b32 s3, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, s2 th:TH_LOAD_NT
+; GFX12-CU-NEXT:    scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
@@ -429,145 +484,146 @@ entry:
 define amdgpu_kernel void @private_nontemporal_store_0(
 ; GFX6-LABEL: private_nontemporal_store_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX6-NEXT:    s_add_u32 s8, s8, s7
-; GFX6-NEXT:    s_addc_u32 s9, s9, 0
+; GFX6-NEXT:    s_add_u32 s0, s0, s13
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s4 sc0 nt sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0 sc0 nt sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-TGSPLIT-NEXT:    scratch_store_dword off, v0, s4 sc0 nt sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0 sc0 nt sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_nontemporal_store_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -579,7 +635,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX11-CU-LABEL: private_nontemporal_store_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -591,22 +647,26 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX12-WGP-LABEL: private_nontemporal_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_nontemporal_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s2 th:TH_STORE_NT
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
@@ -618,191 +678,223 @@ entry:
 define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX6-LABEL: private_nontemporal_store_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX6-NEXT:    s_add_u32 s8, s8, s7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_addc_u32 s9, s9, 0
+; GFX6-NEXT:    s_add_u32 s0, s0, s13
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[4:5], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_mov_b32 s6, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
+; GFX6-NEXT:    v_add_i32_e64 v1, s[6:7], s5, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s6, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
+; GFX7-NEXT:    v_add_i32_e64 v1, s[6:7], s5, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b32 s5, 2
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v1, s[2:3], s1, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen glc slc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen glc slc
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 2
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s5, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 2
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s5, v1
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 2
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s1, v1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-NOTTGSPLIT-NEXT:    scratch_store_dword v0, v1, off sc0 nt sc1
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT:    scratch_store_dword v1, v0, off sc0 nt sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
-; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX940-TGSPLIT-NEXT:    s_mov_b32 s1, 2
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s1, v1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-TGSPLIT-NEXT:    scratch_store_dword v0, v1, off sc0 nt sc1
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT:    scratch_store_dword v1, v0, off sc0 nt sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-WGP-NEXT:    scratch_store_b32 v0, v1, off glc slc dlc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    scratch_store_b32 v1, v0, off glc slc dlc
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: private_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX11-CU-NEXT:    s_mov_b32 s1, 2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    scratch_store_b32 v0, v1, off glc slc dlc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    scratch_store_b32 v1, v0, off glc slc dlc
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: private_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, s2 th:TH_STORE_NT
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, s2 th:TH_STORE_NT
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
@@ -816,198 +908,206 @@ entry:
 define amdgpu_kernel void @private_nontemporal_volatile_load(
 ; GFX6-LABEL: private_nontemporal_volatile_load:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX6-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX6-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX6-NEXT:    s_add_u32 s8, s8, s7
-; GFX6-NEXT:    s_addc_u32 s9, s9, 0
-; GFX6-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX6-NEXT:    s_add_u32 s0, s0, s13
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_nontemporal_volatile_load:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
-; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s8, s8, s7
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s9, s9, 0
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_nop 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX90A-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s4 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v1, off, s2 sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX940-TGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX940-TGSPLIT:       ; %bb.0: ; %entry
-; GFX940-TGSPLIT-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT:    s_nop 0
+; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-TGSPLIT-NEXT:    scratch_load_dword v0, off, s4 sc0 sc1
+; GFX940-TGSPLIT-NEXT:    scratch_load_dword v1, off, s2 sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-TGSPLIT-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
 ; GFX940-TGSPLIT-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s2 glc dlc
+; GFX11-WGP-NEXT:    scratch_load_b32 v1, off, s2 glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s2 glc dlc
+; GFX11-CU-NEXT:    scratch_load_b32 v1, off, s2 glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-CU-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 216b1eccf3a6..a68b5f36b806 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -1,157 +1,163 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd- -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @private_volatile_load_0(
 ; GFX6-LABEL: private_volatile_load_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX6-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX6-NEXT:    s_add_u32 s4, s4, s3
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s14, -1
+; GFX6-NEXT:    s_mov_b32 s15, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s12, s12, s9
+; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_mov_b32 s7, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    s_mov_b32 s5, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s7
+; GFX6-NEXT:    s_mov_b32 s2, s6
+; GFX6-NEXT:    s_mov_b32 s3, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_volatile_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_volatile_load_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
-; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_volatile_load_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s2 glc dlc
+; GFX11-WGP-NEXT:    scratch_load_b32 v1, off, s2 glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: private_volatile_load_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s2 glc dlc
+; GFX11-CU-NEXT:    scratch_load_b32 v1, off, s2 glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: private_volatile_load_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s2 scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    scratch_load_b32 v1, off, s2 scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_volatile_load_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s2 scope:SCOPE_SYS
+; GFX12-CU-NEXT:    scratch_load_b32 v1, off, s2 scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
@@ -163,149 +169,182 @@ entry:
 define amdgpu_kernel void @private_volatile_load_1(
 ; GFX6-LABEL: private_volatile_load_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX6-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX6-NEXT:    s_add_u32 s4, s4, s3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s14, -1
+; GFX6-NEXT:    s_mov_b32 s15, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s12, s12, s9
+; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX6-NEXT:    s_mov_b32 s7, s1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX6-NEXT:    s_mov_b32 s5, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX6-NEXT:    s_mov_b32 s1, s7
+; GFX6-NEXT:    s_mov_b32 s2, s6
+; GFX6-NEXT:    s_mov_b32 s3, s5
+; GFX6-NEXT:    s_mov_b32 s5, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
+; GFX6-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX7-NEXT:    buffer_load_dword v2, v0, s[8:11], 0 offen glc
+; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
+; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-WGP-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-CU-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_volatile_load_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
-; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
 ; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_volatile_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX11-WGP-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v1, v1, off glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-WGP-NEXT:    s_nop 0
-; GFX11-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: private_volatile_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
+; GFX11-CU-NEXT:    s_mov_b32 s2, 2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX11-CU-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v1, v1, off glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-CU-NEXT:    s_nop 0
-; GFX11-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: private_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, s2 scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-WGP-NEXT:    s_nop 0
-; GFX12-WGP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
+; GFX12-CU-NEXT:    s_mov_b32 s3, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, s2 scope:SCOPE_SYS
+; GFX12-CU-NEXT:    scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-CU-NEXT:    s_nop 0
-; GFX12-CU-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(1) %out) {
 entry:
@@ -319,98 +358,97 @@ entry:
 define amdgpu_kernel void @private_volatile_store_0(
 ; GFX6-LABEL: private_volatile_store_0:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX6-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX6-NEXT:    s_add_u32 s4, s4, s3
+; GFX6-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s14, -1
+; GFX6-NEXT:    s_mov_b32 s15, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s12, s12, s9
+; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s1
-; GFX6-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_volatile_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_volatile_store_0:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_volatile_store_0:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
+; GFX11-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -423,7 +461,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX11-CU-LABEL: private_volatile_store_0:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
+; GFX11-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -436,23 +474,37 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX12-WGP-LABEL: private_volatile_store_0:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s2 scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_volatile_store_0:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s2 scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
@@ -465,145 +517,173 @@ entry:
 define amdgpu_kernel void @private_volatile_store_1(
 ; GFX6-LABEL: private_volatile_store_1:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX6-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    s_mov_b32 s7, 0xe8f000
-; GFX6-NEXT:    s_add_u32 s4, s4, s3
+; GFX6-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s14, -1
+; GFX6-NEXT:    s_mov_b32 s15, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s12, s12, s9
+; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_load_dword s1, s[0:1], 0xb
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_load_dword s1, s[2:3], 0x0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s2, 2
+; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
+; GFX6-NEXT:    v_add_i32_e64 v1, s[2:3], s1, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: private_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
-; GFX7-NEXT:    s_add_u32 s8, s8, s7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[4:5], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_mov_b32 s6, 2
+; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
+; GFX7-NEXT:    v_add_i32_e64 v1, s[6:7], s5, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-WGP-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-WGP-NEXT:    s_clause 0x1
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-WGP-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-WGP-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_nop 0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-WGP-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-WGP-NEXT:    s_endpgm
 ;
 ; GFX10-CU-LABEL: private_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GFX10-CU-NEXT:    s_mov_b64 s[8:9], s[0:1]
-; GFX10-CU-NEXT:    s_clause 0x1
-; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX10-CU-NEXT:    s_add_u32 s8, s8, s7
-; GFX10-CU-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_nop 0
+; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-CU-NEXT:    s_mov_b32 s5, 2
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: private_volatile_store_1:
 ; SKIP-CACHE-INV:       ; %bb.0: ; %entry
-; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[4:5]
-; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s0
-; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s4, s3
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s9
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0x2
-; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s5, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[0:1], 0x2
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
-; SKIP-CACHE-INV-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
+; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
+; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v1, s[2:3], s1, v0
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
-; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v0, s[4:7], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
 ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
 ; SKIP-CACHE-INV-NEXT:    s_endpgm
 ;
 ; GFX11-WGP-LABEL: private_volatile_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_clause 0x1
-; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-WGP-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    scratch_store_b32 v1, v0, off dlc
 ; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-WGP-NEXT:    s_endpgm
 ;
 ; GFX11-CU-LABEL: private_volatile_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_clause 0x1
-; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[2:3], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
+; GFX11-CU-NEXT:    s_mov_b32 s1, 2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    scratch_store_b32 v1, v0, off dlc
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: private_volatile_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, s2 scope:SCOPE_SYS
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 v1, v0, s0 scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_volatile_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, s2 scope:SCOPE_SYS
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 v1, v0, s0 scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/mode-register-fpconstrain.ll b/llvm/test/CodeGen/AMDGPU/mode-register-fpconstrain.ll
new file mode 100644
index 000000000000..2403aeaa4428
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mode-register-fpconstrain.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -o - | FileCheck -check-prefix=GCN %s
+
+; The si-mode-register pass is changing the default mode for FP constrained  operations.
+; It must ignore for strictfp functions.
+
+define double @ignoreStrictfp(double noundef %a, double noundef %b) #0 {
+; GCN-LABEL: ignoreStrictfp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 0
+; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  tail call void @llvm.amdgcn.s.setreg(i32 2177, i32 1)
+  %val = tail call double @llvm.experimental.constrained.fadd.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+define double @set_fpenv(double noundef %a, double noundef %b) #0 {
+; GCN-LABEL: set_fpenv:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 23), 4
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 0, 5), 0
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 0
+; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  call void @llvm.set.fpenv.i64(i64 4)
+  %val = tail call double @llvm.experimental.constrained.fadd.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+declare void @llvm.amdgcn.s.setreg(i32 immarg, i32)
+
+declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
+
+declare void @llvm.set.fpenv.i64(i64)
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 125e6bc0f787..ba012b208c95 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s9
 ; MUBUF-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; MUBUF-NEXT:    s_movk_i32 s32, 0x400
 ; MUBUF-NEXT:    s_mov_b32 s33, 0
+; MUBUF-NEXT:    s_movk_i32 s32, 0x400
 ; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; MUBUF-NEXT:    s_cmp_lg_u32 s8, 0
 ; MUBUF-NEXT:    s_cbranch_scc1 .LBB0_3
@@ -57,8 +57,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; FLATSCR-NEXT:    s_mov_b32 s32, 16
 ; FLATSCR-NEXT:    s_mov_b32 s33, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, 16
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    s_cmp_lg_u32 s4, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 .LBB0_3
@@ -125,8 +125,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s9
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; MUBUF-NEXT:    s_movk_i32 s32, 0x1000
 ; MUBUF-NEXT:    s_mov_b32 s33, 0
+; MUBUF-NEXT:    s_movk_i32 s32, 0x1000
 ; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; MUBUF-NEXT:    s_cmp_lg_u32 s6, 0
 ; MUBUF-NEXT:    s_cbranch_scc1 .LBB1_2
@@ -159,8 +159,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; FLATSCR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; FLATSCR-NEXT:    s_mov_b32 s32, 64
 ; FLATSCR-NEXT:    s_mov_b32 s33, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, 64
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    s_cmp_lg_u32 s2, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index 7c351d2b8443..a50a0766f67c 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -8,9 +8,10 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s10, -1
 ; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s11, s7
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    s_mov_b32 s8, s4
+; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_branch .LBB0_2
 ; GCN-NEXT:  .LBB0_1: ; %loop.exit.guard
@@ -20,7 +21,6 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:  .LBB0_2: ; %bb1
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
-; GCN-NEXT:    s_mov_b32 s11, s7
 ; GCN-NEXT:    buffer_load_dword v1, off, s[8:11], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll b/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
index 6d043e2b6b0a..591deda611b2 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
@@ -4,12 +4,12 @@
 ; full tessellation-and-geometry pipeline, compiled on gfx8 so it uses all six
 ; hardware shader types.
 
-; CHECK-DAG: 0x2c0c (SPI_SHADER_USER_DATA_PS_0): 0x10000000
-; CHECK-DAG: 0x2c4c (SPI_SHADER_USER_DATA_VS_0): 0x10000000
-; CHECK-DAG: 0x2c8c (SPI_SHADER_USER_DATA_GS_0): 0x10000000
-; CHECK-DAG: 0x2ccc (SPI_SHADER_USER_DATA_ES_0): 0x10000000
-; CHECK-DAG: 0x2d0c (SPI_SHADER_USER_DATA_HS_0): 0x10000000
-; CHECK-DAG: 0x2d4c (SPI_SHADER_USER_DATA_LS_0): 0x10000000
+; CHECK-DAG: '0x2c0c (SPI_SHADER_USER_DATA_PS_0)': 0x10000000
+; CHECK-DAG: '0x2c4c (SPI_SHADER_USER_DATA_VS_0)': 0x10000000
+; CHECK-DAG: '0x2c8c (SPI_SHADER_USER_DATA_GS_0)': 0x10000000
+; CHECK-DAG: '0x2ccc (SPI_SHADER_USER_DATA_ES_0)': 0x10000000
+; CHECK-DAG: '0x2d0c (SPI_SHADER_USER_DATA_HS_0)': 0x10000000
+; CHECK-DAG: '0x2d4c (SPI_SHADER_USER_DATA_LS_0)': 0x10000000
 
 !amdgpu.pal.metadata.msgpack = !{!0}
 
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6ba66ccf7186..b068d87c4d6f 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -242,130 +242,137 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[12:13], s4, v[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[11:12], s4, v[10:11]
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[6:7], s4, v[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v11, v12
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[15:16], s4, v[13:14]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v11
+; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v8, v12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v15, v4, v12
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v8, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v15
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, v6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v2, v2, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v9, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v12
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v6
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v3, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v5, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v2, v3, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -438,7 +445,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
@@ -474,7 +482,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
@@ -589,27 +598,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -624,22 +633,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -679,27 +688,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -709,30 +718,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
@@ -872,24 +881,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
 ; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
@@ -899,42 +908,42 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
@@ -1018,12 +1027,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
@@ -1034,30 +1043,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -1099,14 +1108,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
@@ -1152,12 +1161,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
@@ -1172,18 +1181,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
@@ -1203,18 +1212,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[5:6]
@@ -1486,11 +1495,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
new file mode 100644
index 000000000000..8eb4be266dd3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
@@ -0,0 +1,24 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX11 %s
+
+---
+name:            sample_load_msaa
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX11-LABEL: name: sample_load_msaa
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: S_WAITCNT 0
+    ; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V1_gfx11 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+    ; GFX11-NEXT: S_WAITCNT 1015
+    ; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+    ; GFX11-NEXT: S_WAITCNT 1015
+    ; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V1_gfx11 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+    SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
index e73235857728..29520cb7468c 100644
--- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
@@ -21,10 +21,10 @@
 ; VI-NEXT:          .vgpr_count:     0x5
 ; GFX9-NEXT:        .vgpr_count:     0x5
 ; GCN-NEXT:     .registers:
-; SI-NEXT:        0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}81
-; VI-NEXT:        0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}c1
-; GFX9-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}81
-; GCN-NEXT:       0x2e13 (COMPUTE_PGM_RSRC2): 0
+; SI-NEXT:        '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}81
+; VI-NEXT:        '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}c1
+; GFX9-NEXT:      '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}81
+; GCN-NEXT:       '0x2e13 (COMPUTE_PGM_RSRC2)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
index be741f536ac7..528bfe041173 100644
--- a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
@@ -46,12 +46,10 @@ define float @fminnum32_intrinsic(float %x, float %y) {
 define float @fminnum32_nsz_intrinsic(float %x, float %y) {
 ; ARMV7-LABEL: fminnum32_nsz_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s0, r0
-; ARMV7-NEXT:    vmov s2, r1
-; ARMV7-NEXT:    vcmp.f32 s0, s2
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovlt.f32 s2, s0
-; ARMV7-NEXT:    vmov r0, s2
+; ARMV7-NEXT:    vmov s0, r1
+; ARMV7-NEXT:    vmov s2, r0
+; ARMV7-NEXT:    vmin.f32 d0, d1, d0
+; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fminnum32_nsz_intrinsic:
@@ -78,9 +76,7 @@ define float @fminnum32_non_zero_intrinsic(float %x) {
 ; ARMV7:       @ %bb.0:
 ; ARMV7-NEXT:    vmov.f32 s0, #-1.000000e+00
 ; ARMV7-NEXT:    vmov s2, r0
-; ARMV7-NEXT:    vcmp.f32 s2, s0
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovlt.f32 s0, s2
+; ARMV7-NEXT:    vmin.f32 d0, d1, d0
 ; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
@@ -136,12 +132,10 @@ define float @fmaxnum32_intrinsic(float %x, float %y) {
 define float @fmaxnum32_nsz_intrinsic(float %x, float %y) {
 ; ARMV7-LABEL: fmaxnum32_nsz_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s0, r0
-; ARMV7-NEXT:    vmov s2, r1
-; ARMV7-NEXT:    vcmp.f32 s0, s2
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovgt.f32 s2, s0
-; ARMV7-NEXT:    vmov r0, s2
+; ARMV7-NEXT:    vmov s0, r1
+; ARMV7-NEXT:    vmov s2, r0
+; ARMV7-NEXT:    vmax.f32 d0, d1, d0
+; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fmaxnum32_nsz_intrinsic:
@@ -210,9 +204,7 @@ define float @fmaxnum32_non_zero_intrinsic(float %x) {
 ; ARMV7:       @ %bb.0:
 ; ARMV7-NEXT:    vmov.f32 s0, #1.000000e+00
 ; ARMV7-NEXT:    vmov s2, r0
-; ARMV7-NEXT:    vcmp.f32 s2, s0
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovgt.f32 s0, s2
+; ARMV7-NEXT:    vmax.f32 d0, d1, d0
 ; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
diff --git a/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-function-flags.ll b/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-function-flags.ll
new file mode 100644
index 000000000000..7362b63a0ad6
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-function-flags.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+pacbti< %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-m.main-unknown"
+
+; When PACBTI is enabled, indirect tail-calls must not use R12 that is used
+; to store authentication code.
+
+define void @pacbti_disabled(ptr %p) "sign-return-address"="none" {
+entry:
+  tail call void %p()
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+  ret void
+}
+
+define void @pacbti_enabled(ptr %p) "sign-return-address"="all" {
+entry:
+  tail call void %p()
+; CHECK: bx {{r0|r1|r2|r3}}
+  ret void
+}
+
+define void @pacbti_disabled_force_r12(ptr %p) "sign-return-address"="none" {
+entry:
+  %p_r12 = tail call ptr asm "", "={r12},{r12},~{lr}"(ptr %p)
+  tail call void %p_r12()
+; CHECK: bx r12
+  ret void
+}
+
+define void @pacbti_enabled_force_r12(ptr %p) "sign-return-address"="all" {
+entry:
+  %p_r12 = tail call ptr asm "", "={r12},{r12},~{lr}"(ptr %p)
+  tail call void %p_r12()
+; CHECK: bx {{r0|r1|r2|r3}}
+  ret void
+}
diff --git a/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-module-flags1.ll b/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-module-flags1.ll
new file mode 100644
index 000000000000..59499a240fb3
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-module-flags1.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+pacbti< %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-m.main-unknown"
+
+define dso_local void @sgign_return_address(ptr noundef readonly %fptr_arg) local_unnamed_addr #0 {
+entry:
+  %0 = tail call ptr asm "", "={r12},{r12},~{lr}"(ptr %fptr_arg)
+  tail call void %0()
+; CHECK: bx {{r0|r1|r2|r3}}
+  ret void
+}
+
+!llvm.module.flags = !{!1}
+
+!1 = !{i32 8, !"sign-return-address", i32 1}
+
diff --git a/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-module-flags2.ll b/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-module-flags2.ll
new file mode 100644
index 000000000000..b2ae55c43c33
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/pacbti-indirect-tail-calls-module-flags2.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+pacbti< %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-m.main-unknown"
+
+define dso_local void @sgign_return_address_all(ptr noundef readonly %fptr_arg) local_unnamed_addr #0 {
+entry:
+  %0 = tail call ptr asm "", "={r12},{r12},~{lr}"(ptr %fptr_arg)
+  tail call void %0()
+; CHECK: bx {{r0|r1|r2|r3}}
+  ret void
+}
+
+!llvm.module.flags = !{!1}
+
+!1 = !{i32 8, !"sign-return-address", i32 1}
+!2 = !{i32 8, !"sign-return-address-all", i32 1}
+
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
index 8ca8a6602737..024ed04f6e5e 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
@@ -108,8 +108,8 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   20
 ; CHECK-NEXT:        .long   20
-; CHECK-NEXT:        .long   124
-; CHECK-NEXT:        .long   144
+; CHECK-NEXT:        .long   108
+; CHECK-NEXT:        .long   128
 ; CHECK-NEXT:        .long   28
 ; CHECK-NEXT:        .long   8                       # FuncInfo
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/EmptySignature.ll b/llvm/test/CodeGen/DirectX/ContainerData/EmptySignature.ll
new file mode 100644
index 000000000000..796f7942a80b
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/EmptySignature.ll
@@ -0,0 +1,26 @@
+; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: @dx.isg1 = private constant [8 x i8] c"\00\00\00\00\08\00\00\00", section "ISG1", align 4
+; CHECK: @dx.osg1 = private constant [8 x i8] c"\00\00\00\00\08\00\00\00", section "OSG1", align 4
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+!dx.valver = !{!0}
+
+!0 = !{i32 1, i32 7}
+
+; DXC: - Name:            ISG1
+; DXC-NEXT:   Size:            8
+; DXC-NEXT:   Signature:
+; DXC-NEXT:     Parameters:      []
+; DXC: - Name:            OSG1
+; DXC-NEXT:   Size:            8
+; DXC-NEXT:   Signature:
+; DXC-NEXT:     Parameters:      []
diff --git a/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll b/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
index 867ce3b930f8..69ba26622726 100644
--- a/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
+++ b/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
@@ -69,6 +69,7 @@ b18:                                              ; preds = %b16, %b7
   br label %b22
 
 b21:                                              ; preds = %b22
+  store volatile <64 x i32> %v20, ptr null
   tail call void @sammy() #3
   br label %b7
 
diff --git a/llvm/test/CodeGen/Hexagon/hexagon-copy-hoisting.mir b/llvm/test/CodeGen/Hexagon/hexagon-copy-hoisting.mir
new file mode 100644
index 000000000000..6f2d562cbe09
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/hexagon-copy-hoisting.mir
@@ -0,0 +1,49 @@
+# RUN: llc -march=hexagon -run-pass hexagon-move-phicopy -o - %s | FileCheck %s
+
+# CHECK-COUNT-1: %4:intregs = COPY %1
+
+# CHECK: bb.1
+# CHECK-NOT: %4:intregs = COPY %1
+
+# CHECK: bb.2
+# CHECK-NOT: %4:intregs = COPY %1
+# CHECK: %5:intregs = COPY %0
+
+---
+name:            f0
+tracksRegLiveness: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '' }
+  - { id: 1, class: intregs, preferred-register: '' }
+  - { id: 2, class: predregs, preferred-register: '' }
+  - { id: 3, class: predregs, preferred-register: '' }
+  - { id: 4, class: intregs, preferred-register: '' }
+  - { id: 5, class: intregs, preferred-register: '' }
+stack:
+  - { id: 0, offset: 0, size: 4, alignment: 8 }
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+
+    %1:intregs = COPY $r1
+    %0:intregs = COPY $r0
+    %2:predregs = C2_cmpgt %0, %1
+    %3:predregs = C2_not %2
+    J2_jumpt %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1:
+    successors: %bb.0
+
+    %4:intregs = COPY %1
+    $r1 = COPY %4
+    J2_jump %bb.0, implicit-def dead $pc
+
+  bb.2:
+    successors: %bb.0
+
+    %4:intregs = COPY %1
+    %5:intregs = COPY %0
+    $r1 = COPY %4
+    J2_jump %bb.0, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 6629d3440549..25106b456d2f 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -123,10 +123,9 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 0
-; CHECK-NEXT:    st.b $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 0
+; CHECK-NEXT:    st.b $a2, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -150,10 +149,9 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 1
-; CHECK-NEXT:    st.h $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 1
+; CHECK-NEXT:    st.h $a2, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -177,10 +175,9 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 2
-; CHECK-NEXT:    st.w $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 2
+; CHECK-NEXT:    st.w $a2, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -204,10 +201,9 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 3
-; CHECK-NEXT:    st.d $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 3
+; CHECK-NEXT:    st.d $a2, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -231,10 +227,9 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
 ; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT:    addi.d $a2, $sp, 0
-; CHECK-NEXT:    bstrins.d $a2, $a0, 4, 2
-; CHECK-NEXT:    fst.s $fa0, $a2, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
+; CHECK-NEXT:    fst.s $fa0, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -258,10 +253,9 @@ define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounw
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
 ; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT:    addi.d $a2, $sp, 0
-; CHECK-NEXT:    bstrins.d $a2, $a0, 4, 3
-; CHECK-NEXT:    fst.d $fa0, $a2, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
+; CHECK-NEXT:    fst.d $fa0, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
index 19171b7d8ed7..7f232073ae12 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
@@ -87,10 +87,9 @@ define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 0
-; CHECK-NEXT:    st.b $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 0
+; CHECK-NEXT:    st.b $a2, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -107,10 +106,9 @@ define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 1
-; CHECK-NEXT:    st.h $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 1
+; CHECK-NEXT:    st.h $a2, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -127,10 +125,9 @@ define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 2
-; CHECK-NEXT:    st.w $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 2
+; CHECK-NEXT:    st.w $a2, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -147,10 +144,9 @@ define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 3
-; CHECK-NEXT:    st.d $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 3
+; CHECK-NEXT:    st.d $a2, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -167,10 +163,9 @@ define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwi
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr1, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT:    addi.d $a2, $sp, 0
-; CHECK-NEXT:    bstrins.d $a2, $a0, 3, 2
-; CHECK-NEXT:    fst.s $fa0, $a2, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 3, 2
+; CHECK-NEXT:    fst.s $fa0, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -187,10 +182,9 @@ define void @insert_2xdouble_idx(ptr %src, ptr %dst, double %ins, i32 %idx) noun
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr1, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT:    addi.d $a2, $sp, 0
-; CHECK-NEXT:    bstrins.d $a2, $a0, 3, 3
-; CHECK-NEXT:    fst.d $fa0, $a2, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 3, 3
+; CHECK-NEXT:    fst.d $fa0, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
new file mode 100644
index 000000000000..6db9c1608b3c
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
@@ -0,0 +1,921 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=loongarch64 | FileCheck %s --check-prefixes=CHECK
+
+define void @test1(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    sra.w $s0, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB0_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare signext i32 @bar(i32 signext)
+
+define signext i32 @test2(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    andn $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @test3(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    orn $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = or i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @test4(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    xor $a0, $a1, $a0
+; CHECK-NEXT:    nor $a0, $a0, $zero
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = xor i32 %neg, %a
+  ret i32 %and1
+}
+
+define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -48
+; CHECK-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a1, $a0, $a1
+; CHECK-NEXT:    lu12i.w $a0, 349525
+; CHECK-NEXT:    ori $fp, $a0, 1365
+; CHECK-NEXT:    lu12i.w $a0, 209715
+; CHECK-NEXT:    ori $s0, $a0, 819
+; CHECK-NEXT:    lu12i.w $a0, 61680
+; CHECK-NEXT:    ori $s1, $a0, 3855
+; CHECK-NEXT:    lu12i.w $a0, 4112
+; CHECK-NEXT:    ori $s2, $a0, 257
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB4_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a1, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    srli.d $a1, $a0, 1
+; CHECK-NEXT:    and $a1, $a1, $fp
+; CHECK-NEXT:    sub.d $a1, $a0, $a1
+; CHECK-NEXT:    and $a2, $a1, $s0
+; CHECK-NEXT:    srli.d $a1, $a1, 2
+; CHECK-NEXT:    and $a1, $a1, $s0
+; CHECK-NEXT:    add.d $a1, $a2, $a1
+; CHECK-NEXT:    srli.d $a2, $a1, 4
+; CHECK-NEXT:    add.d $a1, $a1, $a2
+; CHECK-NEXT:    and $a1, $a1, $s1
+; CHECK-NEXT:    mul.d $a1, $a1, $s2
+; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 24
+; CHECK-NEXT:    bnez $a0, .LBB4_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 48
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = tail call i32 @llvm.ctpop.i32(i32 %i4)
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+
+define void @test6(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $fp, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB5_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    bl %plt(baz)
+; CHECK-NEXT:    bstrpick.d $s0, $a0, 31, 0
+; CHECK-NEXT:    move $a0, $s0
+; CHECK-NEXT:    bl %plt(__fixsfsi)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:    move $a0, $s0
+; CHECK-NEXT:    move $a1, $zero
+; CHECK-NEXT:    bl %plt(__nesf2)
+; CHECK-NEXT:    bnez $a0, .LBB5_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call float @baz(i32 signext %i3)
+  %i5 = fptosi float %i4 to i32
+  %i6 = fcmp oeq float %i4, zeroinitializer
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+declare float @baz(i32 signext %i3)
+
+define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -48
+; CHECK-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a0, $a0, $a1
+; CHECK-NEXT:    lu12i.w $a1, 349525
+; CHECK-NEXT:    ori $a1, $a1, 1365
+; CHECK-NEXT:    lu32i.d $a1, 349525
+; CHECK-NEXT:    lu52i.d $fp, $a1, 1365
+; CHECK-NEXT:    lu12i.w $a1, 209715
+; CHECK-NEXT:    ori $a1, $a1, 819
+; CHECK-NEXT:    lu32i.d $a1, 209715
+; CHECK-NEXT:    lu52i.d $s0, $a1, 819
+; CHECK-NEXT:    lu12i.w $a1, 61680
+; CHECK-NEXT:    ori $a1, $a1, 3855
+; CHECK-NEXT:    lu32i.d $a1, -61681
+; CHECK-NEXT:    lu52i.d $s1, $a1, 240
+; CHECK-NEXT:    lu12i.w $a1, 4112
+; CHECK-NEXT:    ori $a1, $a1, 257
+; CHECK-NEXT:    lu32i.d $a1, 65793
+; CHECK-NEXT:    lu52i.d $s2, $a1, 16
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB6_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    srli.d $a1, $a0, 1
+; CHECK-NEXT:    and $a1, $a1, $fp
+; CHECK-NEXT:    sub.d $a0, $a0, $a1
+; CHECK-NEXT:    and $a1, $a0, $s0
+; CHECK-NEXT:    srli.d $a0, $a0, 2
+; CHECK-NEXT:    and $a0, $a0, $s0
+; CHECK-NEXT:    add.d $a0, $a1, $a0
+; CHECK-NEXT:    srli.d $a1, $a0, 4
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    and $a0, $a0, $s1
+; CHECK-NEXT:    mul.d $a0, $a0, $s2
+; CHECK-NEXT:    srli.d $a0, $a0, 56
+; CHECK-NEXT:    bnez $a0, .LBB6_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 48
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i6, %bb2 ]
+  %i4 = tail call signext i64 @foo(i32 signext %i3)
+  %i5 = tail call i64 @llvm.ctpop.i64(i64 %i4)
+  %i6 = trunc i64 %i5 to i32
+  %i7 = icmp eq i32 %i6, 0
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i64 @llvm.ctpop.i64(i64)
+
+define void @test8(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $fp, $zero, -256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB7_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    or $a0, $a0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB7_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i6, %bb2 ]
+  %i4 = tail call signext i64 @foo(i32 signext %i3)
+  %i5 = or i64 %i4, -256
+  %i6 = trunc i64 %i5 to i32
+  %i7 = icmp eq i32 %i6, 0
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i64 @foo(i32 signext)
+
+define void @test9(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a1, $a0, $a1
+; CHECK-NEXT:    ori $fp, $zero, 254
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB8_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a1, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    slti $a1, $a0, 255
+; CHECK-NEXT:    blt $fp, $a0, .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i7, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = icmp slt i32 %i4, 255
+  %i6 = sext i1 %i5 to i32
+  %i7 = sub i32 0, %i6
+  br i1 %i5, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define void @test10(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $fp, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB9_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    bl %plt(baz)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    move $a1, $zero
+; CHECK-NEXT:    bl %plt(__nesf2)
+; CHECK-NEXT:    bnez $a0, .LBB9_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call float @baz(i32 signext %i3)
+  %i5 = bitcast float %i4 to i32
+  %i6 = fcmp oeq float %i4, zeroinitializer
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB10_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    andi $a0, $a0, 1234
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB10_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = and i64 %i1, 1234
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
+
+define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB11_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xor $a0, $a0, $a1
+; CHECK-NEXT:    mul.d $a4, $a0, $a1
+; CHECK-NEXT:    add.d $a0, $a0, $a4
+; CHECK-NEXT:    and $a4, $a4, $a0
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a4, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB11_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a4, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i6, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = xor i64 %i1, %arg2
+  %i5 = mul i64 %i4, %arg2
+  %i9 = add i64 %i4, %i5
+  %i8 = and i64 %i5, %i9
+  %i6 = add i64 %i8, %arg2
+  %i7 = icmp ugt i64 %i2, 255
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %r = trunc i64 %i8 to i32
+  ret i32 %r
+}
+
+define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB12_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    div.d $a0, $a0, $a1
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB12_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = sdiv i64 %i1, %arg2
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i8 = trunc i64 %i5 to i32
+  ret i32 %i8
+}
+
+
+define signext i32 @test14(i32 signext %0, i32 signext %1) {
+; CHECK-LABEL: test14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB13_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB13_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB13_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB13_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB13_2
+; CHECK-NEXT:  .LBB13_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB13_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14b(i32 %0, i32 signext %1) {
+; CHECK-LABEL: test14b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB14_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB14_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB14_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB14_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB14_2
+; CHECK-NEXT:  .LBB14_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB14_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14c(i32 zeroext %0, i32 signext %1) {
+; CHECK-LABEL: test14c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB15_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB15_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB15_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB15_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB15_2
+; CHECK-NEXT:  .LBB15_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB15_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14d(i31 zeroext %0, i32 signext %1) {
+; CHECK-LABEL: test14d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB16_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB16_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB16_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB16_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB16_2
+; CHECK-NEXT:  .LBB16_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB16_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %zext = zext i31 %0 to i32
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %zext, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %zext, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test15(i64 %arg1, i64 %arg2, i64 %arg3, ptr %arg4)  {
+; CHECK-LABEL: test15:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a4, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB17_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    andi $a0, $a0, 1234
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    st.w $a0, $a3, 0
+; CHECK-NEXT:    bltu $a2, $a4, .LBB17_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = and i64 %i1, 1234
+  %i5 = add i64 %i4, %arg2
+  %i8 = trunc i64 %i5 to i32
+  store i32 %i8, ptr %arg4
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
+
+define signext i32 @bug(i32 signext %x) {
+; CHECK-LABEL: bug:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    beqz $a0, .LBB18_2
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    bstrpick.d $a1, $a0, 31, 16
+; CHECK-NEXT:    sltui $a1, $a1, 1
+; CHECK-NEXT:    slli.d $a2, $a0, 16
+; CHECK-NEXT:    masknez $a0, $a0, $a1
+; CHECK-NEXT:    maskeqz $a2, $a2, $a1
+; CHECK-NEXT:    or $a0, $a2, $a0
+; CHECK-NEXT:    ori $a2, $zero, 32
+; CHECK-NEXT:    masknez $a2, $a2, $a1
+; CHECK-NEXT:    ori $a3, $zero, 16
+; CHECK-NEXT:    maskeqz $a1, $a3, $a1
+; CHECK-NEXT:    or $a1, $a1, $a2
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 24
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 8
+; CHECK-NEXT:    addi.d $a4, $a1, -8
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 28
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 4
+; CHECK-NEXT:    addi.d $a4, $a1, -4
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 30
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 2
+; CHECK-NEXT:    addi.d $a4, $a1, -2
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    srai.d $a0, $a0, 31
+; CHECK-NEXT:    nor $a0, $a0, $zero
+; CHECK-NEXT:    add.d $a0, $a1, $a0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_2:
+; CHECK-NEXT:    addi.w $a0, $zero, 0
+; CHECK-NEXT:    ret
+entry:
+  %tobool.not = icmp eq i32 %x, 0
+  br i1 %tobool.not, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  %tobool1.not = icmp ult i32 %x, 65536
+  %shl = shl i32 %x, 16
+  %spec.select = select i1 %tobool1.not, i32 %shl, i32 %x
+  %spec.select43 = select i1 %tobool1.not, i32 16, i32 32
+  %tobool5.not = icmp ult i32 %spec.select, 16777216
+  %shl7 = shl i32 %spec.select, 8
+  %sub8 = add nsw i32 %spec.select43, -8
+  %x.addr.1 = select i1 %tobool5.not, i32 %shl7, i32 %spec.select
+  %r.1 = select i1 %tobool5.not, i32 %sub8, i32 %spec.select43
+  %tobool11.not = icmp ult i32 %x.addr.1, 268435456
+  %shl13 = shl i32 %x.addr.1, 4
+  %sub14 = add nsw i32 %r.1, -4
+  %x.addr.2 = select i1 %tobool11.not, i32 %shl13, i32 %x.addr.1
+  %r.2 = select i1 %tobool11.not, i32 %sub14, i32 %r.1
+  %tobool17.not = icmp ult i32 %x.addr.2, 1073741824
+  %shl19 = shl i32 %x.addr.2, 2
+  %sub20 = add nsw i32 %r.2, -2
+  %x.addr.3 = select i1 %tobool17.not, i32 %shl19, i32 %x.addr.2
+  %r.3 = select i1 %tobool17.not, i32 %sub20, i32 %r.2
+  %x.addr.3.lobit = ashr i32 %x.addr.3, 31
+  %x.addr.3.lobit.not = xor i32 %x.addr.3.lobit, -1
+  %r.4 = add nsw i32 %r.3, %x.addr.3.lobit.not
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+  %retval.0 = phi i32 [ %r.4, %if.end ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define void @test16(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test16:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    move $s0, $a0
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB19_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB19_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = call signext i32 @bar(i32 signext %arg)
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define void @test17(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test17:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    bl %plt(bat)
+; CHECK-NEXT:    move $s0, $a0
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB20_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB20_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = call zeroext i16 @bat(i32 signext %arg)
+  %zext = zext i16 %i to i32
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %zext, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+declare zeroext i16 @bat(i32 signext)
+
+define signext i32 @sextw_sh2add(i1 zeroext %0, ptr %1, i32 signext %2, i32 signext %3, i32 signext %4) {
+; CHECK-LABEL: sextw_sh2add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    alsl.d $a2, $a2, $a3, 2
+; CHECK-NEXT:    beqz $a0, .LBB21_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    st.w $a2, $a1, 0
+; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    add.w $a0, $a2, $a4
+; CHECK-NEXT:    ret
+  %6 = shl i32 %2, 2
+  %7 = add i32 %6, %3
+  br i1 %0, label %8, label %9
+
+8:                                                ; preds = %5
+  store i32 %7, ptr %1, align 4
+  br label %9
+
+9:                                                ; preds = %5, %8
+  %10 = add i32 %7, %4
+  ret i32 %10
+}
+
+define signext i32 @test19(i64 %arg, i1 zeroext %c1, i1 zeroext %c2, ptr %p) nounwind {
+; CHECK-LABEL: test19:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    ori $a0, $zero, 35
+; CHECK-NEXT:    lu32i.d $a0, 1
+; CHECK-NEXT:    maskeqz $fp, $a0, $a1
+; CHECK-NEXT:    st.d $fp, $a3, 0
+; CHECK-NEXT:    beqz $a2, .LBB22_2
+; CHECK-NEXT:  # %bb.1: # %bb2
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:  .LBB22_2: # %bb7
+; CHECK-NEXT:    bl %plt(side_effect)
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %sel = select i1 %c1, i64 4294967331, i64 0
+  store i64 %sel, ptr %p, align 8
+  br i1 %c2, label %bb2, label %bb7
+
+bb2:                                              ; preds = %bb2, %bb
+  %i4 = call signext i32 @bar(i32 0)
+  %i4.sext = sext i32 %i4 to i64
+  br label %bb7
+
+bb7:                                              ; preds = %bb2
+  %phi = phi i64 [ %sel, %bb ], [ %i4.sext, %bb2 ]
+  %trunc = trunc i64 %phi to i32
+  call void @side_effect()
+  ret i32 %trunc
+}
+
+ declare void @side_effect(i64)
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index 3994df1da716..6b250ec02162 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -5,6 +5,12 @@
 ; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32NOPIC
 ; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64NOPIC
 ; RUN: llc --mtriple=loongarch64 --code-model=large < %s | FileCheck %s --check-prefix=LA64LARGENOPIC
+; RUN: llc --mtriple=loongarch32 --relocation-model=pic --enable-tlsdesc < %s \
+; RUN:     | FileCheck %s --check-prefix=LA32DESC
+; RUN: llc --mtriple=loongarch64 --relocation-model=pic --enable-tlsdesc < %s \
+; RUN:     | FileCheck %s --check-prefix=LA64DESC
+; RUN: llc --mtriple=loongarch64 --relocation-model=pic --enable-tlsdesc \
+; RUN:     --code-model=large < %s | FileCheck %s --check-prefix=DESC64
 
 ;; Check that TLS symbols are lowered correctly based on the specified
 ;; model. Make sure they're external to avoid them all being optimised to Local
@@ -82,6 +88,49 @@ define ptr @f1() nounwind {
 ; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
+;
+; LA32DESC-LABEL: f1:
+; LA32DESC:       # %bb.0: # %entry
+; LA32DESC-NEXT:    addi.w $sp, $sp, -16
+; LA32DESC-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32DESC-NEXT:    pcalau12i $a0, %desc_pc_hi20(unspecified)
+; LA32DESC-NEXT:    addi.w $a0, $a0, %desc_pc_lo12(unspecified)
+; LA32DESC-NEXT:    ld.w $ra, $a0, %desc_ld(unspecified)
+; LA32DESC-NEXT:    jirl $ra, $ra, %desc_call(unspecified)
+; LA32DESC-NEXT:    add.w $a0, $a0, $tp
+; LA32DESC-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32DESC-NEXT:    addi.w $sp, $sp, 16
+; LA32DESC-NEXT:    ret
+;
+; LA64DESC-LABEL: f1:
+; LA64DESC:       # %bb.0: # %entry
+; LA64DESC-NEXT:    addi.d $sp, $sp, -16
+; LA64DESC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64DESC-NEXT:    pcalau12i $a0, %desc_pc_hi20(unspecified)
+; LA64DESC-NEXT:    addi.d $a0, $a0, %desc_pc_lo12(unspecified)
+; LA64DESC-NEXT:    ld.d $ra, $a0, %desc_ld(unspecified)
+; LA64DESC-NEXT:    jirl $ra, $ra, %desc_call(unspecified)
+; LA64DESC-NEXT:    add.d $a0, $a0, $tp
+; LA64DESC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64DESC-NEXT:    addi.d $sp, $sp, 16
+; LA64DESC-NEXT:    ret
+;
+; DESC64-LABEL: f1:
+; DESC64:       # %bb.0: # %entry
+; DESC64-NEXT:    addi.d $sp, $sp, -16
+; DESC64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; DESC64-NEXT:    pcalau12i $a0, %desc_pc_hi20(unspecified)
+; DESC64-NEXT:    addi.d $t8, $zero, %desc_pc_lo12(unspecified)
+; DESC64-NEXT:    lu32i.d $t8, %desc64_pc_lo20(unspecified)
+; DESC64-NEXT:    lu52i.d $t8, $t8, %desc64_pc_hi12(unspecified)
+; DESC64-NEXT:    add.d $a0, $t8, $a0
+; DESC64-NEXT:    ld.d $ra, $a0, %desc_ld(unspecified)
+; DESC64-NEXT:    jirl $ra, $ra, %desc_call(unspecified)
+; DESC64-NEXT:    add.d $a1, $a0, $tp
+; DESC64-NEXT:    move $a0, $a1
+; DESC64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; DESC64-NEXT:    addi.d $sp, $sp, 16
+; DESC64-NEXT:    ret
 entry:
   ret ptr @unspecified
 }
@@ -153,6 +202,49 @@ define ptr @f2() nounwind {
 ; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
+;
+; LA32DESC-LABEL: f2:
+; LA32DESC:       # %bb.0: # %entry
+; LA32DESC-NEXT:    addi.w $sp, $sp, -16
+; LA32DESC-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32DESC-NEXT:    pcalau12i $a0, %desc_pc_hi20(ld)
+; LA32DESC-NEXT:    addi.w $a0, $a0, %desc_pc_lo12(ld)
+; LA32DESC-NEXT:    ld.w $ra, $a0, %desc_ld(ld)
+; LA32DESC-NEXT:    jirl $ra, $ra, %desc_call(ld)
+; LA32DESC-NEXT:    add.w $a0, $a0, $tp
+; LA32DESC-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32DESC-NEXT:    addi.w $sp, $sp, 16
+; LA32DESC-NEXT:    ret
+;
+; LA64DESC-LABEL: f2:
+; LA64DESC:       # %bb.0: # %entry
+; LA64DESC-NEXT:    addi.d $sp, $sp, -16
+; LA64DESC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64DESC-NEXT:    pcalau12i $a0, %desc_pc_hi20(ld)
+; LA64DESC-NEXT:    addi.d $a0, $a0, %desc_pc_lo12(ld)
+; LA64DESC-NEXT:    ld.d $ra, $a0, %desc_ld(ld)
+; LA64DESC-NEXT:    jirl $ra, $ra, %desc_call(ld)
+; LA64DESC-NEXT:    add.d $a0, $a0, $tp
+; LA64DESC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64DESC-NEXT:    addi.d $sp, $sp, 16
+; LA64DESC-NEXT:    ret
+;
+; DESC64-LABEL: f2:
+; DESC64:       # %bb.0: # %entry
+; DESC64-NEXT:    addi.d $sp, $sp, -16
+; DESC64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; DESC64-NEXT:    pcalau12i $a0, %desc_pc_hi20(ld)
+; DESC64-NEXT:    addi.d $t8, $zero, %desc_pc_lo12(ld)
+; DESC64-NEXT:    lu32i.d $t8, %desc64_pc_lo20(ld)
+; DESC64-NEXT:    lu52i.d $t8, $t8, %desc64_pc_hi12(ld)
+; DESC64-NEXT:    add.d $a0, $t8, $a0
+; DESC64-NEXT:    ld.d $ra, $a0, %desc_ld(ld)
+; DESC64-NEXT:    jirl $ra, $ra, %desc_call(ld)
+; DESC64-NEXT:    add.d $a1, $a0, $tp
+; DESC64-NEXT:    move $a0, $a1
+; DESC64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; DESC64-NEXT:    addi.d $sp, $sp, 16
+; DESC64-NEXT:    ret
 entry:
   ret ptr @ld
 }
@@ -207,6 +299,30 @@ define ptr @f3() nounwind {
 ; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
+;
+; LA32DESC-LABEL: f3:
+; LA32DESC:       # %bb.0: # %entry
+; LA32DESC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LA32DESC-NEXT:    ld.w $a0, $a0, %ie_pc_lo12(ie)
+; LA32DESC-NEXT:    add.w $a0, $a0, $tp
+; LA32DESC-NEXT:    ret
+;
+; LA64DESC-LABEL: f3:
+; LA64DESC:       # %bb.0: # %entry
+; LA64DESC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LA64DESC-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
+; LA64DESC-NEXT:    add.d $a0, $a0, $tp
+; LA64DESC-NEXT:    ret
+;
+; DESC64-LABEL: f3:
+; DESC64:       # %bb.0: # %entry
+; DESC64-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; DESC64-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; DESC64-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; DESC64-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; DESC64-NEXT:    ldx.d $a0, $t8, $a0
+; DESC64-NEXT:    add.d $a0, $a0, $tp
+; DESC64-NEXT:    ret
 entry:
   ret ptr @ie
 }
@@ -259,6 +375,29 @@ define ptr @f4() nounwind {
 ; LA64LARGENOPIC-NEXT:    lu52i.d $a0, $a0, %le64_hi12(le)
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
+;
+; LA32DESC-LABEL: f4:
+; LA32DESC:       # %bb.0: # %entry
+; LA32DESC-NEXT:    lu12i.w $a0, %le_hi20(le)
+; LA32DESC-NEXT:    ori $a0, $a0, %le_lo12(le)
+; LA32DESC-NEXT:    add.w $a0, $a0, $tp
+; LA32DESC-NEXT:    ret
+;
+; LA64DESC-LABEL: f4:
+; LA64DESC:       # %bb.0: # %entry
+; LA64DESC-NEXT:    lu12i.w $a0, %le_hi20(le)
+; LA64DESC-NEXT:    ori $a0, $a0, %le_lo12(le)
+; LA64DESC-NEXT:    add.d $a0, $a0, $tp
+; LA64DESC-NEXT:    ret
+;
+; DESC64-LABEL: f4:
+; DESC64:       # %bb.0: # %entry
+; DESC64-NEXT:    lu12i.w $a0, %le_hi20(le)
+; DESC64-NEXT:    ori $a0, $a0, %le_lo12(le)
+; DESC64-NEXT:    lu32i.d $a0, %le64_lo20(le)
+; DESC64-NEXT:    lu52i.d $a0, $a0, %le64_hi12(le)
+; DESC64-NEXT:    add.d $a0, $a0, $tp
+; DESC64-NEXT:    ret
 entry:
   ret ptr @le
 }
diff --git a/llvm/test/CodeGen/M68k/Arith/add-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/add-with-overflow.ll
index bd5e593edb33..70479b0b3ec6 100644
--- a/llvm/test/CodeGen/M68k/Arith/add-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/add-with-overflow.ll
@@ -35,7 +35,7 @@ define fastcc i1 @test6(i32 %v1, i32 %v2, ptr %X) nounwind {
 ; CHECK-NEXT:  ; %bb.1: ; %normal
 ; CHECK-NEXT:    move.l #0, (%a0)
 ; CHECK-NEXT:  .LBB1_2: ; %carry
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
diff --git a/llvm/test/CodeGen/M68k/Arith/add.ll b/llvm/test/CodeGen/M68k/Arith/add.ll
index 281751e3e183..a9eb0bb815b0 100644
--- a/llvm/test/CodeGen/M68k/Arith/add.ll
+++ b/llvm/test/CodeGen/M68k/Arith/add.ll
@@ -43,7 +43,7 @@ define fastcc void @test3(ptr inreg %a) nounwind {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
 ; CHECK-NEXT:    move.l (%a0), %d0
-; CHECK-NEXT:    move.l #0, %d1
+; CHECK-NEXT:    moveq #0, %d1
 ; CHECK-NEXT:    move.l #-2147483648, %d2
 ; CHECK-NEXT:    add.l (4,%a0), %d2
 ; CHECK-NEXT:    addx.l %d0, %d1
@@ -64,7 +64,7 @@ define fastcc void @test4(ptr inreg %a) nounwind {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
 ; CHECK-NEXT:    move.l (%a0), %d0
-; CHECK-NEXT:    move.l #0, %d1
+; CHECK-NEXT:    moveq #0, %d1
 ; CHECK-NEXT:    move.l #128, %d2
 ; CHECK-NEXT:    add.l (4,%a0), %d2
 ; CHECK-NEXT:    addx.l %d0, %d1
diff --git a/llvm/test/CodeGen/M68k/Arith/bitwise.ll b/llvm/test/CodeGen/M68k/Arith/bitwise.ll
index 70e4dd42bfb6..74fc543a5fb8 100644
--- a/llvm/test/CodeGen/M68k/Arith/bitwise.ll
+++ b/llvm/test/CodeGen/M68k/Arith/bitwise.ll
@@ -242,7 +242,7 @@ define i64 @lshr64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    add.l #-32, %d1
 ; CHECK-NEXT:    bmi .LBB18_1
 ; CHECK-NEXT:  ; %bb.2:
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    bra .LBB18_3
 ; CHECK-NEXT:  .LBB18_1:
 ; CHECK-NEXT:    move.l %d2, %d0
@@ -301,7 +301,7 @@ define i64 @ashr64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    add.l #-32, %d3
 ; CHECK-NEXT:    bmi .LBB19_5
 ; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    move.l #31, %d2
+; CHECK-NEXT:    moveq #31, %d2
 ; CHECK-NEXT:  .LBB19_5:
 ; CHECK-NEXT:    asr.l %d2, %d0
 ; CHECK-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
@@ -322,7 +322,7 @@ define i64 @shl64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    add.l #-32, %d0
 ; CHECK-NEXT:    bmi .LBB20_1
 ; CHECK-NEXT:  ; %bb.2:
-; CHECK-NEXT:    move.l #0, %d1
+; CHECK-NEXT:    moveq #0, %d1
 ; CHECK-NEXT:    bra .LBB20_3
 ; CHECK-NEXT:  .LBB20_1:
 ; CHECK-NEXT:    move.l %d2, %d1
diff --git a/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll b/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
index 834dfe1c26f0..fcc8dd3e7662 100644
--- a/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
+++ b/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
@@ -40,7 +40,7 @@ define zeroext i8 @test3(i8 zeroext %x, i8 zeroext %c) {
 ; CHECK-NEXT:    move.b (11,%sp), %d0
 ; CHECK-NEXT:    and.l #255, %d0
 ; CHECK-NEXT:    muls #171, %d0
-; CHECK-NEXT:    move.w #9, %d1
+; CHECK-NEXT:    moveq #9, %d1
 ; CHECK-NEXT:    lsr.w %d1, %d0
 ; CHECK-NEXT:    and.l #65535, %d0
 ; CHECK-NEXT:    rts
@@ -58,7 +58,7 @@ define signext i16 @test4(i16 signext %x) nounwind {
 ; CHECK-NEXT:    muls #1986, %d0
 ; CHECK-NEXT:    asr.l #8, %d0
 ; CHECK-NEXT:    asr.l #8, %d0
-; CHECK-NEXT:    move.w #15, %d1
+; CHECK-NEXT:    moveq #15, %d1
 ; CHECK-NEXT:    move.w %d0, %d2
 ; CHECK-NEXT:    lsr.w %d1, %d2
 ; CHECK-NEXT:    add.w %d2, %d0
@@ -94,7 +94,7 @@ define signext i16 @test6(i16 signext %x) nounwind {
 ; CHECK-NEXT:    muls #26215, %d0
 ; CHECK-NEXT:    asr.l #8, %d0
 ; CHECK-NEXT:    asr.l #8, %d0
-; CHECK-NEXT:    move.w #15, %d1
+; CHECK-NEXT:    moveq #15, %d1
 ; CHECK-NEXT:    move.w %d0, %d2
 ; CHECK-NEXT:    lsr.w %d1, %d2
 ; CHECK-NEXT:    asr.w #2, %d0
@@ -128,7 +128,7 @@ define i8 @test8(i8 %x) nounwind {
 ; CHECK-NEXT:    lsr.b #1, %d0
 ; CHECK-NEXT:    and.l #255, %d0
 ; CHECK-NEXT:    muls #211, %d0
-; CHECK-NEXT:    move.w #13, %d1
+; CHECK-NEXT:    moveq #13, %d1
 ; CHECK-NEXT:    lsr.w %d1, %d0
 ; CHECK-NEXT:    ; kill: def $bd0 killed $bd0 killed $d0
 ; CHECK-NEXT:    rts
@@ -143,7 +143,7 @@ define i8 @test9(i8 %x) nounwind {
 ; CHECK-NEXT:    lsr.b #2, %d0
 ; CHECK-NEXT:    and.l #255, %d0
 ; CHECK-NEXT:    muls #71, %d0
-; CHECK-NEXT:    move.w #11, %d1
+; CHECK-NEXT:    moveq #11, %d1
 ; CHECK-NEXT:    lsr.w %d1, %d0
 ; CHECK-NEXT:    ; kill: def $bd0 killed $bd0 killed $d0
 ; CHECK-NEXT:    rts
@@ -156,11 +156,11 @@ define i32 @testsize1(i32 %x) minsize nounwind {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
-; CHECK-NEXT:    move.l #31, %d1
+; CHECK-NEXT:    moveq #31, %d1
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    move.l %d0, %d2
 ; CHECK-NEXT:    asr.l %d1, %d2
-; CHECK-NEXT:    move.l #27, %d1
+; CHECK-NEXT:    moveq #27, %d1
 ; CHECK-NEXT:    lsr.l %d1, %d2
 ; CHECK-NEXT:    add.l %d2, %d0
 ; CHECK-NEXT:    asr.l #5, %d0
diff --git a/llvm/test/CodeGen/M68k/Arith/imul.ll b/llvm/test/CodeGen/M68k/Arith/imul.ll
index f53568395c29..a1846e4d51bd 100644
--- a/llvm/test/CodeGen/M68k/Arith/imul.ll
+++ b/llvm/test/CodeGen/M68k/Arith/imul.ll
@@ -19,7 +19,7 @@ define i64 @mul4_64(i64 %A) {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -8
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
-; CHECK-NEXT:    move.l #30, %d0
+; CHECK-NEXT:    moveq #30, %d0
 ; CHECK-NEXT:    move.l (12,%sp), %d1
 ; CHECK-NEXT:    move.l %d1, %d2
 ; CHECK-NEXT:    lsr.l %d0, %d2
@@ -38,7 +38,7 @@ define i32 @mul4096_32(i32 %A) {
 ; CHECK-LABEL: mul4096_32:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #12, %d1
+; CHECK-NEXT:    moveq #12, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsl.l %d1, %d0
 ; CHECK-NEXT:    rts
@@ -53,11 +53,11 @@ define i64 @mul4096_64(i64 %A) {
 ; CHECK-NEXT:    suba.l #8, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -12
 ; CHECK-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
-; CHECK-NEXT:    move.l #20, %d0
+; CHECK-NEXT:    moveq #20, %d0
 ; CHECK-NEXT:    move.l (16,%sp), %d1
 ; CHECK-NEXT:    move.l %d1, %d2
 ; CHECK-NEXT:    lsr.l %d0, %d2
-; CHECK-NEXT:    move.l #12, %d3
+; CHECK-NEXT:    moveq #12, %d3
 ; CHECK-NEXT:    move.l (12,%sp), %d0
 ; CHECK-NEXT:    lsl.l %d3, %d0
 ; CHECK-NEXT:    or.l %d2, %d0
@@ -73,7 +73,7 @@ define i32 @mulmin4096_32(i32 %A) {
 ; CHECK-LABEL: mulmin4096_32:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #12, %d1
+; CHECK-NEXT:    moveq #12, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsl.l %d1, %d0
 ; CHECK-NEXT:    neg.l %d0
@@ -89,11 +89,11 @@ define i64 @mulmin4096_64(i64 %A) {
 ; CHECK-NEXT:    suba.l #8, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -12
 ; CHECK-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
-; CHECK-NEXT:    move.l #20, %d0
+; CHECK-NEXT:    moveq #20, %d0
 ; CHECK-NEXT:    move.l (16,%sp), %d1
 ; CHECK-NEXT:    move.l %d1, %d2
 ; CHECK-NEXT:    lsr.l %d0, %d2
-; CHECK-NEXT:    move.l #12, %d3
+; CHECK-NEXT:    moveq #12, %d3
 ; CHECK-NEXT:    move.l (12,%sp), %d0
 ; CHECK-NEXT:    lsl.l %d3, %d0
 ; CHECK-NEXT:    or.l %d2, %d0
@@ -258,7 +258,7 @@ define i32 @mul0_32(i32 %A) {
 ; CHECK-LABEL: mul0_32:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
     %mul = mul i32 %A, 0
     ret i32 %mul
diff --git a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
index 5bd4d5d48bc8..10a797f13441 100644
--- a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
@@ -24,7 +24,7 @@ entry:
 define zeroext i8 @smul_i8_no_ovf(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-LABEL: smul_i8_no_ovf:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #42, %d0
+; CHECK-NEXT:    moveq #42, %d0
 ; CHECK-NEXT:    rts
 entry:
   %smul = tail call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
@@ -70,7 +70,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB3_1: ; %normal
@@ -78,7 +78,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -108,7 +108,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB4_2: ; %normal
@@ -116,7 +116,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -155,7 +155,7 @@ define i32 @test4(i32 %a, i32 %b) nounwind readnone {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
-; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    moveq #4, %d1
 ; CHECK-NEXT:    muls.l %d1, %d0
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
index 8d47c7ebf7e5..be3223156986 100644
--- a/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
@@ -19,7 +19,7 @@ define i1 @func1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB0_1: ; %normal
@@ -27,7 +27,7 @@ define i1 @func1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -56,7 +56,7 @@ define i1 @func2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB1_1: ; %normal
@@ -64,7 +64,7 @@ define i1 @func2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -85,7 +85,7 @@ carry:
 define i1 @func3(i32 %x) nounwind {
 ; CHECK-LABEL: func3:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #-1, %d0
+; CHECK-NEXT:    moveq #-1, %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
 ; CHECK-NEXT:    svs %d0
 ; CHECK-NEXT:    rts
diff --git a/llvm/test/CodeGen/M68k/Arith/sub.ll b/llvm/test/CodeGen/M68k/Arith/sub.ll
index fff3601000df..16d0498b3dbb 100644
--- a/llvm/test/CodeGen/M68k/Arith/sub.ll
+++ b/llvm/test/CodeGen/M68k/Arith/sub.ll
@@ -7,7 +7,7 @@ define i32 @test1(i32 %x) {
 ; CHECK-NEXT:  ; %bb.0:
 ; CHECK-NEXT:    move.l (4,%sp), %d1
 ; CHECK-NEXT:    eori.l #31, %d1
-; CHECK-NEXT:    move.l #32, %d0
+; CHECK-NEXT:    moveq #32, %d0
 ; CHECK-NEXT:    sub.l %d1, %d0
 ; CHECK-NEXT:    rts
   %xor = xor i32 %x, 31
diff --git a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
index fd128a3e52bd..3314e65399c4 100644
--- a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
@@ -24,7 +24,7 @@ entry:
 define zeroext i8 @umul_i8_no_ovf(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-LABEL: umul_i8_no_ovf:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #42, %d0
+; CHECK-NEXT:    moveq #42, %d0
 ; CHECK-NEXT:    rts
 entry:
   %umul = tail call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
@@ -59,7 +59,7 @@ declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 define i1 @a(i32 %x)  nounwind {
 ; CHECK-LABEL: a:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    move.l #3, %d0
+; CHECK-NEXT:    moveq #3, %d0
 ; CHECK-NEXT:    move.l (4,%sp), %d1
 ; CHECK-NEXT:    mulu.l %d0, %d1
 ; CHECK-NEXT:    svs %d0
@@ -90,7 +90,7 @@ define i32 @test3(i32 %a, i32 %b) nounwind readnone {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
-; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    moveq #4, %d1
 ; CHECK-NEXT:    mulu.l %d1, %d0
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/CConv/c-call.ll b/llvm/test/CodeGen/M68k/CConv/c-call.ll
index a9638eec6a31..badd4e31f37d 100644
--- a/llvm/test/CodeGen/M68k/CConv/c-call.ll
+++ b/llvm/test/CodeGen/M68k/CConv/c-call.ll
@@ -14,7 +14,7 @@ define i32 @test1() nounwind {
 ; CHECK-NEXT:    move.l #2, (4,%sp)
 ; CHECK-NEXT:    move.l #1, (%sp)
 ; CHECK-NEXT:    jsr (test1_callee@PLT,%pc)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -34,7 +34,7 @@ define i16 @test2() nounwind {
 ; CHECK-NEXT:    move.l #2, (4,%sp)
 ; CHECK-NEXT:    move.l #1, (%sp)
 ; CHECK-NEXT:    jsr (test2_callee@PLT,%pc)
-; CHECK-NEXT:    move.w #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -54,7 +54,7 @@ define i8 @test3() nounwind {
 ; CHECK-NEXT:    move.l #2, (4,%sp)
 ; CHECK-NEXT:    move.l #1, (%sp)
 ; CHECK-NEXT:    jsr (test3_callee@PLT,%pc)
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/CConv/fastcc-call.ll b/llvm/test/CodeGen/M68k/CConv/fastcc-call.ll
index 4b0f8ed254a5..8d40ebd5228f 100644
--- a/llvm/test/CodeGen/M68k/CConv/fastcc-call.ll
+++ b/llvm/test/CodeGen/M68k/CConv/fastcc-call.ll
@@ -11,12 +11,12 @@ define i32 @foo1() nounwind uwtable {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -8
 ; CHECK-NEXT:    move.l #5, (%sp)
-; CHECK-NEXT:    move.l #1, %d0
-; CHECK-NEXT:    move.l #2, %d1
+; CHECK-NEXT:    moveq #1, %d0
+; CHECK-NEXT:    moveq #2, %d1
 ; CHECK-NEXT:    move.l #3, %a0
 ; CHECK-NEXT:    move.l #4, %a1
 ; CHECK-NEXT:    jsr (bar1@PLT,%pc)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -34,11 +34,11 @@ define i32 @foo2() nounwind uwtable {
 ; CHECK-NEXT:    suba.l #12, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -16
 ; CHECK-NEXT:    lea (8,%sp), %a0
-; CHECK-NEXT:    move.l #2, %d0
+; CHECK-NEXT:    moveq #2, %d0
 ; CHECK-NEXT:    lea (4,%sp), %a1
-; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    moveq #4, %d1
 ; CHECK-NEXT:    jsr (bar2@PLT,%pc)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll b/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll
index ce8f2d0a6ba7..3d398afe7dc4 100644
--- a/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll
+++ b/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll
@@ -71,7 +71,7 @@ define i32 @my_access_global_store_d() #0 {
 ; CHECK-NEXT:  ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (d@GOTPCREL,%pc), %a0
 ; CHECK-NEXT:    move.l #2, (%a0)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
  store i32 2, ptr @d, align 4
@@ -105,7 +105,7 @@ define linkonce_odr i32 @bar() comdat {
 ; CHECK-LABEL: bar:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
  ret i32 0
diff --git a/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll b/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll
index 668f8a96ac6f..030f72bb3753 100644
--- a/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll
+++ b/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll
@@ -69,7 +69,7 @@ define i32 @my_access_global_store_d() #0 {
 ; CHECK-NEXT:  ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (d@GOTPCREL,%pc), %a0
 ; CHECK-NEXT:    move.l #2, (%a0)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
  store i32 2, ptr @d, align 4
@@ -103,7 +103,7 @@ define linkonce_odr i32 @bar() comdat {
 ; CHECK-LABEL: bar:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
  ret i32 0
diff --git a/llvm/test/CodeGen/M68k/Control/cmp.ll b/llvm/test/CodeGen/M68k/Control/cmp.ll
index 634c08760a4e..d3a8bbb0b0c8 100644
--- a/llvm/test/CodeGen/M68k/Control/cmp.ll
+++ b/llvm/test/CodeGen/M68k/Control/cmp.ll
@@ -8,10 +8,10 @@ define i32 @test1(ptr %y) nounwind {
 ; CHECK-NEXT:    cmpi.l #0, (%a0)
 ; CHECK-NEXT:    beq .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %cond_false
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB0_2: ; %cond_true
-; CHECK-NEXT:    move.l #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    rts
  %tmp = load i32, ptr %y  ; <i32> [#uses=1]
  %tmp.upgrd.1 = icmp eq i32 %tmp, 0  ; <i1> [#uses=1]
@@ -33,10 +33,10 @@ define i32 @test2(ptr %y) nounwind {
 ; CHECK-NEXT:    cmpi.l #0, %d0
 ; CHECK-NEXT:    beq .LBB1_2
 ; CHECK-NEXT:  ; %bb.1: ; %cond_false
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB1_2: ; %cond_true
-; CHECK-NEXT:    move.l #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    rts
  %tmp = load i32, ptr %y  ; <i32> [#uses=1]
  %tmp1 = shl i32 %tmp, 3  ; <i32> [#uses=1]
@@ -59,10 +59,10 @@ define i8 @test2b(ptr %y) nounwind {
 ; CHECK-NEXT:    cmpi.b #0, %d0
 ; CHECK-NEXT:    beq .LBB2_2
 ; CHECK-NEXT:  ; %bb.1: ; %cond_false
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB2_2: ; %cond_true
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    rts
  %tmp = load i8, ptr %y  ; <i8> [#uses=1]
  %tmp1 = shl i8 %tmp, 3  ; <i8> [#uses=1]
@@ -84,7 +84,7 @@ define i64 @test3(i64 %x) nounwind {
 ; CHECK-NEXT:    seq %d0
 ; CHECK-NEXT:    move.l %d0, %d1
 ; CHECK-NEXT:    and.l #255, %d1
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
   %t = icmp eq i64 %x, 0
   %r = zext i1 %t to i64
@@ -97,7 +97,7 @@ define i64 @test4(i64 %x) nounwind {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
 ; CHECK-NEXT:    move.l (8,%sp), %d1
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    move.l (12,%sp), %d2
 ; CHECK-NEXT:    sub.l #1, %d2
 ; CHECK-NEXT:    subx.l %d0, %d1
@@ -119,11 +119,11 @@ define i32 @test6() nounwind align 2 {
 ; CHECK-NEXT:    or.l (8,%sp), %d0
 ; CHECK-NEXT:    beq .LBB5_1
 ; CHECK-NEXT:  ; %bb.2: ; %F
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB5_1: ; %T
-; CHECK-NEXT:    move.l #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
   %A = alloca {i64, i64}, align 8
@@ -229,7 +229,7 @@ define zeroext i1 @test15(i32 %bf.load, i32 %n) {
 ; CHECK-LABEL: test15:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #16, %d0
+; CHECK-NEXT:    moveq #16, %d0
 ; CHECK-NEXT:    move.l (4,%sp), %d1
 ; CHECK-NEXT:    lsr.l %d0, %d1
 ; CHECK-NEXT:    move.l %d1, %d0
@@ -252,7 +252,7 @@ define i8 @test16(i16 signext %L) {
 ; CHECK-LABEL: test16:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.w #15, %d1
+; CHECK-NEXT:    moveq #15, %d1
 ; CHECK-NEXT:    move.w (6,%sp), %d0
 ; CHECK-NEXT:    lsr.w %d1, %d0
 ; CHECK-NEXT:    eori.b #1, %d0
@@ -268,7 +268,7 @@ define i8 @test18(i64 %L) {
 ; CHECK-LABEL: test18:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #31, %d1
+; CHECK-NEXT:    moveq #31, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsr.l %d1, %d0
 ; CHECK-NEXT:    eori.b #1, %d0
diff --git a/llvm/test/CodeGen/M68k/Control/long-setcc.ll b/llvm/test/CodeGen/M68k/Control/long-setcc.ll
index b089af5f2ae8..45a617599c1e 100644
--- a/llvm/test/CodeGen/M68k/Control/long-setcc.ll
+++ b/llvm/test/CodeGen/M68k/Control/long-setcc.ll
@@ -4,7 +4,7 @@
 define i1 @t1(i64 %x) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    move.l #31, %d1
+; CHECK-NEXT:    moveq #31, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsr.l %d1, %d0
 ; CHECK-NEXT:    ; kill: def $bd0 killed $bd0 killed $d0
@@ -26,7 +26,7 @@ define i1 @t2(i64 %x) nounwind {
 define i1 @t3(i32 %x) nounwind {
 ; CHECK-LABEL: t3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
   %tmp = icmp ugt i32 %x, -1
   ret i1 %tmp
diff --git a/llvm/test/CodeGen/M68k/Control/setcc.ll b/llvm/test/CodeGen/M68k/Control/setcc.ll
index 63856e278c9e..9e03f9b90842 100644
--- a/llvm/test/CodeGen/M68k/Control/setcc.ll
+++ b/llvm/test/CodeGen/M68k/Control/setcc.ll
@@ -40,7 +40,7 @@ define fastcc i64 @t3(i64 %x) nounwind readnone ssp {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
-; CHECK-NEXT:    move.l #0, %d2
+; CHECK-NEXT:    moveq #0, %d2
 ; CHECK-NEXT:    sub.l #18, %d1
 ; CHECK-NEXT:    subx.l %d2, %d0
 ; CHECK-NEXT:    scs %d0
@@ -61,7 +61,7 @@ define i8 @t5(i32 %a) {
 ; CHECK-LABEL: t5:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #31, %d1
+; CHECK-NEXT:    moveq #31, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsr.l %d1, %d0
 ; CHECK-NEXT:    eori.b #1, %d0
@@ -86,7 +86,7 @@ define zeroext i1 @t6(i32 %a) {
 ; CHECK-LABEL: t6:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #31, %d0
+; CHECK-NEXT:    moveq #31, %d0
 ; CHECK-NEXT:    move.l (4,%sp), %d1
 ; CHECK-NEXT:    lsr.l %d0, %d1
 ; CHECK-NEXT:    eori.b #1, %d1
diff --git a/llvm/test/CodeGen/M68k/PR57660.ll b/llvm/test/CodeGen/M68k/PR57660.ll
index 184c30a33d79..bad949b08caf 100644
--- a/llvm/test/CodeGen/M68k/PR57660.ll
+++ b/llvm/test/CodeGen/M68k/PR57660.ll
@@ -7,7 +7,7 @@ define dso_local void @foo1() {
 ; CHECK-NEXT:  ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #2, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -6
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    move.b %d0, (0,%sp) ; 1-byte Folded Spill
 ; CHECK-NEXT:  .LBB0_1: ; %do.body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/M68k/gcc_except_table.ll b/llvm/test/CodeGen/M68k/gcc_except_table.ll
index a7d2a6662724..fe0ed7861dfe 100644
--- a/llvm/test/CodeGen/M68k/gcc_except_table.ll
+++ b/llvm/test/CodeGen/M68k/gcc_except_table.ll
@@ -19,7 +19,7 @@ define i32 @foo() uwtable ssp personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    jsr _Z1fv@PLT
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:  ; %bb.1: ; %try.cont
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB0_2: ; %lpad
diff --git a/llvm/test/CodeGen/M68k/link-unlnk.ll b/llvm/test/CodeGen/M68k/link-unlnk.ll
index dfdd80e66ade..fe39a9a13494 100644
--- a/llvm/test/CodeGen/M68k/link-unlnk.ll
+++ b/llvm/test/CodeGen/M68k/link-unlnk.ll
@@ -105,7 +105,7 @@ define i32 @test_gep() {
 ; FP-NEXT:    .cfi_def_cfa_register %a6
 ; FP-NEXT:    move.l #21, (-4,%a6)
 ; FP-NEXT:    move.l #12, (-256,%a6)
-; FP-NEXT:    move.l #0, %d0
+; FP-NEXT:    moveq #0, %d0
 ; FP-NEXT:    unlk %a6
 ; FP-NEXT:    rts
 ;
@@ -116,7 +116,7 @@ define i32 @test_gep() {
 ; NO-FP-NEXT:    .cfi_def_cfa_offset -260
 ; NO-FP-NEXT:    move.l #21, (252,%sp)
 ; NO-FP-NEXT:    move.l #12, (0,%sp)
-; NO-FP-NEXT:    move.l #0, %d0
+; NO-FP-NEXT:    moveq #0, %d0
 ; NO-FP-NEXT:    adda.l #256, %sp
 ; NO-FP-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/MIR/AArch64/calleesavedinfovalid.mir b/llvm/test/CodeGen/MIR/AArch64/calleesavedinfovalid.mir
new file mode 100644
index 000000000000..829eccddb27f
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AArch64/calleesavedinfovalid.mir
@@ -0,0 +1,41 @@
+
+# RUN: llc -run-pass=none -mtriple=aarch64-- -o - %s | FileCheck %s
+
+---
+# CHECK-LABEL:  name:                   no_stack_no_calleesavedinfo
+# CHECK:        isCalleeSavedInfoValid: false
+name:            no_stack_no_calleesavedinfo
+frameInfo:
+  isCalleeSavedInfoValid: false
+stack: []
+
+...
+---
+# CHECK-LABEL:  name:                   no_stack_calleesavedinfo
+# CHECK:        isCalleeSavedInfoValid: true
+name:            no_stack_calleesavedinfo
+frameInfo:
+  isCalleeSavedInfoValid: true
+stack: []
+
+...
+---
+# CHECK-LABEL:  name:                   stack_no_calleesavedinfo
+# CHECK:        isCalleeSavedInfoValid: true
+name:            stack_no_calleesavedinfo
+frameInfo:
+  isCalleeSavedInfoValid: false
+stack:
+  - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, callee-saved-register: '$lr' }
+
+...
+---
+# CHECK-LABEL:  name:                   stack_calleesavedinfo
+# CHECK:        isCalleeSavedInfoValid: true
+name:            stack_calleesavedinfo
+frameInfo:
+  isCalleeSavedInfoValid: true
+stack:
+  - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, callee-saved-register: '$lr' }
+
+...
diff --git a/llvm/test/CodeGen/MIR/Generic/frame-info.mir b/llvm/test/CodeGen/MIR/Generic/frame-info.mir
index 4a897a9ec5e3..d5e014cf6299 100644
--- a/llvm/test/CodeGen/MIR/Generic/frame-info.mir
+++ b/llvm/test/CodeGen/MIR/Generic/frame-info.mir
@@ -44,6 +44,7 @@ tracksRegLiveness: true
 # CHECK-NEXT: hasVAStart: false
 # CHECK-NEXT: hasMustTailInVarArgFunc: false
 # CHECK-NEXT: hasTailCall: false
+# CHECK-NEXT: isCalleeSavedInfoValid: false
 # CHECK-NEXT: localFrameSize: 0
 # CHECK-NEXT: savePoint:       ''
 # CHECK-NEXT: restorePoint:    ''
diff --git a/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll b/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
new file mode 100644
index 000000000000..e14e89916e6d
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
@@ -0,0 +1,69 @@
+; RUN: llc %s -mtriple=mipsisa32r6el-linux-gnu -o - | \
+; RUN:     FileCheck %s --check-prefix=MIPS32R6EL
+; RUN: llc %s -mtriple=mipsisa64r6el-linux-gnuabi64 -o - | \
+; RUN:     FileCheck %s --check-prefix=MIPS64R6EL
+
+define float @mins(float %x, float %y) {
+; MIPS32R6EL-LABEL:	mins
+; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	jr	$ra
+; MIPS32R6EL-NEXT:	min.s	$f0, $f12, $f14
+;
+; MIPS64R6EL-LABEL:	mins
+; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	jr	$ra
+; MIPS64R6EL-NEXT:	min.s	$f0, $f12, $f13
+
+  %r = tail call float @llvm.minnum.f32(float %x, float %y)
+  ret float %r
+}
+
+define float @maxs(float %x, float %y) {
+; MIPS32R6EL-LABEL:	maxs
+; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	jr	$ra
+; MIPS32R6EL-NEXT:	max.s	$f0, $f12, $f14
+;
+; MIPS64R6EL-LABEL:	maxs
+; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	jr	$ra
+; MIPS64R6EL-NEXT:	max.s	$f0, $f12, $f13
+
+  %r = tail call float @llvm.maxnum.f32(float %x, float %y)
+  ret float %r
+}
+
+define double @mind(double %x, double %y) {
+; MIPS32R6EL-LABEL:	mind
+; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	jr	$ra
+; MIPS32R6EL-NEXT:	min.d	$f0, $f12, $f14
+;
+; MIPS64R6EL-LABEL:	mind
+; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	jr	$ra
+; MIPS64R6EL-NEXT:	min.d	$f0, $f12, $f13
+
+  %r = tail call double @llvm.minnum.f64(double %x, double %y)
+  ret double %r
+}
+
+define double @maxd(double %x, double %y) {
+; MIPS32R6EL-LABEL:	maxd
+; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	jr	$ra
+; MIPS32R6EL-NEXT:	max.d	$f0, $f12, $f14
+;
+; MIPS64R6EL-LABEL:	maxd
+; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	jr	$ra
+; MIPS64R6EL-NEXT:	max.d	$f0, $f12, $f13
+
+  %r = tail call double @llvm.maxnum.f64(double %x, double %y)
+  ret double %r
+}
+
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index 45c7ab980edd..fe68bee408fc 100644
--- a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -2365,101 +2365,159 @@ entry:
 declare float @llvm.minnum.f32(float %Val, float %b)
 
 define void @fminnum(float %b) {
-; MIPS32-LABEL: fminnum:
-; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
-; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MIPS32-NEXT:    addiu $sp, $sp, -24
-; MIPS32-NEXT:    .cfi_def_cfa_offset 24
-; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
-; MIPS32-NEXT:    .cfi_offset 31, -4
-; MIPS32-NEXT:    .cfi_offset 16, -8
-; MIPS32-NEXT:    addu $gp, $2, $25
-; MIPS32-NEXT:    mov.s $f14, $f12
-; MIPS32-NEXT:    lw $16, %got(g)($gp)
-; MIPS32-NEXT:    lh $1, 0($16)
-; MIPS32-NEXT:    fill.h $w0, $1
-; MIPS32-NEXT:    fexupr.w $w0, $w0
-; MIPS32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32-NEXT:    lw $25, %call16(fminf)($gp)
-; MIPS32-NEXT:    jalr $25
-; MIPS32-NEXT:    mtc1 $1, $f12
-; MIPS32-NEXT:    mfc1 $1, $f0
-; MIPS32-NEXT:    fill.w $w0, $1
-; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS32-NEXT:    copy_u.h $1, $w0[0]
-; MIPS32-NEXT:    sh $1, 0($16)
-; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    addiu $sp, $sp, 24
+; MIPS32-O32-LABEL: fminnum:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-O32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-O32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-O32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-O32-NEXT:    .cfi_offset 31, -4
+; MIPS32-O32-NEXT:    .cfi_offset 16, -8
+; MIPS32-O32-NEXT:    addu $gp, $2, $25
+; MIPS32-O32-NEXT:    mov.s $f14, $f12
+; MIPS32-O32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-O32-NEXT:    lh $1, 0($16)
+; MIPS32-O32-NEXT:    fill.h $w0, $1
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-O32-NEXT:    lw $25, %call16(fminf)($gp)
+; MIPS32-O32-NEXT:    jalr $25
+; MIPS32-O32-NEXT:    mtc1 $1, $f12
+; MIPS32-O32-NEXT:    mfc1 $1, $f0
+; MIPS32-O32-NEXT:    fill.w $w0, $1
+; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-O32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-O32-NEXT:    sh $1, 0($16)
+; MIPS32-O32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-O32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    addiu $sp, $sp, 24
 ;
-; MIPS64-N32-LABEL: fminnum:
-; MIPS64-N32:       # %bb.0: # %entry
-; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
-; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    .cfi_offset 31, -8
-; MIPS64-N32-NEXT:    .cfi_offset 28, -16
-; MIPS64-N32-NEXT:    .cfi_offset 16, -24
-; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
-; MIPS64-N32-NEXT:    addu $1, $1, $25
-; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
-; MIPS64-N32-NEXT:    mov.s $f13, $f12
-; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
-; MIPS64-N32-NEXT:    lh $1, 0($16)
-; MIPS64-N32-NEXT:    fill.h $w0, $1
-; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
-; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64-N32-NEXT:    lw $25, %call16(fminf)($gp)
-; MIPS64-N32-NEXT:    jalr $25
-; MIPS64-N32-NEXT:    mtc1 $1, $f12
-; MIPS64-N32-NEXT:    mfc1 $1, $f0
-; MIPS64-N32-NEXT:    fill.w $w0, $1
-; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
-; MIPS64-N32-NEXT:    sh $1, 0($16)
-; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    jr $ra
-; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+; MIPS64R5-N32-LABEL: fminnum:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64R5-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64R5-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64R5-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPS64R5-N32-NEXT:    mov.s $f13, $f12
+; MIPS64R5-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64R5-N32-NEXT:    lh $1, 0($16)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $1
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N32-NEXT:    lw $25, %call16(fminf)($gp)
+; MIPS64R5-N32-NEXT:    jalr $25
+; MIPS64R5-N32-NEXT:    mtc1 $1, $f12
+; MIPS64R5-N32-NEXT:    mfc1 $1, $f0
+; MIPS64R5-N32-NEXT:    fill.w $w0, $1
+; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64R5-N32-NEXT:    sh $1, 0($16)
+; MIPS64R5-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64R5-N64-LABEL: fminnum:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64R5-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64R5-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPS64R5-N64-NEXT:    mov.s $f13, $f12
+; MIPS64R5-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64R5-N64-NEXT:    lh $1, 0($16)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $1
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N64-NEXT:    ld $25, %call16(fminf)($gp)
+; MIPS64R5-N64-NEXT:    jalr $25
+; MIPS64R5-N64-NEXT:    mtc1 $1, $f12
+; MIPS64R5-N64-NEXT:    mfc1 $1, $f0
+; MIPS64R5-N64-NEXT:    fill.w $w0, $1
+; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64R5-N64-NEXT:    sh $1, 0($16)
+; MIPS64R5-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, 32
+;
+; MIPSR6-O32-LABEL: fminnum:
+; MIPSR6-O32:       # %bb.0: # %entry
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
+; MIPSR6-O32-NEXT:    lh $2, 0($1)
+; MIPSR6-O32-NEXT:    fill.h $w0, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f0
+; MIPSR6-O32-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    fill.w $w0, $2
+; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-O32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N32-LABEL: fminnum:
+; MIPSR6-N32:       # %bb.0: # %entry
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPSR6-N32-NEXT:    lh $2, 0($1)
+; MIPSR6-N32-NEXT:    fill.h $w0, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f0
+; MIPSR6-N32-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    fill.w $w0, $2
+; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N64-LABEL: fminnum:
+; MIPSR6-N64:       # %bb.0: # %entry
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPSR6-N64-NEXT:    lh $2, 0($1)
+; MIPSR6-N64-NEXT:    fill.h $w0, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f0
+; MIPSR6-N64-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    fill.w $w0, $2
+; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    sh $2, 0($1)
 ;
-; MIPS64-N64-LABEL: fminnum:
-; MIPS64-N64:       # %bb.0: # %entry
-; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
-; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    .cfi_offset 31, -8
-; MIPS64-N64-NEXT:    .cfi_offset 28, -16
-; MIPS64-N64-NEXT:    .cfi_offset 16, -24
-; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
-; MIPS64-N64-NEXT:    daddu $1, $1, $25
-; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
-; MIPS64-N64-NEXT:    mov.s $f13, $f12
-; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
-; MIPS64-N64-NEXT:    lh $1, 0($16)
-; MIPS64-N64-NEXT:    fill.h $w0, $1
-; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
-; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64-N64-NEXT:    ld $25, %call16(fminf)($gp)
-; MIPS64-N64-NEXT:    jalr $25
-; MIPS64-N64-NEXT:    mtc1 $1, $f12
-; MIPS64-N64-NEXT:    mfc1 $1, $f0
-; MIPS64-N64-NEXT:    fill.w $w0, $1
-; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
-; MIPS64-N64-NEXT:    sh $1, 0($16)
-; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    jr $ra
-; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
   %0 = load i16, ptr @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
@@ -2477,101 +2535,158 @@ entry:
 declare float @llvm.maxnum.f32(float %Val, float %b)
 
 define void @fmaxnum(float %b) {
-; MIPS32-LABEL: fmaxnum:
-; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
-; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MIPS32-NEXT:    addiu $sp, $sp, -24
-; MIPS32-NEXT:    .cfi_def_cfa_offset 24
-; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
-; MIPS32-NEXT:    .cfi_offset 31, -4
-; MIPS32-NEXT:    .cfi_offset 16, -8
-; MIPS32-NEXT:    addu $gp, $2, $25
-; MIPS32-NEXT:    mov.s $f14, $f12
-; MIPS32-NEXT:    lw $16, %got(g)($gp)
-; MIPS32-NEXT:    lh $1, 0($16)
-; MIPS32-NEXT:    fill.h $w0, $1
-; MIPS32-NEXT:    fexupr.w $w0, $w0
-; MIPS32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32-NEXT:    lw $25, %call16(fmaxf)($gp)
-; MIPS32-NEXT:    jalr $25
-; MIPS32-NEXT:    mtc1 $1, $f12
-; MIPS32-NEXT:    mfc1 $1, $f0
-; MIPS32-NEXT:    fill.w $w0, $1
-; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS32-NEXT:    copy_u.h $1, $w0[0]
-; MIPS32-NEXT:    sh $1, 0($16)
-; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    addiu $sp, $sp, 24
+; MIPS32-O32-LABEL: fmaxnum:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-O32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-O32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-O32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-O32-NEXT:    .cfi_offset 31, -4
+; MIPS32-O32-NEXT:    .cfi_offset 16, -8
+; MIPS32-O32-NEXT:    addu $gp, $2, $25
+; MIPS32-O32-NEXT:    mov.s $f14, $f12
+; MIPS32-O32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-O32-NEXT:    lh $1, 0($16)
+; MIPS32-O32-NEXT:    fill.h $w0, $1
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-O32-NEXT:    lw $25, %call16(fmaxf)($gp)
+; MIPS32-O32-NEXT:    jalr $25
+; MIPS32-O32-NEXT:    mtc1 $1, $f12
+; MIPS32-O32-NEXT:    mfc1 $1, $f0
+; MIPS32-O32-NEXT:    fill.w $w0, $1
+; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-O32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-O32-NEXT:    sh $1, 0($16)
+; MIPS32-O32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-O32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    addiu $sp, $sp, 24
 ;
-; MIPS64-N32-LABEL: fmaxnum:
-; MIPS64-N32:       # %bb.0: # %entry
-; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
-; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    .cfi_offset 31, -8
-; MIPS64-N32-NEXT:    .cfi_offset 28, -16
-; MIPS64-N32-NEXT:    .cfi_offset 16, -24
-; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
-; MIPS64-N32-NEXT:    addu $1, $1, $25
-; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
-; MIPS64-N32-NEXT:    mov.s $f13, $f12
-; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
-; MIPS64-N32-NEXT:    lh $1, 0($16)
-; MIPS64-N32-NEXT:    fill.h $w0, $1
-; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
-; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64-N32-NEXT:    lw $25, %call16(fmaxf)($gp)
-; MIPS64-N32-NEXT:    jalr $25
-; MIPS64-N32-NEXT:    mtc1 $1, $f12
-; MIPS64-N32-NEXT:    mfc1 $1, $f0
-; MIPS64-N32-NEXT:    fill.w $w0, $1
-; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
-; MIPS64-N32-NEXT:    sh $1, 0($16)
-; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    jr $ra
-; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+; MIPS64R5-N32-LABEL: fmaxnum:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64R5-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64R5-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64R5-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPS64R5-N32-NEXT:    mov.s $f13, $f12
+; MIPS64R5-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64R5-N32-NEXT:    lh $1, 0($16)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $1
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N32-NEXT:    lw $25, %call16(fmaxf)($gp)
+; MIPS64R5-N32-NEXT:    jalr $25
+; MIPS64R5-N32-NEXT:    mtc1 $1, $f12
+; MIPS64R5-N32-NEXT:    mfc1 $1, $f0
+; MIPS64R5-N32-NEXT:    fill.w $w0, $1
+; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64R5-N32-NEXT:    sh $1, 0($16)
+; MIPS64R5-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    addiu $sp, $sp, 32
 ;
-; MIPS64-N64-LABEL: fmaxnum:
-; MIPS64-N64:       # %bb.0: # %entry
-; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
-; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    .cfi_offset 31, -8
-; MIPS64-N64-NEXT:    .cfi_offset 28, -16
-; MIPS64-N64-NEXT:    .cfi_offset 16, -24
-; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
-; MIPS64-N64-NEXT:    daddu $1, $1, $25
-; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
-; MIPS64-N64-NEXT:    mov.s $f13, $f12
-; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
-; MIPS64-N64-NEXT:    lh $1, 0($16)
-; MIPS64-N64-NEXT:    fill.h $w0, $1
-; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
-; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64-N64-NEXT:    ld $25, %call16(fmaxf)($gp)
-; MIPS64-N64-NEXT:    jalr $25
-; MIPS64-N64-NEXT:    mtc1 $1, $f12
-; MIPS64-N64-NEXT:    mfc1 $1, $f0
-; MIPS64-N64-NEXT:    fill.w $w0, $1
-; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
-; MIPS64-N64-NEXT:    sh $1, 0($16)
-; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    jr $ra
-; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5-N64-LABEL: fmaxnum:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64R5-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64R5-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPS64R5-N64-NEXT:    mov.s $f13, $f12
+; MIPS64R5-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64R5-N64-NEXT:    lh $1, 0($16)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $1
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N64-NEXT:    ld $25, %call16(fmaxf)($gp)
+; MIPS64R5-N64-NEXT:    jalr $25
+; MIPS64R5-N64-NEXT:    mtc1 $1, $f12
+; MIPS64R5-N64-NEXT:    mfc1 $1, $f0
+; MIPS64R5-N64-NEXT:    fill.w $w0, $1
+; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64R5-N64-NEXT:    sh $1, 0($16)
+; MIPS64R5-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, 32
+;
+; MIPSR6-O32-LABEL: fmaxnum:
+; MIPSR6-O32:       # %bb.0:
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
+; MIPSR6-O32-NEXT:    lh $2, 0($1)
+; MIPSR6-O32-NEXT:    fill.h $w0, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f0
+; MIPSR6-O32-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    fill.w $w0, $2
+; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-O32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N32-LABEL: fmaxnum:
+; MIPSR6-N32:       # %bb.0:
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPSR6-N32-NEXT:    lh $2, 0($1)
+; MIPSR6-N32-NEXT:    fill.h $w0, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f0
+; MIPSR6-N32-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    fill.w $w0, $2
+; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N64-LABEL: fmaxnum:
+; MIPSR6-N64:       # %bb.0:
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPSR6-N64-NEXT:    lh $2, 0($1)
+; MIPSR6-N64-NEXT:    fill.h $w0, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f0
+; MIPSR6-N64-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    fill.w $w0, $2
+; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    sh $2, 0($1)
 entry:
   %0 = load i16, ptr @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 9d058662c271..20c7ae5908d2 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
+; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
+; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
 
 declare i32 @llvm.nvvm.rotate.b32(i32, i32)
@@ -11,11 +12,35 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
 ; SM20: rotate32
 ; SM35: rotate32
 define i32 @rotate32(i32 %a, i32 %b) {
-; SM20: shl.b32
-; SM20: sub.s32
-; SM20: shr.b32
-; SM20: add.u32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotate32(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<4>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u32 %r1, [rotate32_param_0];
+; SM20-NEXT:    ld.param.u32 %r2, [rotate32_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b32 %lhs;
+; SM20-NEXT:    .reg .b32 %rhs;
+; SM20-NEXT:    .reg .b32 %amt2;
+; SM20-NEXT:    shl.b32 %lhs, %r1, %r2;
+; SM20-NEXT:    sub.s32 %amt2, 32, %r2;
+; SM20-NEXT:    shr.b32 %rhs, %r1, %amt2;
+; SM20-NEXT:    add.u32 %r3, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotate32(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<4>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u32 %r1, [rotate32_param_0];
+; SM35-NEXT:    ld.param.u32 %r2, [rotate32_param_1];
+; SM35-NEXT:    shf.l.wrap.b32 %r3, %r1, %r1, %r2;
+; SM35-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; SM35-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
   ret i32 %val
 }
@@ -23,12 +48,48 @@ define i32 @rotate32(i32 %a, i32 %b) {
 ; SM20: rotate64
 ; SM35: rotate64
 define i64 @rotate64(i64 %a, i32 %b) {
-; SM20: shl.b64
-; SM20: sub.u32
-; SM20: shr.b64
-; SM20: add.u64
-; SM35: shf.l.wrap.b32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotate64(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<2>;
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
+; SM20-NEXT:    ld.param.u32 %r1, [rotate64_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    .reg .u32 %amt2;
+; SM20-NEXT:    and.b32 %amt2, %r1, 63;
+; SM20-NEXT:    shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT:    shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotate64(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<6>;
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b32 %dummy;
+; SM35-NEXT:    mov.b64 {%dummy,%r1}, %rd1;
+; SM35-NEXT:    }
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b32 %dummy;
+; SM35-NEXT:    mov.b64 {%r2,%dummy}, %rd1;
+; SM35-NEXT:    }
+; SM35-NEXT:    ld.param.u32 %r3, [rotate64_param_1];
+; SM35-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
+; SM35-NEXT:    shf.l.wrap.b32 %r5, %r1, %r2, %r3;
+; SM35-NEXT:    mov.b64 %rd2, {%r5, %r4};
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
   %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
   ret i64 %val
 }
@@ -36,12 +97,48 @@ define i64 @rotate64(i64 %a, i32 %b) {
 ; SM20: rotateright64
 ; SM35: rotateright64
 define i64 @rotateright64(i64 %a, i32 %b) {
-; SM20: shr.b64
-; SM20: sub.u32
-; SM20: shl.b64
-; SM20: add.u64
-; SM35: shf.r.wrap.b32
-; SM35: shf.r.wrap.b32
+; SM20-LABEL: rotateright64(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<2>;
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotateright64_param_0];
+; SM20-NEXT:    ld.param.u32 %r1, [rotateright64_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    .reg .u32 %amt2;
+; SM20-NEXT:    and.b32 %amt2, %r1, 63;
+; SM20-NEXT:    shr.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT:    shl.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotateright64(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<6>;
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotateright64_param_0];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b32 %dummy;
+; SM35-NEXT:    mov.b64 {%r1,%dummy}, %rd1;
+; SM35-NEXT:    }
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b32 %dummy;
+; SM35-NEXT:    mov.b64 {%dummy,%r2}, %rd1;
+; SM35-NEXT:    }
+; SM35-NEXT:    ld.param.u32 %r3, [rotateright64_param_1];
+; SM35-NEXT:    shf.r.wrap.b32 %r4, %r2, %r1, %r3;
+; SM35-NEXT:    shf.r.wrap.b32 %r5, %r1, %r2, %r3;
+; SM35-NEXT:    mov.b64 %rd2, {%r5, %r4};
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
   %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
   ret i64 %val
 }
@@ -49,12 +146,204 @@ define i64 @rotateright64(i64 %a, i32 %b) {
 ; SM20: rotl0
 ; SM35: rotl0
 define i32 @rotl0(i32 %x) {
-; SM20: shl.b32
-; SM20: shr.b32
-; SM20: add.u32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotl0(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u32 %r1, [rotl0_param_0];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b32 %lhs;
+; SM20-NEXT:    .reg .b32 %rhs;
+; SM20-NEXT:    shl.b32 %lhs, %r1, 8;
+; SM20-NEXT:    shr.b32 %rhs, %r1, 24;
+; SM20-NEXT:    add.u32 %r2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotl0(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u32 %r1, [rotl0_param_0];
+; SM35-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 8;
+; SM35-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM35-NEXT:    ret;
   %t0 = shl i32 %x, 8
   %t1 = lshr i32 %x, 24
   %t2 = or i32 %t0, %t1
   ret i32 %t2
 }
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+; SM35: rotl64
+define i64 @rotl64(i64 %a, i64 %n) {
+; SM20-LABEL: rotl64(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<2>;
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotl64_param_0];
+; SM20-NEXT:    ld.param.u32 %r1, [rotl64_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    .reg .u32 %amt2;
+; SM20-NEXT:    and.b32 %amt2, %r1, 63;
+; SM20-NEXT:    shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT:    shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotl64(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<2>;
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotl64_param_0];
+; SM35-NEXT:    ld.param.u32 %r1, [rotl64_param_1];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b64 %lhs;
+; SM35-NEXT:    .reg .b64 %rhs;
+; SM35-NEXT:    .reg .u32 %amt2;
+; SM35-NEXT:    and.b32 %amt2, %r1, 63;
+; SM35-NEXT:    shl.b64 %lhs, %rd1, %amt2;
+; SM35-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM35-NEXT:    shr.b64 %rhs, %rd1, %amt2;
+; SM35-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT:    }
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotl64_imm
+define i64 @rotl64_imm(i64 %a) {
+; SM20-LABEL: rotl64_imm(
+; SM20:       {
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotl64_imm_param_0];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    shl.b64 %lhs, %rd1, 2;
+; SM20-NEXT:    shr.b64 %rhs, %rd1, 62;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotl64_imm(
+; SM35:       {
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotl64_imm_param_0];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b64 %lhs;
+; SM35-NEXT:    .reg .b64 %rhs;
+; SM35-NEXT:    shl.b64 %lhs, %rd1, 2;
+; SM35-NEXT:    shr.b64 %rhs, %rd1, 62;
+; SM35-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT:    }
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}
+
+; SM35: rotr64
+define i64 @rotr64(i64 %a, i64 %n) {
+; SM20-LABEL: rotr64(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<2>;
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotr64_param_0];
+; SM20-NEXT:    ld.param.u32 %r1, [rotr64_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    .reg .u32 %amt2;
+; SM20-NEXT:    and.b32 %amt2, %r1, 63;
+; SM20-NEXT:    shr.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT:    shl.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotr64(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<2>;
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotr64_param_0];
+; SM35-NEXT:    ld.param.u32 %r1, [rotr64_param_1];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b64 %lhs;
+; SM35-NEXT:    .reg .b64 %rhs;
+; SM35-NEXT:    .reg .u32 %amt2;
+; SM35-NEXT:    and.b32 %amt2, %r1, 63;
+; SM35-NEXT:    shr.b64 %lhs, %rd1, %amt2;
+; SM35-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM35-NEXT:    shl.b64 %rhs, %rd1, %amt2;
+; SM35-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT:    }
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotr64_imm
+define i64 @rotr64_imm(i64 %a) {
+; SM20-LABEL: rotr64_imm(
+; SM20:       {
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotr64_imm_param_0];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    shl.b64 %lhs, %rd1, 62;
+; SM20-NEXT:    shr.b64 %rhs, %rd1, 2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotr64_imm(
+; SM35:       {
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotr64_imm_param_0];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b64 %lhs;
+; SM35-NEXT:    .reg .b64 %rhs;
+; SM35-NEXT:    shl.b64 %lhs, %rd1, 62;
+; SM35-NEXT:    shr.b64 %rhs, %rd1, 2;
+; SM35-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT:    }
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}
diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll
new file mode 100644
index 000000000000..6d9eb1337682
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
+
+define fp128 @f128_minimum(fp128 %a, fp128 %b) {
+; CHECK-LABEL: f128_minimum:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xscmpuqp 0, 2, 3
+; CHECK-NEXT:    vmr 4, 2
+; CHECK-NEXT:    bge 0, .LBB0_8
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bun 0, .LBB0_9
+; CHECK-NEXT:  .LBB0_2: # %entry
+; CHECK-NEXT:    xststdcqp 0, 2, 4
+; CHECK-NEXT:    bc 4, 2, .LBB0_10
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    xststdcqp 0, 3, 4
+; CHECK-NEXT:    bc 12, 2, .LBB0_5
+; CHECK-NEXT:  .LBB0_4: # %entry
+; CHECK-NEXT:    vmr 3, 2
+; CHECK-NEXT:  .LBB0_5: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI0_1@toc@l
+; CHECK-NEXT:    lxv 34, 0(3)
+; CHECK-NEXT:    xscmpuqp 0, 4, 2
+; CHECK-NEXT:    beq 0, .LBB0_7
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    vmr 3, 4
+; CHECK-NEXT:  .LBB0_7: # %entry
+; CHECK-NEXT:    vmr 2, 3
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB0_8: # %entry
+; CHECK-NEXT:    vmr 4, 3
+; CHECK-NEXT:    bnu 0, .LBB0_2
+; CHECK-NEXT:  .LBB0_9:
+; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
+; CHECK-NEXT:    lxv 36, 0(3)
+; CHECK-NEXT:    xststdcqp 0, 2, 4
+; CHECK-NEXT:    bc 12, 2, .LBB0_3
+; CHECK-NEXT:  .LBB0_10: # %entry
+; CHECK-NEXT:    vmr 2, 4
+; CHECK-NEXT:    xststdcqp 0, 3, 4
+; CHECK-NEXT:    bc 4, 2, .LBB0_4
+; CHECK-NEXT:    b .LBB0_5
+entry:
+  %m = call fp128 @llvm.minimum.f128(fp128 %a, fp128 %b)
+  ret fp128 %m
+}
+
+define fp128 @f128_maximum(fp128 %a, fp128 %b) {
+; CHECK-LABEL: f128_maximum:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xscmpuqp 0, 2, 3
+; CHECK-NEXT:    vmr 4, 2
+; CHECK-NEXT:    ble 0, .LBB1_8
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bun 0, .LBB1_9
+; CHECK-NEXT:  .LBB1_2: # %entry
+; CHECK-NEXT:    xststdcqp 0, 2, 8
+; CHECK-NEXT:    bc 4, 2, .LBB1_10
+; CHECK-NEXT:  .LBB1_3: # %entry
+; CHECK-NEXT:    xststdcqp 0, 3, 8
+; CHECK-NEXT:    bc 12, 2, .LBB1_5
+; CHECK-NEXT:  .LBB1_4: # %entry
+; CHECK-NEXT:    vmr 3, 2
+; CHECK-NEXT:  .LBB1_5: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI1_1@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI1_1@toc@l
+; CHECK-NEXT:    lxv 34, 0(3)
+; CHECK-NEXT:    xscmpuqp 0, 4, 2
+; CHECK-NEXT:    beq 0, .LBB1_7
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    vmr 3, 4
+; CHECK-NEXT:  .LBB1_7: # %entry
+; CHECK-NEXT:    vmr 2, 3
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB1_8: # %entry
+; CHECK-NEXT:    vmr 4, 3
+; CHECK-NEXT:    bnu 0, .LBB1_2
+; CHECK-NEXT:  .LBB1_9:
+; CHECK-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI1_0@toc@l
+; CHECK-NEXT:    lxv 36, 0(3)
+; CHECK-NEXT:    xststdcqp 0, 2, 8
+; CHECK-NEXT:    bc 12, 2, .LBB1_3
+; CHECK-NEXT:  .LBB1_10: # %entry
+; CHECK-NEXT:    vmr 2, 4
+; CHECK-NEXT:    xststdcqp 0, 3, 8
+; CHECK-NEXT:    bc 4, 2, .LBB1_4
+; CHECK-NEXT:    b .LBB1_5
+entry:
+  %m = call fp128 @llvm.maximum.f128(fp128 %a, fp128 %b)
+  ret fp128 %m
+}
+
+declare fp128 @llvm.minimum.f128(fp128, fp128)
+declare fp128 @llvm.maximum.f128(fp128, fp128)
diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
new file mode 100644
index 000000000000..c33875dbfee4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
@@ -0,0 +1,847 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s --check-prefix=NOVSX
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s --check-prefix=VSX
+; RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr8 < %s | FileCheck %s --check-prefix=AIX
+
+define float @f32_minimum(float %a, float %b) {
+; NOVSX-LABEL: f32_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfs 2, -8(1)
+; NOVSX-NEXT:    stfs 1, -4(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB0_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB0_2: # %entry
+; NOVSX-NEXT:    lwz 3, -4(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB0_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI0_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI0_0@toc@l(4)
+; NOVSX-NEXT:  .LBB0_4: # %entry
+; NOVSX-NEXT:    xoris 3, 3, 32768
+; NOVSX-NEXT:    lwz 4, -8(1)
+; NOVSX-NEXT:    cmplwi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB0_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB0_6: # %entry
+; NOVSX-NEXT:    xoris 3, 4, 32768
+; NOVSX-NEXT:    cmplwi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB0_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB0_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI0_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB0_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB0_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f32_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xscvdpspn 0, 1
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    xscvdpspn 3, 2
+; VSX-NEXT:    mffprwz 3, 0
+; VSX-NEXT:    bc 12, 3, .LBB0_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmindp 0, 1, 2
+; VSX-NEXT:    b .LBB0_3
+; VSX-NEXT:  .LBB0_2:
+; VSX-NEXT:    addis 4, 2, .LCPI0_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI0_0@toc@l(4)
+; VSX-NEXT:  .LBB0_3: # %entry
+; VSX-NEXT:    xoris 3, 3, 32768
+; VSX-NEXT:    mffprwz 4, 3
+; VSX-NEXT:    cmplwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB0_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB0_5: # %entry
+; VSX-NEXT:    xoris 3, 4, 32768
+; VSX-NEXT:    cmplwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB0_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB0_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB0_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB0_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f32_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xscvdpspn 0, 1
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    xscvdpspn 3, 2
+; AIX-NEXT:    mffprwz 3, 0
+; AIX-NEXT:    bc 12, 3, L..BB0_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmindp 0, 1, 2
+; AIX-NEXT:    b L..BB0_3
+; AIX-NEXT:  L..BB0_2:
+; AIX-NEXT:    ld 4, L..C0(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB0_3: # %entry
+; AIX-NEXT:    xoris 3, 3, 32768
+; AIX-NEXT:    mffprwz 4, 3
+; AIX-NEXT:    cmplwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB0_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB0_5: # %entry
+; AIX-NEXT:    xoris 3, 4, 32768
+; AIX-NEXT:    cmplwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB0_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB0_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB0_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB0_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %m
+}
+
+define float @f32_maximum(float %a, float %b) {
+; NOVSX-LABEL: f32_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfs 2, -8(1)
+; NOVSX-NEXT:    stfs 1, -4(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB1_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB1_2: # %entry
+; NOVSX-NEXT:    lwz 3, -4(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB1_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI1_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI1_0@toc@l(4)
+; NOVSX-NEXT:  .LBB1_4: # %entry
+; NOVSX-NEXT:    cmpwi 3, 0
+; NOVSX-NEXT:    lwz 4, -8(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB1_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB1_6: # %entry
+; NOVSX-NEXT:    cmpwi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB1_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB1_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI1_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI1_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB1_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB1_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f32_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xscvdpspn 0, 1
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    xscvdpspn 3, 2
+; VSX-NEXT:    mffprwz 3, 0
+; VSX-NEXT:    bc 12, 3, .LBB1_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmaxdp 0, 1, 2
+; VSX-NEXT:    b .LBB1_3
+; VSX-NEXT:  .LBB1_2:
+; VSX-NEXT:    addis 4, 2, .LCPI1_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI1_0@toc@l(4)
+; VSX-NEXT:  .LBB1_3: # %entry
+; VSX-NEXT:    mffprwz 4, 3
+; VSX-NEXT:    cmpwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB1_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB1_5: # %entry
+; VSX-NEXT:    cmpwi 4, 0
+; VSX-NEXT:    bc 12, 2, .LBB1_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB1_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB1_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB1_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f32_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xscvdpspn 0, 1
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    xscvdpspn 3, 2
+; AIX-NEXT:    mffprwz 3, 0
+; AIX-NEXT:    bc 12, 3, L..BB1_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmaxdp 0, 1, 2
+; AIX-NEXT:    b L..BB1_3
+; AIX-NEXT:  L..BB1_2:
+; AIX-NEXT:    ld 4, L..C1(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB1_3: # %entry
+; AIX-NEXT:    mffprwz 4, 3
+; AIX-NEXT:    cmpwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB1_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB1_5: # %entry
+; AIX-NEXT:    cmpwi 4, 0
+; AIX-NEXT:    bc 12, 2, L..BB1_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB1_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB1_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB1_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %m
+}
+
+define double @f64_minimum(double %a, double %b) {
+; NOVSX-LABEL: f64_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfd 2, -16(1)
+; NOVSX-NEXT:    stfd 1, -8(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB2_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB2_2: # %entry
+; NOVSX-NEXT:    ld 3, -8(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB2_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI2_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI2_0@toc@l(4)
+; NOVSX-NEXT:  .LBB2_4: # %entry
+; NOVSX-NEXT:    li 5, 1
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    rldic 5, 5, 63, 0
+; NOVSX-NEXT:    cmpd 3, 5
+; NOVSX-NEXT:    bc 12, 2, .LBB2_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB2_6: # %entry
+; NOVSX-NEXT:    cmpd 4, 5
+; NOVSX-NEXT:    bc 12, 2, .LBB2_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB2_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI2_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI2_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB2_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB2_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f64_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    mffprd 3, 1
+; VSX-NEXT:    bc 12, 3, .LBB2_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmindp 0, 1, 2
+; VSX-NEXT:    b .LBB2_3
+; VSX-NEXT:  .LBB2_2:
+; VSX-NEXT:    addis 4, 2, .LCPI2_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI2_0@toc@l(4)
+; VSX-NEXT:  .LBB2_3: # %entry
+; VSX-NEXT:    li 5, 1
+; VSX-NEXT:    mffprd 4, 2
+; VSX-NEXT:    rldic 5, 5, 63, 0
+; VSX-NEXT:    cmpd 3, 5
+; VSX-NEXT:    bc 12, 2, .LBB2_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB2_5: # %entry
+; VSX-NEXT:    cmpd 4, 5
+; VSX-NEXT:    bc 12, 2, .LBB2_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB2_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB2_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB2_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f64_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    mffprd 3, 1
+; AIX-NEXT:    bc 12, 3, L..BB2_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmindp 0, 1, 2
+; AIX-NEXT:    b L..BB2_3
+; AIX-NEXT:  L..BB2_2:
+; AIX-NEXT:    ld 4, L..C2(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB2_3: # %entry
+; AIX-NEXT:    li 5, 1
+; AIX-NEXT:    mffprd 4, 2
+; AIX-NEXT:    rldic 5, 5, 63, 0
+; AIX-NEXT:    cmpd 3, 5
+; AIX-NEXT:    bc 12, 2, L..BB2_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB2_5: # %entry
+; AIX-NEXT:    cmpd 4, 5
+; AIX-NEXT:    bc 12, 2, L..BB2_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB2_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB2_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB2_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call double @llvm.minimum.f64(double %a, double %b)
+  ret double %m
+}
+
+define double @f64_maximum(double %a, double %b) {
+; NOVSX-LABEL: f64_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfd 2, -16(1)
+; NOVSX-NEXT:    stfd 1, -8(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB3_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB3_2: # %entry
+; NOVSX-NEXT:    ld 3, -8(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB3_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI3_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI3_0@toc@l(4)
+; NOVSX-NEXT:  .LBB3_4: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB3_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB3_6: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB3_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB3_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI3_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI3_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB3_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB3_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f64_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    mffprd 3, 1
+; VSX-NEXT:    bc 12, 3, .LBB3_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmaxdp 0, 1, 2
+; VSX-NEXT:    b .LBB3_3
+; VSX-NEXT:  .LBB3_2:
+; VSX-NEXT:    addis 4, 2, .LCPI3_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI3_0@toc@l(4)
+; VSX-NEXT:  .LBB3_3: # %entry
+; VSX-NEXT:    mffprd 4, 2
+; VSX-NEXT:    cmpdi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB3_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB3_5: # %entry
+; VSX-NEXT:    cmpdi 4, 0
+; VSX-NEXT:    bc 12, 2, .LBB3_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB3_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB3_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB3_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f64_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    mffprd 3, 1
+; AIX-NEXT:    bc 12, 3, L..BB3_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmaxdp 0, 1, 2
+; AIX-NEXT:    b L..BB3_3
+; AIX-NEXT:  L..BB3_2:
+; AIX-NEXT:    ld 4, L..C3(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB3_3: # %entry
+; AIX-NEXT:    mffprd 4, 2
+; AIX-NEXT:    cmpdi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB3_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB3_5: # %entry
+; AIX-NEXT:    cmpdi 4, 0
+; AIX-NEXT:    bc 12, 2, L..BB3_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB3_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB3_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB3_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call double @llvm.maximum.f64(double %a, double %b)
+  ret double %m
+}
+
+define <4 x float> @v4f32_minimum(<4 x float> %a, <4 x float> %b) {
+; NOVSX-LABEL: v4f32_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    vcmpeqfp 0, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 1, 2, 2
+; NOVSX-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
+; NOVSX-NEXT:    addi 3, 3, .LCPI4_0@toc@l
+; NOVSX-NEXT:    vnot 0, 0
+; NOVSX-NEXT:    vnot 1, 1
+; NOVSX-NEXT:    vspltisb 4, -1
+; NOVSX-NEXT:    vcmpgtfp 5, 3, 2
+; NOVSX-NEXT:    vslw 4, 4, 4
+; NOVSX-NEXT:    vor 0, 1, 0
+; NOVSX-NEXT:    lvx 1, 0, 3
+; NOVSX-NEXT:    vsel 5, 3, 2, 5
+; NOVSX-NEXT:    vsel 5, 5, 1, 0
+; NOVSX-NEXT:    vcmpequw 0, 2, 4
+; NOVSX-NEXT:    vcmpequw 4, 3, 4
+; NOVSX-NEXT:    vsel 2, 5, 2, 0
+; NOVSX-NEXT:    vsel 2, 2, 3, 4
+; NOVSX-NEXT:    vxor 3, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 3, 5, 3
+; NOVSX-NEXT:    vsel 2, 5, 2, 3
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: v4f32_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvcmpeqsp 1, 35, 35
+; VSX-NEXT:    xvcmpeqsp 2, 34, 34
+; VSX-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
+; VSX-NEXT:    xxleqv 36, 36, 36
+; VSX-NEXT:    xvminsp 0, 34, 35
+; VSX-NEXT:    vslw 4, 4, 4
+; VSX-NEXT:    addi 3, 3, .LCPI4_0@toc@l
+; VSX-NEXT:    xxlnor 1, 1, 1
+; VSX-NEXT:    xxlnor 2, 2, 2
+; VSX-NEXT:    vcmpequw 5, 2, 4
+; VSX-NEXT:    xxlor 1, 2, 1
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxlxor 2, 2, 2
+; VSX-NEXT:    xvcmpeqsp 2, 0, 2
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequw 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xxsel 34, 0, 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v4f32_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xvcmpeqsp 1, 35, 35
+; AIX-NEXT:    xvcmpeqsp 2, 34, 34
+; AIX-NEXT:    ld 3, L..C4(2) # %const.0
+; AIX-NEXT:    xxleqv 36, 36, 36
+; AIX-NEXT:    xvminsp 0, 34, 35
+; AIX-NEXT:    vslw 4, 4, 4
+; AIX-NEXT:    xxlnor 1, 1, 1
+; AIX-NEXT:    xxlnor 2, 2, 2
+; AIX-NEXT:    vcmpequw 5, 2, 4
+; AIX-NEXT:    xxlor 1, 2, 1
+; AIX-NEXT:    lxvw4x 2, 0, 3
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxlxor 2, 2, 2
+; AIX-NEXT:    xvcmpeqsp 2, 0, 2
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequw 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xxsel 34, 0, 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %m
+}
+
+define <4 x float> @v4f32_maximum(<4 x float> %a, <4 x float> %b) {
+; NOVSX-LABEL: v4f32_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    vcmpeqfp 5, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 0, 2, 2
+; NOVSX-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
+; NOVSX-NEXT:    addi 3, 3, .LCPI5_0@toc@l
+; NOVSX-NEXT:    vnot 5, 5
+; NOVSX-NEXT:    vnot 0, 0
+; NOVSX-NEXT:    vcmpgtfp 4, 2, 3
+; NOVSX-NEXT:    vor 5, 0, 5
+; NOVSX-NEXT:    lvx 0, 0, 3
+; NOVSX-NEXT:    vsel 4, 3, 2, 4
+; NOVSX-NEXT:    vsel 4, 4, 0, 5
+; NOVSX-NEXT:    vxor 5, 5, 5
+; NOVSX-NEXT:    vcmpequw 0, 2, 5
+; NOVSX-NEXT:    vsel 2, 4, 2, 0
+; NOVSX-NEXT:    vcmpequw 0, 3, 5
+; NOVSX-NEXT:    vsel 2, 2, 3, 0
+; NOVSX-NEXT:    vcmpeqfp 3, 4, 5
+; NOVSX-NEXT:    vsel 2, 4, 2, 3
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: v4f32_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvcmpeqsp 1, 35, 35
+; VSX-NEXT:    xvcmpeqsp 2, 34, 34
+; VSX-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
+; VSX-NEXT:    addi 3, 3, .LCPI5_0@toc@l
+; VSX-NEXT:    xxlnor 1, 1, 1
+; VSX-NEXT:    xxlnor 2, 2, 2
+; VSX-NEXT:    xvmaxsp 0, 34, 35
+; VSX-NEXT:    xxlxor 36, 36, 36
+; VSX-NEXT:    vcmpequw 5, 2, 4
+; VSX-NEXT:    xxlor 1, 2, 1
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xvcmpeqsp 2, 0, 36
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequw 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xxsel 34, 0, 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v4f32_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xvcmpeqsp 1, 35, 35
+; AIX-NEXT:    xvcmpeqsp 2, 34, 34
+; AIX-NEXT:    ld 3, L..C5(2) # %const.0
+; AIX-NEXT:    xvmaxsp 0, 34, 35
+; AIX-NEXT:    xxlxor 36, 36, 36
+; AIX-NEXT:    xxlnor 1, 1, 1
+; AIX-NEXT:    xxlnor 2, 2, 2
+; AIX-NEXT:    vcmpequw 5, 2, 4
+; AIX-NEXT:    xxlor 1, 2, 1
+; AIX-NEXT:    lxvw4x 2, 0, 3
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xvcmpeqsp 2, 0, 36
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequw 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xxsel 34, 0, 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %m
+}
+
+define <2 x double> @v2f64_minimum(<2 x double> %a, <2 x double> %b) {
+; NOVSX-LABEL: v2f64_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 3
+; NOVSX-NEXT:    fmr 6, 1
+; NOVSX-NEXT:    stfd 4, -16(1)
+; NOVSX-NEXT:    stfd 2, -8(1)
+; NOVSX-NEXT:    stfd 3, -32(1)
+; NOVSX-NEXT:    stfd 1, -24(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB6_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 6, 3
+; NOVSX-NEXT:  .LBB6_2: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI6_0@toc@ha
+; NOVSX-NEXT:    ld 4, -24(1)
+; NOVSX-NEXT:    lfs 0, .LCPI6_0@toc@l(3)
+; NOVSX-NEXT:    fmr 5, 0
+; NOVSX-NEXT:    bc 12, 3, .LBB6_4
+; NOVSX-NEXT:  # %bb.3: # %entry
+; NOVSX-NEXT:    fmr 5, 6
+; NOVSX-NEXT:  .LBB6_4: # %entry
+; NOVSX-NEXT:    li 3, 1
+; NOVSX-NEXT:    ld 5, -32(1)
+; NOVSX-NEXT:    rldic 3, 3, 63, 0
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 5
+; NOVSX-NEXT:  .LBB6_6: # %entry
+; NOVSX-NEXT:    cmpd 5, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 3, 1
+; NOVSX-NEXT:  .LBB6_8: # %entry
+; NOVSX-NEXT:    addis 4, 2, .LCPI6_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI6_1@toc@l(4)
+; NOVSX-NEXT:    fcmpu 0, 5, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB6_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 3, 5
+; NOVSX-NEXT:  .LBB6_10: # %entry
+; NOVSX-NEXT:    fcmpu 0, 2, 4
+; NOVSX-NEXT:    fmr 5, 2
+; NOVSX-NEXT:    bc 12, 0, .LBB6_12
+; NOVSX-NEXT:  # %bb.11: # %entry
+; NOVSX-NEXT:    fmr 5, 4
+; NOVSX-NEXT:  .LBB6_12: # %entry
+; NOVSX-NEXT:    ld 5, -8(1)
+; NOVSX-NEXT:    bc 12, 3, .LBB6_14
+; NOVSX-NEXT:  # %bb.13: # %entry
+; NOVSX-NEXT:    fmr 0, 5
+; NOVSX-NEXT:  .LBB6_14: # %entry
+; NOVSX-NEXT:    cmpd 5, 3
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    bc 4, 2, .LBB6_19
+; NOVSX-NEXT:  # %bb.15: # %entry
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 4, 2, .LBB6_20
+; NOVSX-NEXT:  .LBB6_16: # %entry
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB6_18
+; NOVSX-NEXT:  .LBB6_17: # %entry
+; NOVSX-NEXT:    fmr 4, 0
+; NOVSX-NEXT:  .LBB6_18: # %entry
+; NOVSX-NEXT:    fmr 1, 3
+; NOVSX-NEXT:    fmr 2, 4
+; NOVSX-NEXT:    blr
+; NOVSX-NEXT:  .LBB6_19: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_16
+; NOVSX-NEXT:  .LBB6_20: # %entry
+; NOVSX-NEXT:    fmr 4, 2
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 4, 2, .LBB6_17
+; NOVSX-NEXT:    b .LBB6_18
+;
+; VSX-LABEL: v2f64_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    addis 3, 2, .LCPI6_0@toc@ha
+; VSX-NEXT:    xvcmpeqdp 36, 35, 35
+; VSX-NEXT:    xvcmpeqdp 37, 34, 34
+; VSX-NEXT:    addi 3, 3, .LCPI6_0@toc@l
+; VSX-NEXT:    xxlnor 36, 36, 36
+; VSX-NEXT:    xxlnor 37, 37, 37
+; VSX-NEXT:    xvmindp 0, 34, 35
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    addis 3, 2, .LCPI6_1@toc@ha
+; VSX-NEXT:    xxlor 1, 37, 36
+; VSX-NEXT:    addi 3, 3, .LCPI6_1@toc@l
+; VSX-NEXT:    lxvd2x 36, 0, 3
+; VSX-NEXT:    vcmpequd 5, 2, 4
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxlxor 2, 2, 2
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequd 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xvcmpeqdp 34, 0, 2
+; VSX-NEXT:    xxsel 34, 0, 1, 34
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v2f64_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    ld 3, L..C6(2) # %const.0
+; AIX-NEXT:    xvcmpeqdp 36, 35, 35
+; AIX-NEXT:    xvcmpeqdp 37, 34, 34
+; AIX-NEXT:    lxvd2x 2, 0, 3
+; AIX-NEXT:    ld 3, L..C7(2) # %const.1
+; AIX-NEXT:    xxlnor 36, 36, 36
+; AIX-NEXT:    xxlnor 37, 37, 37
+; AIX-NEXT:    xvmindp 0, 34, 35
+; AIX-NEXT:    xxlor 1, 37, 36
+; AIX-NEXT:    lxvd2x 36, 0, 3
+; AIX-NEXT:    vcmpequd 5, 2, 4
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxlxor 2, 2, 2
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequd 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xvcmpeqdp 34, 0, 2
+; AIX-NEXT:    xxsel 34, 0, 1, 34
+; AIX-NEXT:    blr
+entry:
+  %m = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %m
+}
+
+define <2 x double> @v2f64_maximum(<2 x double> %a, <2 x double> %b) {
+; NOVSX-LABEL: v2f64_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 3
+; NOVSX-NEXT:    fmr 6, 1
+; NOVSX-NEXT:    stfd 4, -16(1)
+; NOVSX-NEXT:    stfd 2, -8(1)
+; NOVSX-NEXT:    stfd 3, -32(1)
+; NOVSX-NEXT:    stfd 1, -24(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB7_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 6, 3
+; NOVSX-NEXT:  .LBB7_2: # %entry
+; NOVSX-NEXT:    addis 4, 2, .LCPI7_0@toc@ha
+; NOVSX-NEXT:    ld 3, -24(1)
+; NOVSX-NEXT:    lfs 0, .LCPI7_0@toc@l(4)
+; NOVSX-NEXT:    fmr 5, 0
+; NOVSX-NEXT:    bc 12, 3, .LBB7_4
+; NOVSX-NEXT:  # %bb.3: # %entry
+; NOVSX-NEXT:    fmr 5, 6
+; NOVSX-NEXT:  .LBB7_4: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    ld 4, -32(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB7_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 5
+; NOVSX-NEXT:  .LBB7_6: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB7_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 3, 1
+; NOVSX-NEXT:  .LBB7_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI7_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI7_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 5, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB7_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 3, 5
+; NOVSX-NEXT:  .LBB7_10: # %entry
+; NOVSX-NEXT:    fcmpu 0, 2, 4
+; NOVSX-NEXT:    fmr 5, 2
+; NOVSX-NEXT:    bc 12, 1, .LBB7_12
+; NOVSX-NEXT:  # %bb.11: # %entry
+; NOVSX-NEXT:    fmr 5, 4
+; NOVSX-NEXT:  .LBB7_12: # %entry
+; NOVSX-NEXT:    ld 4, -8(1)
+; NOVSX-NEXT:    bc 12, 3, .LBB7_14
+; NOVSX-NEXT:  # %bb.13: # %entry
+; NOVSX-NEXT:    fmr 0, 5
+; NOVSX-NEXT:  .LBB7_14: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    ld 3, -16(1)
+; NOVSX-NEXT:    bc 4, 2, .LBB7_19
+; NOVSX-NEXT:  # %bb.15: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    bc 4, 2, .LBB7_20
+; NOVSX-NEXT:  .LBB7_16: # %entry
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB7_18
+; NOVSX-NEXT:  .LBB7_17: # %entry
+; NOVSX-NEXT:    fmr 4, 0
+; NOVSX-NEXT:  .LBB7_18: # %entry
+; NOVSX-NEXT:    fmr 1, 3
+; NOVSX-NEXT:    fmr 2, 4
+; NOVSX-NEXT:    blr
+; NOVSX-NEXT:  .LBB7_19: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB7_16
+; NOVSX-NEXT:  .LBB7_20: # %entry
+; NOVSX-NEXT:    fmr 4, 2
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 4, 2, .LBB7_17
+; NOVSX-NEXT:    b .LBB7_18
+;
+; VSX-LABEL: v2f64_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    addis 3, 2, .LCPI7_0@toc@ha
+; VSX-NEXT:    xvcmpeqdp 36, 35, 35
+; VSX-NEXT:    xvcmpeqdp 37, 34, 34
+; VSX-NEXT:    addi 3, 3, .LCPI7_0@toc@l
+; VSX-NEXT:    xxlnor 36, 36, 36
+; VSX-NEXT:    xxlnor 37, 37, 37
+; VSX-NEXT:    xvmaxdp 0, 34, 35
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxlor 1, 37, 36
+; VSX-NEXT:    xxlxor 36, 36, 36
+; VSX-NEXT:    vcmpequd 5, 2, 4
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequd 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xvcmpeqdp 34, 0, 36
+; VSX-NEXT:    xxsel 34, 0, 1, 34
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v2f64_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    ld 3, L..C8(2) # %const.0
+; AIX-NEXT:    xvcmpeqdp 36, 35, 35
+; AIX-NEXT:    xvcmpeqdp 37, 34, 34
+; AIX-NEXT:    lxvd2x 2, 0, 3
+; AIX-NEXT:    xxlnor 36, 36, 36
+; AIX-NEXT:    xxlnor 37, 37, 37
+; AIX-NEXT:    xvmaxdp 0, 34, 35
+; AIX-NEXT:    xxlor 1, 37, 36
+; AIX-NEXT:    xxlxor 36, 36, 36
+; AIX-NEXT:    vcmpequd 5, 2, 4
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequd 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xvcmpeqdp 34, 0, 36
+; AIX-NEXT:    xxsel 34, 0, 1, 34
+; AIX-NEXT:    blr
+entry:
+  %m = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %m
+}
+
+declare float @llvm.maximum.f32(float, float)
+declare double @llvm.maximum.f64(double, double)
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+declare float @llvm.minimum.f32(float, float)
+declare double @llvm.minimum.f64(double, double)
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/PowerPC/git_revision.ll b/llvm/test/CodeGen/PowerPC/git_revision.ll
new file mode 100644
index 000000000000..86dcc5048425
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/git_revision.ll
@@ -0,0 +1,12 @@
+; Check that the git revision is contained in the assembly/object files
+
+; REQUIRES: vc-rev-enabled 
+
+; RUN: llc < %s | FileCheck %s -DREVISION=git-revision
+; RUN: llc -filetype=obj < %s | FileCheck %s -DREVISION=git-revision
+
+; CHECK: ([[REVISION]])
+
+source_filename = "git_revision.cpp"
+target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512"
+target triple = "powerpc64-ibm-aix7.2.0.0"
diff --git a/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll b/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
index 8980049969da..b7f8b8af2472 100644
--- a/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
+++ b/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ;RUN: llc < %s --mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=BE
 ;RUN: llc < %s --mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=LE
-;RUN: llc < %s --mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -ppc-gather-alias-max-depth=0 | FileCheck %s -check-prefix=FORWARD
 
 define <8 x i32> @test_large_vec_vaarg(i32 %n, ...) {
 ; BE-LABEL: test_large_vec_vaarg:
@@ -36,22 +35,6 @@ define <8 x i32> @test_large_vec_vaarg(i32 %n, ...) {
 ; LE-NEXT:    lxvd2x 0, 0, 3
 ; LE-NEXT:    xxswapd 35, 0
 ; LE-NEXT:    blr
-;
-; FORWARD-LABEL: test_large_vec_vaarg:
-; FORWARD:       # %bb.0:
-; FORWARD-NEXT:    ld 3, -8(1)
-; FORWARD-NEXT:    addi 3, 3, 15
-; FORWARD-NEXT:    rldicr 3, 3, 0, 59
-; FORWARD-NEXT:    addi 4, 3, 16
-; FORWARD-NEXT:    std 4, -8(1)
-; FORWARD-NEXT:    ld 4, -8(1)
-; FORWARD-NEXT:    lvx 2, 0, 3
-; FORWARD-NEXT:    addi 4, 4, 15
-; FORWARD-NEXT:    rldicr 3, 4, 0, 59
-; FORWARD-NEXT:    addi 4, 3, 16
-; FORWARD-NEXT:    std 4, -8(1)
-; FORWARD-NEXT:    lvx 3, 0, 3
-; FORWARD-NEXT:    blr
   %args = alloca ptr, align 4
   %x = va_arg ptr %args, <8 x i32>
   ret <8 x i32> %x
diff --git a/llvm/test/CodeGen/PowerPC/lit.local.cfg b/llvm/test/CodeGen/PowerPC/lit.local.cfg
index 3e8c0f8b1842..4cc802afef4a 100644
--- a/llvm/test/CodeGen/PowerPC/lit.local.cfg
+++ b/llvm/test/CodeGen/PowerPC/lit.local.cfg
@@ -1,4 +1,23 @@
 if not "PowerPC" in config.root.targets:
     config.unsupported = True
 
+import subprocess
+
 config.suffixes.add(".py")
+
+def get_revision(repo_path):
+    cmd = ['git', '-C', repo_path, 'rev-parse', 'HEAD']
+    try:
+        return subprocess.run(cmd, stdout=subprocess.PIPE, check=True).stdout.decode()
+    except subprocess.CalledProcessError:
+        print("An error occurred retrieving the git revision.")
+        return None
+
+if config.have_vc_rev:
+    if config.force_vc_rev:
+        git_revision = config.force_vc_rev
+    else:
+        git_revision = get_revision(config.llvm_src_root)
+    if git_revision:
+        config.substitutions.append(("git-revision", git_revision))
+        config.available_features.add("vc-rev-enabled")
diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
new file mode 100644
index 000000000000..d8f2b08adaf2
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# REQUIRES: asserts
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=0,ppc-xtoi-peephole-count=8 \
+# RUN:   | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=2 \
+# RUN:   | FileCheck %s --check-prefix=ONE-FIRSTSTORE
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=5,ppc-xtoi-peephole-count=2 \
+# RUN:   | FileCheck %s --check-prefix=ONE-SECONDSTORE
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=4 \
+# RUN:   | FileCheck %s --check-prefix=TWO
+
+---
+name: foldDForm
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3
+
+    ; ALL-LABEL: name: foldDForm
+    ; ALL: liveins: $x3
+    ; ALL-NEXT: {{  $}}
+    ; ALL-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY killed $x3
+    ; ALL-NEXT: dead [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 144
+    ; ALL-NEXT: [[LI8_:%[0-9]+]]:g8rc = LI8 0
+    ; ALL-NEXT: STD [[LI8_]], 160, [[COPY]]
+    ; ALL-NEXT: dead [[ADDI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 160
+    ; ALL-NEXT: STD [[LI8_]], 176, [[COPY]]
+    ; ALL-NEXT: dead [[ADDI8_2:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 176
+    ; ALL-NEXT: STD killed [[LI8_]], 192, killed [[COPY]]
+    ; ALL-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; ONE-FIRSTSTORE-LABEL: name: foldDForm
+    ; ONE-FIRSTSTORE: liveins: $x3
+    ; ONE-FIRSTSTORE-NEXT: {{  $}}
+    ; ONE-FIRSTSTORE-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY killed $x3
+    ; ONE-FIRSTSTORE-NEXT: dead [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 144
+    ; ONE-FIRSTSTORE-NEXT: [[LI8_:%[0-9]+]]:g8rc = LI8 0
+    ; ONE-FIRSTSTORE-NEXT: STD [[LI8_]], 160, [[COPY]]
+    ; ONE-FIRSTSTORE-NEXT: [[ADDI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 160
+    ; ONE-FIRSTSTORE-NEXT: STD [[LI8_]], 16, killed [[ADDI8_1]]
+    ; ONE-FIRSTSTORE-NEXT: [[ADDI8_2:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 killed [[COPY]], 176
+    ; ONE-FIRSTSTORE-NEXT: STD killed [[LI8_]], 16, killed [[ADDI8_2]]
+    ; ONE-FIRSTSTORE-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; ONE-SECONDSTORE-LABEL: name: foldDForm
+    ; ONE-SECONDSTORE: liveins: $x3
+    ; ONE-SECONDSTORE-NEXT: {{  $}}
+    ; ONE-SECONDSTORE-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY killed $x3
+    ; ONE-SECONDSTORE-NEXT: [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 144
+    ; ONE-SECONDSTORE-NEXT: [[LI8_:%[0-9]+]]:g8rc = LI8 0
+    ; ONE-SECONDSTORE-NEXT: STD [[LI8_]], 16, killed [[ADDI8_]]
+    ; ONE-SECONDSTORE-NEXT: dead [[ADDI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 160
+    ; ONE-SECONDSTORE-NEXT: STD [[LI8_]], 176, [[COPY]]
+    ; ONE-SECONDSTORE-NEXT: [[ADDI8_2:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 killed [[COPY]], 176
+    ; ONE-SECONDSTORE-NEXT: STD killed [[LI8_]], 16, killed [[ADDI8_2]]
+    ; ONE-SECONDSTORE-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; TWO-LABEL: name: foldDForm
+    ; TWO: liveins: $x3
+    ; TWO-NEXT: {{  $}}
+    ; TWO-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY killed $x3
+    ; TWO-NEXT: dead [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 144
+    ; TWO-NEXT: [[LI8_:%[0-9]+]]:g8rc = LI8 0
+    ; TWO-NEXT: STD [[LI8_]], 160, [[COPY]]
+    ; TWO-NEXT: dead [[ADDI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 160
+    ; TWO-NEXT: STD [[LI8_]], 176, [[COPY]]
+    ; TWO-NEXT: [[ADDI8_2:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 killed [[COPY]], 176
+    ; TWO-NEXT: STD killed [[LI8_]], 16, killed [[ADDI8_2]]
+    ; TWO-NEXT: BLR8 implicit $lr8, implicit $rm
+    %0:g8rc_and_g8rc_nox0 = COPY $x3
+    %1:g8rc_and_g8rc_nox0 = ADDI8 %0:g8rc_and_g8rc_nox0, 144
+    %2:g8rc = LI8 0
+    STD %2:g8rc, 16, %1:g8rc_and_g8rc_nox0
+    %3:g8rc_and_g8rc_nox0 = ADDI8 %0:g8rc_and_g8rc_nox0, 160
+    STD %2:g8rc, 16, %3:g8rc_and_g8rc_nox0
+    %4:g8rc_and_g8rc_nox0 = ADDI8 %0:g8rc_and_g8rc_nox0, 176
+    STD killed %2:g8rc, 16, %4:g8rc_and_g8rc_nox0
+    BLR8 implicit $lr8, implicit $rm
+...
diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
new file mode 100644
index 000000000000..cf3ff291e26c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
@@ -0,0 +1,74 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# REQUIRES: asserts
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=0,ppc-per-op-peephole-count=6 \
+# RUN:   | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=1 \
+# RUN:   | FileCheck %s --check-prefix=ONE-FIRST-RLWINM
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=4,ppc-per-op-peephole-count=1 \
+# RUN:   | FileCheck %s --check-prefix=ONE-SECOND-RLWINM
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=2 \
+# RUN:   | FileCheck %s --check-prefix=TWO
+
+---
+name: testFoldRLWINM
+#CHECK: name: testFoldRLWINM
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    ; ALL-LABEL: name: testFoldRLWINM
+    ; ALL: liveins: $x3
+    ; ALL-NEXT: {{  $}}
+    ; ALL-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY killed $x3
+    ; ALL-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY killed [[COPY]].sub_32
+    ; ALL-NEXT: dead [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 14, 0, 12
+    ; ALL-NEXT: dead [[RLWINM1:%[0-9]+]]:gprc = RLWINM [[COPY1]], 14, 0, 11
+    ; ALL-NEXT: dead [[RLWINM2:%[0-9]+]]:gprc = RLWINM killed [[COPY1]], 14, 0, 10
+    ; ALL-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; ONE-FIRST-RLWINM-LABEL: name: testFoldRLWINM
+    ; ONE-FIRST-RLWINM: liveins: $x3
+    ; ONE-FIRST-RLWINM-NEXT: {{  $}}
+    ; ONE-FIRST-RLWINM-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY killed $x3
+    ; ONE-FIRST-RLWINM-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY killed [[COPY]].sub_32
+    ; ONE-FIRST-RLWINM-NEXT: [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 27, 5, 31
+    ; ONE-FIRST-RLWINM-NEXT: dead [[RLWINM1:%[0-9]+]]:gprc = RLWINM killed [[COPY1]], 14, 0, 12
+    ; ONE-FIRST-RLWINM-NEXT: dead [[RLWINM2:%[0-9]+]]:gprc = RLWINM [[RLWINM]], 19, 0, 11
+    ; ONE-FIRST-RLWINM-NEXT: dead [[RLWINM3:%[0-9]+]]:gprc = RLWINM killed [[RLWINM]], 19, 0, 10
+    ; ONE-FIRST-RLWINM-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; ONE-SECOND-RLWINM-LABEL: name: testFoldRLWINM
+    ; ONE-SECOND-RLWINM: liveins: $x3
+    ; ONE-SECOND-RLWINM-NEXT: {{  $}}
+    ; ONE-SECOND-RLWINM-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY killed $x3
+    ; ONE-SECOND-RLWINM-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY killed [[COPY]].sub_32
+    ; ONE-SECOND-RLWINM-NEXT: [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 27, 5, 31
+    ; ONE-SECOND-RLWINM-NEXT: dead [[RLWINM1:%[0-9]+]]:gprc = RLWINM [[RLWINM]], 19, 0, 12
+    ; ONE-SECOND-RLWINM-NEXT: dead [[RLWINM2:%[0-9]+]]:gprc = RLWINM killed [[COPY1]], 14, 0, 11
+    ; ONE-SECOND-RLWINM-NEXT: dead [[RLWINM3:%[0-9]+]]:gprc = RLWINM killed [[RLWINM]], 19, 0, 10
+    ; ONE-SECOND-RLWINM-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; TWO-LABEL: name: testFoldRLWINM
+    ; TWO: liveins: $x3
+    ; TWO-NEXT: {{  $}}
+    ; TWO-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY killed $x3
+    ; TWO-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY killed [[COPY]].sub_32
+    ; TWO-NEXT: [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 27, 5, 31
+    ; TWO-NEXT: dead [[RLWINM1:%[0-9]+]]:gprc = RLWINM [[COPY1]], 14, 0, 12
+    ; TWO-NEXT: dead [[RLWINM2:%[0-9]+]]:gprc = RLWINM killed [[COPY1]], 14, 0, 11
+    ; TWO-NEXT: dead [[RLWINM3:%[0-9]+]]:gprc = RLWINM killed [[RLWINM]], 19, 0, 10
+    ; TWO-NEXT: BLR8 implicit $lr8, implicit $rm
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 27, 5, 31
+    %3:gprc = RLWINM %2:gprc, 19, 0, 12
+    %4:gprc = RLWINM %2:gprc, 19, 0, 11
+    %5:gprc = RLWINM %2:gprc, 19, 0, 10
+    BLR8 implicit $lr8, implicit $rm
+...
diff --git a/llvm/test/CodeGen/PowerPC/peephole-replaceInstr-after-eliminate-extsw.mir b/llvm/test/CodeGen/PowerPC/peephole-replaceInstr-after-eliminate-extsw.mir
new file mode 100644
index 000000000000..71b1ad536810
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/peephole-replaceInstr-after-eliminate-extsw.mir
@@ -0,0 +1,695 @@
+# RUN: llc -run-pass=ppc-mi-peepholes  -mtriple powerpc64-ibm-aix-xcoff %s -o - \
+# RUN:   -verify-machineinstrs | FileCheck %s
+
+--- |
+  ; ModuleID = '71030_tmp_reduce-O2.ll'
+  source_filename = "71030_tmp_reduce.c"
+  target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512"
+  target triple = "powerpc64-ibm-aix-xcoff"
+
+  @globalShortValue = local_unnamed_addr global i16 1, align 2
+  @globalCharValue = local_unnamed_addr global i8 0, align 1
+  @largeNumber = local_unnamed_addr global i64 -3664682556119382352, align 8
+  @someIntValue = local_unnamed_addr global i32 378441747, align 4
+  @unitIncrement = local_unnamed_addr global i32 1, align 4
+  @computedResultUll = local_unnamed_addr global i64 0, align 8
+  @computedResultShort = local_unnamed_addr global i16 0, align 2
+  @computedResultUChar = local_unnamed_addr global i8 0, align 1
+  @computedResultBool = local_unnamed_addr global i8 0, align 1
+  @computedResultChar = local_unnamed_addr global i8 0, align 1
+  @shortArray = local_unnamed_addr global [8 x i16] zeroinitializer, align 2
+  @charArray = local_unnamed_addr global [8 x [8 x [8 x i8]]] zeroinitializer, align 1
+  @longArray = local_unnamed_addr global [8 x [8 x i64]] zeroinitializer, align 8
+  @resultArray = local_unnamed_addr global [8 x [8 x i16]] zeroinitializer, align 2
+  @ullArray = local_unnamed_addr global [8 x i64] zeroinitializer, align 8
+  @intArray = local_unnamed_addr global [8 x [8 x [8 x i32]]] zeroinitializer, align 4
+  @_MergedGlobals = private constant <{ [29 x i8], [46 x i8] }> <{ [29 x i8] c"Computed Result (ULL): %llx\0A\00", [46 x i8] c"Computed convert largeNumber&&&& (ULL): %llx\0A\00" }>, align 1
+
+  @.str.1 = private alias [29 x i8], ptr @_MergedGlobals
+  @.str = private alias [46 x i8], getelementptr inbounds (<{ [29 x i8], [46 x i8] }>, ptr @_MergedGlobals, i32 0, i32 1)
+
+  ; Function Attrs: nofree nounwind
+  define noundef signext i32 @main() local_unnamed_addr #0 {
+  entry:
+    store i16 -1, ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 3), align 2, !tbaa !3
+    %0 = load i64, ptr @largeNumber, align 8, !tbaa !7
+    %conv = trunc i64 %0 to i32
+    %sext = shl i32 %conv, 16
+    %conv1 = ashr exact i32 %sext, 16
+    %sub = add nsw i32 %conv1, -1705
+    %call = tail call signext i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) getelementptr inbounds (<{ [29 x i8], [46 x i8] }>, ptr @_MergedGlobals, i32 0, i32 1), i32 noundef signext %sub)
+    %1 = load i16, ptr @globalShortValue, align 2, !tbaa !3
+    %2 = load i32, ptr @someIntValue, align 4, !tbaa !9
+    %3 = trunc i32 %2 to i8
+    %conv20 = add i8 %3, -19
+    %4 = load i32, ptr @unitIncrement, align 4
+    %5 = load i8, ptr @globalCharValue, align 1
+    %conv45 = sext i8 %5 to i32
+    %computedResultShort.promoted = load i16, ptr @computedResultShort, align 2, !tbaa !3
+    %resultArray.promoted = load i16, ptr @resultArray, align 2, !tbaa !3
+    %computedResultChar.promoted149 = load i8, ptr @computedResultChar, align 1, !tbaa !11
+    %6 = sext i8 %conv20 to i64
+    %7 = load i16, ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 3), align 2, !tbaa !3
+    %8 = load i16, ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 2), align 2
+    %conv46 = sext i16 %8 to i32
+    %cond54 = tail call i32 @llvm.smin.i32(i32 %conv45, i32 %conv46)
+    %tobool = icmp ne i32 %cond54, 0
+    %conv55 = zext i1 %tobool to i8
+    %9 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @ullArray, i64 0, i64 3), align 8
+    %tobool72 = icmp ne i64 %9, 0
+    %frombool = zext i1 %tobool72 to i8
+    %smax = tail call i64 @llvm.smax.i64(i64 %6, i64 4)
+    %10 = add nuw nsw i64 %smax, 3
+    %11 = sub i64 %10, %6
+    %12 = lshr i64 %11, 2
+    %13 = add nuw nsw i64 %12, 1
+    %n.vec = and i64 %13, 9223372036854775806
+    %14 = shl i64 %n.vec, 2
+    %ind.end = add i64 %14, %6
+    %15 = shl i64 %6, 2
+    %16 = shl i64 %6, 3
+    %17 = add nsw i64 %16, -64
+    %scevgep30 = getelementptr i8, ptr @longArray, i64 %17
+    %18 = add nsw i64 %15, 64
+    %scevgep31 = getelementptr i8, ptr @intArray, i64 %18
+    %19 = lshr i64 %13, 1
+    %20 = shl nuw nsw i64 %19, 1
+    %21 = add nsw i64 %20, -2
+    %22 = lshr i64 %21, 1
+    %23 = add nuw i64 %22, 1
+    br label %for.body16
+
+  for.cond.cleanup15:                               ; preds = %for.cond.cleanup25
+    %24 = tail call i16 @llvm.smin.i16(i16 %1, i16 %7)
+    %conv11.le = sext i16 %24 to i64
+    store i64 %conv11.le, ptr @computedResultUll, align 8, !tbaa !7
+    %call97 = tail call signext i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @_MergedGlobals, i64 noundef %conv11.le)
+    ret i32 0
+
+  for.body16:                                       ; preds = %for.cond.cleanup25, %entry
+    %lsr.iv29 = phi i32 [ %lsr.iv.next, %for.cond.cleanup25 ], [ 8, %entry ]
+    %conv36.lcssa132140 = phi i16 [ %computedResultShort.promoted, %entry ], [ %conv36.lcssa131, %for.cond.cleanup25 ]
+    %and.lcssa135139 = phi i16 [ %resultArray.promoted, %entry ], [ %and.lcssa134, %for.cond.cleanup25 ]
+    %conv81118.lcssa.lcssa137138 = phi i8 [ %computedResultChar.promoted149, %entry ], [ %conv81118.lcssa.lcssa136, %for.cond.cleanup25 ]
+    %25 = icmp slt i8 %conv20, 8
+    br i1 %25, label %for.body31.lr.ph, label %for.cond.cleanup25
+
+  for.body31.lr.ph:                                 ; preds = %for.body16
+    %26 = icmp ult i64 %11, 4
+    store i8 %conv55, ptr @computedResultUChar, align 1, !tbaa !11
+    br i1 %26, label %for.body31.preheader, label %vector.body.preheader
+
+  vector.body.preheader:                            ; preds = %for.body31.lr.ph
+    call void @llvm.set.loop.iterations.i64(i64 %23)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body.preheader, %vector.body
+    %vec.phi = phi i16 [ %44, %vector.body ], [ %conv36.lcssa132140, %vector.body.preheader ]
+    %vec.phi159 = phi i16 [ %45, %vector.body ], [ 0, %vector.body.preheader ]
+    %vec.phi160 = phi i16 [ %46, %vector.body ], [ %and.lcssa135139, %vector.body.preheader ]
+    %vec.phi161 = phi i16 [ %47, %vector.body ], [ -1, %vector.body.preheader ]
+    %vec.phi162 = phi i8 [ %48, %vector.body ], [ %conv81118.lcssa.lcssa137138, %vector.body.preheader ]
+    %vec.phi163 = phi i8 [ %49, %vector.body ], [ 0, %vector.body.preheader ]
+    %27 = phi ptr [ %scevgep30, %vector.body.preheader ], [ %31, %vector.body ]
+    %28 = phi ptr [ %scevgep31, %vector.body.preheader ], [ %29, %vector.body ]
+    %29 = getelementptr i8, ptr %28, i64 32
+    %30 = getelementptr i8, ptr %29, i64 16
+    %31 = getelementptr i8, ptr %27, i64 64
+    %32 = getelementptr i8, ptr %31, i64 32
+    %33 = trunc i32 %4 to i16
+    %34 = load i64, ptr %31, align 8, !tbaa !7
+    %35 = load i64, ptr %32, align 8, !tbaa !7
+    %36 = trunc i64 %34 to i16
+    %37 = trunc i64 %35 to i16
+    %38 = load i32, ptr %29, align 4, !tbaa !9
+    %39 = load i32, ptr %30, align 4, !tbaa !9
+    %40 = trunc i32 %38 to i8
+    %41 = trunc i32 %39 to i8
+    %42 = mul i8 %40, -6
+    %43 = mul i8 %41, -6
+    %44 = sub i16 %vec.phi, %33
+    %45 = sub i16 %vec.phi159, %33
+    %46 = and i16 %vec.phi160, %36
+    %47 = and i16 %vec.phi161, %37
+    %48 = add i8 %42, %vec.phi162
+    %49 = add i8 %43, %vec.phi163
+    %50 = call i1 @llvm.loop.decrement.i64(i64 1)
+    br i1 %50, label %vector.body, label %middle.block, !llvm.loop !12
+
+  middle.block:                                     ; preds = %vector.body
+    %51 = icmp eq i64 %13, %n.vec
+    %bin.rdx = add i16 %45, %44
+    %bin.rdx164 = and i16 %47, %46
+    %bin.rdx165 = add i8 %49, %48
+    br i1 %51, label %for.cond21.for.cond.cleanup25_crit_edge, label %for.body31.preheader
+
+  for.body31.preheader:                             ; preds = %middle.block, %for.body31.lr.ph
+    %indvars.iv.ph = phi i64 [ %6, %for.body31.lr.ph ], [ %ind.end, %middle.block ]
+    %conv36121128.ph = phi i16 [ %conv36.lcssa132140, %for.body31.lr.ph ], [ %bin.rdx, %middle.block ]
+    %and122127.ph = phi i16 [ %and.lcssa135139, %for.body31.lr.ph ], [ %bin.rdx164, %middle.block ]
+    %conv81118.lcssa124126.ph = phi i8 [ %conv81118.lcssa.lcssa137138, %for.body31.lr.ph ], [ %bin.rdx165, %middle.block ]
+    %52 = shl i64 %indvars.iv.ph, 2
+    %53 = shl i64 %indvars.iv.ph, 3
+    %scevgep = getelementptr i8, ptr getelementptr ([8 x [8 x i64]], ptr @longArray, i64 -1, i64 7, i64 4), i64 %53
+    %scevgep32 = getelementptr i8, ptr getelementptr inbounds ([8 x [8 x [8 x i32]]], ptr @intArray, i64 0, i64 0, i64 2, i64 4), i64 %52
+    %smax33 = call i64 @llvm.smax.i64(i64 %indvars.iv.ph, i64 4)
+    %54 = add i64 %smax33, 3
+    %55 = sub i64 %54, %indvars.iv.ph
+    %56 = lshr i64 %55, 2
+    %57 = add nuw nsw i64 %56, 1
+    call void @llvm.set.loop.iterations.i64(i64 %57)
+    br label %for.body31
+
+  for.cond21.for.cond.cleanup25_crit_edge:          ; preds = %for.body31, %middle.block
+    %conv36.lcssa = phi i16 [ %bin.rdx, %middle.block ], [ %conv36, %for.body31 ]
+    %and.lcssa = phi i16 [ %bin.rdx164, %middle.block ], [ %and, %for.body31 ]
+    %.lcssa = phi i8 [ %bin.rdx165, %middle.block ], [ %67, %for.body31 ]
+    %58 = trunc i16 %1 to i8
+    store i16 %conv36.lcssa, ptr @computedResultShort, align 2, !tbaa !3
+    store i8 %58, ptr getelementptr inbounds ([8 x [8 x [8 x i8]]], ptr @charArray, i64 0, i64 2, i64 0, i64 3), align 1, !tbaa !11
+    store i16 %and.lcssa, ptr @resultArray, align 2, !tbaa !3
+    store i8 %frombool, ptr @computedResultBool, align 1, !tbaa !16
+    store i8 %.lcssa, ptr @computedResultChar, align 1, !tbaa !11
+    br label %for.cond.cleanup25
+
+  for.cond.cleanup25:                               ; preds = %for.cond21.for.cond.cleanup25_crit_edge, %for.body16
+    %conv81118.lcssa.lcssa136 = phi i8 [ %.lcssa, %for.cond21.for.cond.cleanup25_crit_edge ], [ %conv81118.lcssa.lcssa137138, %for.body16 ]
+    %and.lcssa134 = phi i16 [ %and.lcssa, %for.cond21.for.cond.cleanup25_crit_edge ], [ %and.lcssa135139, %for.body16 ]
+    %conv36.lcssa131 = phi i16 [ %conv36.lcssa, %for.cond21.for.cond.cleanup25_crit_edge ], [ %conv36.lcssa132140, %for.body16 ]
+    %lsr.iv.next = add nsw i32 %lsr.iv29, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.cond.cleanup15, label %for.body16, !llvm.loop !18
+
+  for.body31:                                       ; preds = %for.body31, %for.body31.preheader
+    %conv36121128 = phi i16 [ %conv36, %for.body31 ], [ %conv36121128.ph, %for.body31.preheader ]
+    %and122127 = phi i16 [ %and, %for.body31 ], [ %and122127.ph, %for.body31.preheader ]
+    %conv81118.lcssa124126 = phi i8 [ %67, %for.body31 ], [ %conv81118.lcssa124126.ph, %for.body31.preheader ]
+    %59 = phi ptr [ %scevgep, %for.body31.preheader ], [ %62, %for.body31 ]
+    %60 = phi ptr [ %scevgep32, %for.body31.preheader ], [ %61, %for.body31 ]
+    %61 = getelementptr i8, ptr %60, i64 16
+    %62 = getelementptr i8, ptr %59, i64 32
+    %63 = trunc i32 %4 to i16
+    %64 = load i64, ptr %62, align 8, !tbaa !7
+    %conv61 = trunc i64 %64 to i16
+    %65 = load i32, ptr %61, align 4, !tbaa !9
+    %66 = trunc i32 %65 to i8
+    %.neg = mul i8 %66, -6
+    %conv36 = sub i16 %conv36121128, %63
+    %and = and i16 %and122127, %conv61
+    %67 = add i8 %.neg, %conv81118.lcssa124126
+    %68 = call i1 @llvm.loop.decrement.i64(i64 1)
+    br i1 %68, label %for.body31, label %for.cond21.for.cond.cleanup25_crit_edge, !llvm.loop !19
+  }
+
+  ; Function Attrs: nofree nounwind
+  declare noundef signext i32 @printf(ptr nocapture noundef readonly, ...) local_unnamed_addr #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare i32 @llvm.smin.i32(i32, i32) #1
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare i16 @llvm.smin.i16(i16, i16) #1
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare i64 @llvm.smax.i64(i64, i64) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i64(i64) #2
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i64(i64) #2
+
+  attributes #0 = { nofree nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr7" "target-features"="+altivec,+bpermd,+extdiv,+isa-v206-instructions,+vsx,-aix-small-local-exec-tls,-crbits,-crypto,-direct-move,-htm,-isa-v207-instructions,-isa-v30-instructions,-power8-vector,-power9-vector,-privileged,-quadword-atomics,-rop-protect,-spe" }
+  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0, !1}
+  !llvm.ident = !{!2}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 8, !"PIC Level", i32 2}
+  !2 = !{!"IBM Open XL C/C++ for AIX 17.1.3 (5725-C72, 5765-J18), version 17.1.3.0, clang version 19.0.0git"}
+  !3 = !{!4, !4, i64 0}
+  !4 = !{!"short", !5, i64 0}
+  !5 = !{!"omnipotent char", !6, i64 0}
+  !6 = !{!"Simple C/C++ TBAA"}
+  !7 = !{!8, !8, i64 0}
+  !8 = !{!"long long", !5, i64 0}
+  !9 = !{!10, !10, i64 0}
+  !10 = !{!"int", !5, i64 0}
+  !11 = !{!5, !5, i64 0}
+  !12 = distinct !{!12, !13, !14, !15}
+  !13 = !{!"llvm.loop.mustprogress"}
+  !14 = !{!"llvm.loop.isvectorized", i32 1}
+  !15 = !{!"llvm.loop.unroll.runtime.disable"}
+  !16 = !{!17, !17, i64 0}
+  !17 = !{!"_Bool", !5, i64 0}
+  !18 = distinct !{!18, !13}
+  !19 = distinct !{!19, !13, !14}
+
+...
+---
+name:            main
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: gprc, preferred-register: '' }
+  - { id: 1, class: gprc, preferred-register: '' }
+  - { id: 2, class: gprc, preferred-register: '' }
+  - { id: 3, class: gprc, preferred-register: '' }
+  - { id: 4, class: gprc, preferred-register: '' }
+  - { id: 5, class: gprc, preferred-register: '' }
+  - { id: 6, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 7, class: gprc, preferred-register: '' }
+  - { id: 8, class: gprc, preferred-register: '' }
+  - { id: 9, class: gprc, preferred-register: '' }
+  - { id: 10, class: g8rc, preferred-register: '' }
+  - { id: 11, class: g8rc, preferred-register: '' }
+  - { id: 12, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 13, class: g8rc, preferred-register: '' }
+  - { id: 14, class: g8rc, preferred-register: '' }
+  - { id: 15, class: g8rc, preferred-register: '' }
+  - { id: 16, class: g8rc, preferred-register: '' }
+  - { id: 17, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 18, class: gprc, preferred-register: '' }
+  - { id: 19, class: gprc, preferred-register: '' }
+  - { id: 20, class: gprc, preferred-register: '' }
+  - { id: 21, class: gprc, preferred-register: '' }
+  - { id: 22, class: gprc, preferred-register: '' }
+  - { id: 23, class: gprc, preferred-register: '' }
+  - { id: 24, class: gprc, preferred-register: '' }
+  - { id: 25, class: gprc, preferred-register: '' }
+  - { id: 26, class: gprc, preferred-register: '' }
+  - { id: 27, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 28, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 29, class: g8rc, preferred-register: '' }
+  - { id: 30, class: g8rc, preferred-register: '' }
+  - { id: 31, class: gprc, preferred-register: '' }
+  - { id: 32, class: gprc, preferred-register: '' }
+  - { id: 33, class: gprc, preferred-register: '' }
+  - { id: 34, class: gprc, preferred-register: '' }
+  - { id: 35, class: gprc, preferred-register: '' }
+  - { id: 36, class: gprc, preferred-register: '' }
+  - { id: 37, class: gprc, preferred-register: '' }
+  - { id: 38, class: gprc, preferred-register: '' }
+  - { id: 39, class: gprc, preferred-register: '' }
+  - { id: 40, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 41, class: gprc, preferred-register: '' }
+  - { id: 42, class: gprc, preferred-register: '' }
+  - { id: 43, class: gprc, preferred-register: '' }
+  - { id: 44, class: g8rc, preferred-register: '' }
+  - { id: 45, class: g8rc, preferred-register: '' }
+  - { id: 46, class: gprc, preferred-register: '' }
+  - { id: 47, class: gprc, preferred-register: '' }
+  - { id: 48, class: gprc, preferred-register: '' }
+  - { id: 49, class: gprc, preferred-register: '' }
+  - { id: 50, class: gprc, preferred-register: '' }
+  - { id: 51, class: gprc, preferred-register: '' }
+  - { id: 52, class: gprc, preferred-register: '' }
+  - { id: 53, class: gprc, preferred-register: '' }
+  - { id: 54, class: gprc, preferred-register: '' }
+  - { id: 55, class: gprc, preferred-register: '' }
+  - { id: 56, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 57, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 58, class: g8rc, preferred-register: '' }
+  - { id: 59, class: g8rc, preferred-register: '' }
+  - { id: 60, class: gprc, preferred-register: '' }
+  - { id: 61, class: gprc, preferred-register: '' }
+  - { id: 62, class: gprc, preferred-register: '' }
+  - { id: 63, class: gprc, preferred-register: '' }
+  - { id: 64, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 65, class: g8rc, preferred-register: '' }
+  - { id: 66, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 67, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 68, class: gprc, preferred-register: '' }
+  - { id: 69, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 70, class: g8rc, preferred-register: '' }
+  - { id: 71, class: g8rc, preferred-register: '' }
+  - { id: 72, class: g8rc, preferred-register: '' }
+  - { id: 73, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 74, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 75, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 76, class: gprc, preferred-register: '' }
+  - { id: 77, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 78, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 79, class: gprc, preferred-register: '' }
+  - { id: 80, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 81, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 82, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 83, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 84, class: g8rc, preferred-register: '' }
+  - { id: 85, class: g8rc, preferred-register: '' }
+  - { id: 86, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 87, class: crrc, preferred-register: '' }
+  - { id: 88, class: gprc, preferred-register: '' }
+  - { id: 89, class: crrc, preferred-register: '' }
+  - { id: 90, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 91, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 92, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 93, class: g8rc, preferred-register: '' }
+  - { id: 94, class: crrc, preferred-register: '' }
+  - { id: 95, class: gprc, preferred-register: '' }
+  - { id: 96, class: gprc, preferred-register: '' }
+  - { id: 97, class: crrc, preferred-register: '' }
+  - { id: 98, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 99, class: g8rc, preferred-register: '' }
+  - { id: 100, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 101, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 102, class: g8rc, preferred-register: '' }
+  - { id: 103, class: g8rc, preferred-register: '' }
+  - { id: 104, class: g8rc, preferred-register: '' }
+  - { id: 105, class: g8rc, preferred-register: '' }
+  - { id: 106, class: g8rc, preferred-register: '' }
+  - { id: 107, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 108, class: g8rc, preferred-register: '' }
+  - { id: 109, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 110, class: g8rc, preferred-register: '' }
+  - { id: 111, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 112, class: crrc, preferred-register: '' }
+  - { id: 113, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 114, class: crrc, preferred-register: '' }
+  - { id: 115, class: gprc, preferred-register: '' }
+  - { id: 116, class: gprc, preferred-register: '' }
+  - { id: 117, class: gprc, preferred-register: '' }
+  - { id: 118, class: gprc, preferred-register: '' }
+  - { id: 119, class: gprc, preferred-register: '' }
+  - { id: 120, class: gprc, preferred-register: '' }
+  - { id: 121, class: gprc, preferred-register: '' }
+  - { id: 122, class: gprc, preferred-register: '' }
+  - { id: 123, class: gprc, preferred-register: '' }
+  - { id: 124, class: gprc, preferred-register: '' }
+  - { id: 125, class: crbitrc, preferred-register: '' }
+  - { id: 126, class: crrc, preferred-register: '' }
+  - { id: 127, class: g8rc, preferred-register: '' }
+  - { id: 128, class: g8rc, preferred-register: '' }
+  - { id: 129, class: g8rc, preferred-register: '' }
+  - { id: 130, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 131, class: g8rc, preferred-register: '' }
+  - { id: 132, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 133, class: crrc, preferred-register: '' }
+  - { id: 134, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 135, class: g8rc, preferred-register: '' }
+  - { id: 136, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 137, class: g8rc, preferred-register: '' }
+  - { id: 138, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 139, class: g8rc, preferred-register: '' }
+  - { id: 140, class: gprc, preferred-register: '' }
+  - { id: 141, class: gprc, preferred-register: '' }
+  - { id: 142, class: gprc, preferred-register: '' }
+  - { id: 143, class: crbitrc, preferred-register: '' }
+  - { id: 144, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 145, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 146, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 147, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 148, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 149, class: crrc, preferred-register: '' }
+  - { id: 150, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 151, class: gprc_and_gprc_nor0, preferred-register: '' }
+  - { id: 152, class: crrc, preferred-register: '' }
+  - { id: 153, class: gprc, preferred-register: '' }
+  - { id: 154, class: g8rc, preferred-register: '' }
+  - { id: 155, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 156, class: g8rc, preferred-register: '' }
+  - { id: 157, class: g8rc, preferred-register: '' }
+  - { id: 158, class: g8rc, preferred-register: '' }
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x80000000)
+  
+    %64:g8rc_and_g8rc_nox0 = LDtoc @shortArray, $x2 :: (load (s64) from got)
+    %65:g8rc = LI8 -1
+    STH8 killed %65, 6, %64 :: (store (s16) into `ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 3)`, !tbaa !3)
+    %66:g8rc_and_g8rc_nox0 = LDtoc @largeNumber, $x2 :: (load (s64) from got)
+    %67:gprc_and_gprc_nor0 = LHA 6, killed %66 :: (dereferenceable load (s16) from @largeNumber + 6, basealign 8, !tbaa !7)
+    %68:gprc = ADDI killed %67, -1705
+    ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+    %69:g8rc_and_g8rc_nox0 = LDtoc @_MergedGlobals, $x2 :: (load (s64) from got)
+    %70:g8rc = nuw ADDI8 killed %69, 29
+    %71:g8rc = EXTSW_32_64 killed %68
+    $x3 = COPY %70
+    $x4 = COPY %71
+    BL8_NOP <mcsymbol .printf>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
+    ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+    %73:g8rc_and_g8rc_nox0 = LDtoc @globalShortValue, $x2 :: (load (s64) from got)
+    %0:gprc = LHZ 0, killed %73 :: (dereferenceable load (s16) from @globalShortValue, !tbaa !3)
+    %74:g8rc_and_g8rc_nox0 = LDtoc @someIntValue, $x2 :: (load (s64) from got)
+    %75:gprc_and_gprc_nor0 = LBZ 3, killed %74 :: (dereferenceable load (s8) from @someIntValue + 3, basealign 4, !tbaa !9)
+    %76:gprc = ADDI killed %75, -19
+    %1:gprc = EXTSB %76
+    %77:g8rc_and_g8rc_nox0 = LDtoc @unitIncrement, $x2 :: (load (s64) from got)
+    %2:gprc = LWZ 0, killed %77 :: (dereferenceable load (s32) from @unitIncrement)
+    %78:g8rc_and_g8rc_nox0 = LDtoc @globalCharValue, $x2 :: (load (s64) from got)
+    %79:gprc = LBZ 0, killed %78 :: (dereferenceable load (s8) from @globalCharValue)
+    %80:gprc_and_gprc_nor0 = EXTSB killed %79
+    %81:g8rc_and_g8rc_nox0 = LDtoc @computedResultShort, $x2 :: (load (s64) from got)
+    %3:gprc = LHZ 0, %81 :: (dereferenceable load (s16) from @computedResultShort, !tbaa !3)
+    %82:g8rc_and_g8rc_nox0 = LDtoc @resultArray, $x2 :: (load (s64) from got)
+    %4:gprc = LHZ 0, %82 :: (dereferenceable load (s16) from @resultArray, !tbaa !3)
+    %83:g8rc_and_g8rc_nox0 = LDtoc @computedResultChar, $x2 :: (load (s64) from got)
+    %5:gprc = LBZ 0, %83 :: (dereferenceable load (s8) from @computedResultChar, !tbaa !11)
+    %85:g8rc = IMPLICIT_DEF
+    %84:g8rc = INSERT_SUBREG %85, %76, %subreg.sub_32
+    %6:g8rc_and_g8rc_nox0 = EXTSB8 killed %84
+    %7:gprc = LHZ 6, %64 :: (dereferenceable load (s16) from `ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 3)`, !tbaa !3)
+    %86:gprc_and_gprc_nor0 = LHA 4, %64 :: (dereferenceable load (s16) from `ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 2)`)
+    ; CHECK:  [[VIRREG1:%[0-9]+]]:gprc_and_gprc_nor0 = LHA 6, %64 :: (dereferenceable load (s16) from `ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 3)`, !tbaa !3) 
+    %87:crrc = CMPW %80, %86
+    %88:gprc = ISEL %80, %86, %87.sub_lt
+    %89:crrc = CMPLWI killed %88, 0
+    %91:gprc_and_gprc_nor0 = LI 1
+    %8:gprc = ISEL $zero, %91, %89.sub_eq
+    %92:g8rc_and_g8rc_nox0 = LDtoc @ullArray, $x2 :: (load (s64) from got)
+    %93:g8rc = LD 24, killed %92 :: (dereferenceable load (s64) from `ptr getelementptr inbounds ([8 x i64], ptr @ullArray, i64 0, i64 3)`)
+    %94:crrc = CMPLDI killed %93, 0
+    $cr7 = COPY %94
+    %95:gprc = MFOCRF $cr7
+    %96:gprc = RLWINM killed %95, 31, 31, 31
+    %9:gprc = XORI killed %96, 1
+    %97:crrc = CMPDI %6, 4
+    %98:g8rc_and_g8rc_nox0 = LI8 4
+    %99:g8rc = ISEL8 %6, %98, %97.sub_gt
+    %100:g8rc_and_g8rc_nox0 = SUBF8 %6, killed %99
+    %10:g8rc = ADDI8 killed %100, 3
+    %101:g8rc_and_g8rc_nox0 = RLDICL %10, 62, 2
+    %11:g8rc = nuw nsw ADDI8 killed %101, 1
+    %102:g8rc = RLDICL %11, 63, 1
+    %12:g8rc_and_g8rc_nox0 = RLDICL killed %102, 1, 1
+    %103:g8rc = RLDICR %11, 2, 60
+    %13:g8rc = ADD8 killed %103, %6
+    %104:g8rc = RLDICR %6, 2, 61
+    %105:g8rc = RLDICR %6, 3, 60
+    %106:g8rc = LDtoc @longArray, $x2 :: (load (s64) from got)
+    %107:g8rc_and_g8rc_nox0 = ADD8 killed %105, %106
+    %14:g8rc = ADDI8 killed %107, -64
+    %108:g8rc = LDtoc @intArray, $x2 :: (load (s64) from got)
+    %109:g8rc_and_g8rc_nox0 = ADD8 killed %104, %108
+    %15:g8rc = ADDI8 killed %109, 64
+    %110:g8rc = nsw ADDI8 %12, -2
+    %111:g8rc_and_g8rc_nox0 = RLDICL %110, 63, 1
+    %16:g8rc = nuw ADDI8 killed %111, 1
+    %63:gprc = LI 8
+    %112:crrc = CMPWI %1, 7
+    %113:g8rc_and_g8rc_nox0 = LDtoc @computedResultUChar, $x2 :: (load (s64) from got)
+    %114:crrc = CMPLDI %10, 4
+    %118:gprc = LIS 0
+    %116:gprc = ORI %118, 65535
+    %126:crrc = CMPLD %11, %12
+    B %bb.2
+  
+  bb.1.for.cond.cleanup15:
+    %150:gprc_and_gprc_nor0 = EXTSH %7
+    %151:gprc_and_gprc_nor0 = EXTSH %0
+
+    ; CHECK:      [[VIRREG2:%[0-9]+]]:gprc_and_gprc_nor0 = EXTSH killed %0 
+    %152:crrc = CMPW %151, %150
+    %153:gprc = ISEL %151, %150, %152.sub_lt
+    %154:g8rc = EXTSW_32_64 killed %153
+    ; CHECK-NEXT: [[VIRREG3:%[0-9]+]]:crrc = CMPW [[VIRREG2]], [[VIRREG1]]
+    ; CHECK-NEXT: %153:gprc = ISEL killed [[VIRREG2]], killed [[VIRREG1]], killed [[VIRREG3]].sub_lt
+    ; CHECK-NOT: EXTSW_32_64
+    %155:g8rc_and_g8rc_nox0 = LDtoc @computedResultUll, $x2 :: (load (s64) from got)
+    STD %154, 0, killed %155 :: (store (s64) into @computedResultUll, !tbaa !7)
+    ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+    %156:g8rc = LDtoc @_MergedGlobals, $x2 :: (load (s64) from got)
+    $x3 = COPY %156
+    $x4 = COPY %154
+    BL8_NOP <mcsymbol .printf>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
+    ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+    %158:g8rc = LI8 0
+    $x3 = COPY %158
+    BLR8 implicit $lr8, implicit $rm, implicit $x3
+  
+  bb.2.for.body16:
+    successors: %bb.3(0x40000000), %bb.9(0x40000000)
+  
+    %17:gprc_and_gprc_nor0 = PHI %63, %bb.0, %52, %bb.9
+    %18:gprc = PHI %3, %bb.0, %51, %bb.9
+    %19:gprc = PHI %4, %bb.0, %50, %bb.9
+    %20:gprc = PHI %5, %bb.0, %49, %bb.9
+    BCC 44, %112, %bb.9
+    B %bb.3
+  
+  bb.3.for.body31.lr.ph:
+    successors: %bb.7(0x40000000), %bb.4(0x40000000)
+  
+    STB %8, 0, %113 :: (store (s8) into @computedResultUChar, !tbaa !11)
+    BCC 12, %114, %bb.7
+    B %bb.4
+  
+  bb.4.vector.body.preheader:
+    successors: %bb.5(0x80000000)
+  
+    MTCTR8loop %16, implicit-def dead $ctr8
+    %117:gprc = LI 0
+    %115:gprc = COPY %117
+  
+  bb.5.vector.body:
+    successors: %bb.5(0x7c000000), %bb.6(0x04000000)
+  
+    %21:gprc = PHI %18, %bb.4, %31, %bb.5
+    %22:gprc = PHI %115, %bb.4, %32, %bb.5
+    %23:gprc = PHI %19, %bb.4, %33, %bb.5
+    %24:gprc = PHI %116, %bb.4, %34, %bb.5
+    %25:gprc = PHI %20, %bb.4, %35, %bb.5
+    %26:gprc = PHI %117, %bb.4, %36, %bb.5
+    %27:g8rc_and_g8rc_nox0 = PHI %14, %bb.4, %30, %bb.5
+    %28:g8rc_and_g8rc_nox0 = PHI %15, %bb.4, %29, %bb.5
+    %29:g8rc = ADDI8 %28, 32
+    %30:g8rc = ADDI8 %27, 64
+    %119:gprc = LHZ 70, %27 :: (load (s16) from %ir.31 + 6, basealign 8, !tbaa !7)
+    %120:gprc = LHZ 102, %27 :: (load (s16) from %ir.32 + 6, basealign 8, !tbaa !7)
+    %121:gprc = LBZ 35, %28 :: (load (s8) from %ir.29 + 3, basealign 4, !tbaa !9)
+    %122:gprc = LBZ 51, %28 :: (load (s8) from %ir.30 + 3, basealign 4, !tbaa !9)
+    %123:gprc = MULLI killed %121, -6
+    %124:gprc = MULLI killed %122, -6
+    %31:gprc = SUBF %2, %21
+    %32:gprc = SUBF %2, %22
+    %33:gprc = AND %23, killed %119
+    %34:gprc = AND %24, killed %120
+    %35:gprc = ADD4 killed %123, %25
+    %36:gprc = ADD4 killed %124, %26
+    BDNZ8 %bb.5, implicit-def $ctr8, implicit $ctr8
+    B %bb.6
+  
+  bb.6.middle.block:
+    successors: %bb.8(0x40000000), %bb.7(0x40000000)
+  
+    %37:gprc = ADD4 %32, %31
+    %38:gprc = AND %34, %33
+    %39:gprc = ADD4 %36, %35
+    BCC 76, %126, %bb.8
+    B %bb.7
+  
+  bb.7.for.body31.preheader:
+    successors: %bb.10(0x80000000)
+  
+    %40:g8rc_and_g8rc_nox0 = PHI %6, %bb.3, %13, %bb.6
+    %41:gprc = PHI %18, %bb.3, %37, %bb.6
+    %42:gprc = PHI %19, %bb.3, %38, %bb.6
+    %43:gprc = PHI %20, %bb.3, %39, %bb.6
+    %127:g8rc = RLDICR %40, 2, 61
+    %128:g8rc = RLDICR %40, 3, 60
+    %130:g8rc_and_g8rc_nox0 = ADD8 %106, killed %128
+    %44:g8rc = ADDI8 killed %130, -32
+    %132:g8rc_and_g8rc_nox0 = ADD8 %108, killed %127
+    %45:g8rc = ADDI8 killed %132, 80
+    %133:crrc = CMPDI %40, 4
+    %135:g8rc = ISEL8 %40, %98, %133.sub_gt
+    %136:g8rc_and_g8rc_nox0 = SUBF8 %40, killed %135
+    %137:g8rc = ADDI8 killed %136, 3
+    %138:g8rc_and_g8rc_nox0 = RLDICL %137, 62, 2
+    %139:g8rc = nuw nsw ADDI8 killed %138, 1
+    MTCTR8loop killed %139, implicit-def dead $ctr8
+    B %bb.10
+  
+  bb.8.for.cond21.for.cond.cleanup25_crit_edge:
+    successors: %bb.9(0x80000000)
+  
+    %46:gprc = PHI %37, %bb.6, %60, %bb.10
+    %47:gprc = PHI %38, %bb.6, %61, %bb.10
+    %48:gprc = PHI %39, %bb.6, %62, %bb.10
+    STH %46, 0, %81 :: (store (s16) into @computedResultShort, !tbaa !3)
+    %145:g8rc_and_g8rc_nox0 = LDtoc @charArray, $x2 :: (load (s64) from got)
+    STB %0, 131, killed %145 :: (store (s8) into `ptr getelementptr inbounds ([8 x [8 x [8 x i8]]], ptr @charArray, i64 0, i64 2, i64 0, i64 3)`, !tbaa !11)
+    STH %47, 0, %82 :: (store (s16) into @resultArray, !tbaa !3)
+    %147:g8rc_and_g8rc_nox0 = LDtoc @computedResultBool, $x2 :: (load (s64) from got)
+    STB %9, 0, killed %147 :: (store (s8) into @computedResultBool, !tbaa !16)
+    STB %48, 0, %83 :: (store (s8) into @computedResultChar, !tbaa !11)
+  
+  bb.9.for.cond.cleanup25:
+    successors: %bb.1(0x04000000), %bb.2(0x7c000000)
+  
+    %49:gprc = PHI %20, %bb.2, %48, %bb.8
+    %50:gprc = PHI %19, %bb.2, %47, %bb.8
+    %51:gprc = PHI %18, %bb.2, %46, %bb.8
+    %52:gprc = nsw ADDI %17, -1
+    %149:crrc = CMPLWI %52, 0
+    BCC 76, killed %149, %bb.1
+    B %bb.2
+  
+  bb.10.for.body31:
+    successors: %bb.10(0x7c000000), %bb.8(0x04000000)
+  
+    %53:gprc = PHI %41, %bb.7, %60, %bb.10
+    %54:gprc = PHI %42, %bb.7, %61, %bb.10
+    %55:gprc = PHI %43, %bb.7, %62, %bb.10
+    %56:g8rc_and_g8rc_nox0 = PHI %44, %bb.7, %59, %bb.10
+    %57:g8rc_and_g8rc_nox0 = PHI %45, %bb.7, %58, %bb.10
+    %58:g8rc = ADDI8 %57, 16
+    %59:g8rc = ADDI8 %56, 32
+    %140:gprc = LHZ 38, %56 :: (load (s16) from %ir.62 + 6, basealign 8, !tbaa !7)
+    %141:gprc = LBZ 19, %57 :: (load (s8) from %ir.61 + 3, basealign 4, !tbaa !9)
+    %142:gprc = MULLI killed %141, -6
+    %60:gprc = SUBF %2, %53
+    %61:gprc = AND %54, killed %140
+    %62:gprc = ADD4 killed %142, %55
+    BDNZ8 %bb.10, implicit-def $ctr8, implicit $ctr8
+    B %bb.8
+
+...
+
+
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index d2ee80e6aa95..f032756e007b 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -57,8 +57,8 @@ define i64 @sltiu(i64 %a) nounwind {
 ;
 ; RV32I-LABEL: sltiu:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    sltiu a0, a0, 3
+; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index f96e1bad2e38..a5a2ae79966c 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -372,10 +372,10 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV32IA-NEXT:    # Child Loop BB2_3 Depth 2
 ; RV32IA-NEXT:    mv a3, a2
-; RV32IA-NEXT:    addi a4, a2, 1
-; RV32IA-NEXT:    sltu a2, a2, a1
-; RV32IA-NEXT:    neg a2, a2
-; RV32IA-NEXT:    and a4, a2, a4
+; RV32IA-NEXT:    addi a2, a2, 1
+; RV32IA-NEXT:    sltu a4, a3, a1
+; RV32IA-NEXT:    neg a4, a4
+; RV32IA-NEXT:    and a4, a4, a2
 ; RV32IA-NEXT:  .LBB2_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # Parent Loop BB2_1 Depth=1
 ; RV32IA-NEXT:    # => This Inner Loop Header: Depth=2
@@ -607,10 +607,10 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB3_3 Depth 2
 ; RV64IA-NEXT:    mv a3, a2
-; RV64IA-NEXT:    addi a4, a2, 1
-; RV64IA-NEXT:    sltu a2, a2, a1
-; RV64IA-NEXT:    neg a2, a2
-; RV64IA-NEXT:    and a4, a2, a4
+; RV64IA-NEXT:    addi a2, a2, 1
+; RV64IA-NEXT:    sltu a4, a3, a1
+; RV64IA-NEXT:    neg a4, a4
+; RV64IA-NEXT:    and a4, a4, a2
 ; RV64IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB3_1 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 141d5ea41828..7bd3440c9dc0 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -256,6 +256,19 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-supm %s -o - | FileCheck --check-prefix=RV64SUPM %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssqosid %s -o - | FileCheck --check-prefix=RV64SSQOSID %s
 
+; Tests for profile features.
+; RUN: llc -mtriple=riscv32 -mattr=+rvi20u32 %s -o - | FileCheck --check-prefix=RVI20U32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rvi20u64 %s -o - | FileCheck --check-prefix=RVI20U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva20u64 %s -o - | FileCheck --check-prefix=RVA20U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva20s64 %s -o - | FileCheck --check-prefix=RVA20S64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva22u64 %s -o - | FileCheck --check-prefix=RVA22U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva22s64 %s -o - | FileCheck --check-prefix=RVA22S64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva23u64 %s -o - | FileCheck --check-prefix=RVA23U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva23s64 %s -o - | FileCheck --check-prefix=RVA23S64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rvb23u64 %s -o - | FileCheck --check-prefix=RVB23U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rvb23s64 %s -o - | FileCheck --check-prefix=RVB23S64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+rvm23u32 %s -o - | FileCheck --check-prefix=RVM23U32 %s
+
 ; CHECK: .attribute 4, 16
 
 ; RV32M: .attribute 5, "rv32i2p1_m2p0"
@@ -512,6 +525,18 @@
 ; RV64SUPM: .attribute 5, "rv64i2p1_supm0p8"
 ; RV64SSQOSID: .attribute 5, "rv64i2p1_ssqosid1p0"
 
+; RVI20U32: .attribute 5, "rv32i2p1"
+; RVI20U64: .attribute 5, "rv64i2p1"
+; RVA20U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_za128rs1p0"
+; RVA20S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_za128rs1p0_ssccptr1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0"
+; RVA22U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zihintpause2p0_zihpm2p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0"
+; RVA22S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_zihintpause2p0_zihpm2p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscounterenw1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0_svinval1p0_svpbmt1p0"
+; RVA23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0"
+; RVA23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm0p8_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
+; RVB23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0"
+; RVB23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
+; RVM23U32: .attribute 5, "rv32i2p1_m2p0_zicbop1p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zimop1p0_zca1p0_zcb1p0_zce1p0_zcmop1p0_zcmp1p0_zcmt1p0_zba1p0_zbb1p0_zbs1p0"
+
 define i32 @addi(i32 %a) {
   %1 = add i32 %a, 1
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 9e2b0b5c3cbb..770dcccee882 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -456,92 +456,80 @@ define i64 @fcvt_l_bf16(bfloat %a) nounwind {
 define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32IZFBFMIN-LABEL: fcvt_l_bf16_sat:
 ; RV32IZFBFMIN:       # %bb.0: # %start
-; RV32IZFBFMIN-NEXT:    addi sp, sp, -32
-; RV32IZFBFMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fs0, fa0
-; RV32IZFBFMIN-NEXT:    flt.s s0, fa5, fs0
-; RV32IZFBFMIN-NEXT:    neg s1, s0
 ; RV32IZFBFMIN-NEXT:    lui a0, 913408
 ; RV32IZFBFMIN-NEXT:    fmv.w.x fa5, a0
-; RV32IZFBFMIN-NEXT:    fle.s s2, fa5, fs0
-; RV32IZFBFMIN-NEXT:    neg s3, s2
+; RV32IZFBFMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFBFMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFBFMIN-NEXT:    call __fixsfdi
-; RV32IZFBFMIN-NEXT:    and a0, s3, a0
-; RV32IZFBFMIN-NEXT:    or a0, s1, a0
-; RV32IZFBFMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFBFMIN-NEXT:    neg a2, a2
 ; RV32IZFBFMIN-NEXT:    lui a4, 524288
-; RV32IZFBFMIN-NEXT:    lui a3, 524288
-; RV32IZFBFMIN-NEXT:    beqz s2, .LBB10_2
+; RV32IZFBFMIN-NEXT:    lui a2, 524288
+; RV32IZFBFMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IZFBFMIN-NEXT:  # %bb.1: # %start
-; RV32IZFBFMIN-NEXT:    mv a3, a1
+; RV32IZFBFMIN-NEXT:    mv a2, a1
 ; RV32IZFBFMIN-NEXT:  .LBB10_2: # %start
-; RV32IZFBFMIN-NEXT:    and a0, a2, a0
-; RV32IZFBFMIN-NEXT:    beqz s0, .LBB10_4
+; RV32IZFBFMIN-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IZFBFMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFBFMIN-NEXT:    beqz a3, .LBB10_4
 ; RV32IZFBFMIN-NEXT:  # %bb.3:
-; RV32IZFBFMIN-NEXT:    addi a3, a4, -1
+; RV32IZFBFMIN-NEXT:    addi a2, a4, -1
 ; RV32IZFBFMIN-NEXT:  .LBB10_4: # %start
-; RV32IZFBFMIN-NEXT:    and a1, a2, a3
-; RV32IZFBFMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    addi sp, sp, 32
+; RV32IZFBFMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFBFMIN-NEXT:    neg a4, a1
+; RV32IZFBFMIN-NEXT:    and a1, a4, a2
+; RV32IZFBFMIN-NEXT:    neg a2, a3
+; RV32IZFBFMIN-NEXT:    neg a3, s0
+; RV32IZFBFMIN-NEXT:    and a0, a3, a0
+; RV32IZFBFMIN-NEXT:    or a0, a2, a0
+; RV32IZFBFMIN-NEXT:    and a0, a4, a0
+; RV32IZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
 ; RV32IZFBFMIN-NEXT:    ret
 ;
 ; R32IDZFBFMIN-LABEL: fcvt_l_bf16_sat:
 ; R32IDZFBFMIN:       # %bb.0: # %start
-; R32IDZFBFMIN-NEXT:    addi sp, sp, -32
-; R32IDZFBFMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT:    addi sp, sp, -16
+; R32IDZFBFMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; R32IDZFBFMIN-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    lui a0, %hi(.LCPI10_0)
-; R32IDZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
 ; R32IDZFBFMIN-NEXT:    fcvt.s.bf16 fs0, fa0
-; R32IDZFBFMIN-NEXT:    flt.s s0, fa5, fs0
-; R32IDZFBFMIN-NEXT:    neg s1, s0
 ; R32IDZFBFMIN-NEXT:    lui a0, 913408
 ; R32IDZFBFMIN-NEXT:    fmv.w.x fa5, a0
-; R32IDZFBFMIN-NEXT:    fle.s s2, fa5, fs0
-; R32IDZFBFMIN-NEXT:    neg s3, s2
+; R32IDZFBFMIN-NEXT:    fle.s s0, fa5, fs0
 ; R32IDZFBFMIN-NEXT:    fmv.s fa0, fs0
 ; R32IDZFBFMIN-NEXT:    call __fixsfdi
-; R32IDZFBFMIN-NEXT:    and a0, s3, a0
-; R32IDZFBFMIN-NEXT:    or a0, s1, a0
-; R32IDZFBFMIN-NEXT:    feq.s a2, fs0, fs0
-; R32IDZFBFMIN-NEXT:    neg a2, a2
 ; R32IDZFBFMIN-NEXT:    lui a4, 524288
-; R32IDZFBFMIN-NEXT:    lui a3, 524288
-; R32IDZFBFMIN-NEXT:    beqz s2, .LBB10_2
+; R32IDZFBFMIN-NEXT:    lui a2, 524288
+; R32IDZFBFMIN-NEXT:    beqz s0, .LBB10_2
 ; R32IDZFBFMIN-NEXT:  # %bb.1: # %start
-; R32IDZFBFMIN-NEXT:    mv a3, a1
+; R32IDZFBFMIN-NEXT:    mv a2, a1
 ; R32IDZFBFMIN-NEXT:  .LBB10_2: # %start
-; R32IDZFBFMIN-NEXT:    and a0, a2, a0
-; R32IDZFBFMIN-NEXT:    beqz s0, .LBB10_4
+; R32IDZFBFMIN-NEXT:    lui a1, %hi(.LCPI10_0)
+; R32IDZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; R32IDZFBFMIN-NEXT:    flt.s a3, fa5, fs0
+; R32IDZFBFMIN-NEXT:    beqz a3, .LBB10_4
 ; R32IDZFBFMIN-NEXT:  # %bb.3:
-; R32IDZFBFMIN-NEXT:    addi a3, a4, -1
+; R32IDZFBFMIN-NEXT:    addi a2, a4, -1
 ; R32IDZFBFMIN-NEXT:  .LBB10_4: # %start
-; R32IDZFBFMIN-NEXT:    and a1, a2, a3
-; R32IDZFBFMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT:    feq.s a1, fs0, fs0
+; R32IDZFBFMIN-NEXT:    neg a4, a1
+; R32IDZFBFMIN-NEXT:    and a1, a4, a2
+; R32IDZFBFMIN-NEXT:    neg a2, a3
+; R32IDZFBFMIN-NEXT:    neg a3, s0
+; R32IDZFBFMIN-NEXT:    and a0, a3, a0
+; R32IDZFBFMIN-NEXT:    or a0, a2, a0
+; R32IDZFBFMIN-NEXT:    and a0, a4, a0
+; R32IDZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; R32IDZFBFMIN-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    addi sp, sp, 32
+; R32IDZFBFMIN-NEXT:    addi sp, sp, 16
 ; R32IDZFBFMIN-NEXT:    ret
 ;
 ; RV32ID-LABEL: fcvt_l_bf16_sat:
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index c147d6ec6d9b..6024a29da33d 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -692,28 +692,27 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB12_2
+; RV32IFD-NEXT:    beqz s0, .LBB12_2
 ; RV32IFD-NEXT:  # %bb.1: # %start
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB12_2: # %start
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI12_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI12_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB12_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB12_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB12_4: # %start
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -790,33 +789,32 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lui a3, 278016
+; RV32I-NEXT:    addi a3, a3, -1
+; RV32I-NEXT:    li a2, -1
+; RV32I-NEXT:    call __gtdf2
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    lui a3, 802304
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    li a2, 0
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call __fixdfdi
-; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    mv s4, a1
-; RV32I-NEXT:    lui s6, 524288
-; RV32I-NEXT:    bgez s2, .LBB12_2
+; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    lui a0, 524288
+; RV32I-NEXT:    bgez s3, .LBB12_2
 ; RV32I-NEXT:  # %bb.1: # %start
-; RV32I-NEXT:    lui s4, 524288
+; RV32I-NEXT:    lui s5, 524288
 ; RV32I-NEXT:  .LBB12_2: # %start
-; RV32I-NEXT:    lui a3, 278016
-; RV32I-NEXT:    addi a3, a3, -1
-; RV32I-NEXT:    li a2, -1
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    call __gtdf2
-; RV32I-NEXT:    mv s5, a0
-; RV32I-NEXT:    blez a0, .LBB12_4
+; RV32I-NEXT:    blez s2, .LBB12_4
 ; RV32I-NEXT:  # %bb.3: # %start
-; RV32I-NEXT:    addi s4, s6, -1
+; RV32I-NEXT:    addi s5, a0, -1
 ; RV32I-NEXT:  .LBB12_4: # %start
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
@@ -825,11 +823,11 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    call __unorddf2
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a1, a0, s4
-; RV32I-NEXT:    slti a2, s2, 0
+; RV32I-NEXT:    and a1, a0, s5
+; RV32I-NEXT:    slti a2, s3, 0
 ; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a2, a2, s3
-; RV32I-NEXT:    sgtz a3, s5
+; RV32I-NEXT:    and a2, a2, s4
+; RV32I-NEXT:    sgtz a3, s2
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    or a2, a3, a2
 ; RV32I-NEXT:    and a0, a0, a2
@@ -840,7 +838,6 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -949,22 +946,23 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lui a0, %hi(.LCPI14_0)
-; RV32IFD-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
+; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fcvt.d.w fa5, zero
 ; RV32IFD-NEXT:    fle.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s1, a0
+; RV32IFD-NEXT:    neg s0, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
-; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
-; RV32IFD-NEXT:    and a1, s1, a1
-; RV32IFD-NEXT:    or a1, s0, a1
+; RV32IFD-NEXT:    lui a2, %hi(.LCPI14_0)
+; RV32IFD-NEXT:    fld fa5, %lo(.LCPI14_0)(a2)
+; RV32IFD-NEXT:    and a0, s0, a0
+; RV32IFD-NEXT:    flt.d a2, fa5, fs0
+; RV32IFD-NEXT:    neg a2, a2
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a1, s0, a1
+; RV32IFD-NEXT:    or a1, a2, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
 ;
@@ -983,27 +981,24 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
-; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    mv s0, a0
-; RV32IZFINXZDINX-NEXT:    fle.d a0, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg s2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, s0
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
-; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI14_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI14_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI14_0)(a2)
-; RV32IZFINXZDINX-NEXT:    and a0, s2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI14_0)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI14_0+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI14_0)(a4)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a1, s2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a3, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
+; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, 16
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index f1c56b320b76..927eee2e9e54 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -53,28 +53,27 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB1_2
+; RV32IFD-NEXT:    beqz s0, .LBB1_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB1_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI1_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI1_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB1_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB1_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB1_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -299,28 +298,27 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB5_2
+; RV32IFD-NEXT:    beqz s0, .LBB5_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB5_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI5_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI5_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB5_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB5_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB5_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -545,28 +543,27 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB9_2
+; RV32IFD-NEXT:    beqz s0, .LBB9_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB9_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI9_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI9_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB9_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB9_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB9_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -791,28 +788,27 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB13_2
+; RV32IFD-NEXT:    beqz s0, .LBB13_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB13_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI13_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI13_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB13_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB13_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB13_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1037,28 +1033,27 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB17_2
+; RV32IFD-NEXT:    beqz s0, .LBB17_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB17_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI17_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI17_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB17_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB17_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB17_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1283,28 +1278,27 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB21_2
+; RV32IFD-NEXT:    beqz s0, .LBB21_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB21_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI21_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI21_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB21_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB21_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB21_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/fixups-diff.ll b/llvm/test/CodeGen/RISCV/fixups-diff.ll
index cc1c87b1fe37..84a7d18ed150 100644
--- a/llvm/test/CodeGen/RISCV/fixups-diff.ll
+++ b/llvm/test/CodeGen/RISCV/fixups-diff.ll
@@ -27,7 +27,7 @@ entry:
 ; CHECK:      }
 
 ; CHECK:      Section {{.*}} .rela.eh_frame {
-; CHECK-NEXT:   0x1C R_RISCV_32_PCREL <null> 0x0
+; CHECK-NEXT:   0x1C R_RISCV_32_PCREL .L0  0x0
 ; CHECK-NEXT: }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 653b64ec7304..7eabd3f5f227 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -275,26 +275,24 @@ define i32 @fcvt_wu_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a1, 325632
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    call __gtsf2
-; RV32I-NEXT:    sgtz a0, a0
-; RV32I-NEXT:    neg s1, a0
-; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
 ; RV32I-NEXT:    slti a0, a0, 0
-; RV32I-NEXT:    addi s2, a0, -1
+; RV32I-NEXT:    addi s1, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfsi
-; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    or a0, s1, a0
+; RV32I-NEXT:    and s1, s1, a0
+; RV32I-NEXT:    lui a1, 325632
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __gtsf2
+; RV32I-NEXT:    sgtz a0, a0
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    or a0, a0, s1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
@@ -618,38 +616,36 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fa0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI12_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI12_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB12_2
 ; RV32IF-NEXT:  # %bb.1: # %start
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB12_2: # %start
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI12_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB12_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB12_4: # %start
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -867,22 +863,23 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    lui a0, %hi(.LCPI14_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI14_0)(a0)
-; RV32IF-NEXT:    flt.s a0, fa5, fa0
-; RV32IF-NEXT:    neg s0, a0
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    fmv.w.x fa5, zero
 ; RV32IF-NEXT:    fle.s a0, fa5, fa0
-; RV32IF-NEXT:    neg s1, a0
+; RV32IF-NEXT:    neg s0, a0
 ; RV32IF-NEXT:    call __fixunssfdi
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    or a0, s0, a0
-; RV32IF-NEXT:    and a1, s1, a1
-; RV32IF-NEXT:    or a1, s0, a1
+; RV32IF-NEXT:    lui a2, %hi(.LCPI14_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI14_0)(a2)
+; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    flt.s a2, fa5, fs0
+; RV32IF-NEXT:    neg a2, a2
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a1, s0, a1
+; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -901,17 +898,19 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFINX-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFINX-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFINX-NEXT:    lui a1, %hi(.LCPI14_0)
-; RV32IZFINX-NEXT:    lw a1, %lo(.LCPI14_0)(a1)
-; RV32IZFINX-NEXT:    flt.s a1, a1, a0
-; RV32IZFINX-NEXT:    neg s0, a1
-; RV32IZFINX-NEXT:    fle.s a1, zero, a0
-; RV32IZFINX-NEXT:    neg s1, a1
+; RV32IZFINX-NEXT:    mv s0, a0
+; RV32IZFINX-NEXT:    fle.s a0, zero, a0
+; RV32IZFINX-NEXT:    neg s1, a0
+; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
+; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI14_0)
+; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI14_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
-; RV32IZFINX-NEXT:    or a0, s0, a0
+; RV32IZFINX-NEXT:    flt.s a2, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a2
+; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    and a1, s1, a1
-; RV32IZFINX-NEXT:    or a1, s0, a1
+; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -929,33 +928,36 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ;
 ; RV32I-LABEL: fcvt_lu_s_sat:
 ; RV32I:       # %bb.0: # %start
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a1, 391168
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    call __gtsf2
-; RV32I-NEXT:    sgtz a0, a0
-; RV32I-NEXT:    neg s1, a0
-; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
 ; RV32I-NEXT:    slti a0, a0, 0
 ; RV32I-NEXT:    addi s2, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfdi
-; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    or a0, s1, a0
-; RV32I-NEXT:    and a1, s2, a1
-; RV32I-NEXT:    or a1, s1, a1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    and s3, s2, a0
+; RV32I-NEXT:    lui a1, 391168
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __gtsf2
+; RV32I-NEXT:    sgtz a0, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    or a0, a1, s3
+; RV32I-NEXT:    and a2, s2, s1
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: fcvt_lu_s_sat:
@@ -2089,26 +2091,24 @@ define zeroext i32 @fcvt_wu_s_sat_zext(float %a) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a1, 325632
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    call __gtsf2
-; RV32I-NEXT:    sgtz a0, a0
-; RV32I-NEXT:    neg s1, a0
-; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
 ; RV32I-NEXT:    slti a0, a0, 0
-; RV32I-NEXT:    addi s2, a0, -1
+; RV32I-NEXT:    addi s1, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfsi
-; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    or a0, s1, a0
+; RV32I-NEXT:    and s1, s1, a0
+; RV32I-NEXT:    lui a1, 325632
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __gtsf2
+; RV32I-NEXT:    sgtz a0, a0
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    or a0, a0, s1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
index 4f747c278da0..5e99c7eb9056 100644
--- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
@@ -37,8 +37,7 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -53,33 +52,32 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI1_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB1_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB1_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB1_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB1_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -117,23 +115,23 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI1_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB1_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB1_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB1_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB1_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB1_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -299,8 +297,7 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -315,33 +312,32 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI5_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB5_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB5_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI5_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI5_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB5_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB5_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -379,23 +375,23 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB5_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB5_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB5_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB5_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB5_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -561,8 +557,7 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -577,33 +572,32 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI9_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI9_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB9_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB9_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI9_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB9_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB9_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -641,23 +635,23 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI9_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB9_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB9_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB9_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB9_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB9_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -823,8 +817,7 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -839,33 +832,32 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI13_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI13_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB13_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB13_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI13_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI13_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB13_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB13_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -903,23 +895,23 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI13_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB13_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB13_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB13_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB13_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB13_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1085,8 +1077,7 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -1101,33 +1092,32 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI17_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI17_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB17_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB17_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI17_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI17_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB17_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB17_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -1165,23 +1155,23 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI17_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB17_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB17_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB17_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB17_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB17_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1347,8 +1337,7 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -1363,33 +1352,32 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI21_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI21_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB21_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB21_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI21_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI21_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB21_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB21_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -1427,23 +1415,23 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI21_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB21_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB21_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB21_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB21_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB21_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index c303690aadff..f6a53a9d76dd 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -3567,8 +3567,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    # in Loop: Header=BB51_2 Depth=1
 ; RV32-NEXT:    neg a3, a0
 ; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
@@ -3659,8 +3659,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    # in Loop: Header=BB52_2 Depth=1
 ; RV32-NEXT:    neg a3, a0
 ; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 06ab813faf02..deb5a6d4013d 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -114,8 +114,8 @@ define i32 @utest_f64i32(double %x) {
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    .cfi_offset ra, -4
 ; RV32IF-NEXT:    call __fixunsdfdi
-; RV32IF-NEXT:    seqz a1, a1
 ; RV32IF-NEXT:    sltiu a2, a0, -1
+; RV32IF-NEXT:    seqz a1, a1
 ; RV32IF-NEXT:    and a1, a1, a2
 ; RV32IF-NEXT:    addi a1, a1, -1
 ; RV32IF-NEXT:    or a0, a1, a0
@@ -429,8 +429,8 @@ define i32 @utesth_f16i32(half %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    call __fixunssfdi
-; RV32-NEXT:    seqz a1, a1
 ; RV32-NEXT:    sltiu a2, a0, -1
+; RV32-NEXT:    seqz a1, a1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 277749c75bbb..31fb6e2ee9c8 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -2145,47 +2145,41 @@ define i64 @fcvt_l_h(half %a) nounwind {
 define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_l_h_sat:
 ; RV32IZFH:       # %bb.0: # %start
-; RV32IZFH-NEXT:    addi sp, sp, -32
-; RV32IZFH-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IZFH-NEXT:    addi sp, sp, -16
+; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
-; RV32IZFH-NEXT:    flt.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
-; RV32IZFH-NEXT:    fle.s s2, fa5, fs0
-; RV32IZFH-NEXT:    neg s3, s2
+; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    and a0, s3, a0
-; RV32IZFH-NEXT:    or a0, s1, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    lui a4, 524288
-; RV32IZFH-NEXT:    lui a3, 524288
-; RV32IZFH-NEXT:    beqz s2, .LBB10_2
+; RV32IZFH-NEXT:    lui a2, 524288
+; RV32IZFH-NEXT:    beqz s0, .LBB10_2
 ; RV32IZFH-NEXT:  # %bb.1: # %start
-; RV32IZFH-NEXT:    mv a3, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB10_2: # %start
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    beqz s0, .LBB10_4
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB10_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    addi a3, a4, -1
+; RV32IZFH-NEXT:    addi a2, a4, -1
 ; RV32IZFH-NEXT:  .LBB10_4: # %start
-; RV32IZFH-NEXT:    and a1, a2, a3
-; RV32IZFH-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    addi sp, sp, 32
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    neg a3, s0
+; RV32IZFH-NEXT:    and a0, a3, a0
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    addi sp, sp, 16
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: fcvt_l_h_sat:
@@ -2199,47 +2193,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ;
 ; RV32IDZFH-LABEL: fcvt_l_h_sat:
 ; RV32IDZFH:       # %bb.0: # %start
-; RV32IDZFH-NEXT:    addi sp, sp, -32
-; RV32IDZFH-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT:    addi sp, sp, -16
+; RV32IDZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IDZFH-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
-; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
 ; RV32IDZFH-NEXT:    fcvt.s.h fs0, fa0
-; RV32IDZFH-NEXT:    flt.s s0, fa5, fs0
-; RV32IDZFH-NEXT:    neg s1, s0
 ; RV32IDZFH-NEXT:    lui a0, 913408
 ; RV32IDZFH-NEXT:    fmv.w.x fa5, a0
-; RV32IDZFH-NEXT:    fle.s s2, fa5, fs0
-; RV32IDZFH-NEXT:    neg s3, s2
+; RV32IDZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IDZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IDZFH-NEXT:    call __fixsfdi
-; RV32IDZFH-NEXT:    and a0, s3, a0
-; RV32IDZFH-NEXT:    or a0, s1, a0
-; RV32IDZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IDZFH-NEXT:    neg a2, a2
 ; RV32IDZFH-NEXT:    lui a4, 524288
-; RV32IDZFH-NEXT:    lui a3, 524288
-; RV32IDZFH-NEXT:    beqz s2, .LBB10_2
+; RV32IDZFH-NEXT:    lui a2, 524288
+; RV32IDZFH-NEXT:    beqz s0, .LBB10_2
 ; RV32IDZFH-NEXT:  # %bb.1: # %start
-; RV32IDZFH-NEXT:    mv a3, a1
+; RV32IDZFH-NEXT:    mv a2, a1
 ; RV32IDZFH-NEXT:  .LBB10_2: # %start
-; RV32IDZFH-NEXT:    and a0, a2, a0
-; RV32IDZFH-NEXT:    beqz s0, .LBB10_4
+; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IDZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IDZFH-NEXT:    beqz a3, .LBB10_4
 ; RV32IDZFH-NEXT:  # %bb.3:
-; RV32IDZFH-NEXT:    addi a3, a4, -1
+; RV32IDZFH-NEXT:    addi a2, a4, -1
 ; RV32IDZFH-NEXT:  .LBB10_4: # %start
-; RV32IDZFH-NEXT:    and a1, a2, a3
-; RV32IDZFH-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IDZFH-NEXT:    neg a4, a1
+; RV32IDZFH-NEXT:    and a1, a4, a2
+; RV32IDZFH-NEXT:    neg a2, a3
+; RV32IDZFH-NEXT:    neg a3, s0
+; RV32IDZFH-NEXT:    and a0, a3, a0
+; RV32IDZFH-NEXT:    or a0, a2, a0
+; RV32IDZFH-NEXT:    and a0, a4, a0
+; RV32IDZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32IDZFH-NEXT:    addi sp, sp, 32
+; RV32IDZFH-NEXT:    addi sp, sp, 16
 ; RV32IDZFH-NEXT:    ret
 ;
 ; RV64IDZFH-LABEL: fcvt_l_h_sat:
@@ -2515,47 +2503,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ;
 ; RV32ID-LABEL: fcvt_l_h_sat:
 ; RV32ID:       # %bb.0: # %start
-; RV32ID-NEXT:    addi sp, sp, -32
-; RV32ID-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32ID-NEXT:    addi sp, sp, -16
+; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32ID-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
-; RV32ID-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
 ; RV32ID-NEXT:    fmv.s fs0, fa0
-; RV32ID-NEXT:    flt.s s0, fa5, fa0
-; RV32ID-NEXT:    neg s1, s0
 ; RV32ID-NEXT:    lui a0, 913408
 ; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    fle.s s2, fa5, fa0
-; RV32ID-NEXT:    neg s3, s2
+; RV32ID-NEXT:    fle.s s0, fa5, fa0
 ; RV32ID-NEXT:    call __fixsfdi
-; RV32ID-NEXT:    and a0, s3, a0
-; RV32ID-NEXT:    or a0, s1, a0
-; RV32ID-NEXT:    feq.s a2, fs0, fs0
-; RV32ID-NEXT:    neg a2, a2
 ; RV32ID-NEXT:    lui a4, 524288
-; RV32ID-NEXT:    lui a3, 524288
-; RV32ID-NEXT:    beqz s2, .LBB10_2
+; RV32ID-NEXT:    lui a2, 524288
+; RV32ID-NEXT:    beqz s0, .LBB10_2
 ; RV32ID-NEXT:  # %bb.1: # %start
-; RV32ID-NEXT:    mv a3, a1
+; RV32ID-NEXT:    mv a2, a1
 ; RV32ID-NEXT:  .LBB10_2: # %start
-; RV32ID-NEXT:    and a0, a2, a0
-; RV32ID-NEXT:    beqz s0, .LBB10_4
+; RV32ID-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32ID-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32ID-NEXT:    flt.s a3, fa5, fs0
+; RV32ID-NEXT:    beqz a3, .LBB10_4
 ; RV32ID-NEXT:  # %bb.3:
-; RV32ID-NEXT:    addi a3, a4, -1
+; RV32ID-NEXT:    addi a2, a4, -1
 ; RV32ID-NEXT:  .LBB10_4: # %start
-; RV32ID-NEXT:    and a1, a2, a3
-; RV32ID-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32ID-NEXT:    feq.s a1, fs0, fs0
+; RV32ID-NEXT:    neg a4, a1
+; RV32ID-NEXT:    and a1, a4, a2
+; RV32ID-NEXT:    neg a2, s0
+; RV32ID-NEXT:    and a0, a2, a0
+; RV32ID-NEXT:    neg a2, a3
+; RV32ID-NEXT:    or a0, a2, a0
+; RV32ID-NEXT:    and a0, a4, a0
+; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32ID-NEXT:    addi sp, sp, 32
+; RV32ID-NEXT:    addi sp, sp, 16
 ; RV32ID-NEXT:    ret
 ;
 ; RV64ID-LABEL: fcvt_l_h_sat:
@@ -2574,47 +2556,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ;
 ; RV32IFZFHMIN-LABEL: fcvt_l_h_sat:
 ; RV32IFZFHMIN:       # %bb.0: # %start
-; RV32IFZFHMIN-NEXT:    addi sp, sp, -32
-; RV32IFZFHMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IFZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IFZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IFZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IFZFHMIN-NEXT:    fcvt.s.h fs0, fa0
-; RV32IFZFHMIN-NEXT:    flt.s s0, fa5, fs0
-; RV32IFZFHMIN-NEXT:    neg s1, s0
 ; RV32IFZFHMIN-NEXT:    lui a0, 913408
 ; RV32IFZFHMIN-NEXT:    fmv.w.x fa5, a0
-; RV32IFZFHMIN-NEXT:    fle.s s2, fa5, fs0
-; RV32IFZFHMIN-NEXT:    neg s3, s2
+; RV32IFZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IFZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IFZFHMIN-NEXT:    call __fixsfdi
-; RV32IFZFHMIN-NEXT:    and a0, s3, a0
-; RV32IFZFHMIN-NEXT:    or a0, s1, a0
-; RV32IFZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IFZFHMIN-NEXT:    neg a2, a2
 ; RV32IFZFHMIN-NEXT:    lui a4, 524288
-; RV32IFZFHMIN-NEXT:    lui a3, 524288
-; RV32IFZFHMIN-NEXT:    beqz s2, .LBB10_2
+; RV32IFZFHMIN-NEXT:    lui a2, 524288
+; RV32IFZFHMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IFZFHMIN-NEXT:  # %bb.1: # %start
-; RV32IFZFHMIN-NEXT:    mv a3, a1
+; RV32IFZFHMIN-NEXT:    mv a2, a1
 ; RV32IFZFHMIN-NEXT:  .LBB10_2: # %start
-; RV32IFZFHMIN-NEXT:    and a0, a2, a0
-; RV32IFZFHMIN-NEXT:    beqz s0, .LBB10_4
+; RV32IFZFHMIN-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IFZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IFZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IFZFHMIN-NEXT:    beqz a3, .LBB10_4
 ; RV32IFZFHMIN-NEXT:  # %bb.3:
-; RV32IFZFHMIN-NEXT:    addi a3, a4, -1
+; RV32IFZFHMIN-NEXT:    addi a2, a4, -1
 ; RV32IFZFHMIN-NEXT:  .LBB10_4: # %start
-; RV32IFZFHMIN-NEXT:    and a1, a2, a3
-; RV32IFZFHMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    addi sp, sp, 32
+; RV32IFZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IFZFHMIN-NEXT:    neg a4, a1
+; RV32IFZFHMIN-NEXT:    and a1, a4, a2
+; RV32IFZFHMIN-NEXT:    neg a2, a3
+; RV32IFZFHMIN-NEXT:    neg a3, s0
+; RV32IFZFHMIN-NEXT:    and a0, a3, a0
+; RV32IFZFHMIN-NEXT:    or a0, a2, a0
+; RV32IFZFHMIN-NEXT:    and a0, a4, a0
+; RV32IFZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT:    addi sp, sp, 16
 ; RV32IFZFHMIN-NEXT:    ret
 ;
 ; CHECK64-IZFHMIN-LABEL: fcvt_l_h_sat:
@@ -2629,47 +2605,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ;
 ; RV32IDZFHMIN-LABEL: fcvt_l_h_sat:
 ; RV32IDZFHMIN:       # %bb.0: # %start
-; RV32IDZFHMIN-NEXT:    addi sp, sp, -32
-; RV32IDZFHMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IDZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IDZFHMIN-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IDZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
 ; RV32IDZFHMIN-NEXT:    fcvt.s.h fs0, fa0
-; RV32IDZFHMIN-NEXT:    flt.s s0, fa5, fs0
-; RV32IDZFHMIN-NEXT:    neg s1, s0
 ; RV32IDZFHMIN-NEXT:    lui a0, 913408
 ; RV32IDZFHMIN-NEXT:    fmv.w.x fa5, a0
-; RV32IDZFHMIN-NEXT:    fle.s s2, fa5, fs0
-; RV32IDZFHMIN-NEXT:    neg s3, s2
+; RV32IDZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IDZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IDZFHMIN-NEXT:    call __fixsfdi
-; RV32IDZFHMIN-NEXT:    and a0, s3, a0
-; RV32IDZFHMIN-NEXT:    or a0, s1, a0
-; RV32IDZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IDZFHMIN-NEXT:    neg a2, a2
 ; RV32IDZFHMIN-NEXT:    lui a4, 524288
-; RV32IDZFHMIN-NEXT:    lui a3, 524288
-; RV32IDZFHMIN-NEXT:    beqz s2, .LBB10_2
+; RV32IDZFHMIN-NEXT:    lui a2, 524288
+; RV32IDZFHMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IDZFHMIN-NEXT:  # %bb.1: # %start
-; RV32IDZFHMIN-NEXT:    mv a3, a1
+; RV32IDZFHMIN-NEXT:    mv a2, a1
 ; RV32IDZFHMIN-NEXT:  .LBB10_2: # %start
-; RV32IDZFHMIN-NEXT:    and a0, a2, a0
-; RV32IDZFHMIN-NEXT:    beqz s0, .LBB10_4
+; RV32IDZFHMIN-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IDZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IDZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IDZFHMIN-NEXT:    beqz a3, .LBB10_4
 ; RV32IDZFHMIN-NEXT:  # %bb.3:
-; RV32IDZFHMIN-NEXT:    addi a3, a4, -1
+; RV32IDZFHMIN-NEXT:    addi a2, a4, -1
 ; RV32IDZFHMIN-NEXT:  .LBB10_4: # %start
-; RV32IDZFHMIN-NEXT:    and a1, a2, a3
-; RV32IDZFHMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IDZFHMIN-NEXT:    neg a4, a1
+; RV32IDZFHMIN-NEXT:    and a1, a4, a2
+; RV32IDZFHMIN-NEXT:    neg a2, a3
+; RV32IDZFHMIN-NEXT:    neg a3, s0
+; RV32IDZFHMIN-NEXT:    and a0, a3, a0
+; RV32IDZFHMIN-NEXT:    or a0, a2, a0
+; RV32IDZFHMIN-NEXT:    and a0, a4, a0
+; RV32IDZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IDZFHMIN-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    addi sp, sp, 32
+; RV32IDZFHMIN-NEXT:    addi sp, sp, 16
 ; RV32IDZFHMIN-NEXT:    ret
 ;
 ; CHECK32-IZHINXMIN-LABEL: fcvt_l_h_sat:
diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
index 9c95210bfa7c..04a8a66f4459 100644
--- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
@@ -108,40 +108,38 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI1_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI1_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB1_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB1_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI1_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI1_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB1_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB1_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB1_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB1_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_floor_si64:
@@ -179,16 +177,16 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI1_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI1_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB1_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB1_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -196,11 +194,11 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB1_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB1_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB1_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_floor_si64:
@@ -238,41 +236,39 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI1_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB1_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB1_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB1_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB1_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB1_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB1_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_floor_si64:
@@ -324,16 +320,16 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI1_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB1_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB1_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -341,11 +337,11 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB1_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB1_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB1_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_floor_si64:
@@ -824,40 +820,38 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI5_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI5_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB5_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB5_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI5_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI5_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB5_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB5_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB5_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB5_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_ceil_si64:
@@ -895,16 +889,16 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI5_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI5_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB5_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB5_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -912,11 +906,11 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB5_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB5_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB5_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_ceil_si64:
@@ -954,41 +948,39 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI5_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB5_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB5_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI5_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI5_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB5_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB5_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB5_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB5_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_ceil_si64:
@@ -1040,16 +1032,16 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB5_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB5_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1057,11 +1049,11 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB5_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB5_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB5_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_ceil_si64:
@@ -1540,40 +1532,38 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI9_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI9_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB9_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB9_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI9_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI9_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB9_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB9_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB9_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB9_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_trunc_si64:
@@ -1611,16 +1601,16 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI9_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI9_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB9_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB9_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1628,11 +1618,11 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB9_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB9_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB9_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_trunc_si64:
@@ -1670,41 +1660,39 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI9_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI9_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB9_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB9_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI9_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB9_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB9_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB9_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB9_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_trunc_si64:
@@ -1756,16 +1744,16 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI9_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB9_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB9_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1773,11 +1761,11 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB9_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB9_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB9_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_trunc_si64:
@@ -2256,40 +2244,38 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI13_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI13_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB13_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB13_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI13_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI13_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB13_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB13_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB13_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB13_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_round_si64:
@@ -2327,16 +2313,16 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI13_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI13_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB13_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB13_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2344,11 +2330,11 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB13_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB13_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB13_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_round_si64:
@@ -2386,41 +2372,39 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI13_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI13_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB13_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB13_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI13_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI13_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB13_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB13_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB13_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB13_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_round_si64:
@@ -2472,16 +2456,16 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI13_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB13_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB13_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2489,11 +2473,11 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB13_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB13_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB13_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_round_si64:
@@ -2972,40 +2956,38 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI17_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI17_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB17_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB17_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI17_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI17_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB17_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB17_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB17_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB17_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_roundeven_si64:
@@ -3043,16 +3025,16 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI17_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI17_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB17_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB17_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3060,11 +3042,11 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB17_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB17_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB17_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_roundeven_si64:
@@ -3102,41 +3084,39 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI17_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI17_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB17_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB17_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI17_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI17_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB17_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB17_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB17_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB17_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_roundeven_si64:
@@ -3188,16 +3168,16 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI17_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB17_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB17_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3205,11 +3185,11 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB17_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB17_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB17_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_roundeven_si64:
@@ -3688,40 +3668,38 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI21_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI21_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB21_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB21_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI21_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI21_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB21_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB21_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB21_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB21_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_rint_si64:
@@ -3759,16 +3737,16 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI21_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI21_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB21_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB21_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3776,11 +3754,11 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB21_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB21_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB21_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_rint_si64:
@@ -3818,41 +3796,39 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI21_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI21_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB21_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB21_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI21_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI21_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB21_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB21_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB21_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB21_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_rint_si64:
@@ -3904,16 +3880,16 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI21_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB21_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB21_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3921,11 +3897,11 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB21_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB21_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB21_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_rint_si64:
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 98c886333d69..a0c85ab4dca7 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -630,8 +630,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
 ; RV32I-LABEL: zext16_abs8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    srai a2, a0, 31
+; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    sh a0, 0(a1)
@@ -648,8 +648,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
 ; RV64I-LABEL: zext16_abs8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a2, a0, 63
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    xor a0, a0, a2
 ; RV64I-NEXT:    subw a0, a0, a2
 ; RV64I-NEXT:    sh a0, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
new file mode 100644
index 000000000000..3fa494e1a57d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s --check-prefixes=CHECK-NOZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | FileCheck %s --check-prefixes=CHECK-ZBB
+
+; This test case miscompiled for ZBB (DAGCombiner turned a SELECT into a more
+; poisonous AND operation).
+define i1 @pr84653(i32 %x) {
+; CHECK-NOZBB-LABEL: pr84653:
+; CHECK-NOZBB:       # %bb.0:
+; CHECK-NOZBB-NEXT:    sext.w a1, a0
+; CHECK-NOZBB-NEXT:    sgtz a2, a1
+; CHECK-NOZBB-NEXT:    lui a3, 524288
+; CHECK-NOZBB-NEXT:    addi a3, a3, -1
+; CHECK-NOZBB-NEXT:    xor a0, a0, a3
+; CHECK-NOZBB-NEXT:    sext.w a0, a0
+; CHECK-NOZBB-NEXT:    slt a0, a0, a1
+; CHECK-NOZBB-NEXT:    and a0, a2, a0
+; CHECK-NOZBB-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: pr84653:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    sext.w a1, a0
+; CHECK-ZBB-NEXT:    lui a2, 524288
+; CHECK-ZBB-NEXT:    addi a2, a2, -1
+; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    sext.w a0, a0
+; CHECK-ZBB-NEXT:    max a0, a0, zero
+; CHECK-ZBB-NEXT:    slt a0, a0, a1
+; CHECK-ZBB-NEXT:    ret
+  %cmp1 = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 2147483647, %x  ; 0x7fffffff
+  %cmp2 = icmp sgt i32 %x, %sub
+  %r = select i1 %cmp1, i1 %cmp2, i1 false
+  ret i1 %r
+}
+
+; This test case miscompiled for ZBB (DAGCombiner turned a SELECT into a more
+; poisonous AND operation).
+define i1 @pr85190(i64 %a) {
+; CHECK-NOZBB-LABEL: pr85190:
+; CHECK-NOZBB:       # %bb.0:
+; CHECK-NOZBB-NEXT:    ori a1, a0, 7
+; CHECK-NOZBB-NEXT:    slti a2, a0, 0
+; CHECK-NOZBB-NEXT:    li a3, -1
+; CHECK-NOZBB-NEXT:    slli a3, a3, 63
+; CHECK-NOZBB-NEXT:    sub a3, a3, a1
+; CHECK-NOZBB-NEXT:    slt a0, a0, a3
+; CHECK-NOZBB-NEXT:    and a0, a2, a0
+; CHECK-NOZBB-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: pr85190:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    ori a1, a0, 7
+; CHECK-ZBB-NEXT:    li a2, -1
+; CHECK-ZBB-NEXT:    slli a2, a2, 63
+; CHECK-ZBB-NEXT:    sub a2, a2, a1
+; CHECK-ZBB-NEXT:    slt a0, a0, a2
+; CHECK-ZBB-NEXT:    ret
+  %or = or i64 %a, 7
+  %cmp1 = icmp slt i64 %a, 0
+  %sub = sub nsw i64 -9223372036854775808, %or  ; 0x8000000000000000
+  %cmp2 = icmp sgt i64 %sub, %a
+  %res = select i1 %cmp1, i1 %cmp2, i1 false
+  ret i1 %res
+}
+
+define i1 @select_to_or(i32 %x) {
+; CHECK-NOZBB-LABEL: select_to_or:
+; CHECK-NOZBB:       # %bb.0:
+; CHECK-NOZBB-NEXT:    sext.w a1, a0
+; CHECK-NOZBB-NEXT:    sgtz a2, a1
+; CHECK-NOZBB-NEXT:    lui a3, 524288
+; CHECK-NOZBB-NEXT:    addi a3, a3, -1
+; CHECK-NOZBB-NEXT:    xor a0, a0, a3
+; CHECK-NOZBB-NEXT:    sext.w a0, a0
+; CHECK-NOZBB-NEXT:    slt a0, a0, a1
+; CHECK-NOZBB-NEXT:    or a0, a2, a0
+; CHECK-NOZBB-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: select_to_or:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    sext.w a1, a0
+; CHECK-ZBB-NEXT:    lui a2, 524288
+; CHECK-ZBB-NEXT:    addi a2, a2, -1
+; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    sext.w a0, a0
+; CHECK-ZBB-NEXT:    min a0, a0, zero
+; CHECK-ZBB-NEXT:    slt a0, a0, a1
+; CHECK-ZBB-NEXT:    ret
+  %cmp1 = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 2147483647, %x  ; 0x7fffffff
+  %cmp2 = icmp sgt i32 %x, %sub
+  %r = select i1 %cmp1, i1 true, i1 %cmp2
+  ret i1 %r
+}
diff --git a/llvm/test/CodeGen/RISCV/pr90652.ll b/llvm/test/CodeGen/RISCV/pr90652.ll
new file mode 100644
index 000000000000..2162395b92ac
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr90652.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s
+
+define i1 @test(i64 %x, i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi a3, a0, 1
+; CHECK-NEXT:    slt a0, a3, a0
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    or a0, a2, a0
+; CHECK-NEXT:    ret
+entry:
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %x, i64 1)
+  %ov = extractvalue { i64, i1 } %sadd, 1
+  %or = or i1 %cond2, %ov
+  %sel = select i1 %cond1, i1 %cond2, i1 %or
+  ret i1 %sel
+}
diff --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
index 3ce56318426a..81ef6072449e 100644
--- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
+++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
@@ -26,7 +26,10 @@ define void @last_chance_recoloring_failure() {
 ; CHECK-NEXT:    li a0, 55
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vloxseg2ei32.v v16, (a0), v8
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
@@ -37,37 +40,24 @@ define void @last_chance_recoloring_failure() {
 ; CHECK-NEXT:    li s0, 36
 ; CHECK-NEXT:    vsetvli zero, s0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwadd.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    call func
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, s0, e16, m4, ta, ma
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwsub.wv v8, v0, v20
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT:    vssubu.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, s0, e32, m8, tu, mu
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfdiv.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    vl4r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl4r.v v20, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwsub.wv v8, v24, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, mu
+; CHECK-NEXT:    vfdiv.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
@@ -109,25 +99,20 @@ define void @last_chance_recoloring_failure() {
 ; SUBREGLIVENESS-NEXT:    addi a0, sp, 16
 ; SUBREGLIVENESS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; SUBREGLIVENESS-NEXT:    call func
-; SUBREGLIVENESS-NEXT:    li a0, 32
-; SUBREGLIVENESS-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; SUBREGLIVENESS-NEXT:    vrgather.vv v16, v8, v12, v0.t
 ; SUBREGLIVENESS-NEXT:    vsetvli zero, s0, e16, m4, ta, ma
+; SUBREGLIVENESS-NEXT:    csrr a0, vlenb
+; SUBREGLIVENESS-NEXT:    slli a0, a0, 3
+; SUBREGLIVENESS-NEXT:    add a0, sp, a0
+; SUBREGLIVENESS-NEXT:    addi a0, a0, 16
 ; SUBREGLIVENESS-NEXT:    csrr a1, vlenb
-; SUBREGLIVENESS-NEXT:    slli a1, a1, 3
-; SUBREGLIVENESS-NEXT:    add a1, sp, a1
-; SUBREGLIVENESS-NEXT:    addi a1, a1, 16
-; SUBREGLIVENESS-NEXT:    csrr a2, vlenb
-; SUBREGLIVENESS-NEXT:    slli a2, a2, 2
-; SUBREGLIVENESS-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; SUBREGLIVENESS-NEXT:    add a1, a1, a2
-; SUBREGLIVENESS-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
-; SUBREGLIVENESS-NEXT:    addi a1, sp, 16
-; SUBREGLIVENESS-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; SUBREGLIVENESS-NEXT:    vfwsub.wv v8, v24, v20
-; SUBREGLIVENESS-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
-; SUBREGLIVENESS-NEXT:    vssubu.vv v16, v16, v8, v0.t
-; SUBREGLIVENESS-NEXT:    vsetvli zero, s0, e32, m8, tu, mu
+; SUBREGLIVENESS-NEXT:    slli a1, a1, 2
+; SUBREGLIVENESS-NEXT:    vl4r.v v16, (a0) # Unknown-size Folded Reload
+; SUBREGLIVENESS-NEXT:    add a0, a0, a1
+; SUBREGLIVENESS-NEXT:    vl4r.v v20, (a0) # Unknown-size Folded Reload
+; SUBREGLIVENESS-NEXT:    addi a0, sp, 16
+; SUBREGLIVENESS-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; SUBREGLIVENESS-NEXT:    vfwsub.wv v8, v24, v16
+; SUBREGLIVENESS-NEXT:    vsetvli zero, zero, e32, m8, tu, mu
 ; SUBREGLIVENESS-NEXT:    vfdiv.vv v8, v24, v8, v0.t
 ; SUBREGLIVENESS-NEXT:    vse32.v v8, (a0)
 ; SUBREGLIVENESS-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
index 9f06a9dd124c..c3ae40124ba0 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
@@ -634,7 +634,6 @@ define i64 @zext_mul288(i32 signext %a) {
 }
 
 ; We can't use slli.uw becaues the shift amount is more than 31.
-; FIXME: The zext.w is unneeded.
 define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
@@ -647,7 +646,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -657,7 +655,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 }
 
 ; We can't use slli.uw becaues the shift amount is more than 31.
-; FIXME: The zext.w is unneeded.
 define i64 @zext_mul21474836480(i32 signext %a) {
 ; RV64I-LABEL: zext_mul21474836480:
 ; RV64I:       # %bb.0:
@@ -670,7 +667,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh2add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -680,7 +676,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 }
 
 ; We can't use slli.uw becaues the shift amount is more than 31.
-; FIXME: The zext.w is unneeded.
 define i64 @zext_mul38654705664(i32 signext %a) {
 ; RV64I-LABEL: zext_mul38654705664:
 ; RV64I:       # %bb.0:
@@ -693,7 +688,6 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh3add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
index 39a5b9b0f367..c98ad4592a66 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
@@ -102,11 +102,10 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
 define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: rol_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a2, a1, -1
-; RV64I-NEXT:    sllw a1, a0, a1
-; RV64I-NEXT:    negw a2, a2
-; RV64I-NEXT:    srlw a0, a0, a2
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sllw a2, a0, a1
+; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-ZBKB-LABEL: rol_i32:
@@ -121,11 +120,10 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
 define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
 ; RV64I-LABEL: rol_i32_nosext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a3, a1, -1
-; RV64I-NEXT:    sllw a1, a0, a1
-; RV64I-NEXT:    negw a3, a3
-; RV64I-NEXT:    srlw a0, a0, a3
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sllw a3, a0, a1
+; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sw a0, 0(a2)
 ; RV64I-NEXT:    ret
 ;
@@ -142,12 +140,11 @@ define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
 define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: rol_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a1, a0, -1
-; RV64I-NEXT:    li a2, -2
-; RV64I-NEXT:    sllw a0, a2, a0
-; RV64I-NEXT:    negw a1, a1
-; RV64I-NEXT:    srlw a1, a2, a1
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    li a1, -2
+; RV64I-NEXT:    sllw a2, a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    srlw a0, a1, a0
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-ZBKB-LABEL: rol_i32_neg_constant_rhs:
@@ -183,11 +180,10 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
 define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: ror_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a2, a1, -1
-; RV64I-NEXT:    srlw a1, a0, a1
-; RV64I-NEXT:    negw a2, a2
-; RV64I-NEXT:    sllw a0, a0, a2
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srlw a2, a0, a1
+; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-ZBKB-LABEL: ror_i32:
@@ -202,11 +198,10 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
 define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
 ; RV64I-LABEL: ror_i32_nosext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a3, a1, -1
-; RV64I-NEXT:    srlw a1, a0, a1
-; RV64I-NEXT:    negw a3, a3
-; RV64I-NEXT:    sllw a0, a0, a3
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    srlw a3, a0, a1
+; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sw a0, 0(a2)
 ; RV64I-NEXT:    ret
 ;
@@ -223,12 +218,11 @@ define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
 define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: ror_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a1, a0, -1
-; RV64I-NEXT:    li a2, -2
-; RV64I-NEXT:    srlw a0, a2, a0
-; RV64I-NEXT:    negw a1, a1
-; RV64I-NEXT:    sllw a1, a2, a1
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    li a1, -2
+; RV64I-NEXT:    srlw a2, a1, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-ZBKB-LABEL: ror_i32_neg_constant_rhs:
diff --git a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
index 6a441e2b9f67..b8c43289bdfe 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
@@ -192,3 +192,17 @@ entry:
   %or = or i32 %and, 255
   ret i32 %or
 }
+
+define i64 @and_allones(i32 signext %x) {
+; CHECK-LABEL: and_allones:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    li a1, 1
+; CHECK-NEXT:    sll a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %y = zext i32 %x to i64
+  %shamt = add nsw i64 %y, -1
+  %ret = shl i64 1, %shamt
+  ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 0efc45b99289..817e2b7d0bd9 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -4,7 +4,9 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zba -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64ZBA,RV64ZBANOZBB
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zba,+zbb -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64ZBA,RV64ZBAZBB
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64ZBA,RV64ZBAZBB,RV64ZBAZBBNOZBS
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zba,+zbb,+zbs -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV64ZBA,RV64ZBAZBB,RV64ZBAZBBZBS
 
 define i64 @slliuw(i64 %a) nounwind {
 ; RV64I-LABEL: slliuw:
@@ -851,7 +853,6 @@ define i64 @zext_mul288(i32 signext %a) {
 }
 
 ; We can't use slli.uw becaues the shift amount is more than 31.
-; FIXME: The zext.w is unneeded.
 define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
@@ -864,7 +865,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -874,7 +874,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 }
 
 ; We can't use slli.uw becaues the shift amount is more than 31.
-; FIXME: The zext.w is unneeded.
 define i64 @zext_mul21474836480(i32 signext %a) {
 ; RV64I-LABEL: zext_mul21474836480:
 ; RV64I:       # %bb.0:
@@ -887,7 +886,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh2add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -897,7 +895,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 }
 
 ; We can't use slli.uw becaues the shift amount is more than 31.
-; FIXME: The zext.w is unneeded.
 define i64 @zext_mul38654705664(i32 signext %a) {
 ; RV64I-LABEL: zext_mul38654705664:
 ; RV64I:       # %bb.0:
@@ -910,7 +907,6 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    andi a0, a0, -1
 ; RV64ZBA-NEXT:    sh3add a0, a0, a0
 ; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
@@ -2739,3 +2735,121 @@ define i64 @mul_neg8(i64 %a) {
   %c = mul i64 %a, -8
   ret i64 %c
 }
+
+define i64 @bext_mul12(i32 %1, i32 %2) {
+; RV64I-LABEL: bext_mul12:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    li a1, 12
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBANOZBB-LABEL: bext_mul12:
+; RV64ZBANOZBB:       # %bb.0: # %entry
+; RV64ZBANOZBB-NEXT:    srlw a0, a0, a1
+; RV64ZBANOZBB-NEXT:    andi a0, a0, 1
+; RV64ZBANOZBB-NEXT:    sh1add a0, a0, a0
+; RV64ZBANOZBB-NEXT:    slli a0, a0, 2
+; RV64ZBANOZBB-NEXT:    ret
+;
+; RV64ZBAZBBNOZBS-LABEL: bext_mul12:
+; RV64ZBAZBBNOZBS:       # %bb.0: # %entry
+; RV64ZBAZBBNOZBS-NEXT:    srlw a0, a0, a1
+; RV64ZBAZBBNOZBS-NEXT:    andi a0, a0, 1
+; RV64ZBAZBBNOZBS-NEXT:    sh1add a0, a0, a0
+; RV64ZBAZBBNOZBS-NEXT:    slli a0, a0, 2
+; RV64ZBAZBBNOZBS-NEXT:    ret
+;
+; RV64ZBAZBBZBS-LABEL: bext_mul12:
+; RV64ZBAZBBZBS:       # %bb.0: # %entry
+; RV64ZBAZBBZBS-NEXT:    bext a0, a0, a1
+; RV64ZBAZBBZBS-NEXT:    sh1add a0, a0, a0
+; RV64ZBAZBBZBS-NEXT:    slli a0, a0, 2
+; RV64ZBAZBBZBS-NEXT:    ret
+entry:
+  %3 = lshr i32 %1, %2
+  %4 = and i32 %3, 1
+  %5 = zext nneg i32 %4 to i64
+  %6 = mul i64 %5, 12
+  ret i64 %6
+}
+
+define i64 @bext_mul45(i32 %1, i32 %2) {
+; RV64I-LABEL: bext_mul45:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    li a1, 45
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBANOZBB-LABEL: bext_mul45:
+; RV64ZBANOZBB:       # %bb.0: # %entry
+; RV64ZBANOZBB-NEXT:    srlw a0, a0, a1
+; RV64ZBANOZBB-NEXT:    andi a0, a0, 1
+; RV64ZBANOZBB-NEXT:    sh2add a0, a0, a0
+; RV64ZBANOZBB-NEXT:    sh3add a0, a0, a0
+; RV64ZBANOZBB-NEXT:    ret
+;
+; RV64ZBAZBBNOZBS-LABEL: bext_mul45:
+; RV64ZBAZBBNOZBS:       # %bb.0: # %entry
+; RV64ZBAZBBNOZBS-NEXT:    srlw a0, a0, a1
+; RV64ZBAZBBNOZBS-NEXT:    andi a0, a0, 1
+; RV64ZBAZBBNOZBS-NEXT:    sh2add a0, a0, a0
+; RV64ZBAZBBNOZBS-NEXT:    sh3add a0, a0, a0
+; RV64ZBAZBBNOZBS-NEXT:    ret
+;
+; RV64ZBAZBBZBS-LABEL: bext_mul45:
+; RV64ZBAZBBZBS:       # %bb.0: # %entry
+; RV64ZBAZBBZBS-NEXT:    bext a0, a0, a1
+; RV64ZBAZBBZBS-NEXT:    sh2add a0, a0, a0
+; RV64ZBAZBBZBS-NEXT:    sh3add a0, a0, a0
+; RV64ZBAZBBZBS-NEXT:    ret
+entry:
+  %3 = lshr i32 %1, %2
+  %4 = and i32 %3, 1
+  %5 = zext nneg i32 %4 to i64
+  %6 = mul i64 %5, 45
+  ret i64 %6
+}
+
+define i64 @bext_mul132(i32 %1, i32 %2) {
+; RV64I-LABEL: bext_mul132:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    li a1, 132
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBANOZBB-LABEL: bext_mul132:
+; RV64ZBANOZBB:       # %bb.0: # %entry
+; RV64ZBANOZBB-NEXT:    srlw a0, a0, a1
+; RV64ZBANOZBB-NEXT:    andi a0, a0, 1
+; RV64ZBANOZBB-NEXT:    slli a1, a0, 7
+; RV64ZBANOZBB-NEXT:    sh2add a0, a0, a1
+; RV64ZBANOZBB-NEXT:    ret
+;
+; RV64ZBAZBBNOZBS-LABEL: bext_mul132:
+; RV64ZBAZBBNOZBS:       # %bb.0: # %entry
+; RV64ZBAZBBNOZBS-NEXT:    srlw a0, a0, a1
+; RV64ZBAZBBNOZBS-NEXT:    andi a0, a0, 1
+; RV64ZBAZBBNOZBS-NEXT:    slli a1, a0, 7
+; RV64ZBAZBBNOZBS-NEXT:    sh2add a0, a0, a1
+; RV64ZBAZBBNOZBS-NEXT:    ret
+;
+; RV64ZBAZBBZBS-LABEL: bext_mul132:
+; RV64ZBAZBBZBS:       # %bb.0: # %entry
+; RV64ZBAZBBZBS-NEXT:    bext a0, a0, a1
+; RV64ZBAZBBZBS-NEXT:    slli a1, a0, 7
+; RV64ZBAZBBZBS-NEXT:    sh2add a0, a0, a1
+; RV64ZBAZBBZBS-NEXT:    ret
+entry:
+  %3 = lshr i32 %1, %2
+  %4 = and i32 %3, 1
+  %5 = zext nneg i32 %4 to i64
+  %6 = mul i64 %5, 132
+  ret i64 %6
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/commutable.ll b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
index b59df3b743cd..d94b529bac01 100644
--- a/llvm/test/CodeGen/RISCV/rvv/commutable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
@@ -649,3 +649,173 @@ entry:
   ret <vscale x 1 x i64> %ret
 }
 
+; vsadd.vv
+declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen);
+define <vscale x 1 x i64> @commutable_vsadd_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vsadd_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vsadd_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vsadd_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+; vsaddu.vv
+declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen);
+define <vscale x 1 x i64> @commutable_vsaddu_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vsaddu_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vsaddu_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vsaddu_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+; vaadd.vv
+declare <vscale x 1 x i64> @llvm.riscv.vaadd.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vaadd_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vaadd_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vaadd.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vaadd.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen 0, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vaadd.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen 0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vaadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vaadd_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vaadd_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vaadd.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vaadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vaadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vaadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+; vaaddu.vv
+declare <vscale x 1 x i64> @llvm.riscv.vaaddu.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vaaddu_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vaaddu_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vaaddu.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vaaddu.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen 0, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vaaddu.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen 0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vaaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vaaddu_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vaaddu_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vaaddu.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vaaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vaaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vaaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+; vsmul.vv
+declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vsmul_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vsmul_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsmul.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen 0, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen 0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vsmul_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vsmul_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsmul.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vsmul.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
index eb4c8bfdd67f..95c227518f5c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -start-after=finalize-isel | FileCheck %s
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v,+xsfvcp -start-after=finalize-isel | FileCheck %s
 
 --- |
   define void @foo() {
@@ -7,23 +7,22 @@
   ; CHECK:       # %bb.0: # %entry
   ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
   ; CHECK-NEXT:    vmsne.vi v0, v8, 0
-  ; CHECK-NEXT:    vsll.vi v9, v8, 5
-  ; CHECK-NEXT:    vmerge.vim v9, v9, -1, v0
-  ; CHECK-NEXT:    csrwi vxrm, 0
-  ; CHECK-NEXT:    vssra.vi v8, v8, 2
+  ; CHECK-NEXT:    vsll.vi v8, v8, 5
+  ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
+  ; CHECK-NEXT:    sf.vc.v.x 3, 31, v9, a1
   ; CHECK-NEXT:    bgeu a0, zero, .LBB0_3
   ; CHECK-NEXT:  # %bb.1: # %entry
   ; CHECK-NEXT:    li a2, 128
   ; CHECK-NEXT:    bltu a0, a2, .LBB0_4
   ; CHECK-NEXT:  .LBB0_2: # %entry
-  ; CHECK-NEXT:    vse64.v v8, (a1)
+  ; CHECK-NEXT:    vse64.v v9, (a1)
   ; CHECK-NEXT:    ret
   ; CHECK-NEXT:  .LBB0_3:
-  ; CHECK-NEXT:    vmv.v.i v8, 0
+  ; CHECK-NEXT:    vmv.v.i v9, 0
   ; CHECK-NEXT:    li a2, 128
   ; CHECK-NEXT:    bgeu a0, a2, .LBB0_2
   ; CHECK-NEXT:  .LBB0_4: # %entry
-  ; CHECK-NEXT:    vse64.v v9, (a1)
+  ; CHECK-NEXT:    vse64.v v8, (a1)
   ; CHECK-NEXT:    ret
   entry:
     ret void
@@ -51,7 +50,7 @@ body:             |
     %26:vrnov0 = IMPLICIT_DEF
     %25:vrnov0 = PseudoVMERGE_VIM_M1 %26, %17, -1, $v0, 1, 6 /* e64 */
     %pt8:vr = IMPLICIT_DEF
-    %29:vr = PseudoVSSRA_VI_M1 %pt8, %3, 2, 0, 1, 6 /* e64 */, 0
+    %29:vr = PseudoVC_V_X_SE_M1 3, 31, %2, 1, 6 /* e64 */, implicit-def dead $vcix_state, implicit $vcix_state
     %pt9:vr = IMPLICIT_DEF
     %30:vr = PseudoVMV_V_I_M1 %pt9, 0, 1, 6 /* e64 */, 0
     BGEU %1, $x0, %bb.2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
index 2874db6debd7..875f4f239028 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
@@ -26,17 +26,13 @@ define <512 x i8> @single_source(<512 x i8> %a) {
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v17, v16, 5
-; CHECK-NEXT:    vmv.x.s a0, v17
-; CHECK-NEXT:    vmv.s.x v24, a0
+; CHECK-NEXT:    vslidedown.vi v24, v16, 5
 ; CHECK-NEXT:    li a0, 432
 ; CHECK-NEXT:    li a1, 431
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v24, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v16, v16, 4
-; CHECK-NEXT:    vmv.x.s a0, v16
-; CHECK-NEXT:    vmv.s.x v16, a0
 ; CHECK-NEXT:    li a0, 466
 ; CHECK-NEXT:    li a1, 465
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
@@ -109,20 +105,17 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) {
 ; CHECK-NEXT:    addi a1, sp, 512
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v24, 5
 ; CHECK-NEXT:    vmv.x.s a1, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v25, v24, 5
-; CHECK-NEXT:    vmv.x.s a1, v25
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    li a1, 432
 ; CHECK-NEXT:    li a2, 431
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v0, a2
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v24, v24, 4
-; CHECK-NEXT:    vmv.x.s a1, v24
-; CHECK-NEXT:    vmv.s.x v24, a1
 ; CHECK-NEXT:    li a1, 466
 ; CHECK-NEXT:    li a2, 465
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
@@ -130,9 +123,9 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) {
 ; CHECK-NEXT:    vslideup.vx v8, v24, a2
 ; CHECK-NEXT:    vmv.s.x v24, a1
 ; CHECK-NEXT:    li a1, 478
+; CHECK-NEXT:    li a2, 477
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    lbu a1, 1012(sp)
-; CHECK-NEXT:    li a2, 477
 ; CHECK-NEXT:    vslideup.vx v8, v24, a2
 ; CHECK-NEXT:    vmv.s.x v24, a1
 ; CHECK-NEXT:    li a1, 501
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 83edd49bc963..1587f770f87c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -35,7 +35,7 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-NEXT:    vmv.v.v v0, v9
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
-  %retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec)
+  %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
   ret {<16 x i1>, <16 x i1>} %retval
 }
 
@@ -46,7 +46,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_load_v16i8_v32i8(ptr %p) {
 ; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <32 x i8>, ptr %p
-  %retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+  %retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
   ret {<16 x i8>, <16 x i8>} %retval
 }
 
@@ -62,7 +62,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16_align1(ptr
 ; CHECK-NEXT:    vnsrl.wi v9, v10, 16
 ; CHECK-NEXT:    ret
   %vec = load <16 x i16>, ptr %p, align 1
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -73,7 +73,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <16 x i16>, ptr %p
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -84,7 +84,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_load_v4i32_vv8i32(ptr %p) {
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x i32>, ptr %p
-  %retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+  %retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
   ret {<4 x i32>, <4 x i32>} %retval
 }
 
@@ -95,15 +95,15 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_load_v2i64_v4i64(ptr %p) {
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x i64>, ptr %p
-  %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+  %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
   ret {<2 x i64>, <2 x i64>} %retval
 }
 
-declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>)
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
 ; Floats
 
@@ -114,7 +114,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_load_v2f16_v4f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x half>, ptr %p
-  %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+  %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>} %retval
 }
 
@@ -125,7 +125,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_load_v4f16_v8f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x half>, ptr %p
-  %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+  %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>} %retval
 }
 
@@ -136,7 +136,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_load_v2f32_v4f32(ptr %p)
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x float>, ptr %p
-  %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+  %retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
   ret {<2 x float>, <2 x float>} %retval
 }
 
@@ -147,7 +147,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_load_v8f16_v16f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <16 x half>, ptr %p
-  %retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+  %retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
   ret {<8 x half>, <8 x half>} %retval
 }
 
@@ -158,7 +158,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_load_v4f32_v8f32(ptr %p)
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x float>, ptr %p
-  %retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+  %retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
   ret  {<4 x float>, <4 x float>} %retval
 }
 
@@ -169,13 +169,13 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x double>, ptr %p
-  %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+  %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
   ret {<2 x double>, <2 x double>} %retval
 }
 
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index ab6df1d3e883..53de1a875535 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -9,39 +9,63 @@
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v12
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 0)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 2
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 2
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v12, 2
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 2)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 6
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 6
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v12, 2
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 6)
   ret <vscale x 8 x i32> %v
@@ -58,9 +82,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %
 ;
 ; VLS-LABEL: insert_nxv8i32_v8i32_0:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl2re32.v v12, (a0)
-; VLS-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    vl2re32.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
@@ -78,9 +100,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %
 ;
 ; VLS-LABEL: insert_nxv8i32_v8i32_8:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl2re32.v v12, (a0)
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    vl2re32.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
@@ -98,6 +118,31 @@ define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
   ret <vscale x 8 x i32> %v
 }
 
+define <vscale x 2 x i32> @insert_nxv8i32_v4i32_0(<vscale x 2 x i32> %vec, <4 x i32> %subvec) {
+; VLA-LABEL: insert_nxv8i32_v4i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v9
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v4i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v8, v9
+; VLS-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 2 x i32> %v
+}
+
+
+define <4 x i32> @insert_v4i32_v4i32_0(<4 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: insert_v4i32_v4i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <4 x i32> %v
+}
+
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v4i32_v2i32_0:
 ; VLA:       # %bb.0:
@@ -175,6 +220,31 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
   ret void
 }
 
+; This tests the code path in RISCVISelDAGToDAG::Select where we select an
+; insert_subvector with a fixed vector and fixed subvector type. The phi here is
+; used to prevent the fixed insert_subvector from being combined away into a
+; scalable insert_subvector.
+define <4 x i32> @insert_v4i32_undef_v2i32_0_phi(<2 x i32> %subvec, i1 %cond) {
+; CHECK-LABEL: insert_v4i32_undef_v2i32_0_phi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    bnez a0, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:  .LBB11_2: # %bar
+; CHECK-NEXT:    ret
+entry:
+  br i1 %cond, label %foo, label %bar
+foo:
+  %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %subvec, i64 0)
+  br label %bar
+bar:
+  %w = phi <4 x i32> [%v, %foo], [zeroinitializer, %entry]
+  ret <4 x i32> %w
+}
+
+
 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLA-LABEL: insert_v8i32_v2i32_0:
 ; VLA:       # %bb.0:
@@ -193,7 +263,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vle32.v v8, (a1)
 ; VLS-NEXT:    vl2re32.v v10, (a0)
-; VLS-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; VLS-NEXT:    vmv.v.v v10, v8
 ; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
@@ -220,11 +290,11 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_2:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vl2re32.v v8, (a0)
-; VLS-NEXT:    vle32.v v10, (a1)
-; VLS-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v10, 2
-; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v8, 2
+; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -247,11 +317,11 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS-LABEL: insert_v8i32_v2i32_6:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; VLS-NEXT:    vl2re32.v v8, (a0)
-; VLS-NEXT:    vle32.v v10, (a1)
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v10, 6
-; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v11, v8, 2
+; VLS-NEXT:    vs2r.v v10, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -274,9 +344,9 @@ define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vle32.v v8, (a1)
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v10, v8, 6
-; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v8, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
 ; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
@@ -542,9 +612,7 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
 ; VLS-LABEL: insert_v2i64_nxv16i64:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vl1re64.v v8, (a0)
-; VLS-NEXT:    vl1re64.v v16, (a1)
-; VLS-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vl1re64.v v10, (a1)
 ; VLS-NEXT:    vs8r.v v8, (a2)
 ; VLS-NEXT:    ret
   %sv0 = load <2 x i64>, ptr %psv0
@@ -586,10 +654,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
 ;
 ; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vl1re64.v v8, (a0)
-; VLS-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v8, 2
-; VLS-NEXT:    vs8r.v v16, (a1)
+; VLS-NEXT:    vl1re64.v v9, (a0)
+; VLS-NEXT:    vs8r.v v8, (a1)
 ; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
@@ -633,7 +699,6 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
-;
 ; RV64-LABEL: insert_v2i64_nxv16i64_hi:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index 9161cedd58e3..8de9cc25ae09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -23,7 +23,7 @@ define void @vector_interleave_store_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b, ptr
 ; CHECK-NEXT:    vmsne.vi v8, v12, 0
 ; CHECK-NEXT:    vsm.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
+  %res = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
   store <32 x i1> %res, ptr %p
   ret void
 }
@@ -40,7 +40,7 @@ define void @vector_interleave_store_v16i16_v8i16_align1(<8 x i16> %a, <8 x i16>
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vse8.v v10, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+  %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
   store <16 x i16> %res, ptr %p, align 1
   ret void
 }
@@ -51,7 +51,7 @@ define void @vector_interleave_store_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b, pt
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+  %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
   store <16 x i16> %res, ptr %p
   ret void
 }
@@ -62,7 +62,7 @@ define void @vector_interleave_store_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b, ptr
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+  %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
   store <8 x i32> %res, ptr %p
   ret void
 }
@@ -73,15 +73,15 @@ define void @vector_interleave_store_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b, ptr
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
+  %res = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
   store <4 x i64> %res, ptr %p
   ret void
 }
 
-declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
 ; Floats
 
@@ -91,7 +91,7 @@ define void @vector_interleave_store_v4f16_v2f16(<2 x half> %a, <2 x half> %b, p
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
+  %res = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
   store <4 x half> %res, ptr %p
   ret void
 }
@@ -102,7 +102,7 @@ define void @vector_interleave_store_v8f16_v4f16(<4 x half> %a, <4 x half> %b, p
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
+  %res = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
   store <8 x half> %res, ptr %p
   ret void
 }
@@ -113,7 +113,7 @@ define void @vector_interleave_store_v4f32_v2f32(<2 x float> %a, <2 x float> %b,
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
+  %res = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
   store <4 x float> %res, ptr %p
   ret void
 }
@@ -124,7 +124,7 @@ define void @vector_interleave_store_v16f16_v8f16(<8 x half> %a, <8 x half> %b,
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
+  %res = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
   store <16 x half> %res, ptr %p
   ret void
 }
@@ -135,7 +135,7 @@ define void @vector_interleave_store_v8f32_v4f32(<4 x float> %a, <4 x float> %b,
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+  %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
   store <8 x float> %res, ptr %p
   ret void
 }
@@ -146,15 +146,15 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> %
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
+  %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
   store <4 x double> %res, ptr %p
   ret void
 }
 
 
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 2495178ea762..02a989a96996 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -574,24 +574,14 @@ define signext i32 @vpreduce_add_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m
 declare i32 @llvm.vp.reduce.umax.v2i32(i32, <2 x i32>, <2 x i1>, i32)
 
 define signext i32 @vpreduce_umax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umax_v2i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umax_v2i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV64-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umax_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umax.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -614,24 +604,14 @@ define signext i32 @vpreduce_smax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %
 declare i32 @llvm.vp.reduce.umin.v2i32(i32, <2 x i32>, <2 x i1>, i32)
 
 define signext i32 @vpreduce_umin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umin_v2i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umin_v2i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV64-NEXT:    vredminu.vs v9, v8, v9, v0.t
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umin_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umin.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -714,24 +694,14 @@ define signext i32 @vpreduce_add_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m
 declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 
 define signext i32 @vpreduce_umax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umax_v4i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umax_v4i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; RV64-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umax_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umax.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -754,24 +724,14 @@ define signext i32 @vpreduce_smax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %
 declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32)
 
 define signext i32 @vpreduce_umin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umin_v4i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umin_v4i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; RV64-NEXT:    vredminu.vs v9, v8, v9, v0.t
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umin_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umin.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)
   ret i32 %r
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 98e6b8f2dd76..609b4e982489 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -1,30 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefix=VLA %s
 
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefix=VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: concat_2xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %ab
 }
 
 define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
-; CHECK-LABEL: concat_4xv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv2i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv2i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v13, v10
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vmv1r.v v12, v8
+; VLS-NEXT:    vslideup.vi v13, v11, 2
+; VLS-NEXT:    vslideup.vi v12, v9, 2
+; VLS-NEXT:    vmv2r.v v8, v12
+; VLS-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -32,21 +46,38 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 }
 
 define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
-; CHECK-LABEL: concat_8xv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 1
-; CHECK-NEXT:    vslideup.vi v12, v13, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 1
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv1i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vslideup.vi v14, v15, 1
+; VLA-NEXT:    vslideup.vi v12, v13, 1
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 2
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 1
+; VLA-NEXT:    vslideup.vi v8, v9, 1
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 4
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv1i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v17, v12
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vslideup.vi v14, v15, 1
+; VLS-NEXT:    vmv1r.v v16, v8
+; VLS-NEXT:    vslideup.vi v17, v13, 1
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v17, v14, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v11, 1
+; VLS-NEXT:    vslideup.vi v16, v9, 1
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v16, v10, 2
+; VLS-NEXT:    vmv2r.v v8, v16
+; VLS-NEXT:    ret
   %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
   %abcd = shufflevector <2 x i32> %ab, <2 x i32> %cd, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -58,28 +89,36 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x
 }
 
 define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: concat_2xv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv8i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv2r.v v12, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv8i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %v = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i32> %v
 }
 
 define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
-; CHECK-LABEL: concat_4xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v14, v11
-; CHECK-NEXT:    vmv1r.v v12, v10
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv1r.v v14, v11
+; VLA-NEXT:    vmv1r.v v12, v10
+; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -87,21 +126,35 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 }
 
 define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
-; CHECK-LABEL: concat_8xv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 2
-; CHECK-NEXT:    vslideup.vi v12, v13, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv2i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v14, v15, 2
+; VLA-NEXT:    vslideup.vi v12, v13, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv2i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv1r.v v19, v14
+; VLS-NEXT:    vmv1r.v v18, v12
+; VLS-NEXT:    vmv1r.v v17, v10
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vmv1r.v v16, v8
+; VLS-NEXT:    vslideup.vi v19, v15, 2
+; VLS-NEXT:    vslideup.vi v18, v13, 2
+; VLS-NEXT:    vslideup.vi v17, v11, 2
+; VLS-NEXT:    vslideup.vi v16, v9, 2
+; VLS-NEXT:    vmv4r.v v8, v16
+; VLS-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -123,9 +176,6 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
 ;
 ; VLS-LABEL: concat_2xv16i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv4r.v v16, v12
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i32> %ab
@@ -147,14 +197,6 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 ;
 ; VLS-LABEL: concat_4xv8i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv2r.v v20, v14
-; VLS-NEXT:    vmv2r.v v16, v12
-; VLS-NEXT:    vmv2r.v v12, v10
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -189,25 +231,6 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ;
 ; VLS-LABEL: concat_8xv4i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv1r.v v18, v15
-; VLS-NEXT:    vmv1r.v v20, v14
-; VLS-NEXT:    vmv1r.v v22, v13
-; VLS-NEXT:    vmv1r.v v16, v12
-; VLS-NEXT:    vmv1r.v v14, v11
-; VLS-NEXT:    vmv1r.v v12, v10
-; VLS-NEXT:    vmv1r.v v10, v9
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v20, v18, 4
-; VLS-NEXT:    vslideup.vi v16, v22, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v12, v14, 4
-; VLS-NEXT:    vslideup.vi v8, v10, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
index d4c0477408fd..a81f740f1739 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
@@ -16,8 +16,8 @@ define <256 x i1> @reverse_v256i1(<256 x i1> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v10
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    ret
-  %res = call <256 x i1> @llvm.experimental.vector.reverse.v256i1(<256 x i1> %a)
+  %res = call <256 x i1> @llvm.vector.reverse.v256i1(<256 x i1> %a)
   ret <256 x i1> %res
 }
 
-declare <256 x i1> @llvm.experimental.vector.reverse.v256i1(<256 x i1>)
+declare <256 x i1> @llvm.vector.reverse.v256i1(<256 x i1>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index 8f9f1c2729fc..47d7baade8b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -29,7 +29,7 @@ define <2 x i1> @reverse_v2i1(<2 x i1> %a) {
 ; ZVBB-NEXT:    vbrev.v v8, v0
 ; ZVBB-NEXT:    vsrl.vi v0, v8, 6
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i1> @llvm.experimental.vector.reverse.v2i1(<2 x i1> %a)
+  %res = call <2 x i1> @llvm.vector.reverse.v2i1(<2 x i1> %a)
   ret <2 x i1> %res
 }
 
@@ -51,7 +51,7 @@ define <4 x i1> @reverse_v4i1(<4 x i1> %a) {
 ; ZVBB-NEXT:    vbrev.v v8, v0
 ; ZVBB-NEXT:    vsrl.vi v0, v8, 4
 ; ZVBB-NEXT:    ret
-  %res = call <4 x i1> @llvm.experimental.vector.reverse.v4i1(<4 x i1> %a)
+  %res = call <4 x i1> @llvm.vector.reverse.v4i1(<4 x i1> %a)
   ret <4 x i1> %res
 }
 
@@ -72,7 +72,7 @@ define <8 x i1> @reverse_v8i1(<8 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <8 x i1> @llvm.experimental.vector.reverse.v8i1(<8 x i1> %a)
+  %res = call <8 x i1> @llvm.vector.reverse.v8i1(<8 x i1> %a)
   ret <8 x i1> %res
 }
 
@@ -93,7 +93,7 @@ define <16 x i1> @reverse_v16i1(<16 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <16 x i1> @llvm.experimental.vector.reverse.v16i1(<16 x i1> %a)
+  %res = call <16 x i1> @llvm.vector.reverse.v16i1(<16 x i1> %a)
   ret <16 x i1> %res
 }
 
@@ -116,7 +116,7 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <32 x i1> @llvm.experimental.vector.reverse.v32i1(<32 x i1> %a)
+  %res = call <32 x i1> @llvm.vector.reverse.v32i1(<32 x i1> %a)
   ret <32 x i1> %res
 }
 
@@ -139,7 +139,7 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <64 x i1> @llvm.experimental.vector.reverse.v64i1(<64 x i1> %a)
+  %res = call <64 x i1> @llvm.vector.reverse.v64i1(<64 x i1> %a)
   ret <64 x i1> %res
 }
 
@@ -156,7 +156,7 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) {
 ; CHECK-NEXT:    vrgather.vv v24, v16, v8
 ; CHECK-NEXT:    vmsne.vi v0, v24, 0
 ; CHECK-NEXT:    ret
-  %res = call <128 x i1> @llvm.experimental.vector.reverse.v128i1(<128 x i1> %a)
+  %res = call <128 x i1> @llvm.vector.reverse.v128i1(<128 x i1> %a)
   ret <128 x i1> %res
 }
 
@@ -164,7 +164,7 @@ define <1 x i8> @reverse_v1i8(<1 x i8> %a) {
 ; CHECK-LABEL: reverse_v1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i8> @llvm.experimental.vector.reverse.v1i8(<1 x i8> %a)
+  %res = call <1 x i8> @llvm.vector.reverse.v1i8(<1 x i8> %a)
   ret <1 x i8> %res
 }
 
@@ -182,7 +182,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVBB-NEXT:    vrev8.v v8, v8
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -195,7 +195,7 @@ define <4 x i8> @reverse_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i8> @llvm.experimental.vector.reverse.v4i8(<4 x i8> %a)
+  %res = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> %a)
   ret <4 x i8> %res
 }
 
@@ -208,7 +208,7 @@ define <8 x i8> @reverse_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x i8> @llvm.experimental.vector.reverse.v8i8(<8 x i8> %a)
+  %res = call <8 x i8> @llvm.vector.reverse.v8i8(<8 x i8> %a)
   ret <8 x i8> %res
 }
 
@@ -221,7 +221,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -236,7 +236,7 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> %a)
+  %res = call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> %a)
   ret <32 x i8> %res
 }
 
@@ -251,7 +251,7 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <64 x i8> @llvm.experimental.vector.reverse.v64i8(<64 x i8> %a)
+  %res = call <64 x i8> @llvm.vector.reverse.v64i8(<64 x i8> %a)
   ret <64 x i8> %res
 }
 
@@ -259,7 +259,7 @@ define <1 x i16> @reverse_v1i16(<1 x i16> %a) {
 ; CHECK-LABEL: reverse_v1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i16> @llvm.experimental.vector.reverse.v1i16(<1 x i16> %a)
+  %res = call <1 x i16> @llvm.vector.reverse.v1i16(<1 x i16> %a)
   ret <1 x i16> %res
 }
 
@@ -277,7 +277,7 @@ define <2 x i16> @reverse_v2i16(<2 x i16> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 16
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16> %a)
+  %res = call <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %res
 }
 
@@ -290,7 +290,7 @@ define <4 x i16> @reverse_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i16> @llvm.experimental.vector.reverse.v4i16(<4 x i16> %a)
+  %res = call <4 x i16> @llvm.vector.reverse.v4i16(<4 x i16> %a)
   ret <4 x i16> %res
 }
 
@@ -303,7 +303,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -316,7 +316,7 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> %a)
+  %res = call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> %a)
   ret <16 x i16> %res
 }
 
@@ -332,7 +332,7 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <32 x i16> @llvm.experimental.vector.reverse.v32i16(<32 x i16> %a)
+  %res = call <32 x i16> @llvm.vector.reverse.v32i16(<32 x i16> %a)
   ret <32 x i16> %res
 }
 
@@ -340,7 +340,7 @@ define <1 x i32> @reverse_v1i32(<1 x i32> %a) {
 ; CHECK-LABEL: reverse_v1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i32> @llvm.experimental.vector.reverse.v1i32(<1 x i32> %a)
+  %res = call <1 x i32> @llvm.vector.reverse.v1i32(<1 x i32> %a)
   ret <1 x i32> %res
 }
 
@@ -358,7 +358,7 @@ define <2 x i32> @reverse_v2i32(<2 x i32> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 32
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %a)
+  %res = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %a)
   ret <2 x i32> %res
 }
 
@@ -371,7 +371,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -385,7 +385,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -399,7 +399,7 @@ define <16 x i32> @reverse_v16i32(<16 x i32> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <16 x i32> @llvm.experimental.vector.reverse.v16i32(<16 x i32> %a)
+  %res = call <16 x i32> @llvm.vector.reverse.v16i32(<16 x i32> %a)
   ret <16 x i32> %res
 }
 
@@ -407,7 +407,7 @@ define <1 x i64> @reverse_v1i64(<1 x i64> %a) {
 ; CHECK-LABEL: reverse_v1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i64> @llvm.experimental.vector.reverse.v1i64(<1 x i64> %a)
+  %res = call <1 x i64> @llvm.vector.reverse.v1i64(<1 x i64> %a)
   ret <1 x i64> %res
 }
 
@@ -419,7 +419,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -433,7 +433,7 @@ define <4 x i64> @reverse_v4i64(<4 x i64> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> %a)
+  %res = call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> %a)
   ret <4 x i64> %res
 }
 
@@ -447,7 +447,7 @@ define <8 x i64> @reverse_v8i64(<8 x i64> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <8 x i64> @llvm.experimental.vector.reverse.v8i64(<8 x i64> %a)
+  %res = call <8 x i64> @llvm.vector.reverse.v8i64(<8 x i64> %a)
   ret <8 x i64> %res
 }
 
@@ -456,7 +456,7 @@ define <1 x half> @reverse_v1f16(<1 x half> %a) {
 ; CHECK-LABEL: reverse_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x half> @llvm.experimental.vector.reverse.v1f16(<1 x half> %a)
+  %res = call <1 x half> @llvm.vector.reverse.v1f16(<1 x half> %a)
   ret <1 x half> %res
 }
 
@@ -474,7 +474,7 @@ define <2 x half> @reverse_v2f16(<2 x half> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 16
 ; ZVBB-NEXT:    ret
-  %res = call <2 x half> @llvm.experimental.vector.reverse.v2f16(<2 x half> %a)
+  %res = call <2 x half> @llvm.vector.reverse.v2f16(<2 x half> %a)
   ret <2 x half> %res
 }
 
@@ -487,7 +487,7 @@ define <4 x half> @reverse_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x half> @llvm.experimental.vector.reverse.v4f16(<4 x half> %a)
+  %res = call <4 x half> @llvm.vector.reverse.v4f16(<4 x half> %a)
   ret <4 x half> %res
 }
 
@@ -500,7 +500,7 @@ define <8 x half> @reverse_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)
+  %res = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> %a)
   ret <8 x half> %res
 }
 
@@ -513,7 +513,7 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> %a)
+  %res = call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> %a)
   ret <16 x half> %res
 }
 
@@ -529,7 +529,7 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <32 x half> @llvm.experimental.vector.reverse.v32f16(<32 x half> %a)
+  %res = call <32 x half> @llvm.vector.reverse.v32f16(<32 x half> %a)
   ret <32 x half> %res
 }
 
@@ -537,7 +537,7 @@ define <1 x float> @reverse_v1f32(<1 x float> %a) {
 ; CHECK-LABEL: reverse_v1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x float> @llvm.experimental.vector.reverse.v1f32(<1 x float> %a)
+  %res = call <1 x float> @llvm.vector.reverse.v1f32(<1 x float> %a)
   ret <1 x float> %res
 }
 
@@ -555,7 +555,7 @@ define <2 x float> @reverse_v2f32(<2 x float> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 32
 ; ZVBB-NEXT:    ret
-  %res = call <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float> %a)
+  %res = call <2 x float> @llvm.vector.reverse.v2f32(<2 x float> %a)
   ret <2 x float> %res
 }
 
@@ -568,7 +568,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -582,7 +582,7 @@ define <8 x float> @reverse_v8f32(<8 x float> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> %a)
+  %res = call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> %a)
   ret <8 x float> %res
 }
 
@@ -596,7 +596,7 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
@@ -604,7 +604,7 @@ define <1 x double> @reverse_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: reverse_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x double> @llvm.experimental.vector.reverse.v1f64(<1 x double> %a)
+  %res = call <1 x double> @llvm.vector.reverse.v1f64(<1 x double> %a)
   ret <1 x double> %res
 }
 
@@ -616,7 +616,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -630,7 +630,7 @@ define <4 x double> @reverse_v4f64(<4 x double> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> %a)
+  %res = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> %a)
   ret <4 x double> %res
 }
 
@@ -644,7 +644,7 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <8 x double> @llvm.experimental.vector.reverse.v8f64(<8 x double> %a)
+  %res = call <8 x double> @llvm.vector.reverse.v8f64(<8 x double> %a)
   ret <8 x double> %res
 }
 
@@ -729,7 +729,7 @@ define <3 x i64> @reverse_v3i64(<3 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v10
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <3 x i64> @llvm.experimental.vector.reverse.v3i64(<3 x i64> %a)
+  %res = call <3 x i64> @llvm.vector.reverse.v3i64(<3 x i64> %a)
   ret <3 x i64> %res
 }
 
@@ -813,7 +813,7 @@ define <6 x i64> @reverse_v6i64(<6 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v12
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <6 x i64> @llvm.experimental.vector.reverse.v6i64(<6 x i64> %a)
+  %res = call <6 x i64> @llvm.vector.reverse.v6i64(<6 x i64> %a)
   ret <6 x i64> %res
 }
 
@@ -901,54 +901,54 @@ define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v16
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <12 x i64> @llvm.experimental.vector.reverse.v12i64(<12 x i64> %a)
+  %res = call <12 x i64> @llvm.vector.reverse.v12i64(<12 x i64> %a)
   ret <12 x i64> %res
 }
 
-declare <2 x i1> @llvm.experimental.vector.reverse.v2i1(<2 x i1>)
-declare <4 x i1> @llvm.experimental.vector.reverse.v4i1(<4 x i1>)
-declare <8 x i1> @llvm.experimental.vector.reverse.v8i1(<8 x i1>)
-declare <16 x i1> @llvm.experimental.vector.reverse.v16i1(<16 x i1>)
-declare <32 x i1> @llvm.experimental.vector.reverse.v32i1(<32 x i1>)
-declare <64 x i1> @llvm.experimental.vector.reverse.v64i1(<64 x i1>)
-declare <128 x i1> @llvm.experimental.vector.reverse.v128i1(<128 x i1>)
-declare <1 x i8> @llvm.experimental.vector.reverse.v1i8(<1 x i8>)
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <4 x i8> @llvm.experimental.vector.reverse.v4i8(<4 x i8>)
-declare <8 x i8> @llvm.experimental.vector.reverse.v8i8(<8 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8>)
-declare <64 x i8> @llvm.experimental.vector.reverse.v64i8(<64 x i8>)
-declare <1 x i16> @llvm.experimental.vector.reverse.v1i16(<1 x i16>)
-declare <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16>)
-declare <4 x i16> @llvm.experimental.vector.reverse.v4i16(<4 x i16>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16>)
-declare <32 x i16> @llvm.experimental.vector.reverse.v32i16(<32 x i16>)
-declare <1 x i32> @llvm.experimental.vector.reverse.v1i32(<1 x i32>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <16 x i32> @llvm.experimental.vector.reverse.v16i32(<16 x i32>)
-declare <1 x i64> @llvm.experimental.vector.reverse.v1i64(<1 x i64>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64>)
-declare <8 x i64> @llvm.experimental.vector.reverse.v8i64(<8 x i64>)
-declare <1 x half> @llvm.experimental.vector.reverse.v1f16(<1 x half>)
-declare <2 x half> @llvm.experimental.vector.reverse.v2f16(<2 x half>)
-declare <4 x half> @llvm.experimental.vector.reverse.v4f16(<4 x half>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half>)
-declare <32 x half> @llvm.experimental.vector.reverse.v32f16(<32 x half>)
-declare <1 x float> @llvm.experimental.vector.reverse.v1f32(<1 x float>)
-declare <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <1 x double> @llvm.experimental.vector.reverse.v1f64(<1 x double>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
-declare <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double>)
-declare <8 x double> @llvm.experimental.vector.reverse.v8f64(<8 x double>)
-declare <3 x i64> @llvm.experimental.vector.reverse.v3i64(<3 x i64>)
-declare <6 x i64> @llvm.experimental.vector.reverse.v6i64(<6 x i64>)
-declare <12 x i64> @llvm.experimental.vector.reverse.v12i64(<12 x i64>)
+declare <2 x i1> @llvm.vector.reverse.v2i1(<2 x i1>)
+declare <4 x i1> @llvm.vector.reverse.v4i1(<4 x i1>)
+declare <8 x i1> @llvm.vector.reverse.v8i1(<8 x i1>)
+declare <16 x i1> @llvm.vector.reverse.v16i1(<16 x i1>)
+declare <32 x i1> @llvm.vector.reverse.v32i1(<32 x i1>)
+declare <64 x i1> @llvm.vector.reverse.v64i1(<64 x i1>)
+declare <128 x i1> @llvm.vector.reverse.v128i1(<128 x i1>)
+declare <1 x i8> @llvm.vector.reverse.v1i8(<1 x i8>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8>)
+declare <8 x i8> @llvm.vector.reverse.v8i8(<8 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8>)
+declare <64 x i8> @llvm.vector.reverse.v64i8(<64 x i8>)
+declare <1 x i16> @llvm.vector.reverse.v1i16(<1 x i16>)
+declare <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16>)
+declare <4 x i16> @llvm.vector.reverse.v4i16(<4 x i16>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16>)
+declare <32 x i16> @llvm.vector.reverse.v32i16(<32 x i16>)
+declare <1 x i32> @llvm.vector.reverse.v1i32(<1 x i32>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <16 x i32> @llvm.vector.reverse.v16i32(<16 x i32>)
+declare <1 x i64> @llvm.vector.reverse.v1i64(<1 x i64>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64>)
+declare <8 x i64> @llvm.vector.reverse.v8i64(<8 x i64>)
+declare <1 x half> @llvm.vector.reverse.v1f16(<1 x half>)
+declare <2 x half> @llvm.vector.reverse.v2f16(<2 x half>)
+declare <4 x half> @llvm.vector.reverse.v4f16(<4 x half>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <16 x half> @llvm.vector.reverse.v16f16(<16 x half>)
+declare <32 x half> @llvm.vector.reverse.v32f16(<32 x half>)
+declare <1 x float> @llvm.vector.reverse.v1f32(<1 x float>)
+declare <2 x float> @llvm.vector.reverse.v2f32(<2 x float>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <8 x float> @llvm.vector.reverse.v8f32(<8 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <1 x double> @llvm.vector.reverse.v1f64(<1 x double>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
+declare <4 x double> @llvm.vector.reverse.v4f64(<4 x double>)
+declare <8 x double> @llvm.vector.reverse.v8f64(<8 x double>)
+declare <3 x i64> @llvm.vector.reverse.v3i64(<3 x i64>)
+declare <6 x i64> @llvm.vector.reverse.v6i64(<6 x i64>)
+declare <12 x i64> @llvm.vector.reverse.v12i64(<12 x i64>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index b3bda5973eb8..a6b2d3141f22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -2190,65 +2190,66 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -32
-; CHECK-NOV-NEXT:    fmv.d fs0, fa0
-; CHECK-NOV-NEXT:    fmv.d fa0, fa1
+; CHECK-NOV-NEXT:    fmv.d fs0, fa1
 ; CHECK-NOV-NEXT:    call __fixdfti
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.d fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixdfti
-; CHECK-NOV-NEXT:    li a2, -1
-; CHECK-NOV-NEXT:    srli a3, a2, 1
-; CHECK-NOV-NEXT:    beqz s1, .LBB18_3
+; CHECK-NOV-NEXT:    mv a2, a0
+; CHECK-NOV-NEXT:    li a0, -1
+; CHECK-NOV-NEXT:    srli a3, a0, 1
+; CHECK-NOV-NEXT:    beqz a1, .LBB18_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 0
-; CHECK-NOV-NEXT:    bnez a1, .LBB18_4
+; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    bnez s1, .LBB18_4
 ; CHECK-NOV-NEXT:  .LBB18_2:
-; CHECK-NOV-NEXT:    sltu a5, a0, a3
+; CHECK-NOV-NEXT:    sltu a5, s0, a3
 ; CHECK-NOV-NEXT:    beqz a5, .LBB18_5
 ; CHECK-NOV-NEXT:    j .LBB18_6
 ; CHECK-NOV-NEXT:  .LBB18_3:
-; CHECK-NOV-NEXT:    sltu a4, s0, a3
-; CHECK-NOV-NEXT:    beqz a1, .LBB18_2
+; CHECK-NOV-NEXT:    sltu a4, a2, a3
+; CHECK-NOV-NEXT:    beqz s1, .LBB18_2
 ; CHECK-NOV-NEXT:  .LBB18_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, a1, 0
+; CHECK-NOV-NEXT:    slti a5, s1, 0
 ; CHECK-NOV-NEXT:    bnez a5, .LBB18_6
 ; CHECK-NOV-NEXT:  .LBB18_5: # %entry
-; CHECK-NOV-NEXT:    mv a0, a3
+; CHECK-NOV-NEXT:    mv s0, a3
 ; CHECK-NOV-NEXT:  .LBB18_6: # %entry
 ; CHECK-NOV-NEXT:    neg a6, a5
 ; CHECK-NOV-NEXT:    neg a5, a4
-; CHECK-NOV-NEXT:    and a5, a5, s1
+; CHECK-NOV-NEXT:    and a5, a5, a1
 ; CHECK-NOV-NEXT:    bnez a4, .LBB18_8
 ; CHECK-NOV-NEXT:  # %bb.7: # %entry
-; CHECK-NOV-NEXT:    mv s0, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:  .LBB18_8: # %entry
-; CHECK-NOV-NEXT:    and a4, a6, a1
-; CHECK-NOV-NEXT:    slli a1, a2, 63
-; CHECK-NOV-NEXT:    beq a5, a2, .LBB18_11
+; CHECK-NOV-NEXT:    and a4, a6, s1
+; CHECK-NOV-NEXT:    slli a1, a0, 63
+; CHECK-NOV-NEXT:    beq a5, a0, .LBB18_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
 ; CHECK-NOV-NEXT:    slti a3, a5, 0
 ; CHECK-NOV-NEXT:    xori a3, a3, 1
-; CHECK-NOV-NEXT:    bne a4, a2, .LBB18_12
+; CHECK-NOV-NEXT:    bne a4, a0, .LBB18_12
 ; CHECK-NOV-NEXT:  .LBB18_10:
-; CHECK-NOV-NEXT:    sltu a2, a1, a0
-; CHECK-NOV-NEXT:    beqz a2, .LBB18_13
+; CHECK-NOV-NEXT:    sltu a0, a1, s0
+; CHECK-NOV-NEXT:    beqz a0, .LBB18_13
 ; CHECK-NOV-NEXT:    j .LBB18_14
 ; CHECK-NOV-NEXT:  .LBB18_11:
-; CHECK-NOV-NEXT:    sltu a3, a1, s0
-; CHECK-NOV-NEXT:    beq a4, a2, .LBB18_10
+; CHECK-NOV-NEXT:    sltu a3, a1, a2
+; CHECK-NOV-NEXT:    beq a4, a0, .LBB18_10
 ; CHECK-NOV-NEXT:  .LBB18_12: # %entry
-; CHECK-NOV-NEXT:    slti a2, a4, 0
-; CHECK-NOV-NEXT:    xori a2, a2, 1
-; CHECK-NOV-NEXT:    bnez a2, .LBB18_14
+; CHECK-NOV-NEXT:    slti a0, a4, 0
+; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    bnez a0, .LBB18_14
 ; CHECK-NOV-NEXT:  .LBB18_13: # %entry
-; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    mv s0, a1
 ; CHECK-NOV-NEXT:  .LBB18_14: # %entry
 ; CHECK-NOV-NEXT:    bnez a3, .LBB18_16
 ; CHECK-NOV-NEXT:  # %bb.15: # %entry
-; CHECK-NOV-NEXT:    mv s0, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:  .LBB18_16: # %entry
-; CHECK-NOV-NEXT:    mv a1, s0
+; CHECK-NOV-NEXT:    mv a0, s0
+; CHECK-NOV-NEXT:    mv a1, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2273,43 +2274,43 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT:    vfmv.f.s fa0, v9
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
-; CHECK-V-NEXT:    beqz s1, .LBB18_3
+; CHECK-V-NEXT:    beqz a1, .LBB18_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, s1, 0
-; CHECK-V-NEXT:    bnez a1, .LBB18_4
+; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    bnez s1, .LBB18_4
 ; CHECK-V-NEXT:  .LBB18_2:
-; CHECK-V-NEXT:    sltu a5, a0, a3
+; CHECK-V-NEXT:    sltu a5, s0, a3
 ; CHECK-V-NEXT:    beqz a5, .LBB18_5
 ; CHECK-V-NEXT:    j .LBB18_6
 ; CHECK-V-NEXT:  .LBB18_3:
-; CHECK-V-NEXT:    sltu a4, s0, a3
-; CHECK-V-NEXT:    beqz a1, .LBB18_2
+; CHECK-V-NEXT:    sltu a4, a0, a3
+; CHECK-V-NEXT:    beqz s1, .LBB18_2
 ; CHECK-V-NEXT:  .LBB18_4: # %entry
-; CHECK-V-NEXT:    slti a5, a1, 0
+; CHECK-V-NEXT:    slti a5, s1, 0
 ; CHECK-V-NEXT:    bnez a5, .LBB18_6
 ; CHECK-V-NEXT:  .LBB18_5: # %entry
-; CHECK-V-NEXT:    mv a0, a3
+; CHECK-V-NEXT:    mv s0, a3
 ; CHECK-V-NEXT:  .LBB18_6: # %entry
 ; CHECK-V-NEXT:    neg a6, a5
 ; CHECK-V-NEXT:    neg a5, a4
-; CHECK-V-NEXT:    and a5, a5, s1
+; CHECK-V-NEXT:    and a5, a5, a1
 ; CHECK-V-NEXT:    bnez a4, .LBB18_8
 ; CHECK-V-NEXT:  # %bb.7: # %entry
-; CHECK-V-NEXT:    mv s0, a3
+; CHECK-V-NEXT:    mv a0, a3
 ; CHECK-V-NEXT:  .LBB18_8: # %entry
-; CHECK-V-NEXT:    and a4, a6, a1
+; CHECK-V-NEXT:    and a4, a6, s1
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB18_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
@@ -2317,26 +2318,26 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    xori a3, a3, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB18_12
 ; CHECK-V-NEXT:  .LBB18_10:
-; CHECK-V-NEXT:    sltu a2, a1, a0
+; CHECK-V-NEXT:    sltu a2, a1, s0
 ; CHECK-V-NEXT:    beqz a2, .LBB18_13
 ; CHECK-V-NEXT:    j .LBB18_14
 ; CHECK-V-NEXT:  .LBB18_11:
-; CHECK-V-NEXT:    sltu a3, a1, s0
+; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB18_10
 ; CHECK-V-NEXT:  .LBB18_12: # %entry
 ; CHECK-V-NEXT:    slti a2, a4, 0
 ; CHECK-V-NEXT:    xori a2, a2, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB18_14
 ; CHECK-V-NEXT:  .LBB18_13: # %entry
-; CHECK-V-NEXT:    mv a0, a1
+; CHECK-V-NEXT:    mv s0, a1
 ; CHECK-V-NEXT:  .LBB18_14: # %entry
 ; CHECK-V-NEXT:    bnez a3, .LBB18_16
 ; CHECK-V-NEXT:  # %bb.15: # %entry
-; CHECK-V-NEXT:    mv s0, a1
+; CHECK-V-NEXT:    mv a0, a1
 ; CHECK-V-NEXT:  .LBB18_16: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, s0
-; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -2369,19 +2370,19 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -32
-; CHECK-NOV-NEXT:    fmv.d fs0, fa0
-; CHECK-NOV-NEXT:    fmv.d fa0, fa1
+; CHECK-NOV-NEXT:    fmv.d fs0, fa1
 ; CHECK-NOV-NEXT:    call __fixunsdfti
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.d fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixunsdfti
-; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s1
+; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    addi a1, a2, -1
-; CHECK-NOV-NEXT:    and a1, a1, s0
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2406,25 +2407,25 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT:    vfmv.f.s fa0, v9
 ; CHECK-V-NEXT:    call __fixunsdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunsdfti
-; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    snez a1, a1
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a2, a2, s0
+; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a2
-; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -2466,32 +2467,32 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB20_2: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a1, 1
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    blez a1, .LBB20_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
 ; CHECK-NOV-NEXT:    li a1, 1
 ; CHECK-NOV-NEXT:  .LBB20_4: # %entry
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a0
-; CHECK-NOV-NEXT:    neg a0, a4
 ; CHECK-NOV-NEXT:    beqz a1, .LBB20_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB20_8
 ; CHECK-NOV-NEXT:  .LBB20_6:
-; CHECK-NOV-NEXT:    snez a2, a0
+; CHECK-NOV-NEXT:    snez a0, a4
 ; CHECK-NOV-NEXT:    j .LBB20_9
 ; CHECK-NOV-NEXT:  .LBB20_7:
 ; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB20_6
 ; CHECK-NOV-NEXT:  .LBB20_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a2, a2
+; CHECK-NOV-NEXT:    sgtz a0, a2
 ; CHECK-NOV-NEXT:  .LBB20_9: # %entry
-; CHECK-NOV-NEXT:    neg a2, a2
-; CHECK-NOV-NEXT:    and a0, a2, a0
+; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, a4
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    and a1, a1, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -2533,15 +2534,15 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:  # %bb.1: # %entry
 ; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB20_2: # %entry
-; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    slti a4, a1, 1
+; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    blez a1, .LBB20_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
 ; CHECK-V-NEXT:    li a1, 1
 ; CHECK-V-NEXT:  .LBB20_4: # %entry
+; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    and a0, a4, a0
-; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    beqz a1, .LBB20_7
 ; CHECK-V-NEXT:  # %bb.5: # %entry
 ; CHECK-V-NEXT:    sgtz a1, a1
@@ -2596,65 +2597,66 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -32
-; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.s fa0, fa1
+; CHECK-NOV-NEXT:    fmv.s fs0, fa1
 ; CHECK-NOV-NEXT:    call __fixsfti
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.s fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixsfti
-; CHECK-NOV-NEXT:    li a2, -1
-; CHECK-NOV-NEXT:    srli a3, a2, 1
-; CHECK-NOV-NEXT:    beqz s1, .LBB21_3
+; CHECK-NOV-NEXT:    mv a2, a0
+; CHECK-NOV-NEXT:    li a0, -1
+; CHECK-NOV-NEXT:    srli a3, a0, 1
+; CHECK-NOV-NEXT:    beqz a1, .LBB21_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 0
-; CHECK-NOV-NEXT:    bnez a1, .LBB21_4
+; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    bnez s1, .LBB21_4
 ; CHECK-NOV-NEXT:  .LBB21_2:
-; CHECK-NOV-NEXT:    sltu a5, a0, a3
+; CHECK-NOV-NEXT:    sltu a5, s0, a3
 ; CHECK-NOV-NEXT:    beqz a5, .LBB21_5
 ; CHECK-NOV-NEXT:    j .LBB21_6
 ; CHECK-NOV-NEXT:  .LBB21_3:
-; CHECK-NOV-NEXT:    sltu a4, s0, a3
-; CHECK-NOV-NEXT:    beqz a1, .LBB21_2
+; CHECK-NOV-NEXT:    sltu a4, a2, a3
+; CHECK-NOV-NEXT:    beqz s1, .LBB21_2
 ; CHECK-NOV-NEXT:  .LBB21_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, a1, 0
+; CHECK-NOV-NEXT:    slti a5, s1, 0
 ; CHECK-NOV-NEXT:    bnez a5, .LBB21_6
 ; CHECK-NOV-NEXT:  .LBB21_5: # %entry
-; CHECK-NOV-NEXT:    mv a0, a3
+; CHECK-NOV-NEXT:    mv s0, a3
 ; CHECK-NOV-NEXT:  .LBB21_6: # %entry
 ; CHECK-NOV-NEXT:    neg a6, a5
 ; CHECK-NOV-NEXT:    neg a5, a4
-; CHECK-NOV-NEXT:    and a5, a5, s1
+; CHECK-NOV-NEXT:    and a5, a5, a1
 ; CHECK-NOV-NEXT:    bnez a4, .LBB21_8
 ; CHECK-NOV-NEXT:  # %bb.7: # %entry
-; CHECK-NOV-NEXT:    mv s0, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:  .LBB21_8: # %entry
-; CHECK-NOV-NEXT:    and a4, a6, a1
-; CHECK-NOV-NEXT:    slli a1, a2, 63
-; CHECK-NOV-NEXT:    beq a5, a2, .LBB21_11
+; CHECK-NOV-NEXT:    and a4, a6, s1
+; CHECK-NOV-NEXT:    slli a1, a0, 63
+; CHECK-NOV-NEXT:    beq a5, a0, .LBB21_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
 ; CHECK-NOV-NEXT:    slti a3, a5, 0
 ; CHECK-NOV-NEXT:    xori a3, a3, 1
-; CHECK-NOV-NEXT:    bne a4, a2, .LBB21_12
+; CHECK-NOV-NEXT:    bne a4, a0, .LBB21_12
 ; CHECK-NOV-NEXT:  .LBB21_10:
-; CHECK-NOV-NEXT:    sltu a2, a1, a0
-; CHECK-NOV-NEXT:    beqz a2, .LBB21_13
+; CHECK-NOV-NEXT:    sltu a0, a1, s0
+; CHECK-NOV-NEXT:    beqz a0, .LBB21_13
 ; CHECK-NOV-NEXT:    j .LBB21_14
 ; CHECK-NOV-NEXT:  .LBB21_11:
-; CHECK-NOV-NEXT:    sltu a3, a1, s0
-; CHECK-NOV-NEXT:    beq a4, a2, .LBB21_10
+; CHECK-NOV-NEXT:    sltu a3, a1, a2
+; CHECK-NOV-NEXT:    beq a4, a0, .LBB21_10
 ; CHECK-NOV-NEXT:  .LBB21_12: # %entry
-; CHECK-NOV-NEXT:    slti a2, a4, 0
-; CHECK-NOV-NEXT:    xori a2, a2, 1
-; CHECK-NOV-NEXT:    bnez a2, .LBB21_14
+; CHECK-NOV-NEXT:    slti a0, a4, 0
+; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    bnez a0, .LBB21_14
 ; CHECK-NOV-NEXT:  .LBB21_13: # %entry
-; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    mv s0, a1
 ; CHECK-NOV-NEXT:  .LBB21_14: # %entry
 ; CHECK-NOV-NEXT:    bnez a3, .LBB21_16
 ; CHECK-NOV-NEXT:  # %bb.15: # %entry
-; CHECK-NOV-NEXT:    mv s0, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:  .LBB21_16: # %entry
-; CHECK-NOV-NEXT:    mv a1, s0
+; CHECK-NOV-NEXT:    mv a0, s0
+; CHECK-NOV-NEXT:    mv a1, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2679,43 +2681,43 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT:    vfmv.f.s fa0, v9
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
-; CHECK-V-NEXT:    beqz s1, .LBB21_3
+; CHECK-V-NEXT:    beqz a1, .LBB21_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, s1, 0
-; CHECK-V-NEXT:    bnez a1, .LBB21_4
+; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    bnez s1, .LBB21_4
 ; CHECK-V-NEXT:  .LBB21_2:
-; CHECK-V-NEXT:    sltu a5, a0, a3
+; CHECK-V-NEXT:    sltu a5, s0, a3
 ; CHECK-V-NEXT:    beqz a5, .LBB21_5
 ; CHECK-V-NEXT:    j .LBB21_6
 ; CHECK-V-NEXT:  .LBB21_3:
-; CHECK-V-NEXT:    sltu a4, s0, a3
-; CHECK-V-NEXT:    beqz a1, .LBB21_2
+; CHECK-V-NEXT:    sltu a4, a0, a3
+; CHECK-V-NEXT:    beqz s1, .LBB21_2
 ; CHECK-V-NEXT:  .LBB21_4: # %entry
-; CHECK-V-NEXT:    slti a5, a1, 0
+; CHECK-V-NEXT:    slti a5, s1, 0
 ; CHECK-V-NEXT:    bnez a5, .LBB21_6
 ; CHECK-V-NEXT:  .LBB21_5: # %entry
-; CHECK-V-NEXT:    mv a0, a3
+; CHECK-V-NEXT:    mv s0, a3
 ; CHECK-V-NEXT:  .LBB21_6: # %entry
 ; CHECK-V-NEXT:    neg a6, a5
 ; CHECK-V-NEXT:    neg a5, a4
-; CHECK-V-NEXT:    and a5, a5, s1
+; CHECK-V-NEXT:    and a5, a5, a1
 ; CHECK-V-NEXT:    bnez a4, .LBB21_8
 ; CHECK-V-NEXT:  # %bb.7: # %entry
-; CHECK-V-NEXT:    mv s0, a3
+; CHECK-V-NEXT:    mv a0, a3
 ; CHECK-V-NEXT:  .LBB21_8: # %entry
-; CHECK-V-NEXT:    and a4, a6, a1
+; CHECK-V-NEXT:    and a4, a6, s1
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB21_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
@@ -2723,26 +2725,26 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    xori a3, a3, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB21_12
 ; CHECK-V-NEXT:  .LBB21_10:
-; CHECK-V-NEXT:    sltu a2, a1, a0
+; CHECK-V-NEXT:    sltu a2, a1, s0
 ; CHECK-V-NEXT:    beqz a2, .LBB21_13
 ; CHECK-V-NEXT:    j .LBB21_14
 ; CHECK-V-NEXT:  .LBB21_11:
-; CHECK-V-NEXT:    sltu a3, a1, s0
+; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB21_10
 ; CHECK-V-NEXT:  .LBB21_12: # %entry
 ; CHECK-V-NEXT:    slti a2, a4, 0
 ; CHECK-V-NEXT:    xori a2, a2, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB21_14
 ; CHECK-V-NEXT:  .LBB21_13: # %entry
-; CHECK-V-NEXT:    mv a0, a1
+; CHECK-V-NEXT:    mv s0, a1
 ; CHECK-V-NEXT:  .LBB21_14: # %entry
 ; CHECK-V-NEXT:    bnez a3, .LBB21_16
 ; CHECK-V-NEXT:  # %bb.15: # %entry
-; CHECK-V-NEXT:    mv s0, a1
+; CHECK-V-NEXT:    mv a0, a1
 ; CHECK-V-NEXT:  .LBB21_16: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, s0
-; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -2775,19 +2777,19 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -32
-; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.s fa0, fa1
+; CHECK-NOV-NEXT:    fmv.s fs0, fa1
 ; CHECK-NOV-NEXT:    call __fixunssfti
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.s fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixunssfti
-; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s1
+; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    addi a1, a2, -1
-; CHECK-NOV-NEXT:    and a1, a1, s0
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2812,25 +2814,25 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT:    vfmv.f.s fa0, v9
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunssfti
-; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    snez a1, a1
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a2, a2, s0
+; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a2
-; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -2872,32 +2874,32 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB23_2: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a1, 1
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    blez a1, .LBB23_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
 ; CHECK-NOV-NEXT:    li a1, 1
 ; CHECK-NOV-NEXT:  .LBB23_4: # %entry
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a0
-; CHECK-NOV-NEXT:    neg a0, a4
 ; CHECK-NOV-NEXT:    beqz a1, .LBB23_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB23_8
 ; CHECK-NOV-NEXT:  .LBB23_6:
-; CHECK-NOV-NEXT:    snez a2, a0
+; CHECK-NOV-NEXT:    snez a0, a4
 ; CHECK-NOV-NEXT:    j .LBB23_9
 ; CHECK-NOV-NEXT:  .LBB23_7:
 ; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB23_6
 ; CHECK-NOV-NEXT:  .LBB23_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a2, a2
+; CHECK-NOV-NEXT:    sgtz a0, a2
 ; CHECK-NOV-NEXT:  .LBB23_9: # %entry
-; CHECK-NOV-NEXT:    neg a2, a2
-; CHECK-NOV-NEXT:    and a0, a2, a0
+; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, a4
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    and a1, a1, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -2939,15 +2941,15 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:  # %bb.1: # %entry
 ; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB23_2: # %entry
-; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    slti a4, a1, 1
+; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    blez a1, .LBB23_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
 ; CHECK-V-NEXT:    li a1, 1
 ; CHECK-V-NEXT:  .LBB23_4: # %entry
+; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    and a0, a4, a0
-; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    beqz a1, .LBB23_7
 ; CHECK-V-NEXT:  # %bb.5: # %entry
 ; CHECK-V-NEXT:    sgtz a1, a1
@@ -3002,8 +3004,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset s2, -32
-; CHECK-NOV-NEXT:    mv s2, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    mv s2, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixsfti
 ; CHECK-NOV-NEXT:    mv s0, a0
@@ -3011,58 +3013,60 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixsfti
-; CHECK-NOV-NEXT:    li a2, -1
-; CHECK-NOV-NEXT:    srli a3, a2, 1
-; CHECK-NOV-NEXT:    beqz s1, .LBB24_3
+; CHECK-NOV-NEXT:    mv a2, a0
+; CHECK-NOV-NEXT:    li a0, -1
+; CHECK-NOV-NEXT:    srli a3, a0, 1
+; CHECK-NOV-NEXT:    beqz a1, .LBB24_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 0
-; CHECK-NOV-NEXT:    bnez a1, .LBB24_4
+; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    bnez s1, .LBB24_4
 ; CHECK-NOV-NEXT:  .LBB24_2:
-; CHECK-NOV-NEXT:    sltu a5, a0, a3
+; CHECK-NOV-NEXT:    sltu a5, s0, a3
 ; CHECK-NOV-NEXT:    beqz a5, .LBB24_5
 ; CHECK-NOV-NEXT:    j .LBB24_6
 ; CHECK-NOV-NEXT:  .LBB24_3:
-; CHECK-NOV-NEXT:    sltu a4, s0, a3
-; CHECK-NOV-NEXT:    beqz a1, .LBB24_2
+; CHECK-NOV-NEXT:    sltu a4, a2, a3
+; CHECK-NOV-NEXT:    beqz s1, .LBB24_2
 ; CHECK-NOV-NEXT:  .LBB24_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, a1, 0
+; CHECK-NOV-NEXT:    slti a5, s1, 0
 ; CHECK-NOV-NEXT:    bnez a5, .LBB24_6
 ; CHECK-NOV-NEXT:  .LBB24_5: # %entry
-; CHECK-NOV-NEXT:    mv a0, a3
+; CHECK-NOV-NEXT:    mv s0, a3
 ; CHECK-NOV-NEXT:  .LBB24_6: # %entry
 ; CHECK-NOV-NEXT:    neg a6, a5
 ; CHECK-NOV-NEXT:    neg a5, a4
-; CHECK-NOV-NEXT:    and a5, a5, s1
+; CHECK-NOV-NEXT:    and a5, a5, a1
 ; CHECK-NOV-NEXT:    bnez a4, .LBB24_8
 ; CHECK-NOV-NEXT:  # %bb.7: # %entry
-; CHECK-NOV-NEXT:    mv s0, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:  .LBB24_8: # %entry
-; CHECK-NOV-NEXT:    and a4, a6, a1
-; CHECK-NOV-NEXT:    slli a1, a2, 63
-; CHECK-NOV-NEXT:    beq a5, a2, .LBB24_11
+; CHECK-NOV-NEXT:    and a4, a6, s1
+; CHECK-NOV-NEXT:    slli a1, a0, 63
+; CHECK-NOV-NEXT:    beq a5, a0, .LBB24_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
 ; CHECK-NOV-NEXT:    slti a3, a5, 0
 ; CHECK-NOV-NEXT:    xori a3, a3, 1
-; CHECK-NOV-NEXT:    bne a4, a2, .LBB24_12
+; CHECK-NOV-NEXT:    bne a4, a0, .LBB24_12
 ; CHECK-NOV-NEXT:  .LBB24_10:
-; CHECK-NOV-NEXT:    sltu a2, a1, a0
-; CHECK-NOV-NEXT:    beqz a2, .LBB24_13
+; CHECK-NOV-NEXT:    sltu a0, a1, s0
+; CHECK-NOV-NEXT:    beqz a0, .LBB24_13
 ; CHECK-NOV-NEXT:    j .LBB24_14
 ; CHECK-NOV-NEXT:  .LBB24_11:
-; CHECK-NOV-NEXT:    sltu a3, a1, s0
-; CHECK-NOV-NEXT:    beq a4, a2, .LBB24_10
+; CHECK-NOV-NEXT:    sltu a3, a1, a2
+; CHECK-NOV-NEXT:    beq a4, a0, .LBB24_10
 ; CHECK-NOV-NEXT:  .LBB24_12: # %entry
-; CHECK-NOV-NEXT:    slti a2, a4, 0
-; CHECK-NOV-NEXT:    xori a2, a2, 1
-; CHECK-NOV-NEXT:    bnez a2, .LBB24_14
+; CHECK-NOV-NEXT:    slti a0, a4, 0
+; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    bnez a0, .LBB24_14
 ; CHECK-NOV-NEXT:  .LBB24_13: # %entry
-; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    mv s0, a1
 ; CHECK-NOV-NEXT:  .LBB24_14: # %entry
 ; CHECK-NOV-NEXT:    bnez a3, .LBB24_16
 ; CHECK-NOV-NEXT:  # %bb.15: # %entry
-; CHECK-NOV-NEXT:    mv s0, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:  .LBB24_16: # %entry
-; CHECK-NOV-NEXT:    mv a1, s0
+; CHECK-NOV-NEXT:    mv a0, s0
+; CHECK-NOV-NEXT:    mv a1, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3082,8 +3086,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
-; CHECK-V-NEXT:    mv s2, a0
-; CHECK-V-NEXT:    fmv.w.x fa0, a1
+; CHECK-V-NEXT:    mv s2, a1
+; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv s0, a0
@@ -3093,31 +3097,31 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
-; CHECK-V-NEXT:    beqz s1, .LBB24_3
+; CHECK-V-NEXT:    beqz a1, .LBB24_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, s1, 0
-; CHECK-V-NEXT:    bnez a1, .LBB24_4
+; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    bnez s1, .LBB24_4
 ; CHECK-V-NEXT:  .LBB24_2:
-; CHECK-V-NEXT:    sltu a5, a0, a3
+; CHECK-V-NEXT:    sltu a5, s0, a3
 ; CHECK-V-NEXT:    beqz a5, .LBB24_5
 ; CHECK-V-NEXT:    j .LBB24_6
 ; CHECK-V-NEXT:  .LBB24_3:
-; CHECK-V-NEXT:    sltu a4, s0, a3
-; CHECK-V-NEXT:    beqz a1, .LBB24_2
+; CHECK-V-NEXT:    sltu a4, a0, a3
+; CHECK-V-NEXT:    beqz s1, .LBB24_2
 ; CHECK-V-NEXT:  .LBB24_4: # %entry
-; CHECK-V-NEXT:    slti a5, a1, 0
+; CHECK-V-NEXT:    slti a5, s1, 0
 ; CHECK-V-NEXT:    bnez a5, .LBB24_6
 ; CHECK-V-NEXT:  .LBB24_5: # %entry
-; CHECK-V-NEXT:    mv a0, a3
+; CHECK-V-NEXT:    mv s0, a3
 ; CHECK-V-NEXT:  .LBB24_6: # %entry
 ; CHECK-V-NEXT:    neg a6, a5
 ; CHECK-V-NEXT:    neg a5, a4
-; CHECK-V-NEXT:    and a5, a5, s1
+; CHECK-V-NEXT:    and a5, a5, a1
 ; CHECK-V-NEXT:    bnez a4, .LBB24_8
 ; CHECK-V-NEXT:  # %bb.7: # %entry
-; CHECK-V-NEXT:    mv s0, a3
+; CHECK-V-NEXT:    mv a0, a3
 ; CHECK-V-NEXT:  .LBB24_8: # %entry
-; CHECK-V-NEXT:    and a4, a6, a1
+; CHECK-V-NEXT:    and a4, a6, s1
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB24_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
@@ -3125,26 +3129,26 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    xori a3, a3, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB24_12
 ; CHECK-V-NEXT:  .LBB24_10:
-; CHECK-V-NEXT:    sltu a2, a1, a0
+; CHECK-V-NEXT:    sltu a2, a1, s0
 ; CHECK-V-NEXT:    beqz a2, .LBB24_13
 ; CHECK-V-NEXT:    j .LBB24_14
 ; CHECK-V-NEXT:  .LBB24_11:
-; CHECK-V-NEXT:    sltu a3, a1, s0
+; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB24_10
 ; CHECK-V-NEXT:  .LBB24_12: # %entry
 ; CHECK-V-NEXT:    slti a2, a4, 0
 ; CHECK-V-NEXT:    xori a2, a2, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB24_14
 ; CHECK-V-NEXT:  .LBB24_13: # %entry
-; CHECK-V-NEXT:    mv a0, a1
+; CHECK-V-NEXT:    mv s0, a1
 ; CHECK-V-NEXT:  .LBB24_14: # %entry
 ; CHECK-V-NEXT:    bnez a3, .LBB24_16
 ; CHECK-V-NEXT:  # %bb.15: # %entry
-; CHECK-V-NEXT:    mv s0, a1
+; CHECK-V-NEXT:    mv a0, a1
 ; CHECK-V-NEXT:  .LBB24_16: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v9, s0
-; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -3175,8 +3179,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset s2, -32
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    mv s0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixunssfti
 ; CHECK-NOV-NEXT:    mv s1, a0
@@ -3184,12 +3188,13 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixunssfti
-; CHECK-NOV-NEXT:    snez a2, s2
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s2
+; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a2, a2, s1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    addi a1, a2, -1
-; CHECK-NOV-NEXT:    and a1, a1, s1
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3209,8 +3214,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
-; CHECK-V-NEXT:    mv s0, a0
-; CHECK-V-NEXT:    fmv.w.x fa0, a1
+; CHECK-V-NEXT:    mv s0, a1
+; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    mv s1, a0
@@ -3218,15 +3223,15 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixunssfti
-; CHECK-V-NEXT:    snez a2, s2
 ; CHECK-V-NEXT:    snez a1, a1
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    snez a2, s2
 ; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a2, a2, s1
+; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v9, a2
-; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -3269,32 +3274,32 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB26_2: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a1, 1
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    blez a1, .LBB26_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
 ; CHECK-NOV-NEXT:    li a1, 1
 ; CHECK-NOV-NEXT:  .LBB26_4: # %entry
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a0
-; CHECK-NOV-NEXT:    neg a0, a4
 ; CHECK-NOV-NEXT:    beqz a1, .LBB26_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB26_8
 ; CHECK-NOV-NEXT:  .LBB26_6:
-; CHECK-NOV-NEXT:    snez a2, a0
+; CHECK-NOV-NEXT:    snez a0, a4
 ; CHECK-NOV-NEXT:    j .LBB26_9
 ; CHECK-NOV-NEXT:  .LBB26_7:
 ; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB26_6
 ; CHECK-NOV-NEXT:  .LBB26_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a2, a2
+; CHECK-NOV-NEXT:    sgtz a0, a2
 ; CHECK-NOV-NEXT:  .LBB26_9: # %entry
-; CHECK-NOV-NEXT:    neg a2, a2
-; CHECK-NOV-NEXT:    and a0, a2, a0
+; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, a4
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    and a1, a1, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -3330,15 +3335,15 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:  # %bb.1: # %entry
 ; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB26_2: # %entry
-; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    slti a4, a1, 1
+; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    blez a1, .LBB26_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
 ; CHECK-V-NEXT:    li a1, 1
 ; CHECK-V-NEXT:  .LBB26_4: # %entry
+; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    and a0, a4, a0
-; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    beqz a1, .LBB26_7
 ; CHECK-V-NEXT:  # %bb.5: # %entry
 ; CHECK-V-NEXT:    sgtz a1, a1
@@ -5811,15 +5816,15 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.d fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixdfti
-; CHECK-NOV-NEXT:    mv a2, s1
-; CHECK-NOV-NEXT:    mv a3, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:    blez a1, .LBB47_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    li a3, 1
+; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB47_2: # %entry
-; CHECK-NOV-NEXT:    blez a2, .LBB47_4
+; CHECK-NOV-NEXT:    mv a3, s1
+; CHECK-NOV-NEXT:    blez s1, .LBB47_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
-; CHECK-NOV-NEXT:    li a2, 1
+; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB47_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    neg a1, a1
@@ -5827,11 +5832,11 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
 ; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a3, a3, 0
+; CHECK-NOV-NEXT:    addi a3, a3, -1
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a0, a2, a0
-; CHECK-NOV-NEXT:    slti a2, a3, 0
-; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -5867,15 +5872,15 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixdfti
-; CHECK-V-NEXT:    mv a2, s1
-; CHECK-V-NEXT:    mv a3, a1
+; CHECK-V-NEXT:    mv a2, a1
 ; CHECK-V-NEXT:    blez a1, .LBB47_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    li a3, 1
+; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB47_2: # %entry
-; CHECK-V-NEXT:    blez a2, .LBB47_4
+; CHECK-V-NEXT:    mv a3, s1
+; CHECK-V-NEXT:    blez s1, .LBB47_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
-; CHECK-V-NEXT:    li a2, 1
+; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB47_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    neg a1, a1
@@ -5883,11 +5888,11 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    slti a1, s1, 1
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a3, a3, 0
+; CHECK-V-NEXT:    addi a3, a3, -1
+; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a1, a2, a1
-; CHECK-V-NEXT:    slti a2, a3, 0
-; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
@@ -6197,15 +6202,15 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.s fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixsfti
-; CHECK-NOV-NEXT:    mv a2, s1
-; CHECK-NOV-NEXT:    mv a3, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:    blez a1, .LBB50_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    li a3, 1
+; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB50_2: # %entry
-; CHECK-NOV-NEXT:    blez a2, .LBB50_4
+; CHECK-NOV-NEXT:    mv a3, s1
+; CHECK-NOV-NEXT:    blez s1, .LBB50_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
-; CHECK-NOV-NEXT:    li a2, 1
+; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB50_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    neg a1, a1
@@ -6213,11 +6218,11 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
 ; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a3, a3, 0
+; CHECK-NOV-NEXT:    addi a3, a3, -1
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a0, a2, a0
-; CHECK-NOV-NEXT:    slti a2, a3, 0
-; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -6253,15 +6258,15 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
-; CHECK-V-NEXT:    mv a2, s1
-; CHECK-V-NEXT:    mv a3, a1
+; CHECK-V-NEXT:    mv a2, a1
 ; CHECK-V-NEXT:    blez a1, .LBB50_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    li a3, 1
+; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB50_2: # %entry
-; CHECK-V-NEXT:    blez a2, .LBB50_4
+; CHECK-V-NEXT:    mv a3, s1
+; CHECK-V-NEXT:    blez s1, .LBB50_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
-; CHECK-V-NEXT:    li a2, 1
+; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB50_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    neg a1, a1
@@ -6269,11 +6274,11 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    slti a1, s1, 1
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a3, a3, 0
+; CHECK-V-NEXT:    addi a3, a3, -1
+; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a1, a2, a1
-; CHECK-V-NEXT:    slti a2, a3, 0
-; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
@@ -6575,15 +6580,15 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixsfti
-; CHECK-NOV-NEXT:    mv a2, s1
-; CHECK-NOV-NEXT:    mv a3, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:    blez a1, .LBB53_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    li a3, 1
+; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB53_2: # %entry
-; CHECK-NOV-NEXT:    blez a2, .LBB53_4
+; CHECK-NOV-NEXT:    mv a3, s1
+; CHECK-NOV-NEXT:    blez s1, .LBB53_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
-; CHECK-NOV-NEXT:    li a2, 1
+; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB53_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    neg a1, a1
@@ -6591,11 +6596,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
 ; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a3, a3, 0
+; CHECK-NOV-NEXT:    addi a3, a3, -1
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a0, a2, a0
-; CHECK-NOV-NEXT:    slti a2, a3, 0
-; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -6625,15 +6630,15 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixsfti
-; CHECK-V-NEXT:    mv a2, s1
-; CHECK-V-NEXT:    mv a3, a1
+; CHECK-V-NEXT:    mv a2, a1
 ; CHECK-V-NEXT:    blez a1, .LBB53_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    li a3, 1
+; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB53_2: # %entry
-; CHECK-V-NEXT:    blez a2, .LBB53_4
+; CHECK-V-NEXT:    mv a3, s1
+; CHECK-V-NEXT:    blez s1, .LBB53_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
-; CHECK-V-NEXT:    li a2, 1
+; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB53_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    neg a1, a1
@@ -6641,11 +6646,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    slti a1, s1, 1
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a3, a3, 0
+; CHECK-V-NEXT:    addi a3, a3, -1
+; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a1, a2, a1
-; CHECK-V-NEXT:    slti a2, a3, 0
-; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
index 4e08f401ca4e..96094eea631b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
@@ -104,7 +104,7 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
+  %res = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
   ret <vscale x 2 x i1> %res
 }
 
@@ -202,7 +202,7 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %res = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
   ret <vscale x 4 x i1> %res
 }
 
@@ -294,7 +294,7 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
+  %res = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
   ret <vscale x 8 x i1> %res
 }
 
@@ -392,7 +392,7 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v12, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
+  %res = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
   ret <vscale x 16 x i1> %res
 }
 
@@ -490,7 +490,7 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v16, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
+  %res = call <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
   ret <vscale x 32 x i1> %res
 }
 
@@ -600,7 +600,7 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v24, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.reverse.nxv64i1(<vscale x 64 x i1> %a)
+  %res = call <vscale x 64 x i1> @llvm.vector.reverse.nxv64i1(<vscale x 64 x i1> %a)
   ret <vscale x 64 x i1> %res
 }
 
@@ -682,7 +682,7 @@ define <vscale x 1 x i8> @reverse_nxv1i8(<vscale x 1 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.reverse.nxv1i8(<vscale x 1 x i8> %a)
+  %res = call <vscale x 1 x i8> @llvm.vector.reverse.nxv1i8(<vscale x 1 x i8> %a)
   ret <vscale x 1 x i8> %res
 }
 
@@ -760,7 +760,7 @@ define <vscale x 2 x i8> @reverse_nxv2i8(<vscale x 2 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
+  %res = call <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
   ret <vscale x 2 x i8> %res
 }
 
@@ -838,7 +838,7 @@ define <vscale x 4 x i8> @reverse_nxv4i8(<vscale x 4 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.reverse.nxv4i8(<vscale x 4 x i8> %a)
+  %res = call <vscale x 4 x i8> @llvm.vector.reverse.nxv4i8(<vscale x 4 x i8> %a)
   ret <vscale x 4 x i8> %res
 }
 
@@ -910,7 +910,7 @@ define <vscale x 8 x i8> @reverse_nxv8i8(<vscale x 8 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.reverse.nxv8i8(<vscale x 8 x i8> %a)
+  %res = call <vscale x 8 x i8> @llvm.vector.reverse.nxv8i8(<vscale x 8 x i8> %a)
   ret <vscale x 8 x i8> %res
 }
 
@@ -988,7 +988,7 @@ define <vscale x 16 x i8> @reverse_nxv16i8(<vscale x 16 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v10, v8, v12
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v10
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %res
 }
 
@@ -1066,7 +1066,7 @@ define <vscale x 32 x i8> @reverse_nxv32i8(<vscale x 32 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v12, v8, v16
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v12
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> %a)
+  %res = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> %a)
   ret <vscale x 32 x i8> %res
 }
 
@@ -1148,7 +1148,7 @@ define <vscale x 64 x i8> @reverse_nxv64i8(<vscale x 64 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v16, v12, v24
 ; RV64-BITS-512-NEXT:    vmv8r.v v8, v16
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.reverse.nxv64i8(<vscale x 64 x i8> %a)
+  %res = call <vscale x 64 x i8> @llvm.vector.reverse.nxv64i8(<vscale x 64 x i8> %a)
   ret <vscale x 64 x i8> %res
 }
 
@@ -1164,7 +1164,7 @@ define <vscale x 1 x i16> @reverse_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.reverse.nxv1i16(<vscale x 1 x i16> %a)
+  %res = call <vscale x 1 x i16> @llvm.vector.reverse.nxv1i16(<vscale x 1 x i16> %a)
   ret <vscale x 1 x i16> %res
 }
 
@@ -1180,7 +1180,7 @@ define <vscale x 2 x i16> @reverse_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> %a)
+  %res = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> %a)
   ret <vscale x 2 x i16> %res
 }
 
@@ -1196,7 +1196,7 @@ define <vscale x 4 x i16> @reverse_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> %a)
+  %res = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> %a)
   ret <vscale x 4 x i16> %res
 }
 
@@ -1211,7 +1211,7 @@ define <vscale x 8 x i16> @reverse_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
+  %res = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
   ret <vscale x 8 x i16> %res
 }
 
@@ -1227,7 +1227,7 @@ define <vscale x 16 x i16> @reverse_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> %a)
+  %res = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> %a)
   ret <vscale x 16 x i16> %res
 }
 
@@ -1243,7 +1243,7 @@ define <vscale x 32 x i16> @reverse_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.reverse.nxv32i16(<vscale x 32 x i16> %a)
+  %res = call <vscale x 32 x i16> @llvm.vector.reverse.nxv32i16(<vscale x 32 x i16> %a)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1259,7 +1259,7 @@ define <vscale x 1 x i32> @reverse_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.reverse.nxv1i32(<vscale x 1 x i32> %a)
+  %res = call <vscale x 1 x i32> @llvm.vector.reverse.nxv1i32(<vscale x 1 x i32> %a)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1275,7 +1275,7 @@ define <vscale x 2 x i32> @reverse_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.reverse.nxv2i32(<vscale x 2 x i32> %a)
+  %res = call <vscale x 2 x i32> @llvm.vector.reverse.nxv2i32(<vscale x 2 x i32> %a)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1291,7 +1291,7 @@ define <vscale x 4 x i32> @reverse_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %res = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1306,7 +1306,7 @@ define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1322,7 +1322,7 @@ define <vscale x 16 x i32> @reverse_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.reverse.nxv16i32(<vscale x 16 x i32> %a)
+  %res = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> %a)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1338,7 +1338,7 @@ define <vscale x 1 x i64> @reverse_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.reverse.nxv1i64(<vscale x 1 x i64> %a)
+  %res = call <vscale x 1 x i64> @llvm.vector.reverse.nxv1i64(<vscale x 1 x i64> %a)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1354,7 +1354,7 @@ define <vscale x 2 x i64> @reverse_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
+  %res = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1370,7 +1370,7 @@ define <vscale x 4 x i64> @reverse_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> %a)
+  %res = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> %a)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1385,7 +1385,7 @@ define <vscale x 8 x i64> @reverse_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> %a)
+  %res = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> %a)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1405,7 +1405,7 @@ define <vscale x 1 x half> @reverse_nxv1f16(<vscale x 1 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.reverse.nxv1f16(<vscale x 1 x half> %a)
+  %res = call <vscale x 1 x half> @llvm.vector.reverse.nxv1f16(<vscale x 1 x half> %a)
   ret <vscale x 1 x half> %res
 }
 
@@ -1421,7 +1421,7 @@ define <vscale x 2 x half> @reverse_nxv2f16(<vscale x 2 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
+  %res = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
   ret <vscale x 2 x half> %res
 }
 
@@ -1437,7 +1437,7 @@ define <vscale x 4 x half> @reverse_nxv4f16(<vscale x 4 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
+  %res = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
   ret <vscale x 4 x half> %res
 }
 
@@ -1452,7 +1452,7 @@ define <vscale x 8 x half> @reverse_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
+  %res = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
   ret <vscale x 8 x half> %res
 }
 
@@ -1468,7 +1468,7 @@ define <vscale x 16 x half> @reverse_nxv16f16(<vscale x 16 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> %a)
+  %res = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> %a)
   ret <vscale x 16 x half> %res
 }
 
@@ -1484,7 +1484,7 @@ define <vscale x 32 x half> @reverse_nxv32f16(<vscale x 32 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.reverse.nxv32f16(<vscale x 32 x half> %a)
+  %res = call <vscale x 32 x half> @llvm.vector.reverse.nxv32f16(<vscale x 32 x half> %a)
   ret <vscale x 32 x half> %res
 }
 
@@ -1500,7 +1500,7 @@ define <vscale x 1 x float> @reverse_nxv1f32(<vscale x 1 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.reverse.nxv1f32(<vscale x 1 x float> %a)
+  %res = call <vscale x 1 x float> @llvm.vector.reverse.nxv1f32(<vscale x 1 x float> %a)
   ret <vscale x 1 x float> %res
 }
 
@@ -1516,7 +1516,7 @@ define <vscale x 2 x float> @reverse_nxv2f32(<vscale x 2 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> %a)
+  %res = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> %a)
   ret <vscale x 2 x float> %res
 }
 
@@ -1532,7 +1532,7 @@ define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %res = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   ret <vscale x 4 x float> %res
 }
 
@@ -1547,7 +1547,7 @@ define <vscale x 8 x float> @reverse_nxv8f32(<vscale x 8 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> %a)
+  %res = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> %a)
   ret <vscale x 8 x float> %res
 }
 
@@ -1563,7 +1563,7 @@ define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
+  %res = call <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
   ret <vscale x 16 x float> %res
 }
 
@@ -1579,7 +1579,7 @@ define <vscale x 1 x double> @reverse_nxv1f64(<vscale x 1 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.reverse.nxv1f64(<vscale x 1 x double> %a)
+  %res = call <vscale x 1 x double> @llvm.vector.reverse.nxv1f64(<vscale x 1 x double> %a)
   ret <vscale x 1 x double> %res
 }
 
@@ -1595,7 +1595,7 @@ define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
   ret <vscale x 2 x double> %res
 }
 
@@ -1611,7 +1611,7 @@ define <vscale x 4 x double> @reverse_nxv4f64(<vscale x 4 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %a)
+  %res = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %a)
   ret <vscale x 4 x double> %res
 }
 
@@ -1626,7 +1626,7 @@ define <vscale x 8 x double> @reverse_nxv8f64(<vscale x 8 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double> %a)
+  %res = call <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double> %a)
   ret <vscale x 8 x double> %res
 }
 
@@ -1646,7 +1646,7 @@ define <vscale x 3 x i64> @reverse_nxv3i64(<vscale x 3 x i64> %a) {
 ; CHECK-NEXT:    vmv1r.v v9, v18
 ; CHECK-NEXT:    vmv1r.v v10, v19
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 3 x i64> @llvm.experimental.vector.reverse.nxv3i64(<vscale x 3 x i64> %a)
+  %res = call <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64> %a)
   ret <vscale x 3 x i64> %res
 }
 
@@ -1663,7 +1663,7 @@ define <vscale x 6 x i64> @reverse_nxv6i64(<vscale x 6 x i64> %a) {
 ; CHECK-NEXT:    vmv2r.v v10, v28
 ; CHECK-NEXT:    vmv2r.v v12, v30
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 6 x i64> @llvm.experimental.vector.reverse.nxv6i64(<vscale x 6 x i64> %a)
+  %res = call <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64> %a)
   ret <vscale x 6 x i64> %res
 }
 
@@ -1739,53 +1739,53 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
-  %res = call <vscale x 12 x i64> @llvm.experimental.vector.reverse.nxv12i64(<vscale x 12 x i64> %a)
+  %res = call <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64> %a)
   ret <vscale x 12 x i64> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1>)
-declare <vscale x 64 x i1> @llvm.experimental.vector.reverse.nxv64i1(<vscale x 64 x i1>)
-declare <vscale x 1 x i8> @llvm.experimental.vector.reverse.nxv1i8(<vscale x 1 x i8>)
-declare <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8>)
-declare <vscale x 4 x i8> @llvm.experimental.vector.reverse.nxv4i8(<vscale x 4 x i8>)
-declare <vscale x 8 x i8> @llvm.experimental.vector.reverse.nxv8i8(<vscale x 8 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 64 x i8> @llvm.experimental.vector.reverse.nxv64i8(<vscale x 64 x i8>)
-declare <vscale x 1 x i16> @llvm.experimental.vector.reverse.nxv1i16(<vscale x 1 x i16>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.reverse.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 1 x i32> @llvm.experimental.vector.reverse.nxv1i32(<vscale x 1 x i32>)
-declare <vscale x 2 x i32> @llvm.experimental.vector.reverse.nxv2i32(<vscale x 2 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.reverse.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 1 x i64> @llvm.experimental.vector.reverse.nxv1i64(<vscale x 1 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 1 x half> @llvm.experimental.vector.reverse.nxv1f16(<vscale x 1 x half>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.reverse.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 1 x float> @llvm.experimental.vector.reverse.nxv1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 1 x double> @llvm.experimental.vector.reverse.nxv1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 3 x i64> @llvm.experimental.vector.reverse.nxv3i64(<vscale x 3 x i64>)
-declare <vscale x 6 x i64> @llvm.experimental.vector.reverse.nxv6i64(<vscale x 6 x i64>)
-declare <vscale x 12 x i64> @llvm.experimental.vector.reverse.nxv12i64(<vscale x 12 x i64>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1>)
+declare <vscale x 64 x i1> @llvm.vector.reverse.nxv64i1(<vscale x 64 x i1>)
+declare <vscale x 1 x i8> @llvm.vector.reverse.nxv1i8(<vscale x 1 x i8>)
+declare <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8>)
+declare <vscale x 4 x i8> @llvm.vector.reverse.nxv4i8(<vscale x 4 x i8>)
+declare <vscale x 8 x i8> @llvm.vector.reverse.nxv8i8(<vscale x 8 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 64 x i8> @llvm.vector.reverse.nxv64i8(<vscale x 64 x i8>)
+declare <vscale x 1 x i16> @llvm.vector.reverse.nxv1i16(<vscale x 1 x i16>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.reverse.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 1 x i32> @llvm.vector.reverse.nxv1i32(<vscale x 1 x i32>)
+declare <vscale x 2 x i32> @llvm.vector.reverse.nxv2i32(<vscale x 2 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 1 x i64> @llvm.vector.reverse.nxv1i64(<vscale x 1 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 1 x half> @llvm.vector.reverse.nxv1f16(<vscale x 1 x half>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 32 x half> @llvm.vector.reverse.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 1 x float> @llvm.vector.reverse.nxv1f32(<vscale x 1 x float>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 1 x double> @llvm.vector.reverse.nxv1f64(<vscale x 1 x double>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64>)
+declare <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64>)
+declare <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr90559.ll b/llvm/test/CodeGen/RISCV/rvv/pr90559.ll
new file mode 100644
index 000000000000..8d330b12055a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/pr90559.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+; This showcases a miscompile that was fixed in #90573:
+; - The memset will be type-legalized to a 512 bit store + 2 x 128 bit stores.
+; - the load and store of q aliases the upper 128 bits store of p.
+; - The aliasing 128 bit store will be between the chain of the scalar
+;   load/store:
+;
+;   t54: ch = store<(store (s512) into %ir.p, align 1)> t0, ...
+;   t51: ch = store<(store (s128) into %ir.p + 64, align 1)> t0, ...
+;
+;   t44: i64,ch = load<(load (s32) from %ir.q), sext from i32> t0, ...
+;   t50: ch = store<(store (s128) into %ir.p + 80, align 1)> t44:1, ...
+;   t46: ch = store<(store (s32) into %ir.q), trunc to i32> t50, ...
+;
+; Previously, the scalar load/store was incorrectly combined away:
+;
+;   t54: ch = store<(store (s512) into %ir.p, align 1)> t0, ...
+;   t51: ch = store<(store (s128) into %ir.p + 64, align 1)> t0, ...
+;
+;   // MISSING
+;   t50: ch = store<(store (s128) into %ir.p + 80, align 1)> t44:1, ...
+;   // MISSING
+; See also pr83017.ll: This is the same code, but relies on vscale_range instead
+; of -riscv-v-vector-bits-max=128.
+define void @f(ptr %p) vscale_range(2,2) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 84(a0)
+; CHECK-NEXT:    addi a2, a0, 80
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vs4r.v v12, (a0)
+; CHECK-NEXT:    addi a2, a0, 64
+; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    sw a1, 84(a0)
+; CHECK-NEXT:    ret
+  %q = getelementptr inbounds i8, ptr %p, i64 84
+  %x = load i32, ptr %q
+  call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 96, i1 false)
+  store i32 %x, ptr %q
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index 970581b4d80a..64b3a6f2b4b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -194,15 +194,12 @@ define void @vpmerge_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
   ret void
 }
 
-; FIXME: Merge vmerge.vvm and vleffN.v
 declare { <vscale x 2 x i32>, i64 } @llvm.riscv.vleff.nxv2i32(<vscale x 2 x i32>, ptr, i64)
 define <vscale x 2 x i32> @vpmerge_vleff(<vscale x 2 x i32> %passthru, ptr %p, <vscale x 2 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vpmerge_vleff:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vle32ff.v v9, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT:    vle32ff.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
   %1 = zext i32 %vl to i64
   %a = call { <vscale x 2 x i32>, i64 } @llvm.riscv.vleff.nxv2i32(<vscale x 2 x i32> undef, ptr %p, i64 %1)
@@ -634,14 +631,11 @@ define void @vpselect_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
   ret void
 }
 
-; FIXME: select vselect.vvm and vleffN.v
 define <vscale x 2 x i32> @vpselect_vleff(<vscale x 2 x i32> %passthru, ptr %p, <vscale x 2 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vpselect_vleff:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vle32ff.v v9, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vle32ff.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
   %1 = zext i32 %vl to i64
   %a = call { <vscale x 2 x i32>, i64 } @llvm.riscv.vleff.nxv2i32(<vscale x 2 x i32> undef, ptr %p, i64 %1)
@@ -898,22 +892,20 @@ define <vscale x 2 x i32> @vpselect_trunc(<vscale x 2 x i32> %passthru, <vscale
 define void @test_dag_loop() {
 ; CHECK-LABEL: test_dag_loop:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
-; CHECK-NEXT:    vle16.v v8, (zero)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmclr.m v0
-; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m4, tu, mu
-; CHECK-NEXT:    vmv4r.v v20, v16
-; CHECK-NEXT:    vssubu.vx v20, v16, zero, v0.t
+; CHECK-NEXT:    vmv4r.v v12, v8
+; CHECK-NEXT:    vssubu.vx v12, v8, zero, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmseq.vv v0, v20, v16
+; CHECK-NEXT:    vmseq.vv v0, v12, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vsetivli zero, 1, e16, m8, tu, ma
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vsetivli zero, 1, e16, m8, tu, mu
+; CHECK-NEXT:    vle16.v v8, (zero), v0.t
 ; CHECK-NEXT:    vsetivli zero, 0, e16, m8, ta, ma
-; CHECK-NEXT:    vse16.v v16, (zero)
+; CHECK-NEXT:    vse16.v v8, (zero)
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <vscale x 32 x i16> @llvm.riscv.vle.nxv32i16.i64(<vscale x 32 x i16> undef, ptr null, i64 1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index f3c70ed78c74..d02fe5b205f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -31,7 +31,7 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-NEXT:    vmsne.vi v8, v13, 0
 ; CHECK-NEXT:    vmv.v.v v0, v9
 ; CHECK-NEXT:    ret
-%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec)
+%retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
 ret {<16 x i1>, <16 x i1>} %retval
 }
 
@@ -44,7 +44,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) {
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+%retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
 ret {<16 x i8>, <16 x i8>} %retval
 }
 
@@ -57,7 +57,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec)
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+%retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
 ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -71,7 +71,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_vv8i32(<8 x i32> %vec)
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+%retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
 ret {<4 x i32>, <4 x i32>} %retval
 }
 
@@ -87,15 +87,15 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+%retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
 ret {<2 x i64>, <2 x i64>} %retval
 }
 
-declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>)
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
 ; Floats
 
@@ -107,7 +107,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+%retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
 ret {<2 x half>, <2 x half>} %retval
 }
 
@@ -119,7 +119,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+%retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
 ret {<4 x half>, <4 x half>} %retval
 }
 
@@ -131,7 +131,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %
 ; CHECK-NEXT:    vnsrl.wx v9, v8, a0
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+%retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
 ret {<2 x float>, <2 x float>} %retval
 }
 
@@ -144,7 +144,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+%retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
 ret {<8 x half>, <8 x half>} %retval
 }
 
@@ -158,7 +158,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+%retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
 ret  {<4 x float>, <4 x float>} %retval
 }
 
@@ -174,13 +174,13 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+%retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
 ret {<2 x double>, <2 x double>} %retval
 }
 
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 6a712080fda7..8f4ff37fffb0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -24,7 +24,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vmsne.vi v9, v10, 0
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i1>, ptr %p
-  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
   ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
 }
 
@@ -35,7 +35,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i8>, ptr %p
-  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
@@ -49,7 +49,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_load_nxv8i1
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 16
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i16>, ptr %p, align 1
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -60,7 +60,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_load_nxv8i1
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i16>, ptr %p
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -71,7 +71,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_load_nxv4i3
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x i32>, ptr %p
-  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
@@ -82,7 +82,7 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_load_nxv2i6
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x i64>, ptr %p
-  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
@@ -93,7 +93,7 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_load_nxv4i6
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x i64>, ptr %p
-  %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
+  %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
   ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 }
 
@@ -171,17 +171,17 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i64>, ptr %p
-  %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+  %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
   ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
 }
 
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
 ; Floats
 
@@ -192,7 +192,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_load_nxv2
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x half>, ptr %p
-  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
 }
 
@@ -203,7 +203,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_load_nxv4
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x half>, ptr %p
-  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
 }
 
@@ -214,7 +214,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_load_nx
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x float>, ptr %p
-  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
 }
 
@@ -225,7 +225,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_load_nxv8
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x half>, ptr %p
-  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
   ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval
 }
 
@@ -236,7 +236,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_load_nx
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x float>, ptr %p
-  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
   ret  {<vscale x 4 x float>, <vscale x 4 x float>} %retval
 }
 
@@ -247,13 +247,13 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_load_
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x double>, ptr %p
-  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
   ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index d98597fabcd9..7797577362c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -21,7 +21,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 8
 ; CHECK-NEXT:    vmsne.vi v9, v10, 0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
 ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
 }
 
@@ -34,7 +34,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
 ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
@@ -47,7 +47,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
 ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -61,7 +61,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    vmv.v.v v10, v12
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
 ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
@@ -77,15 +77,15 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    vmv2r.v v10, v20
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
 ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
 
 define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv128i1(<vscale x 128 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
@@ -110,7 +110,7 @@ define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv
 ; CHECK-NEXT:    vmsne.vi v9, v24, 0
 ; CHECK-NEXT:    vmv1r.v v8, v7
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
+%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
 ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
 }
 
@@ -125,7 +125,7 @@ define {<vscale x 64 x i8>, <vscale x 64 x i8>} @vector_deinterleave_nxv64i8_nxv
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 8
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.experimental.vector.deinterleave2.nxv128i8(<vscale x 128 x i8> %vec)
+%retval = call {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8> %vec)
 ret {<vscale x 64 x i8>, <vscale x 64 x i8>} %retval
 }
 
@@ -140,7 +140,7 @@ define {<vscale x 32 x i16>, <vscale x 32 x i16>} @vector_deinterleave_nxv32i16_
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.experimental.vector.deinterleave2.nxv64i16(<vscale x 64 x i16> %vec)
+%retval = call {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16> %vec)
 ret {<vscale x 32 x i16>, <vscale x 32 x i16>} %retval
 }
 
@@ -156,7 +156,7 @@ define {<vscale x 16 x i32>, <vscale x 16 x i32>} @vector_deinterleave_nxv16i32_
 ; CHECK-NEXT:    vnsrl.wi v4, v24, 0
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %vec)
+%retval = call {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %vec)
 ret {<vscale x 16 x i32>, <vscale x 16 x i32>} %retval
 }
 
@@ -229,15 +229,15 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
 ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
 }
 
-declare {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1>)
-declare {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.experimental.vector.deinterleave2.nxv128i8(<vscale x 128 x i8>)
-declare {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.experimental.vector.deinterleave2.nxv64i16(<vscale x 64 x i16>)
-declare {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1>)
+declare {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8>)
+declare {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16>)
+declare {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
 ; Floats
 
@@ -249,7 +249,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
 ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
 }
 
@@ -262,7 +262,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_n
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+%retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
 ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
 }
 
@@ -276,7 +276,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+%retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
 ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
 }
 
@@ -289,7 +289,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+%retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
 ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval
 }
 
@@ -303,7 +303,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    vmv.v.v v10, v12
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+%retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
 ret  {<vscale x 4 x float>, <vscale x 4 x float>} %retval
 }
 
@@ -319,16 +319,16 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    vmv2r.v v10, v20
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
 ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 
 define {<vscale x 32 x half>, <vscale x 32 x half>} @vector_deinterleave_nxv32f16_nxv64f16(<vscale x 64 x half> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv32f16_nxv64f16:
@@ -341,7 +341,7 @@ define {<vscale x 32 x half>, <vscale x 32 x half>} @vector_deinterleave_nxv32f1
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.experimental.vector.deinterleave2.nxv64f16(<vscale x 64 x half> %vec)
+%retval = call {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.vector.deinterleave2.nxv64f16(<vscale x 64 x half> %vec)
 ret {<vscale x 32 x half>, <vscale x 32 x half>} %retval
 }
 
@@ -357,7 +357,7 @@ define {<vscale x 16 x float>, <vscale x 16 x float>} @vector_deinterleave_nxv16
 ; CHECK-NEXT:    vnsrl.wi v4, v24, 0
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.experimental.vector.deinterleave2.nxv32f32(<vscale x 32 x float> %vec)
+%retval = call {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.vector.deinterleave2.nxv32f32(<vscale x 32 x float> %vec)
 ret  {<vscale x 16 x float>, <vscale x 16 x float>} %retval
 }
 
@@ -430,10 +430,10 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.experimental.vector.deinterleave2.nxv16f64(<vscale x 16 x double> %vec)
+%retval = call {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.vector.deinterleave2.nxv16f64(<vscale x 16 x double> %vec)
 ret {<vscale x 8 x double>, <vscale x 8 x double>} %retval
 }
 
-declare {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.experimental.vector.deinterleave2.nxv64f16(<vscale x 64 x half>)
-declare {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.experimental.vector.deinterleave2.nxv32f32(<vscale x 32 x float>)
-declare {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.experimental.vector.deinterleave2.nxv16f64(<vscale x 16 x double>)
+declare {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.vector.deinterleave2.nxv64f16(<vscale x 64 x half>)
+declare {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.vector.deinterleave2.nxv32f32(<vscale x 32 x float>)
+declare {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.vector.deinterleave2.nxv16f64(<vscale x 16 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 6ebe8e095469..99872c199a1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -41,7 +41,7 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) {
 ; ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    ret
-	   %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
+	   %res = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
 	   ret <32 x i1> %res
 }
 
@@ -62,7 +62,7 @@ define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+	   %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
 	   ret <16 x i16> %res
 }
 
@@ -84,7 +84,7 @@ define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+	   %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
 	   ret <8 x i32> %res
 }
 
@@ -118,14 +118,14 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
+	   %res = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
 	   ret <4 x i64> %res
 }
 
-declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
 ; Floats
 
@@ -146,7 +146,7 @@ define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
+	   %res = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
 	   ret <4 x half> %res
 }
 
@@ -167,7 +167,7 @@ define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
+	   %res = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
 	   ret <8 x half> %res
 }
 
@@ -189,7 +189,7 @@ define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
+	   %res = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
 	   ret <4 x float> %res
 }
 
@@ -210,7 +210,7 @@ define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b)
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
+	   %res = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
 	   ret <16 x half> %res
 }
 
@@ -232,7 +232,7 @@ define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+	   %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
 	   ret <8 x float> %res
 }
 
@@ -266,17 +266,17 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
 ; ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
+	   %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
 	   ret <4 x double> %res
 }
 
 
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32: {{.*}}
 ; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 922692ed88c9..7ade47e60bc6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -27,7 +27,7 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
+  %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
   store <vscale x 32 x i1> %res, ptr %p
   ret void
 }
@@ -42,7 +42,7 @@ define void @vector_interleave_store_nxv16i16_nxv8i16_align1(<vscale x 8 x i16>
 ; CHECK-NEXT:    vwmaccu.vx v12, a1, v10
 ; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   store <vscale x 16 x i16> %res, ptr %p, align 1
   ret void
 }
@@ -53,7 +53,7 @@ define void @vector_interleave_store_nxv16i16_nxv8i16(<vscale x 8 x i16> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   store <vscale x 16 x i16> %res, ptr %p
   ret void
 }
@@ -64,7 +64,7 @@ define void @vector_interleave_store_nxv8i32_nxv4i32(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   store <vscale x 8 x i32> %res, ptr %p
   ret void
 }
@@ -75,7 +75,7 @@ define void @vector_interleave_store_nxv4i64_nxv2i64(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   store <vscale x 4 x i64> %res, ptr %p
   ret void
 }
@@ -86,7 +86,7 @@ define void @vector_interleave_store_nxv8i64_nxv4i64(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
+  %res = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
   store <vscale x 8 x i64> %res, ptr %p
   ret void
 }
@@ -138,17 +138,17 @@ define void @vector_interleave_store_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vs
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
+  %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   store <vscale x 16 x i64> %res, ptr %p
   ret void
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
 
 ; Floats
 
@@ -158,7 +158,7 @@ define void @vector_interleave_store_nxv4f16_nxv2f16(<vscale x 2 x half> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   store <vscale x 4 x half> %res, ptr %p
   ret void
 }
@@ -169,7 +169,7 @@ define void @vector_interleave_store_nxv8f16_nxv4f16(<vscale x 4 x half> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
   store <vscale x 8 x half> %res, ptr %p
   ret void
 }
@@ -180,7 +180,7 @@ define void @vector_interleave_store_nxv4f32_nxv2f32(<vscale x 2 x float> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   store <vscale x 4 x float> %res, ptr %p
   ret void
 }
@@ -191,7 +191,7 @@ define void @vector_interleave_store_nxv16f16_nxv8f16(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
   store <vscale x 16 x half> %res, ptr %p
   ret void
 }
@@ -202,7 +202,7 @@ define void @vector_interleave_store_nxv8f32_nxv4f32(<vscale x 4 x float> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   store <vscale x 8 x float> %res, ptr %p
   ret void
 }
@@ -213,15 +213,15 @@ define void @vector_interleave_store_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   store <vscale x 4 x double> %res, ptr %p
   ret void
 }
 
 
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 327e18e91381..a7e0ad6ee5f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -47,7 +47,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
+  %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
   ret <vscale x 32 x i1> %res
 }
 
@@ -68,7 +68,7 @@ define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8>
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  %res = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
   ret <vscale x 32 x i8> %res
 }
 
@@ -89,7 +89,7 @@ define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   ret <vscale x 16 x i16> %res
 }
 
@@ -111,7 +111,7 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32>
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 8 x i32> %res
 }
 
@@ -145,15 +145,15 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   ret <vscale x 4 x i64> %res
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
 define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) {
 ; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
@@ -196,7 +196,7 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v8, v24, 0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 128 x i1> @llvm.experimental.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
+  %res = call <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
   ret <vscale x 128 x i1> %res
 }
 
@@ -223,7 +223,7 @@ define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 128 x i8> @llvm.experimental.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
+  %res = call <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
   ret <vscale x 128 x i8> %res
 }
 
@@ -250,7 +250,7 @@ define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 64 x i16> @llvm.experimental.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
+  %res = call <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
   ret <vscale x 64 x i16> %res
 }
 
@@ -278,7 +278,7 @@ define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i
 ; ZVBB-NEXT:    vmv8r.v v8, v24
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i32> @llvm.experimental.vector.interleave2.nxv32i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
+  %res = call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
   ret <vscale x 32 x i32> %res
 }
 
@@ -376,15 +376,15 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
+  %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   ret <vscale x 16 x i64> %res
 }
 
-declare <vscale x 128 x i1> @llvm.experimental.vector.interleave2.nxv128i1(<vscale x 64 x i1>, <vscale x 64 x i1>)
-declare <vscale x 128 x i8> @llvm.experimental.vector.interleave2.nxv128i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
-declare <vscale x 64 x i16> @llvm.experimental.vector.interleave2.nxv64i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
-declare <vscale x 32 x i32> @llvm.experimental.vector.interleave2.nxv32i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1>, <vscale x 64 x i1>)
+declare <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
+declare <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
+declare <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
+declare <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
 
 ; Floats
 
@@ -419,7 +419,7 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   ret <vscale x 4 x half> %res
 }
 
@@ -440,7 +440,7 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
   ret <vscale x 8 x half> %res
 }
 
@@ -462,7 +462,7 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x flo
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   ret <vscale x 4 x float> %res
 }
 
@@ -483,7 +483,7 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
   ret <vscale x 16 x half> %res
 }
 
@@ -505,7 +505,7 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x flo
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   ret <vscale x 8 x float> %res
 }
 
@@ -539,17 +539,17 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   ret <vscale x 4 x double> %res
 }
 
 
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
 define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
 ; CHECK-LABEL: vector_interleave_nxv64f16_nxv32f16:
@@ -574,7 +574,7 @@ define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 64 x half> @llvm.experimental.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
+  %res = call <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
   ret <vscale x 64 x half> %res
 }
 
@@ -602,7 +602,7 @@ define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x
 ; ZVBB-NEXT:    vmv8r.v v8, v24
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x float> @llvm.experimental.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
+  %res = call <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
   ret <vscale x 32 x float> %res
 }
 
@@ -700,7 +700,7 @@ define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x double> @llvm.experimental.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
+  %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
   ret <vscale x 16 x double> %res
 }
 
@@ -718,7 +718,7 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison(<vscale x 4
 ; ZVBB-NEXT:    vzext.vf2 v12, v8
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> poison)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> poison)
   ret <vscale x 8 x i32> %res
 }
 
@@ -738,10 +738,10 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison2(<vscale x 4
 ; ZVBB-NEXT:    vwsll.vx v12, v8, a0
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
-declare <vscale x 64 x half> @llvm.experimental.vector.interleave2.nxv64f16(<vscale x 32 x half>, <vscale x 32 x half>)
-declare <vscale x 32 x float> @llvm.experimental.vector.interleave2.nxv32f32(<vscale x 16 x float>, <vscale x 16 x float>)
-declare <vscale x 16 x double> @llvm.experimental.vector.interleave2.nxv16f64(<vscale x 8 x double>, <vscale x 8 x double>)
+declare <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half>, <vscale x 32 x half>)
+declare <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float>, <vscale x 16 x float>)
+declare <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double>, <vscale x 8 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-reassociations.ll b/llvm/test/CodeGen/RISCV/rvv/vector-reassociations.ll
new file mode 100644
index 000000000000..6435c1c14e06
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-reassociations.ll
@@ -0,0 +1,253 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr='+v' -O3 %s -o - | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  i32)
+
+declare <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  i32, i32)
+
+declare <vscale x 1 x i8> @llvm.riscv.vsub.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  i32)
+
+declare <vscale x 1 x i8> @llvm.riscv.vmul.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  i32)
+
+define <vscale x 1 x i8> @simple_vadd_vv(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
+; CHECK-LABEL: simple_vadd_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v8, v9
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    i32 %2)
+
+  %b = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %a,
+    i32 %2)
+
+  %c = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %b,
+    i32 %2)
+
+  ret <vscale x 1 x i8> %c
+}
+
+define <vscale x 1 x i8> @simple_vadd_vsub_vv(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
+; CHECK-LABEL: simple_vadd_vsub_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vsub.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    i32 %2)
+
+  %b = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %a,
+    i32 %2)
+
+  %c = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %b,
+    i32 %2)
+
+  ret <vscale x 1 x i8> %c
+}
+
+define <vscale x 1 x i8> @simple_vmul_vv(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
+; CHECK-LABEL: simple_vmul_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmul.vv v9, v8, v9
+; CHECK-NEXT:    vmul.vv v8, v8, v8
+; CHECK-NEXT:    vmul.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vmul.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    i32 %2)
+
+  %b = call <vscale x 1 x i8> @llvm.riscv.vmul.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %a,
+    i32 %2)
+
+  %c = call <vscale x 1 x i8> @llvm.riscv.vmul.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %b,
+    i32 %2)
+
+  ret <vscale x 1 x i8> %c
+}
+
+; With passthru and masks.
+define <vscale x 1 x i8> @vadd_vv_passthru(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
+; CHECK-LABEL: vadd_vv_passthru:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vadd.vv v9, v8, v8
+; CHECK-NEXT:    vadd.vv v8, v9, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    i32 %2)
+
+  %b = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %a,
+    i32 %2)
+
+  %c = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %b,
+    i32 %2)
+
+  ret <vscale x 1 x i8> %c
+}
+
+define <vscale x 1 x i8> @vadd_vv_passthru_negative(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
+; CHECK-LABEL: vadd_vv_passthru_negative:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, tu, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vadd.vv v10, v8, v9
+; CHECK-NEXT:    vadd.vv v9, v8, v10
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    i32 %2)
+
+  %b = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %a,
+    i32 %2)
+
+  %c = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %b,
+    i32 %2)
+
+  ret <vscale x 1 x i8> %c
+}
+
+define <vscale x 1 x i8> @vadd_vv_mask(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2, <vscale x 1 x i1> %m) nounwind {
+; CHECK-LABEL: vadd_vv_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vadd.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vadd.vv v9, v8, v8, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i1> %m,
+    i32 %2, i32 1)
+
+  %b = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %a,
+    <vscale x 1 x i1> %m,
+    i32 %2, i32 1)
+
+  %c = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %b,
+    <vscale x 1 x i1> %m,
+    i32 %2, i32 1)
+
+  ret <vscale x 1 x i8> %c
+}
+
+define <vscale x 1 x i8> @vadd_vv_mask_negative(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2, <vscale x 1 x i1> %m, <vscale x 1 x i1> %m2) nounwind {
+; CHECK-LABEL: vadd_vv_mask_negative:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vadd.vv v11, v8, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vadd.vv v9, v8, v11, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i1> %m,
+    i32 %2, i32 1)
+
+  %b = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %a,
+    <vscale x 1 x i1> %m,
+    i32 %2, i32 1)
+
+  %c = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %b,
+    <vscale x 1 x i1> %m2,
+    i32 %2, i32 1)
+
+  ret <vscale x 1 x i8> %c
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index c98242437f62..be56db52e349 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -4,7 +4,7 @@
 
 ; Tests assume VLEN=128 or vscale_range_min=2.
 
-declare <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i1_offset_negone:
@@ -24,7 +24,7 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 -1)
+  %res = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 -1)
   ret <vscale x 1 x i1> %res
 }
 
@@ -48,11 +48,11 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_max(<vscale x 1 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 1)
+  %res = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 1)
   ret <vscale x 1 x i1> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
 
 define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i1_offset_negone:
@@ -72,7 +72,7 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -96,11 +96,11 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_max(<vscale x 2 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 3)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 3)
   ret <vscale x 2 x i1> %res
 }
 
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i1_offset_negone:
@@ -120,7 +120,7 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
   ret <vscale x 4 x i1> %res
 }
 
@@ -144,11 +144,11 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_max(<vscale x 4 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 7)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 7)
   ret <vscale x 4 x i1> %res
 }
 
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i1_offset_negone:
@@ -167,7 +167,7 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
   ret <vscale x 8 x i1> %res
 }
 
@@ -190,11 +190,11 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_max(<vscale x 8 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 15)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 15)
   ret <vscale x 8 x i1> %res
 }
 
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x i1> @splice_nxv16i1_offset_negone(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i1_offset_negone:
@@ -216,7 +216,7 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_negone(<vscale x 16 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
   ret <vscale x 16 x i1> %res
 }
 
@@ -240,11 +240,11 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_max(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 31)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 31)
   ret <vscale x 16 x i1> %res
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>, i32)
 
 define <vscale x 32 x i1> @splice_nxv32i1_offset_negone(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i1_offset_negone:
@@ -266,7 +266,7 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_negone(<vscale x 32 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 -1)
+  %res = call <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 -1)
   ret <vscale x 32 x i1> %res
 }
 
@@ -289,11 +289,11 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_max(<vscale x 32 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v16, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 63)
+  %res = call <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 63)
   ret <vscale x 32 x i1> %res
 }
 
-declare <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>, i32)
+declare <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>, i32)
 
 define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv64i1_offset_negone:
@@ -315,7 +315,7 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 -1)
+  %res = call <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 -1)
   ret <vscale x 64 x i1> %res
 }
 
@@ -338,17 +338,17 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_max(<vscale x 64 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v24, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 127)
+  %res = call <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 127)
   ret <vscale x 64 x i1> %res
 }
 
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
 
 define <vscale x 1 x i8> @splice_nxv1i8_offset_zero(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 0)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 0)
   ret <vscale x 1 x i8> %res
 }
 
@@ -363,7 +363,7 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_negone(<vscale x 1 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -1)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -1)
   ret <vscale x 1 x i8> %res
 }
 
@@ -378,7 +378,7 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_min(<vscale x 1 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -2)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -2)
   ret <vscale x 1 x i8> %res
 }
 
@@ -393,17 +393,17 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_max(<vscale x 1 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 1)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 1)
   ret <vscale x 1 x i8> %res
 }
 
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
 
 define <vscale x 2 x i8> @splice_nxv2i8_offset_zero(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 0)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 0)
   ret <vscale x 2 x i8> %res
 }
 
@@ -418,7 +418,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_negone(<vscale x 2 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -1)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -1)
   ret <vscale x 2 x i8> %res
 }
 
@@ -433,7 +433,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_min(<vscale x 2 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -4)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -4)
   ret <vscale x 2 x i8> %res
 }
 
@@ -448,17 +448,17 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_max(<vscale x 2 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 3)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 3)
   ret <vscale x 2 x i8> %res
 }
 
-declare <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+declare <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
 
 define <vscale x 4 x i8> @splice_nxv4i8_offset_zero(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 0)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 0)
   ret <vscale x 4 x i8> %res
 }
 
@@ -473,7 +473,7 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_negone(<vscale x 4 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -1)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -1)
   ret <vscale x 4 x i8> %res
 }
 
@@ -488,7 +488,7 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_min(<vscale x 4 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -8)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -8)
   ret <vscale x 4 x i8> %res
 }
 
@@ -503,17 +503,17 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_max(<vscale x 4 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 7)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 7)
   ret <vscale x 4 x i8> %res
 }
 
-declare <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+declare <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
 
 define <vscale x 8 x i8> @splice_nxv8i8_offset_zero(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 0)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 0)
   ret <vscale x 8 x i8> %res
 }
 
@@ -527,7 +527,7 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_negone(<vscale x 8 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -1)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -1)
   ret <vscale x 8 x i8> %res
 }
 
@@ -541,7 +541,7 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_min(<vscale x 8 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -16)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -16)
   ret <vscale x 8 x i8> %res
 }
 
@@ -555,17 +555,17 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_max(<vscale x 8 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 15)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 15)
   ret <vscale x 8 x i8> %res
 }
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
 
 define <vscale x 16 x i8> @splice_nxv16i8_offset_zero(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 16 x i8> %res
 }
 
@@ -580,7 +580,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_negone(<vscale x 16 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -596,7 +596,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_min(<vscale x 16 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
   ret <vscale x 16 x i8> %res
 }
 
@@ -611,17 +611,17 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_max(<vscale x 16 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 31)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 31)
   ret <vscale x 16 x i8> %res
 }
 
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
 
 define <vscale x 32 x i8> @splice_nxv32i8_offset_zero(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 0)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 0)
   ret <vscale x 32 x i8> %res
 }
 
@@ -636,7 +636,7 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_negone(<vscale x 32 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -1)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -1)
   ret <vscale x 32 x i8> %res
 }
 
@@ -652,7 +652,7 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_min(<vscale x 32 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -64)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -64)
   ret <vscale x 32 x i8> %res
 }
 
@@ -668,17 +668,17 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_max(<vscale x 32 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 63)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 63)
   ret <vscale x 32 x i8> %res
 }
 
-declare <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
+declare <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
 
 define <vscale x 64 x i8> @splice_nxv64i8_offset_zero(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv64i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 0)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 0)
   ret <vscale x 64 x i8> %res
 }
 
@@ -693,7 +693,7 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_negone(<vscale x 64 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -1)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -1)
   ret <vscale x 64 x i8> %res
 }
 
@@ -709,7 +709,7 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_min(<vscale x 64 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -128)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -128)
   ret <vscale x 64 x i8> %res
 }
 
@@ -725,17 +725,17 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_max(<vscale x 64 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 127)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 127)
   ret <vscale x 64 x i8> %res
 }
 
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
 
 define <vscale x 1 x i16> @splice_nxv1i16_offset_zero(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 0)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 0)
   ret <vscale x 1 x i16> %res
 }
 
@@ -750,7 +750,7 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_negone(<vscale x 1 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -1)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -1)
   ret <vscale x 1 x i16> %res
 }
 
@@ -765,7 +765,7 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_min(<vscale x 1 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -2)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -2)
   ret <vscale x 1 x i16> %res
 }
 
@@ -780,17 +780,17 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_max(<vscale x 1 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 1)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 1)
   ret <vscale x 1 x i16> %res
 }
 
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
 
 define <vscale x 2 x i16> @splice_nxv2i16_offset_zero(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 0)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 0)
   ret <vscale x 2 x i16> %res
 }
 
@@ -805,7 +805,7 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_negone(<vscale x 2 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -1)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -1)
   ret <vscale x 2 x i16> %res
 }
 
@@ -820,7 +820,7 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_min(<vscale x 2 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -4)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -4)
   ret <vscale x 2 x i16> %res
 }
 
@@ -835,17 +835,17 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_max(<vscale x 2 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 3)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 3)
   ret <vscale x 2 x i16> %res
 }
 
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
 
 define <vscale x 4 x i16> @splice_nxv4i16_offset_zero(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 0)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 0)
   ret <vscale x 4 x i16> %res
 }
 
@@ -860,7 +860,7 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_negone(<vscale x 4 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -1)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -1)
   ret <vscale x 4 x i16> %res
 }
 
@@ -875,7 +875,7 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_min(<vscale x 4 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -8)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -8)
   ret <vscale x 4 x i16> %res
 }
 
@@ -890,17 +890,17 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_max(<vscale x 4 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 7)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 7)
   ret <vscale x 4 x i16> %res
 }
 
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
 
 define <vscale x 8 x i16> @splice_nxv8i16_offset_zero(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 0)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 0)
   ret <vscale x 8 x i16> %res
 }
 
@@ -914,7 +914,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_negone(<vscale x 8 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -928,7 +928,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_min(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -16)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -16)
   ret <vscale x 8 x i16> %res
 }
 
@@ -942,17 +942,17 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_max(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 15)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 15)
   ret <vscale x 8 x i16> %res
 }
 
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
 
 define <vscale x 16 x i16> @splice_nxv16i16_offset_zero(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 0)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 0)
   ret <vscale x 16 x i16> %res
 }
 
@@ -967,7 +967,7 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_negone(<vscale x 16 x i16> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -1)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -1)
   ret <vscale x 16 x i16> %res
 }
 
@@ -983,7 +983,7 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_min(<vscale x 16 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -32)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -32)
   ret <vscale x 16 x i16> %res
 }
 
@@ -998,17 +998,17 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_max(<vscale x 16 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 31)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 31)
   ret <vscale x 16 x i16> %res
 }
 
-declare <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
+declare <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
 
 define <vscale x 32 x i16> @splice_nxv32i16_offset_zero(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 0)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 0)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1023,7 +1023,7 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_negone(<vscale x 32 x i16> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -1)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -1)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1039,7 +1039,7 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_min(<vscale x 32 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -64)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -64)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1055,17 +1055,17 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_max(<vscale x 32 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 63)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 63)
   ret <vscale x 32 x i16> %res
 }
 
-declare <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
+declare <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
 
 define <vscale x 1 x i32> @splice_nxv1i32_offset_zero(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 0)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 0)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1080,7 +1080,7 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_negone(<vscale x 1 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -1)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -1)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1095,7 +1095,7 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_min(<vscale x 1 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -2)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -2)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1110,17 +1110,17 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_max(<vscale x 1 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 1)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 1)
   ret <vscale x 1 x i32> %res
 }
 
-declare <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
+declare <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
 
 define <vscale x 2 x i32> @splice_nxv2i32_offset_zero(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 0)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 0)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1135,7 +1135,7 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_negone(<vscale x 2 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -1)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -1)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1150,7 +1150,7 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_min(<vscale x 2 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -4)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -4)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1165,17 +1165,17 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_max(<vscale x 2 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 3)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 3)
   ret <vscale x 2 x i32> %res
 }
 
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
 
 define <vscale x 4 x i32> @splice_nxv4i32_offset_zero(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 0)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 0)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1190,7 +1190,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_negone(<vscale x 4 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1205,7 +1205,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_min(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -8)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -8)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1220,17 +1220,17 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_max(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 7)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 7)
   ret <vscale x 4 x i32> %res
 }
 
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
 
 define <vscale x 8 x i32> @splice_nxv8i32_offset_zero(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 0)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 0)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1244,7 +1244,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_negone(<vscale x 8 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -1)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -1)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1258,7 +1258,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_min(<vscale x 8 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -16)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -16)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1272,17 +1272,17 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_max(<vscale x 8 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 15)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 15)
   ret <vscale x 8 x i32> %res
 }
 
-declare <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
+declare <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
 
 define <vscale x 16 x i32> @splice_nxv16i32_offset_zero(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 0)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 0)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1297,7 +1297,7 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_negone(<vscale x 16 x i32> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -1)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -1)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1313,7 +1313,7 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_min(<vscale x 16 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -32)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -32)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1328,17 +1328,17 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_max(<vscale x 16 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 31)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 31)
   ret <vscale x 16 x i32> %res
 }
 
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
 
 define <vscale x 1 x i64> @splice_nxv1i64_offset_zero(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 0)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 0)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1353,7 +1353,7 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_negone(<vscale x 1 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -1)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -1)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1368,7 +1368,7 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_min(<vscale x 1 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -2)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -2)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1383,17 +1383,17 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_max(<vscale x 1 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 1)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 1)
   ret <vscale x 1 x i64> %res
 }
 
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
 
 define <vscale x 2 x i64> @splice_nxv2i64_offset_zero(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 0)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 0)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1408,7 +1408,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_negone(<vscale x 2 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1423,7 +1423,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_min(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -4)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -4)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1438,17 +1438,17 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_max(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 3)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 3)
   ret <vscale x 2 x i64> %res
 }
 
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
 
 define <vscale x 4 x i64> @splice_nxv4i64_offset_zero(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 0)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 0)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1463,7 +1463,7 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_negone(<vscale x 4 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -1)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -1)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1478,7 +1478,7 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_min(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -8)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -8)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1493,17 +1493,17 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_max(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 7)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 7)
   ret <vscale x 4 x i64> %res
 }
 
-declare <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+declare <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
 
 define <vscale x 8 x i64> @splice_nxv8i64_offset_zero(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 0)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 0)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1517,7 +1517,7 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_negone(<vscale x 8 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -1)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -1)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1531,7 +1531,7 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_min(<vscale x 8 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -16)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -16)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1545,17 +1545,17 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_max(<vscale x 8 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 15)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 15)
   ret <vscale x 8 x i64> %res
 }
 
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
 
 define <vscale x 1 x half> @splice_nxv1f16_offset_zero(<vscale x 1 x half> %a, <vscale x 1 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 0)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 0)
   ret <vscale x 1 x half> %res
 }
 
@@ -1570,7 +1570,7 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_negone(<vscale x 1 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -1)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -1)
   ret <vscale x 1 x half> %res
 }
 
@@ -1585,7 +1585,7 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_min(<vscale x 1 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -2)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -2)
   ret <vscale x 1 x half> %res
 }
 
@@ -1600,17 +1600,17 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_max(<vscale x 1 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 1)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 1)
   ret <vscale x 1 x half> %res
 }
 
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
 
 define <vscale x 2 x half> @splice_nxv2f16_offset_zero(<vscale x 2 x half> %a, <vscale x 2 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 0)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 0)
   ret <vscale x 2 x half> %res
 }
 
@@ -1625,7 +1625,7 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_negone(<vscale x 2 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
   ret <vscale x 2 x half> %res
 }
 
@@ -1640,7 +1640,7 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_min(<vscale x 2 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -4)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -4)
   ret <vscale x 2 x half> %res
 }
 
@@ -1655,17 +1655,17 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_max(<vscale x 2 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 3)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 3)
   ret <vscale x 2 x half> %res
 }
 
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
 
 define <vscale x 4 x half> @splice_nxv4f16_offset_zero(<vscale x 4 x half> %a, <vscale x 4 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 0)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 0)
   ret <vscale x 4 x half> %res
 }
 
@@ -1680,7 +1680,7 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_negone(<vscale x 4 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
   ret <vscale x 4 x half> %res
 }
 
@@ -1695,7 +1695,7 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_min(<vscale x 4 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -8)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -8)
   ret <vscale x 4 x half> %res
 }
 
@@ -1710,17 +1710,17 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_max(<vscale x 4 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 7)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 7)
   ret <vscale x 4 x half> %res
 }
 
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
 
 define <vscale x 8 x half> @splice_nxv8f16_offset_zero(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 0)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 0)
   ret <vscale x 8 x half> %res
 }
 
@@ -1734,7 +1734,7 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_negone(<vscale x 8 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
   ret <vscale x 8 x half> %res
 }
 
@@ -1748,7 +1748,7 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_min(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -16)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -16)
   ret <vscale x 8 x half> %res
 }
 
@@ -1762,17 +1762,17 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_max(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 15)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 15)
   ret <vscale x 8 x half> %res
 }
 
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
 
 define <vscale x 16 x half> @splice_nxv16f16_offset_zero(<vscale x 16 x half> %a, <vscale x 16 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv16f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 0)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 0)
   ret <vscale x 16 x half> %res
 }
 
@@ -1787,7 +1787,7 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_negone(<vscale x 16 x half>
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -1)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -1)
   ret <vscale x 16 x half> %res
 }
 
@@ -1803,7 +1803,7 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_min(<vscale x 16 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -32)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -32)
   ret <vscale x 16 x half> %res
 }
 
@@ -1818,17 +1818,17 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_max(<vscale x 16 x half> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 31)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 31)
   ret <vscale x 16 x half> %res
 }
 
-declare <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
+declare <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
 
 define <vscale x 32 x half> @splice_nxv32f16_offset_zero(<vscale x 32 x half> %a, <vscale x 32 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv32f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 0)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 0)
   ret <vscale x 32 x half> %res
 }
 
@@ -1843,7 +1843,7 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_negone(<vscale x 32 x half>
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -1)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -1)
   ret <vscale x 32 x half> %res
 }
 
@@ -1859,7 +1859,7 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_min(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -64)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -64)
   ret <vscale x 32 x half> %res
 }
 
@@ -1875,17 +1875,17 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_max(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 63)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 63)
   ret <vscale x 32 x half> %res
 }
 
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
 
 define <vscale x 1 x float> @splice_nxv1f32_offset_zero(<vscale x 1 x float> %a, <vscale x 1 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 0)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 0)
   ret <vscale x 1 x float> %res
 }
 
@@ -1900,7 +1900,7 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_negone(<vscale x 1 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -1)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -1)
   ret <vscale x 1 x float> %res
 }
 
@@ -1915,7 +1915,7 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_min(<vscale x 1 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -2)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -2)
   ret <vscale x 1 x float> %res
 }
 
@@ -1930,17 +1930,17 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_max(<vscale x 1 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 1)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 1)
   ret <vscale x 1 x float> %res
 }
 
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
 
 define <vscale x 2 x float> @splice_nxv2f32_offset_zero(<vscale x 2 x float> %a, <vscale x 2 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 0)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 0)
   ret <vscale x 2 x float> %res
 }
 
@@ -1955,7 +1955,7 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_negone(<vscale x 2 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
   ret <vscale x 2 x float> %res
 }
 
@@ -1970,7 +1970,7 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_min(<vscale x 2 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -4)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -4)
   ret <vscale x 2 x float> %res
 }
 
@@ -1985,17 +1985,17 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_max(<vscale x 2 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 3)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 3)
   ret <vscale x 2 x float> %res
 }
 
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
 
 define <vscale x 4 x float> @splice_nxv4f32_offset_zero(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 0)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 0)
   ret <vscale x 4 x float> %res
 }
 
@@ -2010,7 +2010,7 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_negone(<vscale x 4 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
   ret <vscale x 4 x float> %res
 }
 
@@ -2025,7 +2025,7 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_min(<vscale x 4 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -8)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -8)
   ret <vscale x 4 x float> %res
 }
 
@@ -2040,17 +2040,17 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_max(<vscale x 4 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 7)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 7)
   ret <vscale x 4 x float> %res
 }
 
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
 
 define <vscale x 8 x float> @splice_nxv8f32_offset_zero(<vscale x 8 x float> %a, <vscale x 8 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 0)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 0)
   ret <vscale x 8 x float> %res
 }
 
@@ -2064,7 +2064,7 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_negone(<vscale x 8 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -1)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -1)
   ret <vscale x 8 x float> %res
 }
 
@@ -2078,7 +2078,7 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_min(<vscale x 8 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -16)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -16)
   ret <vscale x 8 x float> %res
 }
 
@@ -2092,17 +2092,17 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_max(<vscale x 8 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 15)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 15)
   ret <vscale x 8 x float> %res
 }
 
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
 
 define <vscale x 16 x float> @splice_nxv16f32_offset_zero(<vscale x 16 x float> %a, <vscale x 16 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv16f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 0)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 0)
   ret <vscale x 16 x float> %res
 }
 
@@ -2117,7 +2117,7 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_negone(<vscale x 16 x float
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -1)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -1)
   ret <vscale x 16 x float> %res
 }
 
@@ -2133,7 +2133,7 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_min(<vscale x 16 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -32)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -32)
   ret <vscale x 16 x float> %res
 }
 
@@ -2148,17 +2148,17 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_max(<vscale x 16 x float> %
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 31)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 31)
   ret <vscale x 16 x float> %res
 }
 
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
 
 define <vscale x 1 x double> @splice_nxv1f64_offset_zero(<vscale x 1 x double> %a, <vscale x 1 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 0)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 0)
   ret <vscale x 1 x double> %res
 }
 
@@ -2173,7 +2173,7 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_negone(<vscale x 1 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -1)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -1)
   ret <vscale x 1 x double> %res
 }
 
@@ -2188,7 +2188,7 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_min(<vscale x 1 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -2)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -2)
   ret <vscale x 1 x double> %res
 }
 
@@ -2203,17 +2203,17 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_max(<vscale x 1 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 1)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 1)
   ret <vscale x 1 x double> %res
 }
 
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
 
 define <vscale x 2 x double> @splice_nxv2f64_offset_zero(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 0)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 0)
   ret <vscale x 2 x double> %res
 }
 
@@ -2228,7 +2228,7 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_negone(<vscale x 2 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
   ret <vscale x 2 x double> %res
 }
 
@@ -2243,7 +2243,7 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_min(<vscale x 2 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -4)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -4)
   ret <vscale x 2 x double> %res
 }
 
@@ -2258,17 +2258,17 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_max(<vscale x 2 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 3)
   ret <vscale x 2 x double> %res
 }
 
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
 
 define <vscale x 4 x double> @splice_nxv4f64_offset_zero(<vscale x 4 x double> %a, <vscale x 4 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 0)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 0)
   ret <vscale x 4 x double> %res
 }
 
@@ -2283,7 +2283,7 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_negone(<vscale x 4 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -1)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -1)
   ret <vscale x 4 x double> %res
 }
 
@@ -2298,7 +2298,7 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_min(<vscale x 4 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -8)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -8)
   ret <vscale x 4 x double> %res
 }
 
@@ -2313,17 +2313,17 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_max(<vscale x 4 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 7)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 7)
   ret <vscale x 4 x double> %res
 }
 
-declare <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
+declare <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
 
 define <vscale x 8 x double> @splice_nxv8f64_offset_zero(<vscale x 8 x double> %a, <vscale x 8 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 0)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 0)
   ret <vscale x 8 x double> %res
 }
 
@@ -2337,7 +2337,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_negone(<vscale x 8 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -1)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -1)
   ret <vscale x 8 x double> %res
 }
 
@@ -2351,7 +2351,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_min(<vscale x 8 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -16)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -16)
   ret <vscale x 8 x double> %res
 }
 
@@ -2365,7 +2365,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_max(<vscale x 8 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 15)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 15)
   ret <vscale x 8 x double> %res
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll
new file mode 100644
index 000000000000..8b368bfaab08
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr='+v' -verify-machineinstrs | FileCheck %s --check-prefix=RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr='+v' -verify-machineinstrs | FileCheck %s --check-prefix=RV64
+
+define iXLen @bool_vec(<vscale x 2 x i1> %src, <vscale x 2 x i1> %m, i32 %evl) {
+; RV32-LABEL: bool_vec:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v9, v0
+; RV32-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vfirst.m a1, v9, v0.t
+; RV32-NEXT:    bltz a1, .LBB0_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB0_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bool_vec:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v9, v0
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vfirst.m a1, v9, v0.t
+; RV64-NEXT:    bltz a1, .LBB0_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB0_2:
+; RV64-NEXT:    ret
+  %r = call iXLen @llvm.vp.cttz.elts.iXLen.nxv2i1(<vscale x 2 x i1> %src, i1 0, <vscale x 2 x i1> %m, i32 %evl)
+  ret iXLen %r
+}
+
+define iXLen @bool_vec_zero_poison(<vscale x 2 x i1> %src, <vscale x 2 x i1> %m, i32 %evl) {
+; RV32-LABEL: bool_vec_zero_poison:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v9, v0
+; RV32-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vfirst.m a0, v9, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bool_vec_zero_poison:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v9, v0
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vfirst.m a0, v9, v0.t
+; RV64-NEXT:    ret
+  %r = call iXLen @llvm.vp.cttz.elts.iXLen.nxv2i1(<vscale x 2 x i1> %src, i1 1, <vscale x 2 x i1> %m, i32 %evl)
+  ret iXLen %r
+}
+
+define iXLen @nxv2i32(<vscale x 2 x i32> %src, <vscale x 2 x i1> %m, i32 %evl) {
+; RV32-LABEL: nxv2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV32-NEXT:    vfirst.m a1, v8, v0.t
+; RV32-NEXT:    bltz a1, .LBB2_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB2_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: nxv2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV64-NEXT:    vfirst.m a1, v8, v0.t
+; RV64-NEXT:    bltz a1, .LBB2_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB2_2:
+; RV64-NEXT:    ret
+  %r = call iXLen @llvm.vp.cttz.elts.iXLen.nxv2i32(<vscale x 2 x i32> %src, i1 0, <vscale x 2 x i1> %m, i32 %evl)
+  ret iXLen %r
+}
+
+define iXLen @nxv2i32_zero_poison(<vscale x 2 x i32> %src, <vscale x 2 x i1> %m, i32 %evl) {
+; RV32-LABEL: nxv2i32_zero_poison:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV32-NEXT:    vfirst.m a0, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: nxv2i32_zero_poison:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV64-NEXT:    vfirst.m a0, v8, v0.t
+; RV64-NEXT:    ret
+  %r = call iXLen @llvm.vp.cttz.elts.iXLen.nxv2i32(<vscale x 2 x i32> %src, i1 1, <vscale x 2 x i1> %m, i32 %evl)
+  ret iXLen %r
+}
+
+define iXLen @nxv2i64(<vscale x 2 x i64> %src, <vscale x 2 x i1> %m, i32 %evl) {
+; RV32-LABEL: nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vmsne.vi v10, v8, 0, v0.t
+; RV32-NEXT:    vfirst.m a1, v10, v0.t
+; RV32-NEXT:    bltz a1, .LBB4_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB4_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    vmsne.vi v10, v8, 0, v0.t
+; RV64-NEXT:    vfirst.m a1, v10, v0.t
+; RV64-NEXT:    bltz a1, .LBB4_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB4_2:
+; RV64-NEXT:    ret
+  %r = call iXLen @llvm.vp.cttz.elts.iXLen.nxv2i64(<vscale x 2 x i64> %src, i1 0, <vscale x 2 x i1> %m, i32 %evl)
+  ret iXLen %r
+}
+
+define iXLen @nxv2i64_zero_poison(<vscale x 2 x i64> %src, <vscale x 2 x i1> %m, i32 %evl) {
+; RV32-LABEL: nxv2i64_zero_poison:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vmsne.vi v10, v8, 0, v0.t
+; RV32-NEXT:    vfirst.m a0, v10, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: nxv2i64_zero_poison:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    vmsne.vi v10, v8, 0, v0.t
+; RV64-NEXT:    vfirst.m a0, v10, v0.t
+; RV64-NEXT:    ret
+  %r = call iXLen @llvm.vp.cttz.elts.iXLen.nxv2i64(<vscale x 2 x i64> %src, i1 1, <vscale x 2 x i1> %m, i32 %evl)
+  ret iXLen %r
+}
+
+define i1 @nxv2i32_cmp_evl(<vscale x 2 x i32> %src, <vscale x 2 x i1> %m, i32 %evl) {
+; RV32-LABEL: nxv2i32_cmp_evl:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV32-NEXT:    vfirst.m a2, v8, v0.t
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltz a2, .LBB6_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:  .LBB6_2:
+; RV32-NEXT:    xor a0, a1, a0
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: nxv2i32_cmp_evl:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV64-NEXT:    vfirst.m a2, v8, v0.t
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    bltz a2, .LBB6_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:  .LBB6_2:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+  %r = call i32 @llvm.vp.cttz.elts.i32.nxv2i32(<vscale x 2 x i32> %src, i1 0, <vscale x 2 x i1> %m, i32 %evl)
+  %cmp = icmp eq i32 %r, %evl
+  ret i1 %cmp
+}
+
+define iXLen @fixed_v2i64(<2 x i64> %src, <2 x i1> %m, i32 %evl) {
+; RV32-LABEL: fixed_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV32-NEXT:    vfirst.m a1, v8, v0.t
+; RV32-NEXT:    bltz a1, .LBB7_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB7_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: fixed_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV64-NEXT:    vfirst.m a1, v8, v0.t
+; RV64-NEXT:    bltz a1, .LBB7_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB7_2:
+; RV64-NEXT:    ret
+  %r = call iXLen @llvm.vp.cttz.elts.iXLen.v2i64(<2 x i64> %src, i1 0, <2 x i1> %m, i32 %evl)
+  ret iXLen %r
+}
+
+define iXLen @fixed_v2i64_zero_poison(<2 x i64> %src, <2 x i1> %m, i32 %evl) {
+; RV32-LABEL: fixed_v2i64_zero_poison:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV32-NEXT:    vfirst.m a0, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: fixed_v2i64_zero_poison:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    vmsne.vi v8, v8, 0, v0.t
+; RV64-NEXT:    vfirst.m a0, v8, v0.t
+; RV64-NEXT:    ret
+  %r = call iXLen @llvm.vp.cttz.elts.iXLen.v2i64(<2 x i64> %src, i1 1, <2 x i1> %m, i32 %evl)
+  ret iXLen %r
+}
+
+declare iXLen @llvm.vp.cttz.elts.iXLen.nxv2i1(<vscale x 2 x i1>, i1, <vscale x 2 x i1>, i32)
+declare iXLen @llvm.vp.cttz.elts.iXLen.nxv2i32(<vscale x 2 x i32>, i1, <vscale x 2 x i1>, i32)
+declare iXLen @llvm.vp.cttz.elts.iXLen.nxv2i64(<vscale x 2 x i64>, i1, <vscale x 2 x i1>, i32)
+declare iXLen @llvm.vp.cttz.elts.iXLen.v2i64(<2 x i64>, i1, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 618e169e1f96..7bcf37b1af3c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -841,24 +841,14 @@ define signext i32 @vpreduce_add_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v,
 declare i32 @llvm.vp.reduce.umax.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
 
 define signext i32 @vpreduce_umax_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umax_nxv1i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umax_nxv1i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV64-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umax_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umax.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -881,24 +871,14 @@ define signext i32 @vpreduce_smax_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v,
 declare i32 @llvm.vp.reduce.umin.nxv1i32(i32, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
 
 define signext i32 @vpreduce_umin_nxv1i32(i32 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umin_nxv1i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umin_nxv1i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV64-NEXT:    vredminu.vs v9, v8, v9, v0.t
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umin_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umin.nxv1i32(i32 %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -981,24 +961,14 @@ define signext i32 @vpreduce_add_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v,
 declare i32 @llvm.vp.reduce.umax.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
 
 define signext i32 @vpreduce_umax_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umax_nxv2i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umax_nxv2i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; RV64-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umax_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umax.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -1021,24 +991,14 @@ define signext i32 @vpreduce_smax_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v,
 declare i32 @llvm.vp.reduce.umin.nxv2i32(i32, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
 
 define signext i32 @vpreduce_umin_nxv2i32(i32 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umin_nxv2i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v9, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umin_nxv2i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v9, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; RV64-NEXT:    vredminu.vs v9, v8, v9, v0.t
-; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umin_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vredminu.vs v9, v8, v9, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umin.nxv2i32(i32 %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -1121,24 +1081,14 @@ define signext i32 @vpreduce_add_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v,
 declare i32 @llvm.vp.reduce.umax.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 
 define signext i32 @vpreduce_umax_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umax_nxv4i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v10, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vredmaxu.vs v10, v8, v10, v0.t
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umax_nxv4i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v10, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vredmaxu.vs v10, v8, v10, v0.t
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umax_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vredmaxu.vs v10, v8, v10, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -1146,56 +1096,30 @@ define signext i32 @vpreduce_umax_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v,
 declare i32 @llvm.vp.reduce.umax.nxv32i32(i32, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
 
 define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %v, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umax_nxv32i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    srli a2, a3, 2
-; RV32-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vx v24, v0, a2
-; RV32-NEXT:    slli a3, a3, 1
-; RV32-NEXT:    sub a2, a1, a3
-; RV32-NEXT:    sltu a4, a1, a2
-; RV32-NEXT:    addi a4, a4, -1
-; RV32-NEXT:    and a2, a4, a2
-; RV32-NEXT:    bltu a1, a3, .LBB67_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:  .LBB67_2:
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.s.x v25, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vredmaxu.vs v25, v8, v25, v0.t
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v0, v24
-; RV32-NEXT:    vredmaxu.vs v25, v16, v25, v0.t
-; RV32-NEXT:    vmv.x.s a0, v25
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umax_nxv32i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    srli a2, a3, 2
-; RV64-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a2
-; RV64-NEXT:    andi a2, a0, -1
-; RV64-NEXT:    slli a3, a3, 1
-; RV64-NEXT:    sub a0, a1, a3
-; RV64-NEXT:    sltu a4, a1, a0
-; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    and a0, a4, a0
-; RV64-NEXT:    bltu a1, a3, .LBB67_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a1, a3
-; RV64-NEXT:  .LBB67_2:
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT:    vmv.s.x v25, a2
-; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV64-NEXT:    vredmaxu.vs v25, v8, v25, v0.t
-; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    vredmaxu.vs v25, v16, v25, v0.t
-; RV64-NEXT:    vmv.x.s a0, v25
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umax_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    srli a2, a3, 2
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v24, v0, a2
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    sub a2, a1, a3
+; CHECK-NEXT:    sltu a4, a1, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    bltu a1, a3, .LBB67_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB67_2:
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v25, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vredmaxu.vs v25, v8, v25, v0.t
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vredmaxu.vs v25, v16, v25, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v25
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umax.nxv32i32(i32 %s, <vscale x 32 x i32> %v, <vscale x 32 x i1> %m, i32 %evl)
   ret i32 %r
 }
@@ -1218,24 +1142,14 @@ define signext i32 @vpreduce_smax_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v,
 declare i32 @llvm.vp.reduce.umin.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 
 define signext i32 @vpreduce_umin_nxv4i32(i32 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vpreduce_umin_nxv4i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vmv.s.x v10, a0
-; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vredminu.vs v10, v8, v10, v0.t
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vpreduce_umin_nxv4i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vmv.s.x v10, a0
-; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vredminu.vs v10, v8, v10, v0.t
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    ret
+; CHECK-LABEL: vpreduce_umin_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vredminu.vs v10, v8, v10, v0.t
+; CHECK-NEXT:    vmv.x.s a0, v10
+; CHECK-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 %evl)
   ret i32 %r
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 621445fb2dc5..4ff2fc7a5fff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -487,6 +487,54 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+define void @saxpy_vec_demanded_fields(i64 %n, float %a, ptr nocapture readonly %x, ptr nocapture %y) {
+; CHECK-LABEL: saxpy_vec_demanded_fields:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a3, a0, e32, m8, ta, ma
+; CHECK-NEXT:    beqz a3, .LBB9_2
+; CHECK-NEXT:  .LBB9_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    slli a4, a3, 2
+; CHECK-NEXT:    add a1, a1, a4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, ma
+; CHECK-NEXT:    vfmacc.vf v16, fa0, v8
+; CHECK-NEXT:    vse32.v v16, (a2)
+; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    vsetvli a3, a0, e16, m4, ta, ma
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    bnez a3, .LBB9_1
+; CHECK-NEXT:  .LBB9_2: # %for.end
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.riscv.vsetvli.i64(i64 %n, i64 2, i64 3)
+  %cmp.not13 = icmp eq i64 %0, 0
+  br i1 %cmp.not13, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %1 = phi i64 [ %7, %for.body ], [ %0, %entry ]
+  %n.addr.016 = phi i64 [ %sub, %for.body ], [ %n, %entry ]
+  %x.addr.015 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
+  %y.addr.014 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
+  %2 = bitcast ptr %x.addr.015 to ptr
+  %3 = tail call <vscale x 16 x float> @llvm.riscv.vle.nxv16f32.i64(<vscale x 16 x float> undef, ptr %2, i64 %1)
+  %add.ptr = getelementptr inbounds float, ptr %x.addr.015, i64 %1
+  %4 = bitcast ptr %y.addr.014 to ptr
+  %5 = tail call <vscale x 16 x float> @llvm.riscv.vle.nxv16f32.i64(<vscale x 16 x float> undef, ptr %4, i64 %1)
+  %6 = tail call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float> %5, float %a, <vscale x 16 x float> %3, i64 7, i64 %1, i64 0)
+  tail call void @llvm.riscv.vse.nxv16f32.i64(<vscale x 16 x float> %6, ptr %4, i64 %1)
+  %add.ptr1 = getelementptr inbounds float, ptr %y.addr.014, i64 %1
+  %sub = sub i64 %n.addr.016, %1
+  %7 = tail call i64 @llvm.riscv.vsetvli.i64(i64 %sub, i64 1, i64 2)
+  %cmp.not = icmp eq i64 %7, 0
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg)
 declare <vscale x 16 x float> @llvm.riscv.vle.nxv16f32.i64(<vscale x 16 x float>, ptr nocapture, i64)
 declare <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float>, float, <vscale x 16 x float>, i64, i64, i64)
@@ -501,12 +549,12 @@ define <vscale x 2 x i32> @test_vsetvli_x0_x0(ptr %x, ptr %y, <vscale x 2 x i32>
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    andi a3, a3, 1
-; CHECK-NEXT:    beqz a3, .LBB9_2
+; CHECK-NEXT:    beqz a3, .LBB10_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vle16.v v10, (a1)
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwcvt.x.x.v v8, v10
-; CHECK-NEXT:  .LBB9_2: # %if.end
+; CHECK-NEXT:  .LBB10_2: # %if.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -540,19 +588,19 @@ define <vscale x 2 x i32> @test_vsetvli_x0_x0_2(ptr %x, ptr %y, ptr %z, i64 %vl,
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    andi a4, a4, 1
-; CHECK-NEXT:    beqz a4, .LBB10_2
+; CHECK-NEXT:    beqz a4, .LBB11_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vle16.v v10, (a1)
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwadd.wv v9, v9, v10
-; CHECK-NEXT:  .LBB10_2: # %if.end
+; CHECK-NEXT:  .LBB11_2: # %if.end
 ; CHECK-NEXT:    andi a5, a5, 1
-; CHECK-NEXT:    beqz a5, .LBB10_4
+; CHECK-NEXT:    beqz a5, .LBB11_4
 ; CHECK-NEXT:  # %bb.3: # %if2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a2)
 ; CHECK-NEXT:    vwadd.wv v9, v9, v10
-; CHECK-NEXT:  .LBB10_4: # %if2.end
+; CHECK-NEXT:  .LBB11_4: # %if2.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -586,11 +634,11 @@ define void @vlmax(i64 %N, ptr %c, ptr %a, ptr %b) {
 ; CHECK-LABEL: vlmax:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a6, zero, e64, m1, ta, ma
-; CHECK-NEXT:    blez a0, .LBB11_3
+; CHECK-NEXT:    blez a0, .LBB12_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    li a5, 0
 ; CHECK-NEXT:    slli a4, a6, 3
-; CHECK-NEXT:  .LBB11_2: # %for.body
+; CHECK-NEXT:  .LBB12_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    vle64.v v9, (a3)
@@ -600,8 +648,8 @@ define void @vlmax(i64 %N, ptr %c, ptr %a, ptr %b) {
 ; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    add a3, a3, a4
 ; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    blt a5, a0, .LBB11_2
-; CHECK-NEXT:  .LBB11_3: # %for.end
+; CHECK-NEXT:    blt a5, a0, .LBB12_2
+; CHECK-NEXT:  .LBB12_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.riscv.vsetvlimax.i64(i64 3, i64 0)
@@ -633,18 +681,18 @@ define void @vector_init_vlmax(i64 %N, ptr %c) {
 ; CHECK-LABEL: vector_init_vlmax:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    blez a0, .LBB12_3
+; CHECK-NEXT:    blez a0, .LBB13_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    li a3, 0
 ; CHECK-NEXT:    slli a4, a2, 3
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB12_2: # %for.body
+; CHECK-NEXT:  .LBB13_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a3, a0, .LBB12_2
-; CHECK-NEXT:  .LBB12_3: # %for.end
+; CHECK-NEXT:    blt a3, a0, .LBB13_2
+; CHECK-NEXT:  .LBB13_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.riscv.vsetvlimax.i64(i64 3, i64 0)
@@ -669,20 +717,20 @@ define void @vector_init_vsetvli_N(i64 %N, ptr %c) {
 ; CHECK-LABEL: vector_init_vsetvli_N:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a2, a0, e64, m1, ta, ma
-; CHECK-NEXT:    blez a0, .LBB13_3
+; CHECK-NEXT:    blez a0, .LBB14_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    li a3, 0
 ; CHECK-NEXT:    slli a4, a2, 3
 ; CHECK-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB13_2: # %for.body
+; CHECK-NEXT:  .LBB14_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a3, a0, .LBB13_2
-; CHECK-NEXT:  .LBB13_3: # %for.end
+; CHECK-NEXT:    blt a3, a0, .LBB14_2
+; CHECK-NEXT:  .LBB14_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.riscv.vsetvli(i64 %N, i64 3, i64 0)
@@ -711,13 +759,13 @@ define void @vector_init_vsetvli_fv(i64 %N, ptr %c) {
 ; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB14_1: # %for.body
+; CHECK-NEXT:  .LBB15_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a2, a0, .LBB14_1
+; CHECK-NEXT:    blt a2, a0, .LBB15_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -745,13 +793,13 @@ define void @vector_init_vsetvli_fv2(i64 %N, ptr %c) {
 ; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB15_1: # %for.body
+; CHECK-NEXT:  .LBB16_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a1, a1, 32
-; CHECK-NEXT:    blt a2, a0, .LBB15_1
+; CHECK-NEXT:    blt a2, a0, .LBB16_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -779,13 +827,13 @@ define void @vector_init_vsetvli_fv3(i64 %N, ptr %c) {
 ; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB16_1: # %for.body
+; CHECK-NEXT:  .LBB17_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a1, a1, 32
-; CHECK-NEXT:    blt a2, a0, .LBB16_1
+; CHECK-NEXT:    blt a2, a0, .LBB17_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -861,10 +909,10 @@ define <vscale x 1 x double> @compat_store_consistency(i1 %cond, <vscale x 1 x d
 ; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    vs1r.v v8, (a1)
-; CHECK-NEXT:    beqz a0, .LBB19_2
+; CHECK-NEXT:    beqz a0, .LBB20_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
 ; CHECK-NEXT:    vse32.v v10, (a2)
-; CHECK-NEXT:  .LBB19_2: # %if.end
+; CHECK-NEXT:  .LBB20_2: # %if.end
 ; CHECK-NEXT:    ret
 entry:
   %res = fadd <vscale x 1 x double> %a, %b
@@ -886,16 +934,16 @@ define <vscale x 2 x i32> @test_ratio_only_vmv_s_x(ptr %x, ptr %y, i1 %cond) nou
 ; CHECK-LABEL: test_ratio_only_vmv_s_x:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB20_2
+; CHECK-NEXT:    beqz a2, .LBB21_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    vwcvt.x.x.v v8, v9
-; CHECK-NEXT:    j .LBB20_3
-; CHECK-NEXT:  .LBB20_2:
+; CHECK-NEXT:    j .LBB21_3
+; CHECK-NEXT:  .LBB21_2:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:  .LBB20_3: # %if.end
+; CHECK-NEXT:  .LBB21_3: # %if.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    ret
@@ -918,16 +966,16 @@ define <vscale x 2 x i32> @test_ratio_only_vmv_s_x2(ptr %x, ptr %y, i1 %cond) no
 ; CHECK-LABEL: test_ratio_only_vmv_s_x2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB21_2
+; CHECK-NEXT:    beqz a2, .LBB22_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    j .LBB21_3
-; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    j .LBB22_3
+; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    vwcvt.x.x.v v8, v9
-; CHECK-NEXT:  .LBB21_3: # %if.end
+; CHECK-NEXT:  .LBB22_3: # %if.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    ret
@@ -953,13 +1001,13 @@ define void @pre_over_vle(ptr %A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi a1, a0, 800
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB22_1: # %vector.body
+; CHECK-NEXT:  .LBB23_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsext.vf4 v9, v8
 ; CHECK-NEXT:    vse32.v v9, (a0)
 ; CHECK-NEXT:    addi a0, a0, 8
-; CHECK-NEXT:    bne a0, a1, .LBB22_1
+; CHECK-NEXT:    bne a0, a1, .LBB23_1
 ; CHECK-NEXT:  # %bb.2: # %exit
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
index fdcce72a01eb..c66eb5717048 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -130,6 +130,10 @@
     ret void
   }
 
+  define void @pre_undemanded_vl() {
+    ret void
+  }
+
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
@@ -185,9 +189,8 @@ body:             |
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v8
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 %pt, [[COPY2]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 undef $noreg, [[COPY2]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $x0
   ; CHECK-NEXT:   BEQ [[COPY3]], [[COPY4]], %bb.2
   ; CHECK-NEXT:   PseudoBR %bb.1
@@ -195,15 +198,13 @@ body:             |
   ; CHECK-NEXT: bb.1.if.then:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %pt2:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt2, [[PseudoVLE64_V_M1_]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, [[PseudoVLE64_V_M1_]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.if.else:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %pt3:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVSUB_VV_M1_:%[0-9]+]]:vr = PseudoVSUB_VV_M1 %pt3, [[PseudoVLE64_V_M1_]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVSUB_VV_M1_:%[0-9]+]]:vr = PseudoVSUB_VV_M1 undef $noreg, [[PseudoVLE64_V_M1_]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.if.end:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vr = PHI [[PseudoVADD_VV_M1_]], %bb.1, [[PseudoVSUB_VV_M1_]], %bb.2
@@ -217,20 +218,17 @@ body:             |
     %6:vr = COPY $v8
     %5:gpr = COPY $x11
     %4:gpr = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    %0:vr = PseudoVLE64_V_M1 %pt, %5, %7, 6, 0
+    %0:vr = PseudoVLE64_V_M1 undef $noreg, %5, %7, 6, 0
     %8:gpr = COPY $x0
     BEQ %4, %8, %bb.2
     PseudoBR %bb.1
 
   bb.1.if.then:
-    %pt2:vr = IMPLICIT_DEF
-    %1:vr = PseudoVADD_VV_M1 %pt2, %0, %6, %7, 6, 0
+    %1:vr = PseudoVADD_VV_M1 undef $noreg, %0, %6, %7, 6, 0
     PseudoBR %bb.3
 
   bb.2.if.else:
-    %pt3:vr = IMPLICIT_DEF
-    %2:vr = PseudoVSUB_VV_M1 %pt3, %0, %6, %7, 6, 0
+    %2:vr = PseudoVSUB_VV_M1 undef $noreg, %0, %6, %7, 6, 0
 
   bb.3.if.end:
     %3:vr = PHI %1, %bb.1, %2, %bb.2
@@ -270,9 +268,8 @@ body:             |
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x12
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLI [[COPY]], 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVLE32_V_MF2_:%[0-9]+]]:vr = PseudoVLE32_V_MF2 %pt, [[COPY2]], $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVLE32_V_MF2_:%[0-9]+]]:vr = PseudoVLE32_V_MF2 undef $noreg, [[COPY2]], $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $x0
   ; CHECK-NEXT:   BEQ [[COPY3]], [[COPY4]], %bb.2
   ; CHECK-NEXT:   PseudoBR %bb.1
@@ -280,17 +277,15 @@ body:             |
   ; CHECK-NEXT: bb.1.if.then:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %dead1:vr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   early-clobber %1:vr = PseudoVZEXT_VF2_M1 %dead1, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   early-clobber %1:vr = PseudoVZEXT_VF2_M1 undef $noreg, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.if.else:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %dead2:vr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   early-clobber %2:vr = PseudoVSEXT_VF2_M1 %dead2, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   early-clobber %2:vr = PseudoVSEXT_VF2_M1 undef $noreg, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.if.end:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vr = PHI %1, %bb.1, %2, %bb.2
@@ -304,20 +299,17 @@ body:             |
     %6:gpr = COPY $x12
     %5:gpr = COPY $x11
     %4:gpr = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    %0:vr = PseudoVLE32_V_MF2 %pt, %5, %7, 5, 0
+    %0:vr = PseudoVLE32_V_MF2 undef $noreg, %5, %7, 5, 0
     %8:gpr = COPY $x0
     BEQ %4, %8, %bb.2
     PseudoBR %bb.1
 
   bb.1.if.then:
-    %dead1:vr = IMPLICIT_DEF
-    early-clobber %1:vr = PseudoVZEXT_VF2_M1 %dead1, %0, %7, 6, 0
+    early-clobber %1:vr = PseudoVZEXT_VF2_M1 undef $noreg, %0, %7, 6, 0
     PseudoBR %bb.3
 
   bb.2.if.else:
-    %dead2:vr = IMPLICIT_DEF
-    early-clobber %2:vr = PseudoVSEXT_VF2_M1 %dead2, %0, %7, 6, 0
+    early-clobber %2:vr = PseudoVSEXT_VF2_M1 undef $noreg, %0, %7, 6, 0
 
   bb.3.if.end:
     %3:vr = PHI %1, %bb.1, %2, %bb.2
@@ -364,17 +356,15 @@ body:             |
   ; CHECK-NEXT: bb.1.if.then:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.if.else:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %pt2:vr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVSUB_VV_M1_:%[0-9]+]]:vr = PseudoVSUB_VV_M1 %pt2, [[COPY1]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVSUB_VV_M1_:%[0-9]+]]:vr = PseudoVSUB_VV_M1 undef $noreg, [[COPY1]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.if.end:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vr = PHI [[PseudoVADD_VV_M1_]], %bb.1, [[PseudoVSUB_VV_M1_]], %bb.2
@@ -394,13 +384,11 @@ body:             |
     PseudoBR %bb.1
 
   bb.1.if.then:
-    %pt:vr = IMPLICIT_DEF
-    %0:vr = PseudoVADD_VV_M1 %pt, %4, %5, %6, 6, 0
+    %0:vr = PseudoVADD_VV_M1 undef $noreg, %4, %5, %6, 6, 0
     PseudoBR %bb.3
 
   bb.2.if.else:
-    %pt2:vr = IMPLICIT_DEF
-    %1:vr = PseudoVSUB_VV_M1 %pt2, %5, %5, %6, 6, 0
+    %1:vr = PseudoVSUB_VV_M1 undef $noreg, %5, %5, %6, 6, 0
 
   bb.3.if.end:
     %2:vr = PHI %0, %bb.1, %1, %bb.2
@@ -449,15 +437,13 @@ body:             |
   ; CHECK-NEXT: bb.1.if.then:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.if.else:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %pt2:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVSUB_VV_M1_:%[0-9]+]]:vr = PseudoVSUB_VV_M1 %pt2, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVSUB_VV_M1_:%[0-9]+]]:vr = PseudoVSUB_VV_M1 undef $noreg, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.if.end:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vr = PHI [[PseudoVADD_VV_M1_]], %bb.1, [[PseudoVSUB_VV_M1_]], %bb.2
@@ -477,13 +463,11 @@ body:             |
     PseudoBR %bb.1
 
   bb.1.if.then:
-    %pt:vr = IMPLICIT_DEF
-    %1:vr = PseudoVADD_VV_M1 %pt, %5, %6, %0, 6, 0
+    %1:vr = PseudoVADD_VV_M1 undef $noreg, %5, %6, %0, 6, 0
     PseudoBR %bb.3
 
   bb.2.if.else:
-    %pt2:vr = IMPLICIT_DEF
-    %2:vr = PseudoVSUB_VV_M1 %pt2, %5, %6, %0, 6, 0
+    %2:vr = PseudoVSUB_VV_M1 undef $noreg, %5, %6, %0, 6, 0
 
   bb.3.if.end:
     %3:vr = PHI %1, %bb.1, %2, %bb.2
@@ -515,13 +499,11 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:gpr = IMPLICIT_DEF
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 223 /* e64, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVID_V_MF2_:%[0-9]+]]:vr = PseudoVID_V_MF2 %pt, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
-  ; CHECK-NEXT:   %pt2:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_1:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 %pt2, 0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY undef $noreg
+  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 223 /* e64, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   [[PseudoVID_V_MF2_:%[0-9]+]]:vr = PseudoVID_V_MF2 undef $noreg, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_1:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 undef $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -532,8 +514,8 @@ body:             |
   ; CHECK-NEXT:   [[PseudoVLE32_V_MF2_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_MF2_MASK [[PseudoVMV_V_I_MF2_]], killed [[COPY]], $v0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
   ; CHECK-NEXT:   [[PseudoVCPOP_M_B1_:%[0-9]+]]:gpr = PseudoVCPOP_M_B1 [[PseudoVMSEQ_VI_MF2_]], -1, 0 /* e8 */, implicit $vl, implicit $vtype
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x0
-  ; CHECK-NEXT:   BEQ killed [[PseudoVCPOP_M_B1_]], [[COPY2]], %bb.3
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x0
+  ; CHECK-NEXT:   BEQ killed [[PseudoVCPOP_M_B1_]], [[COPY3]], %bb.3
   ; CHECK-NEXT:   PseudoBR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -542,10 +524,9 @@ body:             |
   ; CHECK-NEXT:   [[LWU:%[0-9]+]]:gpr = LWU [[COPY1]], 0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[DEF]], %bb.1, [[LWU]], %bb.2
-  ; CHECK-NEXT:   %pt3:vr = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[COPY2]], %bb.1, [[LWU]], %bb.2
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   [[PseudoVADD_VX_MF2_:%[0-9]+]]:vr = nsw PseudoVADD_VX_MF2 %pt3, [[PseudoVLE32_V_MF2_MASK]], [[PHI]], -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVADD_VX_MF2_:%[0-9]+]]:vr = nsw PseudoVADD_VX_MF2 undef $noreg, [[PseudoVLE32_V_MF2_MASK]], [[PHI]], -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   $v0 = COPY [[PseudoVADD_VX_MF2_]]
   ; CHECK-NEXT:   PseudoRET implicit $v0
   bb.0:
@@ -554,11 +535,9 @@ body:             |
 
     %0:gpr = COPY $x11
     %1:gpr = COPY $x10
-    %2:gpr = IMPLICIT_DEF
-    %pt:vr = IMPLICIT_DEF
-    %3:vr = PseudoVID_V_MF2 %pt, -1, 6, 0
-    %pt2:vr = IMPLICIT_DEF
-    %4:vrnov0 = PseudoVMV_V_I_MF2 %pt2, 0, -1, 5, 0
+    %2:gpr = COPY undef $noreg
+    %3:vr = PseudoVID_V_MF2 undef $noreg, -1, 6, 0
+    %4:vrnov0 = PseudoVMV_V_I_MF2 undef $noreg, 0, -1, 5, 0
 
   bb.1:
     successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -578,8 +557,7 @@ body:             |
 
   bb.3:
     %10:gpr = PHI %2, %bb.1, %9, %bb.2
-    %pt3:vr = IMPLICIT_DEF
-    %11:vr = nsw PseudoVADD_VX_MF2 %pt3, %6, %10, -1, 5, 0
+    %11:vr = nsw PseudoVADD_VX_MF2 undef $noreg, %6, %10, -1, 5, 0
     $v0 = COPY %11
     PseudoRET implicit $v0
 ...
@@ -608,17 +586,15 @@ body:             |
   ; CHECK-NEXT:   [[PseudoReadVLENB:%[0-9]+]]:gpr = PseudoReadVLENB
   ; CHECK-NEXT:   [[SRLI:%[0-9]+]]:gpr = SRLI [[PseudoReadVLENB]], 3
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x11
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVID_V_M1_:%[0-9]+]]:vr = PseudoVID_V_M1 %pt, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   [[PseudoVID_V_M1_:%[0-9]+]]:vr = PseudoVID_V_M1 undef $noreg, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[COPY2]], %bb.0, %10, %bb.1
-  ; CHECK-NEXT:   %pt2:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVADD_VX_M1_:%[0-9]+]]:vr = PseudoVADD_VX_M1 %pt2, [[PseudoVID_V_M1_]], [[PHI]], -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVADD_VX_M1_:%[0-9]+]]:vr = PseudoVADD_VX_M1 undef $noreg, [[PseudoVID_V_M1_]], [[PHI]], -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[MUL:%[0-9]+]]:gpr = MUL [[PHI]], [[SRLI]]
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[COPY]], [[MUL]]
   ; CHECK-NEXT:   PseudoVSE32_V_MF2 killed [[PseudoVADD_VX_M1_]], killed [[ADD]], -1, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -634,16 +610,14 @@ body:             |
     %1:gpr = PseudoReadVLENB
     %2:gpr = SRLI %1:gpr, 3
     %3:gpr = COPY $x11
-    %pt:vr = IMPLICIT_DEF
-    %4:vr = PseudoVID_V_M1 %pt, -1, 6, 0
+    %4:vr = PseudoVID_V_M1 undef $noreg, -1, 6, 0
     %5:gpr = COPY $x0
 
   bb.1:
     successors: %bb.1, %bb.2
 
     %6:gpr = PHI %5:gpr, %bb.0, %10:gpr, %bb.1
-    %pt2:vr = IMPLICIT_DEF
-    %7:vr = PseudoVADD_VX_M1 %pt2, %4:vr, %6:gpr, -1, 6, 0
+    %7:vr = PseudoVADD_VX_M1 undef $noreg, %4:vr, %6:gpr, -1, 6, 0
     %8:gpr = MUL %6:gpr, %2:gpr
     %9:gpr = ADD %0:gpr, %8:gpr
     PseudoVSE32_V_MF2 killed %7:vr, killed %9:gpr, -1, 5
@@ -680,17 +654,15 @@ body:             |
   ; CHECK-NEXT:   [[PseudoReadVLENB:%[0-9]+]]:gpr = PseudoReadVLENB
   ; CHECK-NEXT:   [[SRLI:%[0-9]+]]:gpr = SRLI [[PseudoReadVLENB]], 3
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x11
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVID_V_M1_:%[0-9]+]]:vr = PseudoVID_V_M1 %pt, -1, 6 /* e64 */, 3 /* ta, ma */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   [[PseudoVID_V_M1_:%[0-9]+]]:vr = PseudoVID_V_M1 undef $noreg, -1, 6 /* e64 */, 3 /* ta, ma */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[COPY2]], %bb.0, %10, %bb.2
-  ; CHECK-NEXT:   %pt2:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVADD_VX_M1_:%[0-9]+]]:vr = PseudoVADD_VX_M1 %pt2, [[PseudoVID_V_M1_]], [[PHI]], -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVADD_VX_M1_:%[0-9]+]]:vr = PseudoVADD_VX_M1 undef $noreg, [[PseudoVID_V_M1_]], [[PHI]], -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[MUL:%[0-9]+]]:gpr = MUL [[PHI]], [[SRLI]]
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[COPY]], [[MUL]]
   ; CHECK-NEXT:   PseudoVSE32_V_MF2 killed [[PseudoVADD_VX_M1_]], killed [[ADD]], -1, 5 /* e32 */, implicit $vl, implicit $vtype
@@ -710,16 +682,14 @@ body:             |
     %1:gpr = PseudoReadVLENB
     %2:gpr = SRLI %1:gpr, 3
     %3:gpr = COPY $x11
-    %pt:vr = IMPLICIT_DEF
-    %4:vr = PseudoVID_V_M1 %pt, -1, 6, 3
+    %4:vr = PseudoVID_V_M1 undef $noreg, -1, 6, 3
     %5:gpr = COPY $x0
 
   bb.1:
     successors: %bb.3
 
     %6:gpr = PHI %5:gpr, %bb.0, %10:gpr, %bb.3
-    %pt2:vr = IMPLICIT_DEF
-    %7:vr = PseudoVADD_VX_M1 %pt2, %4:vr, %6:gpr, -1, 6, 0
+    %7:vr = PseudoVADD_VX_M1 undef $noreg, %4:vr, %6:gpr, -1, 6, 0
     %8:gpr = MUL %6:gpr, %2:gpr
     %9:gpr = ADD %0:gpr, %8:gpr
     PseudoVSE32_V_MF2 killed %7:vr, killed %9:gpr, -1, 5
@@ -778,9 +748,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr = COPY $x12
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   %dead:vr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead $x0 = PseudoVSETIVLI 4, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 %dead, 0, 4, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 undef $noreg, 0, 4, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vr = COPY [[PseudoVMV_V_I_M1_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vr = COPY [[COPY2]]
   ; CHECK-NEXT:   [[LUI:%[0-9]+]]:gpr = LUI 1
@@ -792,10 +761,8 @@ body:             |
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[COPY1]], %bb.0, %5, %bb.1
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr = PHI [[ADDIW]], %bb.0, %4, %bb.1
   ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:vr = PHI [[COPY3]], %bb.0, %16, %bb.1
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVLE32_V_M1_:%[0-9]+]]:vr = PseudoVLE32_V_M1 %pt, [[PHI]], 4, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load (s128) from %ir.lsr.iv12, align 4)
-  ; CHECK-NEXT:   %pt2:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt2, killed [[PseudoVLE32_V_M1_]], [[PHI2]], 4, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVLE32_V_M1_:%[0-9]+]]:vr = PseudoVLE32_V_M1 undef $noreg, [[PHI]], 4, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load (s128) from %ir.lsr.iv12, align 4)
+  ; CHECK-NEXT:   [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, killed [[PseudoVLE32_V_M1_]], [[PHI2]], 4, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[ADDI:%[0-9]+]]:gpr = nsw ADDI [[PHI1]], -4
   ; CHECK-NEXT:   [[ADDI1:%[0-9]+]]:gpr = ADDI [[PHI]], 16
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $x0
@@ -804,10 +771,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.middle.block:
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr = COPY $x0
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVMV_S_X:%[0-9]+]]:vr = PseudoVMV_S_X [[DEF]], [[COPY5]], 1, 5 /* e32 */, implicit $vl, implicit $vtype
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVREDSUM_VS_M1_E8_:%[0-9]+]]:vr = PseudoVREDSUM_VS_M1_E8 [[DEF1]], [[PseudoVADD_VV_M1_]], killed [[PseudoVMV_S_X]], 4, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVMV_S_X:%[0-9]+]]:vr = PseudoVMV_S_X undef $noreg, [[COPY5]], 1, 5 /* e32 */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVREDSUM_VS_M1_E8_:%[0-9]+]]:vr = PseudoVREDSUM_VS_M1_E8 undef $noreg, [[PseudoVADD_VV_M1_]], killed [[PseudoVMV_S_X]], 4, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   dead $x0 = PseudoVSETIVLI 1, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
   ; CHECK-NEXT:   PseudoVSE32_V_M1 killed [[PseudoVREDSUM_VS_M1_E8_]], [[COPY]], 1, 5 /* e32 */, implicit $vl, implicit $vtype :: (store (s32) into %ir.res)
   ; CHECK-NEXT:   PseudoRET
@@ -816,8 +781,7 @@ body:             |
 
     %8:gpr = COPY $x12
     %6:gpr = COPY $x10
-    %dead:vr = IMPLICIT_DEF
-    %11:vr = PseudoVMV_V_I_M1 %dead, 0, 4, 5, 0
+    %11:vr = PseudoVMV_V_I_M1 undef $noreg, 0, 4, 5, 0
     %12:vr = COPY %11
     %10:vr = COPY %12
     %13:gpr = LUI 1
@@ -829,10 +793,8 @@ body:             |
     %0:gpr = PHI %6, %bb.0, %5, %bb.1
     %1:gpr = PHI %9, %bb.0, %4, %bb.1
     %2:vr = PHI %10, %bb.0, %16, %bb.1
-    %pt:vr = IMPLICIT_DEF
-    %14:vr = PseudoVLE32_V_M1 %pt, %0, 4, 5, 0 :: (load (s128) from %ir.lsr.iv12, align 4)
-    %pt2:vr = IMPLICIT_DEF
-    %16:vr = PseudoVADD_VV_M1 %pt2, killed %14, %2, 4, 5, 0
+    %14:vr = PseudoVLE32_V_M1 undef $noreg, %0, 4, 5, 0 :: (load (s128) from %ir.lsr.iv12, align 4)
+    %16:vr = PseudoVADD_VV_M1 undef $noreg, killed %14, %2, 4, 5, 0
     %4:gpr = nsw ADDI %1, -4
     %5:gpr = ADDI %0, 16
     %18:gpr = COPY $x0
@@ -841,10 +803,8 @@ body:             |
 
   bb.2.middle.block:
     %19:gpr = COPY $x0
-    %21:vr = IMPLICIT_DEF
-    %20:vr = PseudoVMV_S_X %21, %19, 1, 5
-    %24:vr = IMPLICIT_DEF
-    %23:vr = PseudoVREDSUM_VS_M1_E8 %24, %16, killed %20, 4, 5, 1
+    %20:vr = PseudoVMV_S_X undef $noreg, %19, 1, 5
+    %23:vr = PseudoVREDSUM_VS_M1_E8 undef $noreg, %16, killed %20, 4, 5, 1
     PseudoVSE32_V_M1 killed %23, %8, 1, 5 :: (store (s32) into %ir.res)
     PseudoRET
 
@@ -866,7 +826,7 @@ body:             |
   ; CHECK-NEXT:   %t3:vr = COPY $v2
   ; CHECK-NEXT:   %t4:vr = COPY $v3
   ; CHECK-NEXT:   %t5:vrnov0 = COPY $v1
-  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
   ; CHECK-NEXT:   %t6:vr = PseudoVMSEQ_VI_M1 %t1, 0, -1, 6 /* e64 */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoBR %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -948,20 +908,18 @@ body:             |
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x15
   ; CHECK-NEXT:   %vlenb:gpr = PseudoReadVLENB
   ; CHECK-NEXT:   %inc:gpr = SRLI killed %vlenb, 3
-  ; CHECK-NEXT:   %pt:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVID_V_M1_:%[0-9]+]]:vr = PseudoVID_V_M1 %pt, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   [[PseudoVID_V_M1_:%[0-9]+]]:vr = PseudoVID_V_M1 undef $noreg, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x0
   ; CHECK-NEXT:   PseudoBR %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[COPY3]], %bb.0, %12, %bb.3
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr = PHI [[COPY3]], %bb.0, %11, %bb.3
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:gpr = ADD [[COPY2]], [[PHI]]
-  ; CHECK-NEXT:   %pta:vr = IMPLICIT_DEF
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   [[PseudoVADD_VX_M1_:%[0-9]+]]:vr = PseudoVADD_VX_M1 %pta, [[PseudoVID_V_M1_]], killed [[ADD]], -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVADD_VX_M1_:%[0-9]+]]:vr = PseudoVADD_VX_M1 undef $noreg, [[PseudoVID_V_M1_]], killed [[ADD]], -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[PseudoVMSLTU_VX_M1_:%[0-9]+]]:vr = PseudoVMSLTU_VX_M1 [[PseudoVADD_VX_M1_]], [[COPY1]], -1, 6 /* e64 */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[PseudoVCPOP_M_B1_:%[0-9]+]]:gpr = PseudoVCPOP_M_B1 [[PseudoVMSLTU_VX_M1_]], -1, 0 /* e8 */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr = COPY $x0
@@ -972,11 +930,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:gpr = ADD %src, [[PHI]]
-  ; CHECK-NEXT:   %pt2:vrnov0 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[PseudoVLE8_V_MF8_:%[0-9]+]]:vrnov0 = PseudoVLE8_V_MF8 %pt2, killed [[ADD1]], -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
-  ; CHECK-NEXT:   %ptb:vr = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[PseudoVLE8_V_MF8_:%[0-9]+]]:vrnov0 = PseudoVLE8_V_MF8 undef $noreg, killed [[ADD1]], -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   [[PseudoVADD_VI_MF8_:%[0-9]+]]:vrnov0 = PseudoVADD_VI_MF8 %ptb, [[PseudoVLE8_V_MF8_]], 4, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVADD_VI_MF8_:%[0-9]+]]:vrnov0 = PseudoVADD_VI_MF8 undef $noreg, [[PseudoVLE8_V_MF8_]], 4, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   [[ADD2:%[0-9]+]]:gpr = ADD %dst, [[PHI]]
   ; CHECK-NEXT:   PseudoVSE8_V_MF8 killed [[PseudoVADD_VI_MF8_]], killed [[ADD2]], -1, 3 /* e8 */, implicit $vl, implicit $vtype
   ; CHECK-NEXT: {{  $}}
@@ -1001,8 +957,7 @@ body:             |
     %12:gpr = COPY $x15
     %vlenb:gpr = PseudoReadVLENB
     %inc:gpr = SRLI killed %vlenb, 3
-    %pt:vr = IMPLICIT_DEF
-    %10:vr = PseudoVID_V_M1 %pt, -1, 6, 0
+    %10:vr = PseudoVID_V_M1 undef $noreg, -1, 6, 0
     %59:gpr = COPY $x0
     PseudoBR %bb.1
 
@@ -1011,8 +966,7 @@ body:             |
 
     %26:gpr = PHI %59, %bb.0, %28, %bb.3
     %61:gpr = ADD %12, %26
-    %pta:vr = IMPLICIT_DEF
-    %27:vr = PseudoVADD_VX_M1 %pta, %10, killed %61, -1, 6, 0
+    %27:vr = PseudoVADD_VX_M1 undef $noreg, %10, killed %61, -1, 6, 0
     %62:vr = PseudoVMSLTU_VX_M1 %27, %11, -1, 6
     %63:gpr = PseudoVCPOP_M_B1 %62, -1, 0
     %64:gpr = COPY $x0
@@ -1023,10 +977,8 @@ body:             |
     successors: %bb.3(0x80000000)
 
     %66:gpr = ADD %src, %26
-    %pt2:vrnov0 = IMPLICIT_DEF
-    %67:vrnov0 = PseudoVLE8_V_MF8 %pt2, killed %66, -1, 3, 0
-    %ptb:vr = IMPLICIT_DEF
-    %76:vrnov0 = PseudoVADD_VI_MF8 %ptb, %67, 4, -1, 3, 0
+    %67:vrnov0 = PseudoVLE8_V_MF8 undef $noreg, killed %66, -1, 3, 0
+    %76:vrnov0 = PseudoVADD_VI_MF8 undef $noreg, %67, 4, -1, 3, 0
     %77:gpr = ADD %dst, %26
     PseudoVSE8_V_MF8 killed %76, killed %77, -1, 3
 
@@ -1041,3 +993,24 @@ body:             |
     PseudoRET
 
 ...
+---
+name: pre_undemanded_vl
+body: |
+  ; CHECK-LABEL: name: pre_undemanded_vl
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $x0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+  ; CHECK-NEXT:   %x:gpr = PseudoVMV_X_S undef $noreg, 6 /* e64 */, implicit $vtype
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  bb.0:
+    PseudoBR %bb.1
+  bb.1:
+    %x:gpr = PseudoVMV_X_S undef $noreg, 6
+    PseudoBR %bb.1
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
index 39f517a100f5..41a68ef9903e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
@@ -121,16 +121,14 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v9
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr = COPY $v8
-    ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: $v8 = COPY [[PseudoVADD_VV_M1_]]
     ; CHECK-NEXT: PseudoRET implicit $v8
     %2:gprnox0 = COPY $x10
     %1:vr = COPY $v9
     %0:vr = COPY $v8
-    %pt:vr = IMPLICIT_DEF
-    %3:vr = PseudoVADD_VV_M1 %pt, %0, %1, %2, 6, 0
+    %3:vr = PseudoVADD_VV_M1 undef $noreg, %0, %1, %2, 6, 0
     $v8 = COPY %3
     PseudoRET implicit $v8
 
@@ -162,20 +160,16 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v8
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x10
-    ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 %pt, [[COPY2]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: %pt2:vr = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt2, [[PseudoVLE64_V_M1_]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 undef $noreg, [[COPY2]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, [[PseudoVLE64_V_M1_]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: $v8 = COPY [[PseudoVADD_VV_M1_]]
     ; CHECK-NEXT: PseudoRET implicit $v8
     %2:gprnox0 = COPY $x11
     %1:vr = COPY $v8
     %0:gpr = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    %3:vr = PseudoVLE64_V_M1 %pt, %0, %2, 6, 0
-    %pt2:vr = IMPLICIT_DEF
-    %4:vr = PseudoVADD_VV_M1 %pt2, killed %3, %1, %2, 6, 0
+    %3:vr = PseudoVLE64_V_M1 undef $noreg, %0, %2, 6, 0
+    %4:vr = PseudoVADD_VV_M1 undef $noreg, killed %3, %1, %2, 6, 0
     $v8 = COPY %4
     PseudoRET implicit $v8
 
@@ -204,19 +198,15 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x10
-    ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVLE32_V_MF2_:%[0-9]+]]:vr = PseudoVLE32_V_MF2 %pt, [[COPY1]], $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: %dead:vr = IMPLICIT_DEF
-    ; CHECK-NEXT: early-clobber %3:vr = PseudoVZEXT_VF2_M1 %dead, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[PseudoVLE32_V_MF2_:%[0-9]+]]:vr = PseudoVLE32_V_MF2 undef $noreg, [[COPY1]], $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: early-clobber %3:vr = PseudoVZEXT_VF2_M1 undef $noreg, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: $v8 = COPY %3
     ; CHECK-NEXT: PseudoRET implicit $v8
     %1:gprnox0 = COPY $x11
     %0:gpr = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    %2:vr = PseudoVLE32_V_MF2 %pt, %0, %1, 5, 0
-    %dead:vr = IMPLICIT_DEF
-    early-clobber %3:vr = PseudoVZEXT_VF2_M1 %dead, killed %2, %1, 6, 0
+    %2:vr = PseudoVLE32_V_MF2 undef $noreg, %0, %1, 5, 0
+    early-clobber %3:vr = PseudoVZEXT_VF2_M1 undef $noreg, killed %2, %1, 6, 0
     $v8 = COPY %3
     PseudoRET implicit $v8
 
@@ -276,23 +266,17 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x10
-    ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF
-    ; CHECK-NEXT: %pt2:vr = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 %pt, [[COPY1]], 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load (s128) from %ir.x)
-    ; CHECK-NEXT: [[PseudoVLE64_V_M1_1:%[0-9]+]]:vr = PseudoVLE64_V_M1 %pt2, [[COPY]], 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load (s128) from %ir.y)
-    ; CHECK-NEXT: %pt3:vr = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt3, [[PseudoVLE64_V_M1_]], [[PseudoVLE64_V_M1_1]], 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 undef $noreg, [[COPY1]], 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load (s128) from %ir.x)
+    ; CHECK-NEXT: [[PseudoVLE64_V_M1_1:%[0-9]+]]:vr = PseudoVLE64_V_M1 undef $noreg, [[COPY]], 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load (s128) from %ir.y)
+    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, [[PseudoVLE64_V_M1_]], [[PseudoVLE64_V_M1_1]], 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: PseudoVSE64_V_M1 [[PseudoVADD_VV_M1_]], [[COPY1]], 2, 6 /* e64 */, implicit $vl, implicit $vtype :: (store (s128) into %ir.x)
     ; CHECK-NEXT: PseudoRET
     %1:gpr = COPY $x11
     %0:gpr = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    %pt2:vr = IMPLICIT_DEF
-    %2:vr = PseudoVLE64_V_M1 %pt, %0, 2, 6, 0 :: (load (s128) from %ir.x)
-    %3:vr = PseudoVLE64_V_M1 %pt2, %1, 2, 6, 0 :: (load (s128) from %ir.y)
-    %pt3:vr = IMPLICIT_DEF
-    %4:vr = PseudoVADD_VV_M1 %pt3, killed %2, killed %3, 2, 6, 0
+    %2:vr = PseudoVLE64_V_M1 undef $noreg, %0, 2, 6, 0 :: (load (s128) from %ir.x)
+    %3:vr = PseudoVLE64_V_M1 undef $noreg, %1, 2, 6, 0 :: (load (s128) from %ir.y)
+    %4:vr = PseudoVADD_VV_M1 undef $noreg, killed %2, killed %3, 2, 6, 0
     PseudoVSE64_V_M1 killed %4, %0, 2, 6 :: (store (s128) into %ir.x)
     PseudoRET
 
@@ -321,23 +305,19 @@ body:             |
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
-    ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 %pt, [[COPY]], 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load (s128) from %ir.x)
-    ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 $noreg, 0, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 undef $noreg, [[COPY]], 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype :: (load (s128) from %ir.x)
+    ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 undef $noreg, 0, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVREDSUM_VS_M1_E8_:%[0-9]+]]:vr = PseudoVREDSUM_VS_M1_E8 [[DEF]], [[PseudoVLE64_V_M1_]], [[PseudoVMV_V_I_M1_]], 2, 6 /* e64 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[PseudoVREDSUM_VS_M1_E8_:%[0-9]+]]:vr = PseudoVREDSUM_VS_M1_E8 undef $noreg, [[PseudoVLE64_V_M1_]], [[PseudoVMV_V_I_M1_]], 2, 6 /* e64 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: [[PseudoVMV_X_S:%[0-9]+]]:gpr = PseudoVMV_X_S [[PseudoVREDSUM_VS_M1_E8_]], 6 /* e64 */, implicit $vtype
     ; CHECK-NEXT: $x10 = COPY [[PseudoVMV_X_S]]
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:gpr = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    %1:vr = PseudoVLE64_V_M1 %pt, %0, 2, 6, 0 :: (load (s128) from %ir.x)
-    %2:vr = PseudoVMV_V_I_M1 $noreg, 0, -1, 6, 0
-    %4:vr = IMPLICIT_DEF
-    %3:vr = PseudoVREDSUM_VS_M1_E8 %4, killed %1, killed %2, 2, 6, 1
+    %1:vr = PseudoVLE64_V_M1 undef $noreg, %0, 2, 6, 0 :: (load (s128) from %ir.x)
+    %2:vr = PseudoVMV_V_I_M1 undef $noreg, 0, -1, 6, 0
+    %3:vr = PseudoVREDSUM_VS_M1_E8 undef $noreg, killed %1, killed %2, 2, 6, 1
     %5:gpr = PseudoVMV_X_S killed %3, 6
     $x10 = COPY %5
     PseudoRET implicit $x10
@@ -370,17 +350,15 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v9
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr = COPY $v8
-    ; CHECK-NEXT: [[PseudoVSETVLI:%[0-9]+]]:gprnox0 = PseudoVSETVLI [[COPY]], 88 /* e64, m1, ta, mu */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF
-    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead [[PseudoVSETVLI:%[0-9]+]]:gprnox0 = PseudoVSETVLI [[COPY]], 88 /* e64, m1, ta, mu */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: $v8 = COPY [[PseudoVADD_VV_M1_]]
     ; CHECK-NEXT: PseudoRET implicit $v8
     %2:gprnox0 = COPY $x10
     %1:vr = COPY $v9
     %0:vr = COPY $v8
     %3:gprnox0 = PseudoVSETVLI %2, 88, implicit-def dead $vl, implicit-def dead $vtype
-    %pt:vr = IMPLICIT_DEF
-    %4:vr = PseudoVADD_VV_M1 %pt, %0, %1, killed %3, 6, 0
+    %4:vr = PseudoVADD_VV_M1 undef $noreg, %0, %1, killed %3, 6, 0
     $v8 = COPY %4
     PseudoRET implicit $v8
 
@@ -412,23 +390,19 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v8
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x10
-    ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 %pt, [[COPY2]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 undef $noreg, [[COPY2]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
-    ; CHECK-NEXT: %pt2:vr = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt2, [[PseudoVLE64_V_M1_]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, [[PseudoVLE64_V_M1_]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: $v8 = COPY [[PseudoVADD_VV_M1_]]
     ; CHECK-NEXT: PseudoRET implicit $v8
     %2:gprnox0 = COPY $x11
     %1:vr = COPY $v8
     %0:gpr = COPY $x10
-    %pt:vr = IMPLICIT_DEF
-    %3:vr = PseudoVLE64_V_M1 %pt, %0, %2, 6, 0
+    %3:vr = PseudoVLE64_V_M1 undef $noreg, %0, %2, 6, 0
     INLINEASM &"", 1 /* sideeffect attdialect */
-    %pt2:vr = IMPLICIT_DEF
-    %4:vr = PseudoVADD_VV_M1 %pt2, killed %3, %1, %2, 6, 0
+    %4:vr = PseudoVADD_VV_M1 undef $noreg, killed %3, %1, %2, 6, 0
     $v8 = COPY %4
     PseudoRET implicit $v8
 
@@ -443,15 +417,13 @@ body:             |
     ; CHECK-LABEL: name: vmv_v_i_different_lmuls
     ; CHECK: liveins: $x10, $v8, $x11
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %pt:vrm2 = IMPLICIT_DEF
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 4, 217 /* e64, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVID_V_M2_:%[0-9]+]]:vrm2 = PseudoVID_V_M2 %pt, 4, 6 /* e64 */, 3 /* ta, ma */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead [[PseudoVID_V_M2_:%[0-9]+]]:vrm2 = PseudoVID_V_M2 undef $noreg, 4, 6 /* e64 */, 3 /* ta, ma */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 198 /* e8, mf4, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
-    ; CHECK-NEXT: [[PseudoVMV_V_I_MF4_:%[0-9]+]]:vr = PseudoVMV_V_I_MF4 $noreg, 0, 4, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead [[PseudoVMV_V_I_MF4_:%[0-9]+]]:vr = PseudoVMV_V_I_MF4 undef $noreg, 0, 4, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: PseudoRET
-    %pt:vrm2 = IMPLICIT_DEF
-    %0:vrm2 = PseudoVID_V_M2 %pt, 4, 6, 3
-    %4:vr = PseudoVMV_V_I_MF4 $noreg, 0, 4, 3, 0
+    %0:vrm2 = PseudoVID_V_M2 undef $noreg, 4, 6, 3
+    %4:vr = PseudoVMV_V_I_MF4 undef $noreg, 0, 4, 3, 0
     PseudoRET
 ...
 ---
@@ -467,14 +439,14 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   %cond:gpr = COPY $x10
   ; CHECK-NEXT:   dead $x0 = PseudoVSETIVLI 2, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vr = PseudoVMV_V_I_MF2 $noreg, 1, 2, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vr = PseudoVMV_V_I_MF2 undef $noreg, 1, 2, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   BEQ %cond, $x0, %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 $noreg, 1, 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 undef $noreg, 1, 2, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -488,23 +460,23 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   $x0 = PseudoVSETIVLI 2, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
-  ; CHECK-NEXT:   [[PseudoVMV_X_S:%[0-9]+]]:gpr = PseudoVMV_X_S $noreg, 5 /* e32 */, implicit $vtype
-  ; CHECK-NEXT:   [[PseudoVMV_V_I_MF2_1:%[0-9]+]]:vr = PseudoVMV_V_I_MF2 $noreg, 1, 2, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   dead [[PseudoVMV_X_S:%[0-9]+]]:gpr = PseudoVMV_X_S undef $noreg, 5 /* e32 */, implicit $vtype
+  ; CHECK-NEXT:   dead [[PseudoVMV_V_I_MF2_1:%[0-9]+]]:vr = PseudoVMV_V_I_MF2 undef $noreg, 1, 2, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoRET
   bb.0:
     liveins: $x10
     %cond:gpr = COPY $x10
-    %1:vr = PseudoVMV_V_I_MF2 $noreg, 1, 2, 5, 0
+    %1:vr = PseudoVMV_V_I_MF2 undef $noreg, 1, 2, 5, 0
     BEQ %cond, $x0, %bb.2
   bb.1:
-    %2:vr = PseudoVMV_V_I_M1 $noreg, 1, 2, 6, 0
+    %2:vr = PseudoVMV_V_I_M1 undef $noreg, 1, 2, 6, 0
   bb.2: ; the exit info here should have sew/lmul ratio only
     BEQ %cond, $x0, %bb.4
   bb.3:
     PseudoCALL $noreg, csr_ilp32_lp64
   bb.4: ; this block will have PRE attempted on it
-    %4:gpr = PseudoVMV_X_S $noreg, 5
-    %5:vr = PseudoVMV_V_I_MF2 $noreg, 1, 2, 5, 0
+    %4:gpr = PseudoVMV_X_S undef $noreg, 5
+    %5:vr = PseudoVMV_V_I_MF2 undef $noreg, 1, 2, 5, 0
     PseudoRET
 ...
 ---
@@ -517,15 +489,15 @@ body:             |
     ; CHECK: liveins: $x1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $vtype
+    ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:gpr = COPY $vtype
     ; CHECK-NEXT: $vl = COPY $x1
     ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
-    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 undef $noreg, undef $noreg, undef $noreg, 3, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
     ; CHECK-NEXT: PseudoRET
     dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
     %1:gpr = COPY $vtype
     $vl = COPY $x1
     dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
-    %4:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
+    %4:vr = PseudoVADD_VV_M1 undef $noreg, undef $noreg, undef $noreg, 3, 6, 0
     PseudoRET
 ...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
index 21ddf1a6e114..d70f619c3601 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
@@ -1484,7 +1484,6 @@ define <vscale x 8 x i64> @vwadd_vx_splat_zext(<vscale x 8 x i32> %va, i32 %b) {
 ;
 ; RV64-LABEL: vwadd_vx_splat_zext:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, -1
 ; RV64-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV64-NEXT:    vwaddu.vx v16, v8, a0
 ; RV64-NEXT:    vmv8r.v v8, v16
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index e07e52091e9e..ffbbe31412ed 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1858,3 +1858,113 @@ define i32 @select_cst6(i1 zeroext %cond) {
   %ret = select i1 %cond, i32 2049, i32 2047
   ret i32 %ret
 }
+
+@select_redundant_czero_eqz_data = global i32 0, align 4
+
+define void @select_redundant_czero_eqz1(ptr %0, ptr %1) {
+; RV32IM-LABEL: select_redundant_czero_eqz1:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    bnez a0, .LBB49_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, %hi(select_redundant_czero_eqz_data)
+; RV32IM-NEXT:    addi a0, a0, %lo(select_redundant_czero_eqz_data)
+; RV32IM-NEXT:  .LBB49_2: # %entry
+; RV32IM-NEXT:    sw a0, 0(a1)
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_redundant_czero_eqz1:
+; RV64IM:       # %bb.0: # %entry
+; RV64IM-NEXT:    bnez a0, .LBB49_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, %hi(select_redundant_czero_eqz_data)
+; RV64IM-NEXT:    addi a0, a0, %lo(select_redundant_czero_eqz_data)
+; RV64IM-NEXT:  .LBB49_2: # %entry
+; RV64IM-NEXT:    sd a0, 0(a1)
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_redundant_czero_eqz1:
+; RV64IMXVTCONDOPS:       # %bb.0: # %entry
+; RV64IMXVTCONDOPS-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV64IMXVTCONDOPS-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV64IMXVTCONDOPS-NEXT:    or a0, a2, a0
+; RV64IMXVTCONDOPS-NEXT:    sd a0, 0(a1)
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_redundant_czero_eqz1:
+; RV32IMZICOND:       # %bb.0: # %entry
+; RV32IMZICOND-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV32IMZICOND-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV32IMZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32IMZICOND-NEXT:    or a0, a2, a0
+; RV32IMZICOND-NEXT:    sw a0, 0(a1)
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_redundant_czero_eqz1:
+; RV64IMZICOND:       # %bb.0: # %entry
+; RV64IMZICOND-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV64IMZICOND-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV64IMZICOND-NEXT:    czero.nez a2, a2, a0
+; RV64IMZICOND-NEXT:    or a0, a2, a0
+; RV64IMZICOND-NEXT:    sd a0, 0(a1)
+; RV64IMZICOND-NEXT:    ret
+entry:
+  %3 = icmp eq ptr %0, null
+  %4 = select i1 %3, ptr @select_redundant_czero_eqz_data, ptr %0
+  store ptr %4, ptr %1, align 8
+  ret void
+}
+
+define void @select_redundant_czero_eqz2(ptr %0, ptr %1) {
+; RV32IM-LABEL: select_redundant_czero_eqz2:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    bnez a0, .LBB50_2
+; RV32IM-NEXT:  # %bb.1: # %entry
+; RV32IM-NEXT:    lui a0, %hi(select_redundant_czero_eqz_data)
+; RV32IM-NEXT:    addi a0, a0, %lo(select_redundant_czero_eqz_data)
+; RV32IM-NEXT:  .LBB50_2: # %entry
+; RV32IM-NEXT:    sw a0, 0(a1)
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_redundant_czero_eqz2:
+; RV64IM:       # %bb.0: # %entry
+; RV64IM-NEXT:    bnez a0, .LBB50_2
+; RV64IM-NEXT:  # %bb.1: # %entry
+; RV64IM-NEXT:    lui a0, %hi(select_redundant_czero_eqz_data)
+; RV64IM-NEXT:    addi a0, a0, %lo(select_redundant_czero_eqz_data)
+; RV64IM-NEXT:  .LBB50_2: # %entry
+; RV64IM-NEXT:    sd a0, 0(a1)
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_redundant_czero_eqz2:
+; RV64IMXVTCONDOPS:       # %bb.0: # %entry
+; RV64IMXVTCONDOPS-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV64IMXVTCONDOPS-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV64IMXVTCONDOPS-NEXT:    or a0, a0, a2
+; RV64IMXVTCONDOPS-NEXT:    sd a0, 0(a1)
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_redundant_czero_eqz2:
+; RV32IMZICOND:       # %bb.0: # %entry
+; RV32IMZICOND-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV32IMZICOND-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV32IMZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32IMZICOND-NEXT:    or a0, a0, a2
+; RV32IMZICOND-NEXT:    sw a0, 0(a1)
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_redundant_czero_eqz2:
+; RV64IMZICOND:       # %bb.0: # %entry
+; RV64IMZICOND-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV64IMZICOND-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV64IMZICOND-NEXT:    czero.nez a2, a2, a0
+; RV64IMZICOND-NEXT:    or a0, a0, a2
+; RV64IMZICOND-NEXT:    sd a0, 0(a1)
+; RV64IMZICOND-NEXT:    ret
+entry:
+  %3 = icmp ne ptr %0, null
+  %4 = select i1 %3, ptr %0, ptr @select_redundant_czero_eqz_data
+  store ptr %4, ptr %1, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/printf.ll b/llvm/test/CodeGen/SPIRV/printf.ll
new file mode 100644
index 000000000000..483fc1f244e5
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/printf.ll
@@ -0,0 +1,40 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#ExtImport:]] = OpExtInstImport "OpenCL.std"
+; CHECK: %[[#Char:]] = OpTypeInt 8 0
+; CHECK: %[[#CharPtr:]] = OpTypePointer UniformConstant %[[#Char]]
+; CHECK: %[[#GV:]] = OpVariable %[[#]] UniformConstant %[[#]]
+; CHECK: OpFunction
+; CHECK: %[[#Arg1:]] = OpFunctionParameter
+; CHECK: %[[#Arg2:]] = OpFunctionParameter
+; CHECK: %[[#CastedGV:]] = OpBitcast %[[#CharPtr]] %[[#GV]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#CastedGV]] %[[#ArgConst:]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#CastedGV]] %[[#ArgConst]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#Arg1]] %[[#ArgConst:]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#Arg1]] %[[#ArgConst]]
+; CHECK-NEXT: %[[#CastedArg2:]] = OpBitcast %[[#CharPtr]] %[[#Arg2]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#CastedArg2]] %[[#ArgConst]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#CastedArg2]] %[[#ArgConst]]
+; CHECK: OpFunctionEnd
+
+%struct = type { [6 x i8] }
+
+@FmtStr = internal addrspace(2) constant [6 x i8] c"c=%c\0A\00", align 1
+
+define spir_kernel void @foo(ptr addrspace(2) %_arg_fmt1, ptr addrspace(2) byval(%struct) %_arg_fmt2) {
+entry:
+  %r1 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z6printfPU3AS2Kcz(ptr addrspace(2) @FmtStr, i8 signext 97)
+  %r2 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2) @FmtStr, i8 signext 97)
+  %r3 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z6printfPU3AS2Kcz(ptr addrspace(2) %_arg_fmt1, i8 signext 97)
+  %r4 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2) %_arg_fmt1, i8 signext 97)
+  %r5 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z6printfPU3AS2Kcz(ptr addrspace(2) %_arg_fmt2, i8 signext 97)
+  %r6 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2) %_arg_fmt2, i8 signext 97)
+  ret void
+}
+
+declare dso_local spir_func i32 @_Z6printfPU3AS2Kcz(ptr addrspace(2), ...)
+declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-event-null.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-event-null.ll
new file mode 100644
index 000000000000..fe0d96f2773e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-event-null.ll
@@ -0,0 +1,33 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#TyEvent:]] = OpTypeEvent
+; CHECK-DAG: %[[#TyStruct:]] = OpTypeStruct %[[#TyEvent]]
+; CHECK-DAG: %[[#ConstEvent:]] = OpConstantNull %[[#TyEvent]]
+; CHECK-DAG: %[[#TyEventPtr:]] = OpTypePointer Function %[[#TyEvent]]
+; CHECK-DAG: %[[#TyStructPtr:]] = OpTypePointer Function %[[#TyStruct]]
+; CHECK: OpFunction
+; CHECK: OpFunctionParameter
+; CHECK: %[[#Src:]] = OpFunctionParameter
+; CHECK: OpVariable %[[#TyStructPtr]] Function
+; CHECK: %[[#EventVar:]] = OpVariable %[[#TyEventPtr]] Function
+; CHECK: %[[#Dest:]] = OpInBoundsPtrAccessChain
+; CHECK: %[[#CopyRes:]] = OpGroupAsyncCopy %[[#TyEvent]] %[[#]] %[[#Dest]] %[[#Src]] %[[#]] %[[#]] %[[#ConstEvent]]
+; CHECK: OpStore %[[#EventVar]] %[[#CopyRes]]
+
+%"class.sycl::_V1::device_event" = type { target("spirv.Event") }
+
+define spir_kernel void @foo(ptr addrspace(1) %_arg_out_ptr, ptr addrspace(3) noundef %_arg_local_acc) {
+entry:
+  %var = alloca %"class.sycl::_V1::device_event"
+  %dev_event.i.sroa.0 = alloca target("spirv.Event")
+  %add.ptr.i26 = getelementptr inbounds i32, ptr addrspace(1) %_arg_out_ptr, i64 0
+  %call3.i = tail call spir_func target("spirv.Event") @_Z22__spirv_GroupAsyncCopyjPU3AS1iPU3AS3Kimm9ocl_event(i32 2, ptr addrspace(1) %add.ptr.i26, ptr addrspace(3) %_arg_local_acc, i64 16, i64 10, target("spirv.Event") zeroinitializer)
+  store target("spirv.Event") %call3.i, ptr %dev_event.i.sroa.0
+  ret void
+}
+
+declare dso_local spir_func target("spirv.Event") @_Z22__spirv_GroupAsyncCopyjPU3AS1iPU3AS3Kimm9ocl_event(i32, ptr addrspace(1), ptr addrspace(3), i64, i64, target("spirv.Event"))
diff --git a/llvm/test/CodeGen/SystemZ/anyregcc.ll b/llvm/test/CodeGen/SystemZ/anyregcc.ll
index 76b9352f3004..8f477c929781 100644
--- a/llvm/test/CodeGen/SystemZ/anyregcc.ll
+++ b/llvm/test/CodeGen/SystemZ/anyregcc.ll
@@ -323,37 +323,37 @@ entry:
 ; CHECK-NEXT:   .byte 1
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .short 8
-; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .short 13
 ; CHECK-NEXT:   .short 0
 ; CHECK-NEXT:   .long 0
-; Loc 9: Register
-; CHECK-NEXT:   .byte 1
+; Loc 9: IndirectMem
+; CHECK-NEXT:   .byte 3
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .short 8
 ; CHECK-NEXT:   .short {{[0-9]+}}
 ; CHECK-NEXT:   .short 0
-; CHECK-NEXT:   .long 0
-; Loc 10: Register
-; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .long 344
+; Loc 10: IndirectMem
+; CHECK-NEXT:   .byte 3
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .short 8
 ; CHECK-NEXT:   .short {{[0-9]+}}
 ; CHECK-NEXT:   .short 0
-; CHECK-NEXT:   .long 0
-; Loc 11: Register
-; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .long 352
+; Loc 11: IndirectMem
+; CHECK-NEXT:   .byte 3
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .short 8
 ; CHECK-NEXT:   .short {{[0-9]+}}
 ; CHECK-NEXT:   .short 0
-; CHECK-NEXT:   .long 0
-; Loc 12: Register
-; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .long 360
+; Loc 12: IndirectMem
+; CHECK-NEXT:   .byte 3
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .short 8
 ; CHECK-NEXT:   .short {{[0-9]+}}
 ; CHECK-NEXT:   .short 0
-; CHECK-NEXT:   .long 0
+; CHECK-NEXT:   .long 368
 define i64 @anyreg_test2(ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5, ptr %a6, ptr %a7, ptr %a8, ptr %a9, ptr %a10, ptr %a11, ptr %a12) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to ptr
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 4d914e3ea0e1..83050ef87591 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -2,8 +2,8 @@
 ; loads with a bitcast, and this test case gets converted into that form as
 ; well by the AtomicExpand pass.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
 
 define void @f1(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f1:
@@ -17,6 +17,34 @@ define void @f1(ptr %ret, ptr %src) {
   ret void
 }
 
+define void @f1_fpuse(ptr %ret, ptr %src) {
+; CHECK-LABEL: f1_fpuse:
+; CHECK:       # %bb.0:
+; BASE-NEXT: aghi	%r15, -176
+; BASE-NEXT: .cfi_def_cfa_offset 336
+
+; CHECK-NEXT:	lpq	%r0, 0(%r3)
+
+; BASE-NEXT: stg %r1, 168(%r15)
+; BASE-NEXT: stg %r0, 160(%r15)
+; BASE-NEXT: ld	%f0, 160(%r15)
+; BASE-NEXT: ld	%f2, 168(%r15)
+
+; Z13-NEXT: vlvgp %v0, %r0, %r1
+; Z13-NEXT: vrepg %v2, %v0, 1
+
+; CHECK-NEXT:	axbr	%f0, %f0
+; CHECK-NEXT:	std	%f0, 0(%r2)
+; CHECK-NEXT:	std	%f2, 8(%r2)
+; BASE-NEXT:	aghi	%r15, 176
+; CHECK-NEXT:	br	%r14
+
+  %val = load atomic fp128, ptr %src seq_cst, align 16
+  %use = fadd fp128 %val, %val
+  store fp128 %use, ptr %ret, align 8
+  ret void
+}
+
 define void @f2(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __atomic_load@PLT
@@ -24,3 +52,19 @@ define void @f2(ptr %ret, ptr %src) {
   store fp128 %val, ptr %ret, align 8
   ret void
 }
+
+define void @f2_fpuse(ptr %ret, ptr %src) {
+; CHECK-LABEL: f2_fpuse:
+; CHECK: brasl %r14, __atomic_load@PLT
+; CHECK-NEXT:   ld	%f0, 160(%r15)
+; CHECK-NEXT:	ld	%f2, 168(%r15)
+; CHECK-NEXT:	axbr	%f0, %f0
+; CHECK-NEXT:	std	%f0, 0(%r13)
+; CHECK-NEXT:	std	%f2, 8(%r13)
+; CHECK-NEXT:	lmg	%r13, %r15, 280(%r15)
+; CHECK-NEXT:	br	%r14
+  %val = load atomic fp128, ptr %src seq_cst, align 8
+  %use = fadd fp128 %val, %val
+  store fp128 %use, ptr %ret, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
index f7f4f4d967db..4d1693477f01 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -1,8 +1,8 @@
 ; Test long double atomic stores. The atomic store is converted to i128 by
 ; the AtomicExpand pass.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
+; xUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
 
 define void @f1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: f1:
@@ -17,6 +17,29 @@ define void @f1(ptr %dst, ptr %src) {
   ret void
 }
 
+define void @f1_fpsrc(ptr %dst, ptr %src) {
+; CHECK-LABEL: f1_fpsrc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT: ld	%f0, 0(%r3)
+; CHECK-NEXT: ld	%f2, 8(%r3)
+; CHECK-NEXT: axbr	%f0, %f0
+
+; BASE-NEXT: lgdr	%r1, %f2
+; BASE-NEXT: lgdr	%r0, %f0
+
+; Z13-NEXT: vmrhg	%v0, %v0, %v2
+; Z13-NEXT: vlgvg	%r1, %v0, 1
+; Z13-NEXT: vlgvg	%r0, %v0, 0
+
+; CHECK-NEXT: stpq	%r0, 0(%r2)
+; CHECK-NEXT: bcr	15, %r0
+; CHECK-NEXT: br	%r14
+  %val = load fp128, ptr %src, align 8
+  %add = fadd fp128 %val, %val
+  store atomic fp128 %add, ptr %dst seq_cst, align 16
+  ret void
+}
+
 define void @f2(ptr %dst, ptr %src) {
 ; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __atomic_store@PLT
@@ -24,3 +47,27 @@ define void @f2(ptr %dst, ptr %src) {
   store atomic fp128 %val, ptr %dst seq_cst, align 8
   ret void
 }
+
+define void @f2_fpuse(ptr %dst, ptr %src) {
+; CHECK-LABEL: f2_fpuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:	stmg	%r14, %r15, 112(%r15)
+; CHECK-NEXT:	.cfi_offset %r14, -48
+; CHECK-NEXT:	.cfi_offset %r15, -40
+; CHECK-NEXT:	aghi	%r15, -176
+; CHECK-NEXT:	.cfi_def_cfa_offset 336
+; CHECK-NEXT:	ld	%f0, 0(%r3)
+; CHECK-NEXT:	ld	%f2, 8(%r3)
+; CHECK-NEXT:	lgr	%r3, %r2
+; CHECK-NEXT:	axbr	%f0, %f0
+; CHECK-NEXT:	la	%r4, 160(%r15)
+; CHECK-NEXT:	lghi	%r2, 16
+; CHECK-NEXT:	lhi	%r5, 5
+; CHECK-NEXT:	std	%f0, 160(%r15)
+; CHECK-NEXT:	std	%f2, 168(%r15)
+; CHECK-NEXT: brasl %r14, __atomic_store@PLT
+  %val = load fp128, ptr %src, align 8
+  %add = fadd fp128 %val, %val
+  store atomic fp128 %add, ptr %dst seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
index 18fa89e6ca6c..f5d8dc092a7e 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
@@ -26,3 +26,45 @@ define void @f1(ptr align 16 %ret, ptr align 16 %src, ptr align 16 %b) {
   store fp128 %res, ptr %ret, align 16
   ret void
 }
+
+define void @f1_fpuse(ptr align 16 %ret, ptr align 16 %src, ptr align 16 %b) {
+; CHECK-LABEL: f1_fpuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT:    .cfi_offset %r12, -64
+; CHECK-NEXT:    .cfi_offset %r13, -56
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -176
+; CHECK-NEXT:    .cfi_def_cfa_offset 336
+; CHECK-NEXT:    ld %f0, 0(%r4)
+; CHECK-NEXT:    ld %f2, 8(%r4)
+; CHECK-NEXT:    lg %r0, 8(%r3)
+; CHECK-NEXT:    lg %r1, 0(%r3)
+; CHECK-NEXT:    axbr %f0, %f0
+; CHECK-NEXT:    lgdr %r5, %f2
+; CHECK-NEXT:    lgdr %r4, %f0
+; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lgr %r12, %r1
+; CHECK-NEXT:    lgr %r13, %r0
+; CHECK-NEXT:    cdsg %r12, %r4, 0(%r3)
+; CHECK-NEXT:    lgr %r0, %r13
+; CHECK-NEXT:    lgr %r1, %r12
+; CHECK-NEXT:    jl .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    stg %r1, 160(%r15)
+; CHECK-NEXT:    stg %r0, 168(%r15)
+; CHECK-NEXT:    ld %f0, 160(%r15)
+; CHECK-NEXT:    ld %f2, 168(%r15)
+; CHECK-NEXT:    axbr %f0, %f0
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    std %f2, 8(%r2)
+; CHECK-NEXT:    lmg %r12, %r15, 272(%r15)
+; CHECK-NEXT:    br %r14
+  %val = load fp128, ptr %b, align 16
+  %add.src = fadd fp128 %val, %val
+  %res = atomicrmw xchg ptr %src, fp128 %add.src seq_cst
+  %res.x2 = fadd fp128 %res, %res
+  store fp128 %res.x2, ptr %ret, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-vr128.mir b/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-vr128.mir
new file mode 100644
index 000000000000..a2a07ac5c7f5
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/copy-phys-reg-gr128-to-vr128.mir
@@ -0,0 +1,78 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=s390x-ibm-linux -mcpu=z13 -run-pass=postrapseudos -o - %s | FileCheck %s
+
+---
+name:            copy_gr128_to_vr128__r0q_to_v0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0q
+    ; CHECK-LABEL: name: copy_gr128_to_vr128__r0q_to_v0
+    ; CHECK: liveins: $r0q
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $v0 = VLVGP $r0d, $r1d
+    ; CHECK-NEXT: Return implicit $v0
+    $v0 = COPY $r0q
+    Return implicit $v0
+...
+
+---
+name:            copy_gr128_to_vr128__r0q_to_v0_killed
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0q
+    ; CHECK-LABEL: name: copy_gr128_to_vr128__r0q_to_v0_killed
+    ; CHECK: liveins: $r0q
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $v0 = VLVGP $r0d, $r1d
+    ; CHECK-NEXT: Return implicit $v0
+    $v0 = COPY killed $r0q
+    Return implicit $v0
+...
+
+---
+name:            copy_gr128_to_vr128__r0q_to_v0_undef
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0q
+    ; CHECK-LABEL: name: copy_gr128_to_vr128__r0q_to_v0_undef
+    ; CHECK: liveins: $r0q
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $v0 = KILL undef $r0q
+    ; CHECK-NEXT: Return implicit $v0
+    $v0 = COPY undef $r0q
+    Return implicit $v0
+...
+
+---
+name:            copy_gr128_to_vr128__r0q_to_v0_subreg0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0d
+    ; CHECK-LABEL: name: copy_gr128_to_vr128__r0q_to_v0_subreg0
+    ; CHECK: liveins: $r0d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $v0 = VLVGP $r0d, $r1d
+    ; CHECK-NEXT: Return implicit $v0
+    $v0 = COPY $r0q
+    Return implicit $v0
+...
+
+---
+name:            copy_gr128_to_vr128__r0q_to_v0_subreg1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r1d
+    ; CHECK-LABEL: name: copy_gr128_to_vr128__r0q_to_v0_subreg1
+    ; CHECK: liveins: $r1d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $v0 = VLVGP $r0d, $r1d
+    ; CHECK-NEXT: Return implicit $v0
+    $v0 = COPY $r0q
+    Return implicit $v0
+...
+
diff --git a/llvm/test/CodeGen/SystemZ/copy-phys-reg-vr128-to-gr128.mir b/llvm/test/CodeGen/SystemZ/copy-phys-reg-vr128-to-gr128.mir
new file mode 100644
index 000000000000..c1141aaf7a2e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/copy-phys-reg-vr128-to-gr128.mir
@@ -0,0 +1,70 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=s390x-ibm-linux -mcpu=z13 -run-pass=postrapseudos -o - %s | FileCheck %s
+
+---
+name:            copy_vr128_to_gr128__v0_to_r0q
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $v0
+    ; CHECK-LABEL: name: copy_vr128_to_gr128__v0_to_r0q
+    ; CHECK: liveins: $v0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $r0d = VLGVG $v0, $noreg, 0, implicit-def $r0q
+    ; CHECK-NEXT: $r1d = VLGVG $v0, $noreg, 1
+    ; CHECK-NEXT: Return implicit $r0q
+    $r0q = COPY $v0
+    Return implicit $r0q
+...
+
+---
+name:            copy_vr128_to_gr128__v0_to_r0q_killed
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $v0
+    ; CHECK-LABEL: name: copy_vr128_to_gr128__v0_to_r0q_killed
+    ; CHECK: liveins: $v0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $r0d = VLGVG $v0, $noreg, 0, implicit-def $r0q
+    ; CHECK-NEXT: $r1d = VLGVG killed $v0, $noreg, 1
+    ; CHECK-NEXT: Return implicit $r0q
+    $r0q = COPY killed $v0
+    Return implicit $r0q
+...
+
+---
+name:            copy_vr128_to_gr128__v0_to_r0q_undef
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: copy_vr128_to_gr128__v0_to_r0q_undef
+    ; CHECK: $r0q = KILL undef $v0
+    ; CHECK-NEXT: Return implicit $r0q
+    $r0q = COPY undef $v0
+    Return implicit $r0q
+...
+
+---
+name:            copy_vr128_to_gr128__v0_to_r0q_undef_use_subreg0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: copy_vr128_to_gr128__v0_to_r0q_undef_use_subreg0
+    ; CHECK: $r0q = KILL undef $v0
+    ; CHECK-NEXT: Return implicit $r0d
+    $r0q = COPY undef $v0
+    Return implicit $r0d
+...
+
+---
+name:            copy_vr128_to_gr128__v0_to_r0q_undef_use_subreg1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: copy_vr128_to_gr128__v0_to_r0q_undef_use_subreg1
+    ; CHECK: $r0q = KILL undef $v0
+    ; CHECK-NEXT: Return implicit $r1d
+    $r0q = COPY undef $v0
+    Return implicit $r1d
+...
diff --git a/llvm/test/CodeGen/SystemZ/foldmem-peep.mir b/llvm/test/CodeGen/SystemZ/foldmem-peep.mir
new file mode 100644
index 000000000000..c6a244f43061
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/foldmem-peep.mir
@@ -0,0 +1,105 @@
+# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z16 -start-before=peephole-opt \
+# RUN:   -stop-after=peephole-opt %s -o - | FileCheck %s
+
+--- |
+  define double @f1(ptr %x, i32 %a, i32 %b, i32 %limit, ptr %dst) #0 {
+    %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+    ret double 0.0
+  }
+  define double @f2(ptr %x, i32 %a, i32 %b, i32 %limit, ptr %dst) #0 {
+    %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+    ret double 0.0
+  }
+
+...
+
+# Do not fold where CC is live.
+# CHECK: name: f1
+# CHECK: {{.*}} WFADB
+---
+name:            f1
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: addr64bit }
+  - { id: 1, class: gr32bit }
+  - { id: 2, class: gr32bit }
+  - { id: 3, class: gr32bit }
+  - { id: 4, class: addr64bit }
+  - { id: 5, class: vr64bit }
+  - { id: 6, class: vr64bit }
+  - { id: 7, class: vr64bit }
+  - { id: 8, class: grx32bit }
+liveins:
+  - { reg: '$r2d', virtual-reg: '%0' }
+  - { reg: '$r3l', virtual-reg: '%1' }
+  - { reg: '$r4l', virtual-reg: '%2' }
+  - { reg: '$r5l', virtual-reg: '%3' }
+  - { reg: '$r6d', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2d, $r3l, $r4l, $r5l, $r6d
+  
+    %4:addr64bit = COPY $r6d
+    %3:gr32bit = COPY $r5l
+    %2:gr32bit = COPY $r4l
+    %1:gr32bit = COPY $r3l
+    %0:addr64bit = COPY $r2d
+    CLFIMux %3, 42, implicit-def $cc
+    %5:vr64bit = VL64 %0, 0, $noreg :: (load (s64) from %ir.x)
+    %6:vr64bit = VL64 %0, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+    %7:vr64bit = nsz arcp contract afn reassoc nofpexcept WFADB killed %6, killed %5, implicit $fpc
+    %8:grx32bit = SELRMux %2, %1, 14, 4, implicit $cc
+    STMux killed %8, %4, 0, $noreg :: (store (s32) into %ir.dst)
+    $f0d = COPY %7
+    Return implicit $f0d
+
+...
+
+# Do not fold where CC is live in.
+# CHECK: name: f2
+# CHECK: {{.*}} WFADB
+---
+name:            f2
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: addr64bit }
+  - { id: 1, class: gr32bit }
+  - { id: 2, class: gr32bit }
+  - { id: 3, class: gr32bit }
+  - { id: 4, class: addr64bit }
+  - { id: 5, class: vr64bit }
+  - { id: 6, class: vr64bit }
+  - { id: 7, class: vr64bit }
+  - { id: 8, class: grx32bit }
+liveins:
+  - { reg: '$r2d', virtual-reg: '%0' }
+  - { reg: '$r3l', virtual-reg: '%1' }
+  - { reg: '$r4l', virtual-reg: '%2' }
+  - { reg: '$r5l', virtual-reg: '%3' }
+  - { reg: '$r6d', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2d, $r3l, $r4l, $r5l, $r6d, $cc
+  
+    %4:addr64bit = COPY $r6d
+    %3:gr32bit = COPY $r5l
+    %2:gr32bit = COPY $r4l
+    %1:gr32bit = COPY $r3l
+    %0:addr64bit = COPY $r2d
+    %5:vr64bit = VL64 %0, 0, $noreg :: (load (s64) from %ir.x)
+    %6:vr64bit = VL64 %0, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+    %7:vr64bit = nsz arcp contract afn reassoc nofpexcept WFADB killed %6, killed %5, implicit $fpc
+    %8:grx32bit = SELRMux %2, %1, 14, 4, implicit $cc
+    STMux killed %8, %4, 0, $noreg :: (store (s32) into %ir.dst)
+    $f0d = COPY %7
+    Return implicit $f0d
+
+...
diff --git a/llvm/test/CodeGen/SystemZ/fp-add-01.ll b/llvm/test/CodeGen/SystemZ/fp-add-01.ll
index f60fb8345b4a..eb845bae9b80 100644
--- a/llvm/test/CodeGen/SystemZ/fp-add-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-add-01.ll
@@ -119,3 +119,15 @@ define float @f7(ptr %ptr0) {
 
   ret float %add10
 }
+
+; Check that reassociation flags do not get in the way of AEB.
+define float @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: aeb %f0
+entry:
+  %0 = load float, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+  %1 = load float, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn float %1, %0
+  ret float %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-add-02.ll b/llvm/test/CodeGen/SystemZ/fp-add-02.ll
index bb12196fb848..7866f98240ea 100644
--- a/llvm/test/CodeGen/SystemZ/fp-add-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-add-02.ll
@@ -118,3 +118,17 @@ define double @f7(ptr %ptr0) {
 
   ret double %add10
 }
+
+; Check that reassociation flags do not get in the way of ADB.
+define double @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: ld %f0
+; CHECK: adb %f0
+; CHECK: br %r14
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  ret double %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-01.ll b/llvm/test/CodeGen/SystemZ/fp-mul-01.ll
index 144e3208c5eb..c5e66ff72c2a 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-01.ll
@@ -119,3 +119,15 @@ define float @f7(ptr %ptr0) {
 
   ret float %mul10
 }
+
+; Check that reassociation flags do not get in the way of MEEB.
+define float @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: meeb %f0
+entry:
+  %0 = load float, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+  %1 = load float, ptr %arrayidx1, align 8
+  %add = fmul reassoc nsz arcp contract afn float %1, %0
+  ret float %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-03.ll b/llvm/test/CodeGen/SystemZ/fp-mul-03.ll
index dbd6975af413..820fdbd6f5bd 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-03.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-03.ll
@@ -119,3 +119,17 @@ define double @f7(ptr %ptr0) {
 
   ret double %mul10
 }
+
+; Check that reassociation flags do not get in the way of MDB.
+define double @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: ld %f0
+; CHECK: mdb %f0
+; CHECK: br %r14
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fmul reassoc nsz arcp contract afn double %1, %0
+  ret double %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-sub-01.ll b/llvm/test/CodeGen/SystemZ/fp-sub-01.ll
index a6e01112619e..e875fa3be735 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sub-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sub-01.ll
@@ -119,3 +119,15 @@ define float @f7(ptr %ptr0) {
 
   ret float %sub10
 }
+
+; Check that reassociation flags do not get in the way of SEB.
+define float @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: seb %f0
+entry:
+  %0 = load float, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+  %1 = load float, ptr %arrayidx1, align 8
+  %add = fsub reassoc nsz arcp contract afn float %1, %0
+  ret float %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-sub-02.ll b/llvm/test/CodeGen/SystemZ/fp-sub-02.ll
index c564c2de3188..3219b6e4be8f 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sub-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sub-02.ll
@@ -119,3 +119,17 @@ define double @f7(ptr %ptr0) {
 
   ret double %sub10
 }
+
+; Check that reassociation flags do not get in the way of SDB.
+define double @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: ld %f0
+; CHECK: sdb %f0
+; CHECK: br %r14
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fsub reassoc nsz arcp contract afn double %1, %0
+  ret double %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll
new file mode 100644
index 000000000000..fdf1be68a543
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll
@@ -0,0 +1,653 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
+; RUN:   | FileCheck %s
+
+; Test reassociation of fp add, subtract and multiply.
+
+define double @fun0_fadd(ptr %x) {
+; CHECK-LABEL: fun0_fadd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ld %f0, 0(%r2)
+; CHECK-NEXT:    adb %f0, 8(%r2)
+; CHECK-NEXT:    ld %f1, 24(%r2)
+; CHECK-NEXT:    adb %f1, 16(%r2)
+; CHECK-NEXT:    adbr %f0, %f1
+; CHECK-NEXT:    ld %f1, 40(%r2)
+; CHECK-NEXT:    adb %f1, 32(%r2)
+; CHECK-NEXT:    adb %f1, 48(%r2)
+; CHECK-NEXT:    adbr %f0, %f1
+; CHECK-NEXT:    adb %f0, 56(%r2)
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4
+  %4 = load double, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5
+  %5 = load double, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6
+  %6 = load double, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7
+  %7 = load double, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
+  ret double %add13
+}
+
+define float @fun1_fadd(ptr %x) {
+; CHECK-LABEL: fun1_fadd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lde %f0, 0(%r2)
+; CHECK-NEXT:    aeb %f0, 4(%r2)
+; CHECK-NEXT:    lde %f1, 12(%r2)
+; CHECK-NEXT:    aeb %f1, 8(%r2)
+; CHECK-NEXT:    aebr %f0, %f1
+; CHECK-NEXT:    lde %f1, 20(%r2)
+; CHECK-NEXT:    aeb %f1, 16(%r2)
+; CHECK-NEXT:    aeb %f1, 24(%r2)
+; CHECK-NEXT:    aebr %f0, %f1
+; CHECK-NEXT:    aeb %f0, 28(%r2)
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load float, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+  %1 = load float, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2
+  %2 = load float, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn float %add, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3
+  %3 = load float, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4
+  %4 = load float, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5
+  %5 = load float, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6
+  %6 = load float, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7
+  %7 = load float, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
+  ret float %add13
+}
+
+define fp128 @fun2_fadd(ptr %x) {
+; CHECK-LABEL: fun2_fadd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 16(%r3), 3
+; CHECK-NEXT:    wfaxb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 32(%r3), 3
+; CHECK-NEXT:    vl %v2, 48(%r3), 3
+; CHECK-NEXT:    wfaxb %v1, %v1, %v2
+; CHECK-NEXT:    wfaxb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r3), 3
+; CHECK-NEXT:    vl %v2, 80(%r3), 3
+; CHECK-NEXT:    wfaxb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r3), 3
+; CHECK-NEXT:    wfaxb %v1, %v1, %v2
+; CHECK-NEXT:    wfaxb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r3), 3
+; CHECK-NEXT:    wfaxb %v0, %v0, %v1
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load fp128, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
+  %1 = load fp128, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn fp128 %1, %0
+  %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2
+  %2 = load fp128, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn fp128 %add, %2
+  %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3
+  %3 = load fp128, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn fp128 %add3, %3
+  %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4
+  %4 = load fp128, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn fp128 %add5, %4
+  %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5
+  %5 = load fp128, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn fp128 %add7, %5
+  %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6
+  %6 = load fp128, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn fp128 %add9, %6
+  %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7
+  %7 = load fp128, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn fp128 %add11, %7
+  ret fp128 %add13
+}
+
+define <2 x double> @fun3_fadd(ptr %x) {
+; CHECK-LABEL: fun3_fadd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r2), 3
+; CHECK-NEXT:    vl %v1, 16(%r2), 3
+; CHECK-NEXT:    vfadb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 32(%r2), 3
+; CHECK-NEXT:    vl %v2, 48(%r2), 3
+; CHECK-NEXT:    vfadb %v1, %v1, %v2
+; CHECK-NEXT:    vfadb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r2), 3
+; CHECK-NEXT:    vl %v2, 80(%r2), 3
+; CHECK-NEXT:    vfadb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r2), 3
+; CHECK-NEXT:    vfadb %v1, %v1, %v2
+; CHECK-NEXT:    vfadb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r2), 3
+; CHECK-NEXT:    vfadb %v24, %v0, %v1
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load <2 x double>, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
+  %1 = load <2 x double>, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn <2 x double> %1, %0
+  %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2
+  %2 = load <2 x double>, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn <2 x double> %add, %2
+  %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3
+  %3 = load <2 x double>, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn <2 x double> %add3, %3
+  %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4
+  %4 = load <2 x double>, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn <2 x double> %add5, %4
+  %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5
+  %5 = load <2 x double>, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn <2 x double> %add7, %5
+  %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6
+  %6 = load <2 x double>, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn <2 x double> %add9, %6
+  %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7
+  %7 = load <2 x double>, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn <2 x double> %add11, %7
+  ret <2 x double> %add13
+}
+
+define <4 x float> @fun4_fadd(ptr %x) {
+; CHECK-LABEL: fun4_fadd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r2), 3
+; CHECK-NEXT:    vl %v1, 16(%r2), 3
+; CHECK-NEXT:    vfasb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 32(%r2), 3
+; CHECK-NEXT:    vl %v2, 48(%r2), 3
+; CHECK-NEXT:    vfasb %v1, %v1, %v2
+; CHECK-NEXT:    vfasb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r2), 3
+; CHECK-NEXT:    vl %v2, 80(%r2), 3
+; CHECK-NEXT:    vfasb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r2), 3
+; CHECK-NEXT:    vfasb %v1, %v1, %v2
+; CHECK-NEXT:    vfasb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r2), 3
+; CHECK-NEXT:    vfasb %v24, %v0, %v1
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load <4 x float>, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
+  %1 = load <4 x float>, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn <4 x float> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2
+  %2 = load <4 x float>, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn <4 x float> %add, %2
+  %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3
+  %3 = load <4 x float>, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn <4 x float> %add3, %3
+  %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4
+  %4 = load <4 x float>, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn <4 x float> %add5, %4
+  %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5
+  %5 = load <4 x float>, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn <4 x float> %add7, %5
+  %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6
+  %6 = load <4 x float>, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn <4 x float> %add9, %6
+  %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7
+  %7 = load <4 x float>, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn <4 x float> %add11, %7
+  ret <4 x float> %add13
+}
+
+define double @fun5_fsub(ptr %x) {
+; CHECK-LABEL: fun5_fsub:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ld %f0, 0(%r2)
+; CHECK-NEXT:    sdb %f0, 8(%r2)
+; CHECK-NEXT:    ld %f1, 24(%r2)
+; CHECK-NEXT:    adb %f1, 16(%r2)
+; CHECK-NEXT:    sdbr %f0, %f1
+; CHECK-NEXT:    ld %f1, 40(%r2)
+; CHECK-NEXT:    adb %f1, 32(%r2)
+; CHECK-NEXT:    adb %f1, 48(%r2)
+; CHECK-NEXT:    sdbr %f0, %f1
+; CHECK-NEXT:    sdb %f0, 56(%r2)
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %sub = fsub reassoc nsz arcp contract afn double %0, %1
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %sub3 = fsub reassoc nsz arcp contract afn double %sub, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %sub5 = fsub reassoc nsz arcp contract afn double %sub3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4
+  %4 = load double, ptr %arrayidx6, align 8
+  %sub7 = fsub reassoc nsz arcp contract afn double %sub5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5
+  %5 = load double, ptr %arrayidx8, align 8
+  %sub9 = fsub reassoc nsz arcp contract afn double %sub7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6
+  %6 = load double, ptr %arrayidx10, align 8
+  %sub11 = fsub reassoc nsz arcp contract afn double %sub9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7
+  %7 = load double, ptr %arrayidx12, align 8
+  %sub13 = fsub reassoc nsz arcp contract afn double %sub11, %7
+  ret double %sub13
+}
+
+define float @fun6_fsub(ptr %x) {
+; CHECK-LABEL: fun6_fsub:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lde %f0, 0(%r2)
+; CHECK-NEXT:    seb %f0, 4(%r2)
+; CHECK-NEXT:    lde %f1, 12(%r2)
+; CHECK-NEXT:    aeb %f1, 8(%r2)
+; CHECK-NEXT:    sebr %f0, %f1
+; CHECK-NEXT:    lde %f1, 20(%r2)
+; CHECK-NEXT:    aeb %f1, 16(%r2)
+; CHECK-NEXT:    aeb %f1, 24(%r2)
+; CHECK-NEXT:    sebr %f0, %f1
+; CHECK-NEXT:    seb %f0, 28(%r2)
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load float, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+  %1 = load float, ptr %arrayidx1, align 8
+  %sub = fsub reassoc nsz arcp contract afn float %0, %1
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2
+  %2 = load float, ptr %arrayidx2, align 8
+  %sub3 = fsub reassoc nsz arcp contract afn float %sub, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3
+  %3 = load float, ptr %arrayidx4, align 8
+  %sub5 = fsub reassoc nsz arcp contract afn float %sub3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4
+  %4 = load float, ptr %arrayidx6, align 8
+  %sub7 = fsub reassoc nsz arcp contract afn float %sub5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5
+  %5 = load float, ptr %arrayidx8, align 8
+  %sub9 = fsub reassoc nsz arcp contract afn float %sub7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6
+  %6 = load float, ptr %arrayidx10, align 8
+  %sub11 = fsub reassoc nsz arcp contract afn float %sub9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7
+  %7 = load float, ptr %arrayidx12, align 8
+  %sub13 = fsub reassoc nsz arcp contract afn float %sub11, %7
+  ret float %sub13
+}
+
+define fp128 @fun7_fsub(ptr %x) {
+; CHECK-LABEL: fun7_fsub:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 16(%r3), 3
+; CHECK-NEXT:    wfsxb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 32(%r3), 3
+; CHECK-NEXT:    vl %v2, 48(%r3), 3
+; CHECK-NEXT:    wfaxb %v1, %v1, %v2
+; CHECK-NEXT:    wfsxb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r3), 3
+; CHECK-NEXT:    vl %v2, 80(%r3), 3
+; CHECK-NEXT:    wfaxb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r3), 3
+; CHECK-NEXT:    wfaxb %v1, %v1, %v2
+; CHECK-NEXT:    wfsxb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r3), 3
+; CHECK-NEXT:    wfsxb %v0, %v0, %v1
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load fp128, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
+  %1 = load fp128, ptr %arrayidx1, align 8
+  %sub = fsub reassoc nsz arcp contract afn fp128 %0, %1
+  %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2
+  %2 = load fp128, ptr %arrayidx2, align 8
+  %sub3 = fsub reassoc nsz arcp contract afn fp128 %sub, %2
+  %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3
+  %3 = load fp128, ptr %arrayidx4, align 8
+  %sub5 = fsub reassoc nsz arcp contract afn fp128 %sub3, %3
+  %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4
+  %4 = load fp128, ptr %arrayidx6, align 8
+  %sub7 = fsub reassoc nsz arcp contract afn fp128 %sub5, %4
+  %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5
+  %5 = load fp128, ptr %arrayidx8, align 8
+  %sub9 = fsub reassoc nsz arcp contract afn fp128 %sub7, %5
+  %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6
+  %6 = load fp128, ptr %arrayidx10, align 8
+  %sub11 = fsub reassoc nsz arcp contract afn fp128 %sub9, %6
+  %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7
+  %7 = load fp128, ptr %arrayidx12, align 8
+  %sub13 = fsub reassoc nsz arcp contract afn fp128 %sub11, %7
+  ret fp128 %sub13
+}
+
+define <2 x double> @fun8_fsub(ptr %x) {
+; CHECK-LABEL: fun8_fsub:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r2), 3
+; CHECK-NEXT:    vl %v1, 16(%r2), 3
+; CHECK-NEXT:    vfsdb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 32(%r2), 3
+; CHECK-NEXT:    vl %v2, 48(%r2), 3
+; CHECK-NEXT:    vfadb %v1, %v1, %v2
+; CHECK-NEXT:    vfsdb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r2), 3
+; CHECK-NEXT:    vl %v2, 80(%r2), 3
+; CHECK-NEXT:    vfadb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r2), 3
+; CHECK-NEXT:    vfadb %v1, %v1, %v2
+; CHECK-NEXT:    vfsdb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r2), 3
+; CHECK-NEXT:    vfsdb %v24, %v0, %v1
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load <2 x double>, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
+  %1 = load <2 x double>, ptr %arrayidx1, align 8
+  %sub = fsub reassoc nsz arcp contract afn <2 x double> %0, %1
+  %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2
+  %2 = load <2 x double>, ptr %arrayidx2, align 8
+  %sub3 = fsub reassoc nsz arcp contract afn <2 x double> %sub, %2
+  %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3
+  %3 = load <2 x double>, ptr %arrayidx4, align 8
+  %sub5 = fsub reassoc nsz arcp contract afn <2 x double> %sub3, %3
+  %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4
+  %4 = load <2 x double>, ptr %arrayidx6, align 8
+  %sub7 = fsub reassoc nsz arcp contract afn <2 x double> %sub5, %4
+  %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5
+  %5 = load <2 x double>, ptr %arrayidx8, align 8
+  %sub9 = fsub reassoc nsz arcp contract afn <2 x double> %sub7, %5
+  %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6
+  %6 = load <2 x double>, ptr %arrayidx10, align 8
+  %sub11 = fsub reassoc nsz arcp contract afn <2 x double> %sub9, %6
+  %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7
+  %7 = load <2 x double>, ptr %arrayidx12, align 8
+  %sub13 = fsub reassoc nsz arcp contract afn <2 x double> %sub11, %7
+  ret <2 x double> %sub13
+}
+
+define <4 x float> @fun9_fsub(ptr %x) {
+; CHECK-LABEL: fun9_fsub:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r2), 3
+; CHECK-NEXT:    vl %v1, 16(%r2), 3
+; CHECK-NEXT:    vfssb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 32(%r2), 3
+; CHECK-NEXT:    vl %v2, 48(%r2), 3
+; CHECK-NEXT:    vfasb %v1, %v1, %v2
+; CHECK-NEXT:    vfssb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r2), 3
+; CHECK-NEXT:    vl %v2, 80(%r2), 3
+; CHECK-NEXT:    vfasb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r2), 3
+; CHECK-NEXT:    vfasb %v1, %v1, %v2
+; CHECK-NEXT:    vfssb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r2), 3
+; CHECK-NEXT:    vfssb %v24, %v0, %v1
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load <4 x float>, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
+  %1 = load <4 x float>, ptr %arrayidx1, align 8
+  %sub = fsub reassoc nsz arcp contract afn <4 x float> %0, %1
+  %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2
+  %2 = load <4 x float>, ptr %arrayidx2, align 8
+  %sub3 = fsub reassoc nsz arcp contract afn <4 x float> %sub, %2
+  %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3
+  %3 = load <4 x float>, ptr %arrayidx4, align 8
+  %sub5 = fsub reassoc nsz arcp contract afn <4 x float> %sub3, %3
+  %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4
+  %4 = load <4 x float>, ptr %arrayidx6, align 8
+  %sub7 = fsub reassoc nsz arcp contract afn <4 x float> %sub5, %4
+  %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5
+  %5 = load <4 x float>, ptr %arrayidx8, align 8
+  %sub9 = fsub reassoc nsz arcp contract afn <4 x float> %sub7, %5
+  %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6
+  %6 = load <4 x float>, ptr %arrayidx10, align 8
+  %sub11 = fsub reassoc nsz arcp contract afn <4 x float> %sub9, %6
+  %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7
+  %7 = load <4 x float>, ptr %arrayidx12, align 8
+  %sub13 = fsub reassoc nsz arcp contract afn <4 x float> %sub11, %7
+  ret <4 x float> %sub13
+}
+
+define double @fun10_fmul(ptr %x) {
+; CHECK-LABEL: fun10_fmul:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ld %f0, 8(%r2)
+; CHECK-NEXT:    mdb %f0, 0(%r2)
+; CHECK-NEXT:    ld %f1, 24(%r2)
+; CHECK-NEXT:    mdb %f1, 16(%r2)
+; CHECK-NEXT:    mdbr %f0, %f1
+; CHECK-NEXT:    ld %f1, 40(%r2)
+; CHECK-NEXT:    mdb %f1, 32(%r2)
+; CHECK-NEXT:    mdb %f1, 48(%r2)
+; CHECK-NEXT:    mdbr %f0, %f1
+; CHECK-NEXT:    mdb %f0, 56(%r2)
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %0, %1
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4
+  %4 = load double, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5
+  %5 = load double, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6
+  %6 = load double, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7
+  %7 = load double, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
+  ret double %mul13
+}
+
+define float @fun11_fmul(ptr %x) {
+; CHECK-LABEL: fun11_fmul:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lde %f0, 4(%r2)
+; CHECK-NEXT:    meeb %f0, 0(%r2)
+; CHECK-NEXT:    lde %f1, 12(%r2)
+; CHECK-NEXT:    meeb %f1, 8(%r2)
+; CHECK-NEXT:    meebr %f0, %f1
+; CHECK-NEXT:    lde %f1, 20(%r2)
+; CHECK-NEXT:    meeb %f1, 16(%r2)
+; CHECK-NEXT:    meeb %f1, 24(%r2)
+; CHECK-NEXT:    meebr %f0, %f1
+; CHECK-NEXT:    meeb %f0, 28(%r2)
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load float, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+  %1 = load float, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn float %0, %1
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2
+  %2 = load float, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3
+  %3 = load float, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4
+  %4 = load float, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5
+  %5 = load float, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6
+  %6 = load float, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7
+  %7 = load float, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
+  ret float %mul13
+}
+
+define fp128 @fun12_fmul(ptr %x) {
+; CHECK-LABEL: fun12_fmul:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 16(%r3), 3
+; CHECK-NEXT:    wfmxb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 32(%r3), 3
+; CHECK-NEXT:    vl %v2, 48(%r3), 3
+; CHECK-NEXT:    wfmxb %v1, %v1, %v2
+; CHECK-NEXT:    wfmxb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r3), 3
+; CHECK-NEXT:    vl %v2, 80(%r3), 3
+; CHECK-NEXT:    wfmxb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r3), 3
+; CHECK-NEXT:    wfmxb %v1, %v1, %v2
+; CHECK-NEXT:    wfmxb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r3), 3
+; CHECK-NEXT:    wfmxb %v0, %v0, %v1
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load fp128, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
+  %1 = load fp128, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn fp128 %0, %1
+  %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2
+  %2 = load fp128, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn fp128 %mul, %2
+  %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3
+  %3 = load fp128, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn fp128 %mul3, %3
+  %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4
+  %4 = load fp128, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn fp128 %mul5, %4
+  %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5
+  %5 = load fp128, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn fp128 %mul7, %5
+  %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6
+  %6 = load fp128, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn fp128 %mul9, %6
+  %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7
+  %7 = load fp128, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn fp128 %mul11, %7
+  ret fp128 %mul13
+}
+
+define <2 x double> @fun13_fmul(ptr %x) {
+; CHECK-LABEL: fun13_fmul:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r2), 3
+; CHECK-NEXT:    vl %v1, 16(%r2), 3
+; CHECK-NEXT:    vfmdb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 32(%r2), 3
+; CHECK-NEXT:    vl %v2, 48(%r2), 3
+; CHECK-NEXT:    vfmdb %v1, %v1, %v2
+; CHECK-NEXT:    vfmdb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r2), 3
+; CHECK-NEXT:    vl %v2, 80(%r2), 3
+; CHECK-NEXT:    vfmdb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r2), 3
+; CHECK-NEXT:    vfmdb %v1, %v1, %v2
+; CHECK-NEXT:    vfmdb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r2), 3
+; CHECK-NEXT:    vfmdb %v24, %v0, %v1
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load <2 x double>, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
+  %1 = load <2 x double>, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn <2 x double> %0, %1
+  %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2
+  %2 = load <2 x double>, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn <2 x double> %mul, %2
+  %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3
+  %3 = load <2 x double>, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn <2 x double> %mul3, %3
+  %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4
+  %4 = load <2 x double>, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn <2 x double> %mul5, %4
+  %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5
+  %5 = load <2 x double>, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn <2 x double> %mul7, %5
+  %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6
+  %6 = load <2 x double>, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn <2 x double> %mul9, %6
+  %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7
+  %7 = load <2 x double>, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn <2 x double> %mul11, %7
+  ret <2 x double> %mul13
+}
+
+define <4 x float> @fun14_fmul(ptr %x) {
+; CHECK-LABEL: fun14_fmul:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl %v0, 0(%r2), 3
+; CHECK-NEXT:    vl %v1, 16(%r2), 3
+; CHECK-NEXT:    vfmsb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 32(%r2), 3
+; CHECK-NEXT:    vl %v2, 48(%r2), 3
+; CHECK-NEXT:    vfmsb %v1, %v1, %v2
+; CHECK-NEXT:    vfmsb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 64(%r2), 3
+; CHECK-NEXT:    vl %v2, 80(%r2), 3
+; CHECK-NEXT:    vfmsb %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 96(%r2), 3
+; CHECK-NEXT:    vfmsb %v1, %v1, %v2
+; CHECK-NEXT:    vfmsb %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 112(%r2), 3
+; CHECK-NEXT:    vfmsb %v24, %v0, %v1
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = load <4 x float>, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
+  %1 = load <4 x float>, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn <4 x float> %0, %1
+  %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2
+  %2 = load <4 x float>, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn <4 x float> %mul, %2
+  %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3
+  %3 = load <4 x float>, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn <4 x float> %mul3, %3
+  %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4
+  %4 = load <4 x float>, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn <4 x float> %mul5, %4
+  %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5
+  %5 = load <4 x float>, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn <4 x float> %mul7, %5
+  %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6
+  %6 = load <4 x float>, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn <4 x float> %mul9, %6
+  %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7
+  %7 = load <4 x float>, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn <4 x float> %mul11, %7
+  ret <4 x float> %mul13
+}
diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll
index 88c7336037c9..6156b7f2fc5a 100644
--- a/llvm/test/CodeGen/SystemZ/stackmap.ll
+++ b/llvm/test/CodeGen/SystemZ/stackmap.ll
@@ -38,10 +38,10 @@
 ; CHECK-NEXT:   .quad 160
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad spilledValue
-; CHECK-NEXT:   .quad 240
+; CHECK-NEXT:   .quad 160
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad spilledStackMapValue
-; CHECK-NEXT:   .quad 200
+; CHECK-NEXT:   .quad 160
 ; CHECK-NEXT:   .quad 1
 ; CHECK-NEXT:   .quad spillSubReg
 ; CHECK-NEXT:   .quad 168
diff --git a/llvm/test/CodeGen/WebAssembly/unreachable.ll b/llvm/test/CodeGen/WebAssembly/unreachable.ll
index 5368c2ba5b8d..ccac31a9af4a 100644
--- a/llvm/test/CodeGen/WebAssembly/unreachable.ll
+++ b/llvm/test/CodeGen/WebAssembly/unreachable.ll
@@ -30,7 +30,6 @@ define void @trap_ret_void() {
 ; CHECK:         .functype trap_ret_void () -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    unreachable
-; CHECK-NEXT:    # fallthrough-return
 ; CHECK-NEXT:    end_function
   call void @llvm.trap()
   ret void
@@ -54,7 +53,6 @@ define void @trap_unreacheable() {
 ; CHECK:         .functype trap_unreacheable () -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    unreachable
-; CHECK-NEXT:    unreachable
 ; CHECK-NEXT:    end_function
   call void @llvm.trap()
   unreachable
@@ -94,3 +92,12 @@ define i32 @missing_ret_noreturn_unreachable() {
   call void @ext_never_return()
   unreachable
 }
+
+define i32 @no_crash_for_other_instruction_after_trap(ptr %p, i32 %b) {
+; CHECK-LABEL: no_crash_for_other_instruction_after_trap:
+; CHECK:      unreachable
+; CHECK-NEXT: end_function
+  %a = load i32, ptr %p
+  call void @llvm.trap()
+  ret i32 %a
+}
diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
index 4686361ad2fc..a0085afbaf02 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
@@ -44,12 +44,8 @@ define dso_local void @test1(ptr%buf) nounwind {
 ; CHECK-NEXT:    tileloadd 3024(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
 ; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm0
 ; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm1
-; CHECK-NEXT:    # implicit-def: $rax
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
 ; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
 ; CHECK-NEXT:    tilestored %tmm2, (%rbx,%r15)
 ; CHECK-NEXT:    incl %r14d
@@ -111,16 +107,10 @@ define dso_local void @test1(ptr%buf) nounwind {
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x04,0xd0,0x0b,0x00,0x00]
 ; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x3b]
 ; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
-; EGPR-NEXT:    # implicit-def: $rax
-; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    # encoding: [0x48,0x89,0x84,0x24,0xb8,0x03,0x00,0x00]
-; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 ; EGPR-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x9c,0x04,0x00,0x04,0x00,0x00]
 ; EGPR-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x94,0x24,0x00,0x04,0x00,0x00]
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; EGPR-NEXT:    # encoding: [0x48,0x8b,0x84,0x24,0xb8,0x03,0x00,0x00]
 ; EGPR-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x5e,0xd0]
 ; EGPR-NEXT:    tilestored %tmm2, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x14,0x3b]
 ; EGPR-NEXT:    incl %r14d # encoding: [0x41,0xff,0xc6]
diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index dd180b67e492..0c33e8973c2d 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -715,43 +715,41 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
 ;
 ; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    psubq %xmm1, %xmm2
 ; SSE42-NEXT:    movdqa %xmm1, %xmm3
-; SSE42-NEXT:    pxor %xmm2, %xmm3
-; SSE42-NEXT:    pxor %xmm0, %xmm2
-; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    paddq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm3
+; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm4, %xmm1
+; SSE42-NEXT:    pxor %xmm4, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE42-NEXT:    paddq %xmm3, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    # xmm2 = mem[0,0]
-; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v2i64_multiuse_cmp:
diff --git a/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll b/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
index 017024c173c3..b2cb2c3e04b3 100644
--- a/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
+++ b/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
@@ -52,10 +52,7 @@ alloca_21:
 define i32 @kmovrk_1(<4 x ptr> %arg) {
 ; AVX512-LABEL: kmovrk_1:
 ; AVX512:       # %bb.0: # %bb
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; AVX512-NEXT:    kmovw %k0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x93,0xc0]
-; AVX512-NEXT:    testb $15, %al # encoding: [0xa8,0x0f]
+; AVX512-NEXT:    vptest %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc0]
 ; AVX512-NEXT:    jne .LBB2_1 # encoding: [0x75,A]
 ; AVX512-NEXT:    # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
 ; AVX512-NEXT:  # %bb.2: # %bb3
@@ -66,10 +63,7 @@ define i32 @kmovrk_1(<4 x ptr> %arg) {
 ;
 ; AVX512BW-LABEL: kmovrk_1:
 ; AVX512BW:       # %bb.0: # %bb
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; AVX512BW-NEXT:    kmovd %k0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x93,0xc0]
-; AVX512BW-NEXT:    testb $15, %al # encoding: [0xa8,0x0f]
+; AVX512BW-NEXT:    vptest %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc0]
 ; AVX512BW-NEXT:    jne .LBB2_1 # encoding: [0x75,A]
 ; AVX512BW-NEXT:    # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
 ; AVX512BW-NEXT:  # %bb.2: # %bb3
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
index 4529ea275df9..f44f98c2a41a 100644
--- a/llvm/test/CodeGen/X86/avgceils.ll
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -65,7 +65,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -165,7 +165,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -190,7 +190,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -289,7 +289,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -314,7 +314,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -410,7 +410,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -472,7 +472,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm0, %rax
@@ -574,7 +574,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -649,7 +649,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
@@ -806,7 +806,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -858,7 +858,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
@@ -1014,7 +1014,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -1066,7 +1066,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
@@ -1218,7 +1218,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1306,27 +1306,15 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    movq %xmm0, %r11
 ; SSE2-NEXT:    movq %r11, %r12
 ; SSE2-NEXT:    sarq $63, %r12
@@ -1382,39 +1370,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v4i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm0, %r11
 ; SSE4-NEXT:    movq %r11, %r12
 ; SSE4-NEXT:    sarq $63, %r12
@@ -1466,39 +1436,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX1-NEXT:    movq %r11, %r12
@@ -1553,39 +1505,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX2-NEXT:    movq %r11, %r12
@@ -1640,39 +1574,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX512-NEXT:    movq %r11, %r12
@@ -1727,17 +1643,11 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <4 x i64> %a0 to <4 x i128>
   %x1 = sext <4 x i64> %a1 to <4 x i128>
@@ -1752,7 +1662,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm11
@@ -1864,7 +1774,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
@@ -2144,7 +2054,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2220,7 +2130,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
@@ -2498,7 +2408,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2574,7 +2484,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
@@ -2848,7 +2758,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
@@ -2985,29 +2895,16 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    .cfi_def_cfa_offset 64
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE2-NEXT:    sarq $63, %rax
@@ -3137,43 +3034,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    addq $8, %rsp
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    subq $16, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 72
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm0, %rax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    sarq $63, %rax
@@ -3301,43 +3178,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    addq $16, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    pushq %rax
-; AVX1-NEXT:    .cfi_def_cfa_offset 64
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -3465,43 +3322,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    addq $8, %rsp
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    .cfi_def_cfa_offset 64
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -3629,43 +3466,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    .cfi_def_cfa_offset 64
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
@@ -3796,19 +3613,12 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <8 x i64> %a0 to <8 x i128>
   %x1 = sext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgceilu.ll b/llvm/test/CodeGen/X86/avgceilu.ll
index dee1a5a720f9..d34894cc0fbb 100644
--- a/llvm/test/CodeGen/X86/avgceilu.ll
+++ b/llvm/test/CodeGen/X86/avgceilu.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm1, %xmm0
@@ -26,7 +26,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm1, %xmm0
@@ -45,7 +45,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm1, %xmm0
@@ -62,7 +62,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm1, %xmm0
@@ -81,7 +81,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -106,7 +106,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
@@ -195,7 +195,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -220,7 +220,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -310,7 +310,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm2, %xmm0
@@ -342,7 +342,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm2, %xmm0
@@ -376,7 +376,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm2, %xmm0
@@ -408,7 +408,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm2, %xmm0
@@ -442,7 +442,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -494,7 +494,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -629,7 +629,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -681,7 +681,7 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
@@ -937,7 +937,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm4, %xmm0
@@ -977,7 +977,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm4, %xmm0
@@ -1019,7 +1019,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm4, %xmm0
@@ -1059,7 +1059,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm4, %xmm0
@@ -1101,7 +1101,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1177,7 +1177,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm8
@@ -1413,7 +1413,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1489,27 +1489,15 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm8, %rcx
 ; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1617,39 +1605,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm3, %rcx
 ; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq %xmm7, %rdx
@@ -1747,39 +1717,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm1, %rcx
 ; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    vmovq %xmm3, %rdx
@@ -1885,39 +1837,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm1, %rcx
 ; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    vmovq %xmm3, %rdx
@@ -2023,39 +1957,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rcx
 ; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vmovq %xmm1, %rdx
@@ -2164,17 +2080,11 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = zext <8 x i64> %a0 to <8 x i128>
   %x1 = zext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index a3864ab4bb44..efee831a15c7 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -64,7 +64,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -150,7 +150,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -174,7 +174,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -259,7 +259,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -283,7 +283,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -365,7 +365,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -425,7 +425,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -514,7 +514,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -587,7 +587,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
@@ -723,7 +723,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -773,7 +773,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
@@ -908,7 +908,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -958,7 +958,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -1089,7 +1089,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1173,27 +1173,15 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm4, %rdx
 ; SSE2-NEXT:    movq %rdx, %r14
@@ -1241,39 +1229,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v4i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm1, %rdi
 ; SSE4-NEXT:    movq %rdi, %r14
 ; SSE4-NEXT:    sarq $63, %r14
@@ -1317,39 +1287,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm0, %rdx
 ; AVX1-NEXT:    movq %rdx, %r14
 ; AVX1-NEXT:    sarq $63, %r14
@@ -1396,39 +1348,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm0, %rdx
 ; AVX2-NEXT:    movq %rdx, %r14
 ; AVX2-NEXT:    sarq $63, %r14
@@ -1475,39 +1409,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rdx
 ; AVX512-NEXT:    movq %rdx, %r14
 ; AVX512-NEXT:    sarq $63, %r14
@@ -1554,17 +1470,11 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <4 x i64> %a0 to <4 x i128>
   %x1 = sext <4 x i64> %a1 to <4 x i128>
@@ -1578,7 +1488,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm10
@@ -1690,7 +1600,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
@@ -1934,7 +1844,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2007,7 +1917,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm9
@@ -2251,7 +2161,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2324,7 +2234,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -2561,7 +2471,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm11
@@ -2698,29 +2608,16 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    .cfi_def_cfa_offset 64
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm8, %rax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2832,43 +2729,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    addq $8, %rsp
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    pushq %rax
-; SSE4-NEXT:    .cfi_def_cfa_offset 64
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm3, %rax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq %rax, %rcx
@@ -2972,43 +2849,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    addq $8, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    pushq %rax
-; AVX1-NEXT:    .cfi_def_cfa_offset 64
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movq %rax, %rcx
@@ -3118,43 +2975,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    addq $8, %rsp
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    .cfi_def_cfa_offset 64
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movq %rax, %rcx
@@ -3264,43 +3101,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    .cfi_def_cfa_offset 64
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movq %rax, %rcx
@@ -3413,19 +3230,12 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <8 x i64> %a0 to <8 x i128>
   %x1 = sext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgflooru.ll b/llvm/test/CodeGen/X86/avgflooru.ll
index e07c1f55991e..000457c5ab1e 100644
--- a/llvm/test/CodeGen/X86/avgflooru.ll
+++ b/llvm/test/CodeGen/X86/avgflooru.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -53,7 +53,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -127,7 +127,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -151,7 +151,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -227,7 +227,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -251,7 +251,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -325,7 +325,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -349,7 +349,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -458,7 +458,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -516,7 +516,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -627,7 +627,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -677,7 +677,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -792,7 +792,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -842,7 +842,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -954,7 +954,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -1004,7 +1004,7 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
@@ -1199,7 +1199,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm9
@@ -1286,7 +1286,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -1481,7 +1481,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1554,7 +1554,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -1757,7 +1757,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1830,7 +1830,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -2027,7 +2027,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2100,27 +2100,15 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm3, %rbx
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
@@ -2194,39 +2182,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm3, %r14
 ; SSE4-NEXT:    movq %xmm2, %r13
 ; SSE4-NEXT:    pextrq $1, %xmm2, %rbp
@@ -2292,39 +2262,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rbx
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; AVX1-NEXT:    vmovq %xmm4, %r15
@@ -2396,39 +2348,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rbx
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX2-NEXT:    vmovq %xmm4, %r15
@@ -2500,39 +2434,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
@@ -2607,17 +2523,11 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = zext <8 x i64> %a0 to <8 x i128>
   %x1 = zext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
index 13d1265a249d..7e48b3719cf0 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
@@ -30,13 +30,13 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
 ; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm3
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm4
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm4
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
index 4988fc35b10e..33819c9e0102 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,EVEX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s --check-prefixes=CHECK,EVEX256
 
 ; 256-bit
 
@@ -236,3 +236,34 @@ define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) {
   ret <8 x i16> %x
 }
 
+define i16 @PR90356(<16 x i1> %a) {
+; EVEX512-LABEL: PR90356:
+; EVEX512:       # %bb.0:
+; EVEX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; EVEX512-NEXT:    vpmovb2m %xmm0, %k1
+; EVEX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; EVEX512-NEXT:    movb $63, %al
+; EVEX512-NEXT:    kmovd %eax, %k1
+; EVEX512-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; EVEX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; EVEX512-NEXT:    kmovd %k0, %eax
+; EVEX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; EVEX512-NEXT:    vzeroupper
+; EVEX512-NEXT:    retq
+;
+; EVEX256-LABEL: PR90356:
+; EVEX256:       # %bb.0:
+; EVEX256-NEXT:    vpsllw $7, %xmm0, %xmm0
+; EVEX256-NEXT:    vpmovb2m %xmm0, %k0
+; EVEX256-NEXT:    vpmovm2w %k0, %ymm0
+; EVEX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; EVEX256-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; EVEX256-NEXT:    vpmovw2m %ymm0, %k0
+; EVEX256-NEXT:    kmovd %k0, %eax
+; EVEX256-NEXT:    # kill: def $ax killed $ax killed $eax
+; EVEX256-NEXT:    vzeroupper
+; EVEX256-NEXT:    retq
+  %1 = shufflevector <16 x i1> %a, <16 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+  %2 = bitcast <16 x i1> %1 to i16
+  ret i16 %2
+}
diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll
index 30e52f063075..402da547613c 100644
--- a/llvm/test/CodeGen/X86/cmp.ll
+++ b/llvm/test/CodeGen/X86/cmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ndd -show-mc-encoding | FileCheck --check-prefix=NDD %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NO-NDD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ndd -show-mc-encoding | FileCheck --check-prefixes=CHECK,NDD %s
 
 @d = dso_local global i8 0, align 1
 @d64 = dso_local global i64 0
@@ -17,18 +17,6 @@ define i32 @test1(i32 %X, ptr %y) nounwind {
 ; CHECK-NEXT:  .LBB0_2: # %ReturnBlock
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test1:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    cmpl $0, (%rsi) # encoding: [0x83,0x3e,0x00]
-; NDD-NEXT:    je .LBB0_2 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB0_2-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %cond_true
-; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB0_2: # %ReturnBlock
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %tmp = load i32, ptr %y
   %tmp.upgrd.1 = icmp eq i32 %tmp, 0
@@ -54,19 +42,6 @@ define i32 @test2(i32 %X, ptr %y) nounwind {
 ; CHECK-NEXT:  .LBB1_2: # %ReturnBlock
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test2:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    testl $536870911, (%rsi) # encoding: [0xf7,0x06,0xff,0xff,0xff,0x1f]
-; NDD-NEXT:    # imm = 0x1FFFFFFF
-; NDD-NEXT:    je .LBB1_2 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %cond_true
-; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB1_2: # %ReturnBlock
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %tmp = load i32, ptr %y
   %tmp1 = shl i32 %tmp, 3
@@ -92,18 +67,6 @@ define i8 @test2b(i8 %X, ptr %y) nounwind {
 ; CHECK-NEXT:  .LBB2_2: # %ReturnBlock
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test2b:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    testb $31, (%rsi) # encoding: [0xf6,0x06,0x1f]
-; NDD-NEXT:    je .LBB2_2 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %cond_true
-; NDD-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB2_2: # %ReturnBlock
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %tmp = load i8, ptr %y
   %tmp1 = shl i8 %tmp, 3
@@ -124,13 +87,6 @@ define i64 @test3(i64 %x) nounwind {
 ; CHECK-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test3:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %t = icmp eq i64 %x, 0
   %r = zext i1 %t to i64
@@ -144,13 +100,6 @@ define i64 @test4(i64 %x) nounwind {
 ; CHECK-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
 ; CHECK-NEXT:    setle %al # encoding: [0x0f,0x9e,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test4:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
-; NDD-NEXT:    setle %al # encoding: [0x0f,0x9e,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %t = icmp slt i64 %x, 1
   %r = zext i1 %t to i64
   ret i64 %r
@@ -176,26 +125,6 @@ define i32 @test5(double %A) nounwind {
 ; CHECK-NEXT:    jmp foo@PLT # TAILCALL
 ; CHECK-NEXT:    # encoding: [0xeb,A]
 ; CHECK-NEXT:    # fixup A - offset: 1, value: foo@PLT-1, kind: FK_PCRel_1
-;
-; NDD-LABEL: test5:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x66,0x0f,0x2e,0x05,A,A,A,A]
-; NDD-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; NDD-NEXT:    ja .LBB5_3 # encoding: [0x77,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB5_3-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %entry
-; NDD-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x66,0x0f,0x2e,0x05,A,A,A,A]
-; NDD-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; NDD-NEXT:    jb .LBB5_3 # encoding: [0x72,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB5_3-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.2: # %bb12
-; NDD-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB5_3: # %bb8
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    jmp foo@PLT # TAILCALL
-; NDD-NEXT:    # encoding: [0xeb,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: foo@PLT-1, kind: FK_PCRel_1
 entry:
   %tmp2 = fcmp ogt double %A, 1.500000e+02
   %tmp5 = fcmp ult double %A, 7.500000e+01
@@ -224,18 +153,6 @@ define i32 @test6() nounwind align 2 {
 ; CHECK-NEXT:  .LBB6_1: # %T
 ; CHECK-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test6:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    cmpq $0, -{{[0-9]+}}(%rsp) # encoding: [0x48,0x83,0x7c,0x24,0xf8,0x00]
-; NDD-NEXT:    je .LBB6_1 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB6_1-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.2: # %F
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB6_1: # %T
-; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %A = alloca { i64, i64 }, align 8
   %B = getelementptr inbounds { i64, i64 }, ptr %A, i64 0, i32 1
@@ -251,12 +168,12 @@ F:
 }
 
 define i32 @test7(i64 %res) nounwind {
-; CHECK-LABEL: test7:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test7:
+; NO-NDD:       # %bb.0: # %entry
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test7:
 ; NDD:       # %bb.0: # %entry
@@ -271,13 +188,13 @@ entry:
 }
 
 define i32 @test8(i64 %res) nounwind {
-; CHECK-LABEL: test8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    cmpl $3, %edi # encoding: [0x83,0xff,0x03]
-; CHECK-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test8:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    cmpl $3, %edi # encoding: [0x83,0xff,0x03]
+; NO-NDD-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test8:
 ; NDD:       # %bb.0:
@@ -292,12 +209,12 @@ define i32 @test8(i64 %res) nounwind {
 }
 
 define i32 @test9(i64 %res) nounwind {
-; CHECK-LABEL: test9:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test9:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test9:
 ; NDD:       # %bb.0:
@@ -311,12 +228,12 @@ define i32 @test9(i64 %res) nounwind {
 }
 
 define i32 @test10(i64 %res) nounwind {
-; CHECK-LABEL: test10:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test10:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test10:
 ; NDD:       # %bb.0:
@@ -330,13 +247,13 @@ define i32 @test10(i64 %res) nounwind {
 }
 
 define i32 @test11(i64 %l) nounwind {
-; CHECK-LABEL: test11:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $47, %rdi # encoding: [0x48,0xc1,0xef,0x2f]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    cmpl $1, %edi # encoding: [0x83,0xff,0x01]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test11:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $47, %rdi # encoding: [0x48,0xc1,0xef,0x2f]
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    cmpl $1, %edi # encoding: [0x83,0xff,0x01]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test11:
 ; NDD:       # %bb.0:
@@ -372,27 +289,6 @@ define i32 @test12() ssp uwtable {
 ; CHECK-NEXT:    popq %rcx # encoding: [0x59]
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test12:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    pushq %rax # encoding: [0x50]
-; NDD-NEXT:    .cfi_def_cfa_offset 16
-; NDD-NEXT:    callq test12b@PLT # encoding: [0xe8,A,A,A,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: test12b@PLT-4, kind: FK_PCRel_4
-; NDD-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
-; NDD-NEXT:    je .LBB12_2 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB12_2-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %T
-; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; NDD-NEXT:    popq %rcx # encoding: [0x59]
-; NDD-NEXT:    .cfi_def_cfa_offset 8
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB12_2: # %F
-; NDD-NEXT:    .cfi_def_cfa_offset 16
-; NDD-NEXT:    movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
-; NDD-NEXT:    popq %rcx # encoding: [0x59]
-; NDD-NEXT:    .cfi_def_cfa_offset 8
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %tmp1 = call zeroext i1 @test12b()
   br i1 %tmp1, label %T, label %F
@@ -407,12 +303,12 @@ F:
 declare zeroext i1 @test12b()
 
 define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
-; CHECK-LABEL: test13:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
-; CHECK-NEXT:    testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
-; CHECK-NEXT:    cmovnel %edx, %eax # encoding: [0x0f,0x45,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test13:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; NO-NDD-NEXT:    testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
+; NO-NDD-NEXT:    cmovnel %edx, %eax # encoding: [0x0f,0x45,0xc2]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test13:
 ; NDD:       # %bb.0:
@@ -426,12 +322,12 @@ define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
 }
 
 define i32 @test14(i32 %mask, i32 %base, i32 %intra) {
-; CHECK-LABEL: test14:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
-; CHECK-NEXT:    shrl $7, %edi # encoding: [0xc1,0xef,0x07]
-; CHECK-NEXT:    cmovnsl %edx, %eax # encoding: [0x0f,0x49,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test14:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; NO-NDD-NEXT:    shrl $7, %edi # encoding: [0xc1,0xef,0x07]
+; NO-NDD-NEXT:    cmovnsl %edx, %eax # encoding: [0x0f,0x49,0xc2]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test14:
 ; NDD:       # %bb.0:
@@ -446,14 +342,14 @@ define i32 @test14(i32 %mask, i32 %base, i32 %intra) {
 
 ; PR19964
 define zeroext i1 @test15(i32 %bf.load, i32 %n) {
-; CHECK-LABEL: test15:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrl $16, %edi # encoding: [0xc1,0xef,0x10]
-; CHECK-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
-; CHECK-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
-; CHECK-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
-; CHECK-NEXT:    orb %cl, %al # encoding: [0x08,0xc8]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test15:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrl $16, %edi # encoding: [0xc1,0xef,0x10]
+; NO-NDD-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
+; NO-NDD-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; NO-NDD-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
+; NO-NDD-NEXT:    orb %cl, %al # encoding: [0x08,0xc8]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test15:
 ; NDD:       # %bb.0:
@@ -476,12 +372,6 @@ define i8 @signbit_i16(i16 signext %L) {
 ; CHECK-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
 ; CHECK-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: signbit_i16:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
-; NDD-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %lshr = lshr i16 %L, 15
   %trunc = trunc i16 %lshr to i8
   %not = xor i8 %trunc, 1
@@ -494,12 +384,6 @@ define i8 @signbit_i32(i32 %L) {
 ; CHECK-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
 ; CHECK-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: signbit_i32:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
-; NDD-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %lshr = lshr i32 %L, 31
   %trunc = trunc i32 %lshr to i8
   %not = xor i8 %trunc, 1
@@ -512,12 +396,6 @@ define i8 @signbit_i64(i64 %L) {
 ; CHECK-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
 ; CHECK-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: signbit_i64:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
-; NDD-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %lshr = lshr i64 %L, 63
   %trunc = trunc i64 %lshr to i8
   %not = xor i8 %trunc, 1
@@ -530,12 +408,6 @@ define zeroext i1 @signbit_i32_i1(i32 %L) {
 ; CHECK-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
 ; CHECK-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: signbit_i32_i1:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
-; NDD-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %lshr = lshr i32 %L, 31
   %trunc = trunc i32 %lshr to i1
   %not = xor i1 %trunc, true
@@ -544,20 +416,20 @@ define zeroext i1 @signbit_i32_i1(i32 %L) {
 
 ; This test failed due to incorrect handling of "shift + icmp" sequence
 define void @test20(i32 %bf.load, i8 %x1, ptr %b_addr) {
-; CHECK-LABEL: test20:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
-; CHECK-NEXT:    # imm = 0xFFFFFF
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    movzbl %sil, %ecx # encoding: [0x40,0x0f,0xb6,0xce]
-; CHECK-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
-; CHECK-NEXT:    setne (%rdx) # encoding: [0x0f,0x95,0x02]
-; CHECK-NEXT:    testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
-; CHECK-NEXT:    # imm = 0xFFFFFF
-; CHECK-NEXT:    setne d(%rip) # encoding: [0x0f,0x95,0x05,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d-4, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test20:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
+; NO-NDD-NEXT:    # imm = 0xFFFFFF
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    movzbl %sil, %ecx # encoding: [0x40,0x0f,0xb6,0xce]
+; NO-NDD-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; NO-NDD-NEXT:    setne (%rdx) # encoding: [0x0f,0x95,0x02]
+; NO-NDD-NEXT:    testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
+; NO-NDD-NEXT:    # imm = 0xFFFFFF
+; NO-NDD-NEXT:    setne d(%rip) # encoding: [0x0f,0x95,0x05,A,A,A,A]
+; NO-NDD-NEXT:    # fixup A - offset: 3, value: d-4, kind: reloc_riprel_4byte
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test20:
 ; NDD:       # %bb.0:
@@ -593,11 +465,6 @@ define i32 @highmask_i64_simplify(i64 %val) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: highmask_i64_simplify:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i64 %val, -2199023255552
   %cmp = icmp ult i64 %and, 0
   %ret = zext i1 %cmp to i32
@@ -605,12 +472,12 @@ define i32 @highmask_i64_simplify(i64 %val) {
 }
 
 define i32 @highmask_i64_mask64(i64 %val) {
-; CHECK-LABEL: highmask_i64_mask64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $41, %rdi # encoding: [0x48,0xc1,0xef,0x29]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: highmask_i64_mask64:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $41, %rdi # encoding: [0x48,0xc1,0xef,0x29]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: highmask_i64_mask64:
 ; NDD:       # %bb.0:
@@ -625,14 +492,14 @@ define i32 @highmask_i64_mask64(i64 %val) {
 }
 
 define i64 @highmask_i64_mask64_extra_use(i64 %val) nounwind {
-; CHECK-LABEL: highmask_i64_mask64_extra_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    movq %rdi, %rcx # encoding: [0x48,0x89,0xf9]
-; CHECK-NEXT:    shrq $41, %rcx # encoding: [0x48,0xc1,0xe9,0x29]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: highmask_i64_mask64_extra_use:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    movq %rdi, %rcx # encoding: [0x48,0x89,0xf9]
+; NO-NDD-NEXT:    shrq $41, %rcx # encoding: [0x48,0xc1,0xe9,0x29]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: highmask_i64_mask64_extra_use:
 ; NDD:       # %bb.0:
@@ -649,12 +516,12 @@ define i64 @highmask_i64_mask64_extra_use(i64 %val) nounwind {
 }
 
 define i32 @highmask_i64_mask32(i64 %val) {
-; CHECK-LABEL: highmask_i64_mask32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $20, %rdi # encoding: [0x48,0xc1,0xef,0x14]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: highmask_i64_mask32:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $20, %rdi # encoding: [0x48,0xc1,0xef,0x14]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: highmask_i64_mask32:
 ; NDD:       # %bb.0:
@@ -669,14 +536,14 @@ define i32 @highmask_i64_mask32(i64 %val) {
 }
 
 define i64 @highmask_i64_mask32_extra_use(i64 %val) nounwind {
-; CHECK-LABEL: highmask_i64_mask32_extra_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    testq $-1048576, %rdi # encoding: [0x48,0xf7,0xc7,0x00,0x00,0xf0,0xff]
-; CHECK-NEXT:    # imm = 0xFFF00000
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: highmask_i64_mask32_extra_use:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    testq $-1048576, %rdi # encoding: [0x48,0xf7,0xc7,0x00,0x00,0xf0,0xff]
+; NO-NDD-NEXT:    # imm = 0xFFF00000
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: highmask_i64_mask32_extra_use:
 ; NDD:       # %bb.0:
@@ -700,13 +567,6 @@ define i32 @highmask_i64_mask8(i64 %val) {
 ; CHECK-NEXT:    testq $-16, %rdi # encoding: [0x48,0xf7,0xc7,0xf0,0xff,0xff,0xff]
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: highmask_i64_mask8:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testq $-16, %rdi # encoding: [0x48,0xf7,0xc7,0xf0,0xff,0xff,0xff]
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i64 %val, -16
   %cmp = icmp ne i64 %and, 0
   %ret = zext i1 %cmp to i32
@@ -714,12 +574,12 @@ define i32 @highmask_i64_mask8(i64 %val) {
 }
 
 define i32 @lowmask_i64_mask64(i64 %val) {
-; CHECK-LABEL: lowmask_i64_mask64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shlq $16, %rdi # encoding: [0x48,0xc1,0xe7,0x10]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: lowmask_i64_mask64:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shlq $16, %rdi # encoding: [0x48,0xc1,0xe7,0x10]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: lowmask_i64_mask64:
 ; NDD:       # %bb.0:
@@ -734,14 +594,14 @@ define i32 @lowmask_i64_mask64(i64 %val) {
 }
 
 define i64 @lowmask_i64_mask64_extra_use(i64 %val) nounwind {
-; CHECK-LABEL: lowmask_i64_mask64_extra_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    movq %rdi, %rcx # encoding: [0x48,0x89,0xf9]
-; CHECK-NEXT:    shlq $16, %rcx # encoding: [0x48,0xc1,0xe1,0x10]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: lowmask_i64_mask64_extra_use:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    movq %rdi, %rcx # encoding: [0x48,0x89,0xf9]
+; NO-NDD-NEXT:    shlq $16, %rcx # encoding: [0x48,0xc1,0xe1,0x10]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: lowmask_i64_mask64_extra_use:
 ; NDD:       # %bb.0:
@@ -758,12 +618,12 @@ define i64 @lowmask_i64_mask64_extra_use(i64 %val) nounwind {
 }
 
 define i32 @lowmask_i64_mask32(i64 %val) {
-; CHECK-LABEL: lowmask_i64_mask32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shlq $44, %rdi # encoding: [0x48,0xc1,0xe7,0x2c]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: lowmask_i64_mask32:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shlq $44, %rdi # encoding: [0x48,0xc1,0xe7,0x2c]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: lowmask_i64_mask32:
 ; NDD:       # %bb.0:
@@ -778,14 +638,14 @@ define i32 @lowmask_i64_mask32(i64 %val) {
 }
 
 define i64 @lowmask_i64_mask32_extra_use(i64 %val) nounwind {
-; CHECK-LABEL: lowmask_i64_mask32_extra_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    testl $1048575, %edi # encoding: [0xf7,0xc7,0xff,0xff,0x0f,0x00]
-; CHECK-NEXT:    # imm = 0xFFFFF
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: lowmask_i64_mask32_extra_use:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    testl $1048575, %edi # encoding: [0xf7,0xc7,0xff,0xff,0x0f,0x00]
+; NO-NDD-NEXT:    # imm = 0xFFFFF
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: lowmask_i64_mask32_extra_use:
 ; NDD:       # %bb.0:
@@ -809,13 +669,6 @@ define i32 @lowmask_i64_mask8(i64 %val) {
 ; CHECK-NEXT:    testb $31, %dil # encoding: [0x40,0xf6,0xc7,0x1f]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: lowmask_i64_mask8:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testb $31, %dil # encoding: [0x40,0xf6,0xc7,0x1f]
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i64 %val, 31
   %cmp = icmp eq i64 %and, 0
   %ret = zext i1 %cmp to i32
@@ -830,14 +683,6 @@ define i32 @highmask_i32_mask32(i32 %val) {
 ; CHECK-NEXT:    # imm = 0xFFF00000
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: highmask_i32_mask32:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testl $-1048576, %edi # encoding: [0xf7,0xc7,0x00,0x00,0xf0,0xff]
-; NDD-NEXT:    # imm = 0xFFF00000
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %val, -1048576
   %cmp = icmp ne i32 %and, 0
   %ret = zext i1 %cmp to i32
@@ -851,13 +696,6 @@ define i32 @highmask_i32_mask8(i32 %val) {
 ; CHECK-NEXT:    testl $-16, %edi # encoding: [0xf7,0xc7,0xf0,0xff,0xff,0xff]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: highmask_i32_mask8:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testl $-16, %edi # encoding: [0xf7,0xc7,0xf0,0xff,0xff,0xff]
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %val, -16
   %cmp = icmp eq i32 %and, 0
   %ret = zext i1 %cmp to i32
@@ -872,14 +710,6 @@ define i32 @lowmask_i32_mask32(i32 %val) {
 ; CHECK-NEXT:    # imm = 0xFFFFF
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: lowmask_i32_mask32:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testl $1048575, %edi # encoding: [0xf7,0xc7,0xff,0xff,0x0f,0x00]
-; NDD-NEXT:    # imm = 0xFFFFF
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %val, 1048575
   %cmp = icmp eq i32 %and, 0
   %ret = zext i1 %cmp to i32
@@ -893,13 +723,6 @@ define i32 @lowmask_i32_mask8(i32 %val) {
 ; CHECK-NEXT:    testb $31, %dil # encoding: [0x40,0xf6,0xc7,0x1f]
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: lowmask_i32_mask8:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testb $31, %dil # encoding: [0x40,0xf6,0xc7,0x1f]
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %val, 31
   %cmp = icmp ne i32 %and, 0
   %ret = zext i1 %cmp to i32
@@ -907,12 +730,12 @@ define i32 @lowmask_i32_mask8(i32 %val) {
 }
 
 define i1 @shifted_mask64_testb(i64 %a) {
-; CHECK-LABEL: shifted_mask64_testb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $50, %rdi # encoding: [0x48,0xc1,0xef,0x32]
-; CHECK-NEXT:    testb %dil, %dil # encoding: [0x40,0x84,0xff]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_testb:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $50, %rdi # encoding: [0x48,0xc1,0xef,0x32]
+; NO-NDD-NEXT:    testb %dil, %dil # encoding: [0x40,0x84,0xff]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_testb:
 ; NDD:       # %bb.0:
@@ -926,12 +749,12 @@ define i1 @shifted_mask64_testb(i64 %a) {
 }
 
 define i1 @shifted_mask64_testw(i64 %a) {
-; CHECK-LABEL: shifted_mask64_testw:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
-; CHECK-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_testw:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
+; NO-NDD-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_testw:
 ; NDD:       # %bb.0:
@@ -945,12 +768,12 @@ define i1 @shifted_mask64_testw(i64 %a) {
 }
 
 define i1 @shifted_mask64_testl(i64 %a) {
-; CHECK-LABEL: shifted_mask64_testl:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $7, %rdi # encoding: [0x48,0xc1,0xef,0x07]
-; CHECK-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_testl:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $7, %rdi # encoding: [0x48,0xc1,0xef,0x07]
+; NO-NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_testl:
 ; NDD:       # %bb.0:
@@ -964,15 +787,15 @@ define i1 @shifted_mask64_testl(i64 %a) {
 }
 
 define i1 @shifted_mask64_extra_use_const(i64 %a) {
-; CHECK-LABEL: shifted_mask64_extra_use_const:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
-; CHECK-NEXT:    # imm = 0x3FC000000000000
-; CHECK-NEXT:    testq %rcx, %rdi # encoding: [0x48,0x85,0xcf]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_extra_use_const:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
+; NO-NDD-NEXT:    # imm = 0x3FC000000000000
+; NO-NDD-NEXT:    testq %rcx, %rdi # encoding: [0x48,0x85,0xcf]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NO-NDD-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_extra_use_const:
 ; NDD:       # %bb.0:
@@ -990,15 +813,15 @@ define i1 @shifted_mask64_extra_use_const(i64 %a) {
 }
 
 define i1 @shifted_mask64_extra_use_and(i64 %a) {
-; CHECK-LABEL: shifted_mask64_extra_use_and:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
-; CHECK-NEXT:    # imm = 0x3FC000000000000
-; CHECK-NEXT:    andq %rdi, %rcx # encoding: [0x48,0x21,0xf9]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_extra_use_and:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
+; NO-NDD-NEXT:    # imm = 0x3FC000000000000
+; NO-NDD-NEXT:    andq %rdi, %rcx # encoding: [0x48,0x21,0xf9]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NO-NDD-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_extra_use_and:
 ; NDD:       # %bb.0:
@@ -1022,13 +845,6 @@ define i1 @shifted_mask32_testl_immediate(i64 %a) {
 ; CHECK-NEXT:    # imm = 0x3FC0000
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: shifted_mask32_testl_immediate:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testl $66846720, %edi # encoding: [0xf7,0xc7,0x00,0x00,0xfc,0x03]
-; NDD-NEXT:    # imm = 0x3FC0000
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 66846720  ; 0xff << 18
   %v1 = icmp ne i64 %v0, 0
   ret i1 %v1
@@ -1044,16 +860,6 @@ define i1 @shifted_mask32_extra_use_const(i64 %a) {
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-8, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    # imm = 0x3FC0000
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: shifted_mask32_extra_use_const:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testl $66846720, %edi # encoding: [0xf7,0xc7,0x00,0x00,0xfc,0x03]
-; NDD-NEXT:    # imm = 0x3FC0000
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    movq $66846720, d64(%rip) # encoding: [0x48,0xc7,0x05,A,A,A,A,0x00,0x00,0xfc,0x03]
-; NDD-NEXT:    # fixup A - offset: 3, value: d64-8, kind: reloc_riprel_4byte
-; NDD-NEXT:    # imm = 0x3FC0000
-; NDD-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 66846720  ; 0xff << 18
   %v1 = icmp ne i64 %v0, 0
   store i64 66846720, ptr @d64
@@ -1061,14 +867,14 @@ define i1 @shifted_mask32_extra_use_const(i64 %a) {
 }
 
 define i1 @shifted_mask32_extra_use_and(i64 %a) {
-; CHECK-LABEL: shifted_mask32_extra_use_and:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andq $66846720, %rdi # encoding: [0x48,0x81,0xe7,0x00,0x00,0xfc,0x03]
-; CHECK-NEXT:    # imm = 0x3FC0000
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask32_extra_use_and:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    andq $66846720, %rdi # encoding: [0x48,0x81,0xe7,0x00,0x00,0xfc,0x03]
+; NO-NDD-NEXT:    # imm = 0x3FC0000
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; NO-NDD-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask32_extra_use_and:
 ; NDD:       # %bb.0:
@@ -1085,14 +891,14 @@ define i1 @shifted_mask32_extra_use_and(i64 %a) {
 }
 
 define { i64, i64 } @pr39968(i64, i64, i32) {
-; CHECK-LABEL: pr39968:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    testb $64, %dl # encoding: [0xf6,0xc2,0x40]
-; CHECK-NEXT:    cmovneq %rdi, %rsi # encoding: [0x48,0x0f,0x45,0xf7]
-; CHECK-NEXT:    cmovneq %rdi, %rax # encoding: [0x48,0x0f,0x45,0xc7]
-; CHECK-NEXT:    movq %rsi, %rdx # encoding: [0x48,0x89,0xf2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: pr39968:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    testb $64, %dl # encoding: [0xf6,0xc2,0x40]
+; NO-NDD-NEXT:    cmovneq %rdi, %rsi # encoding: [0x48,0x0f,0x45,0xf7]
+; NO-NDD-NEXT:    cmovneq %rdi, %rax # encoding: [0x48,0x0f,0x45,0xc7]
+; NO-NDD-NEXT:    movq %rsi, %rdx # encoding: [0x48,0x89,0xf2]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: pr39968:
 ; NDD:       # %bb.0:
@@ -1124,18 +930,6 @@ define i32 @pr42189(i16 signext %c) {
 ; CHECK-NEXT:    jmp g@PLT # TAILCALL
 ; CHECK-NEXT:    # encoding: [0xeb,A]
 ; CHECK-NEXT:    # fixup A - offset: 1, value: g@PLT-1, kind: FK_PCRel_1
-;
-; NDD-LABEL: pr42189:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    cmpl $32767, %edi # encoding: [0x81,0xff,0xff,0x7f,0x00,0x00]
-; NDD-NEXT:    # imm = 0x7FFF
-; NDD-NEXT:    jne f@PLT # TAILCALL
-; NDD-NEXT:    # encoding: [0x75,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: f@PLT-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %if.then
-; NDD-NEXT:    jmp g@PLT # TAILCALL
-; NDD-NEXT:    # encoding: [0xeb,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: g@PLT-1, kind: FK_PCRel_1
 entry:
   %cmp = icmp eq i16 %c, 32767
   br i1 %cmp, label %if.then, label %if.end
@@ -1160,12 +954,12 @@ declare i32 @f()
 ; The store makes sure the chain result of the load is used which used to
 ; prevent the post isel peephole from catching this.
 define i1 @fold_test_and_with_chain(ptr %x, ptr %y, i32 %z) {
-; CHECK-LABEL: fold_test_and_with_chain:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    testl %edx, (%rdi) # encoding: [0x85,0x17]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movl %edx, (%rsi) # encoding: [0x89,0x16]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: fold_test_and_with_chain:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    testl %edx, (%rdi) # encoding: [0x85,0x17]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    movl %edx, (%rsi) # encoding: [0x89,0x16]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: fold_test_and_with_chain:
 ; NDD:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/cmp16.ll b/llvm/test/CodeGen/X86/cmp16.ll
new file mode 100644
index 000000000000..760c8e404499
--- /dev/null
+++ b/llvm/test/CodeGen/X86/cmp16.ll
@@ -0,0 +1,734 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86,X86-GENERIC
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64,X64-GENERIC
+; RUN: llc < %s -mtriple=i686-- -mcpu=atom | FileCheck %s --check-prefixes=X86,X86-ATOM
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=atom | FileCheck %s --check-prefixes=X64,X64-ATOM
+
+define i1 @cmp16_reg_eq_reg(i16 %a0, i16 %a1) {
+; X86-GENERIC-LABEL: cmp16_reg_eq_reg:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpw {{[0-9]+}}(%esp), %ax
+; X86-GENERIC-NEXT:    sete %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_eq_reg:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    cmpw %si, %di
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_eq_reg:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw {{[0-9]+}}(%esp), %ax
+; X86-ATOM-NEXT:    sete %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_eq_reg:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw %si, %di
+; X64-ATOM-NEXT:    sete %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp eq i16 %a0, %a1
+  ret i1 %cmp
+}
+define i1 @cmp16_reg_eq_imm8(i16 %a0) {
+; X86-GENERIC-LABEL: cmp16_reg_eq_imm8:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    cmpw $15, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    sete %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_eq_imm8:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    cmpw $15, %di
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_eq_imm8:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    cmpw $15, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    sete %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_eq_imm8:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $15, %di
+; X64-ATOM-NEXT:    sete %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp eq i16 %a0, 15
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_eq_imm16(i16 %a0) {
+; X86-GENERIC-LABEL: cmp16_reg_eq_imm16:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpl $1024, %eax # imm = 0x400
+; X86-GENERIC-NEXT:    sete %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_eq_imm16:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movzwl %di, %eax
+; X64-GENERIC-NEXT:    cmpl $1024, %eax # imm = 0x400
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_eq_imm16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-ATOM-NEXT:    sete %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_eq_imm16:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $1024, %di # imm = 0x400
+; X64-ATOM-NEXT:    sete %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp eq i16 %a0, 1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_eq_imm16_minsize(i16 %a0) minsize {
+; X86-LABEL: cmp16_reg_eq_imm16_minsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmp16_reg_eq_imm16_minsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $1024, %di # imm = 0x400
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %cmp = icmp eq i16 %a0, 1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_eq_imm16_optsize(i16 %a0) optsize {
+; X86-GENERIC-LABEL: cmp16_reg_eq_imm16_optsize:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpl $1024, %eax # imm = 0x400
+; X86-GENERIC-NEXT:    sete %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_eq_imm16_optsize:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movzwl %di, %eax
+; X64-GENERIC-NEXT:    cmpl $1024, %eax # imm = 0x400
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_eq_imm16_optsize:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-ATOM-NEXT:    sete %al
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_eq_imm16_optsize:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $1024, %di # imm = 0x400
+; X64-ATOM-NEXT:    sete %al
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp eq i16 %a0, 1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_sgt_imm8(i16 %a0) {
+; X86-GENERIC-LABEL: cmp16_reg_sgt_imm8:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    cmpw $16, {{[0-9]+}}(%esp)
+; X86-GENERIC-NEXT:    setge %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_sgt_imm8:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    cmpw $16, %di
+; X64-GENERIC-NEXT:    setge %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_sgt_imm8:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    cmpw $16, {{[0-9]+}}(%esp)
+; X86-ATOM-NEXT:    setge %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_sgt_imm8:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $16, %di
+; X64-ATOM-NEXT:    setge %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp sgt i16 %a0, 15
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_sgt_imm16(i16 %a0) {
+; X86-GENERIC-LABEL: cmp16_reg_sgt_imm16:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpl $-1023, %eax # imm = 0xFC01
+; X86-GENERIC-NEXT:    setge %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_sgt_imm16:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movswl %di, %eax
+; X64-GENERIC-NEXT:    cmpl $-1023, %eax # imm = 0xFC01
+; X64-GENERIC-NEXT:    setge %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_sgt_imm16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-ATOM-NEXT:    setge %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_sgt_imm16:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $-1023, %di # imm = 0xFC01
+; X64-ATOM-NEXT:    setge %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp sgt i16 %a0, -1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_sgt_imm16_minsize(i16 %a0) minsize {
+; X86-LABEL: cmp16_reg_sgt_imm16_minsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmp16_reg_sgt_imm16_minsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $-1023, %di # imm = 0xFC01
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
+  %cmp = icmp sgt i16 %a0, -1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
+; X86-GENERIC-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpl $-1023, %eax # imm = 0xFC01
+; X86-GENERIC-NEXT:    setge %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movswl %di, %eax
+; X64-GENERIC-NEXT:    cmpl $-1023, %eax # imm = 0xFC01
+; X64-GENERIC-NEXT:    setge %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-ATOM-NEXT:    setge %al
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $-1023, %di # imm = 0xFC01
+; X64-ATOM-NEXT:    setge %al
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp sgt i16 %a0, -1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_uge_imm16(i16 %a0) {
+; X86-GENERIC-LABEL: cmp16_reg_uge_imm16:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpl $64512, %eax # imm = 0xFC00
+; X86-GENERIC-NEXT:    setae %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_uge_imm16:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movzwl %di, %eax
+; X64-GENERIC-NEXT:    cmpl $64512, %eax # imm = 0xFC00
+; X64-GENERIC-NEXT:    setae %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_uge_imm16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-ATOM-NEXT:    setae %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_uge_imm16:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $-1024, %di # imm = 0xFC00
+; X64-ATOM-NEXT:    setae %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp uge i16 %a0, -1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_uge_imm16_minsize(i16 %a0) minsize {
+; X86-LABEL: cmp16_reg_uge_imm16_minsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmp16_reg_uge_imm16_minsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $-1024, %di # imm = 0xFC00
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+  %cmp = icmp uge i16 %a0, -1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_reg_uge_imm16_optsize(i16 %a0) optsize {
+; X86-GENERIC-LABEL: cmp16_reg_uge_imm16_optsize:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpl $64512, %eax # imm = 0xFC00
+; X86-GENERIC-NEXT:    setae %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_reg_uge_imm16_optsize:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movzwl %di, %eax
+; X64-GENERIC-NEXT:    cmpl $64512, %eax # imm = 0xFC00
+; X64-GENERIC-NEXT:    setae %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_reg_uge_imm16_optsize:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-ATOM-NEXT:    setae %al
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_reg_uge_imm16_optsize:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $-1024, %di # imm = 0xFC00
+; X64-ATOM-NEXT:    setae %al
+; X64-ATOM-NEXT:    retq
+  %cmp = icmp uge i16 %a0, -1024
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_ne_load(ptr %p0, ptr %p1) {
+; X86-GENERIC-LABEL: cmp16_load_ne_load:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GENERIC-NEXT:    movzwl (%ecx), %ecx
+; X86-GENERIC-NEXT:    cmpw (%eax), %cx
+; X86-GENERIC-NEXT:    setne %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_ne_load:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
+; X64-GENERIC-NEXT:    cmpw (%rsi), %ax
+; X64-GENERIC-NEXT:    setne %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_ne_load:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    movzwl (%ecx), %ecx
+; X86-ATOM-NEXT:    cmpw (%eax), %cx
+; X86-ATOM-NEXT:    setne %al
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_ne_load:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    movzwl (%rdi), %eax
+; X64-ATOM-NEXT:    cmpw (%rsi), %ax
+; X64-ATOM-NEXT:    setne %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %ld0 = load i16, ptr %p0
+  %ld1 = load i16, ptr %p1
+  %cmp = icmp ne i16 %ld0, %ld1
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_ne_imm8(ptr %p0) {
+; X86-GENERIC-LABEL: cmp16_load_ne_imm8:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpw $15, (%eax)
+; X86-GENERIC-NEXT:    setne %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_ne_imm8:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    cmpw $15, (%rdi)
+; X64-GENERIC-NEXT:    setne %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_ne_imm8:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw $15, (%eax)
+; X86-ATOM-NEXT:    setne %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_ne_imm8:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $15, (%rdi)
+; X64-ATOM-NEXT:    setne %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp ne i16 %ld, 15
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_ne_imm16(ptr %p0) {
+; X86-GENERIC-LABEL: cmp16_load_ne_imm16:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    movzwl (%eax), %eax
+; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X86-GENERIC-NEXT:    setne %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_ne_imm16:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
+; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X64-GENERIC-NEXT:    setne %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_ne_imm16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-ATOM-NEXT:    setne %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_ne_imm16:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-ATOM-NEXT:    setne %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp ne i16 %ld, 512
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_slt_imm8(ptr %p0) {
+; X86-GENERIC-LABEL: cmp16_load_slt_imm8:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpw $15, (%eax)
+; X86-GENERIC-NEXT:    setl %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_slt_imm8:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    cmpw $15, (%rdi)
+; X64-GENERIC-NEXT:    setl %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_slt_imm8:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw $15, (%eax)
+; X86-ATOM-NEXT:    setl %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_slt_imm8:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $15, (%rdi)
+; X64-ATOM-NEXT:    setl %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp slt i16 %ld, 15
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_slt_imm16(ptr %p0) {
+; X86-GENERIC-LABEL: cmp16_load_slt_imm16:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    movswl (%eax), %eax
+; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X86-GENERIC-NEXT:    setl %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_slt_imm16:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movswl (%rdi), %eax
+; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X64-GENERIC-NEXT:    setl %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_slt_imm16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-ATOM-NEXT:    setl %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_slt_imm16:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-ATOM-NEXT:    setl %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp slt i16 %ld, 512
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_slt_imm16_minsize(ptr %p0) minsize {
+; X86-LABEL: cmp16_load_slt_imm16_minsize:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmp16_load_slt_imm16_minsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp slt i16 %ld, 512
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_slt_imm16_optsize(ptr %p0) optsize {
+; X86-GENERIC-LABEL: cmp16_load_slt_imm16_optsize:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    movswl (%eax), %eax
+; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X86-GENERIC-NEXT:    setl %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_slt_imm16_optsize:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movswl (%rdi), %eax
+; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X64-GENERIC-NEXT:    setl %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_slt_imm16_optsize:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-ATOM-NEXT:    setl %al
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_slt_imm16_optsize:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-ATOM-NEXT:    setl %al
+; X64-ATOM-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp slt i16 %ld, 512
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_ule_imm8(ptr %p0) {
+; X86-GENERIC-LABEL: cmp16_load_ule_imm8:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    cmpw $16, (%eax)
+; X86-GENERIC-NEXT:    setb %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_ule_imm8:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    cmpw $16, (%rdi)
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_ule_imm8:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw $16, (%eax)
+; X86-ATOM-NEXT:    setb %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_ule_imm8:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $16, (%rdi)
+; X64-ATOM-NEXT:    setb %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp ule i16 %ld, 15
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_ule_imm16(ptr %p0) {
+; X86-GENERIC-LABEL: cmp16_load_ule_imm16:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    movzwl (%eax), %eax
+; X86-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
+; X86-GENERIC-NEXT:    setb %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_ule_imm16:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
+; X64-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_ule_imm16:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-ATOM-NEXT:    setb %al
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_ule_imm16:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-ATOM-NEXT:    setb %al
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    nop
+; X64-ATOM-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp ule i16 %ld, 512
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_ule_imm16_minsize(ptr %p0) minsize {
+; X86-LABEL: cmp16_load_ule_imm16_minsize:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmp16_load_ule_imm16_minsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp ule i16 %ld, 512
+  ret i1 %cmp
+}
+
+define i1 @cmp16_load_ule_imm16_optsize(ptr %p0) optsize {
+; X86-GENERIC-LABEL: cmp16_load_ule_imm16_optsize:
+; X86-GENERIC:       # %bb.0:
+; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    movzwl (%eax), %eax
+; X86-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
+; X86-GENERIC-NEXT:    setb %al
+; X86-GENERIC-NEXT:    retl
+;
+; X64-GENERIC-LABEL: cmp16_load_ule_imm16_optsize:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
+; X64-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    retq
+;
+; X86-ATOM-LABEL: cmp16_load_ule_imm16_optsize:
+; X86-ATOM:       # %bb.0:
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-ATOM-NEXT:    setb %al
+; X86-ATOM-NEXT:    retl
+;
+; X64-ATOM-LABEL: cmp16_load_ule_imm16_optsize:
+; X64-ATOM:       # %bb.0:
+; X64-ATOM-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-ATOM-NEXT:    setb %al
+; X64-ATOM-NEXT:    retq
+  %ld = load i16, ptr %p0
+  %cmp = icmp ule i16 %ld, 512
+  ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 8d2bb77a9e1a..5d7bf4a2c978 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -80,13 +80,13 @@ define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
 define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
 ; SSE-LABEL: combine_vec_mul_pow2c:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    paddq %xmm0, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psllq $4, %xmm2
 ; SSE-NEXT:    psllq $2, %xmm1
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    paddq %xmm0, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_pow2c:
@@ -399,14 +399,12 @@ define i64 @combine_mul_self_demandedbits(i64 %x) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    imulq %rdi, %rax
-; SSE-NEXT:    andq $-3, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_mul_self_demandedbits:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    imulq %rdi, %rax
-; AVX-NEXT:    andq $-3, %rax
 ; AVX-NEXT:    retq
   %1 = mul i64 %x, %x
   %2 = and i64 %1, -3
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
new file mode 100644
index 000000000000..175d21a4f706
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -0,0 +1,862 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse    | FileCheck %s -check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s -check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx  | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=AVX,AVX512
+
+; Verify that each of the following test cases is folded into a single
+; instruction which performs a blend operation.
+
+define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test1:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test3:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test4:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test5:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test5:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test5:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test6:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test6:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test6:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test7:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test7:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test7:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %and1 = and <2 x i64> %a, <i64 -1, i64 0>
+  %and2 = and <2 x i64> %b, <i64 0, i64 -1>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test9:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test9:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test9:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test10:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test10:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test10:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %and1 = and <2 x i64> %a, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %b, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test11:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test11:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test11:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test12:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test12:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test12:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    retq
+  %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+
+; Verify that the following test cases are folded into single shuffles.
+
+define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: test13:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test13:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: test14:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test14:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: test15:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test15:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,1],xmm0[2,1]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: test16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+; Verify that the dag-combiner does not fold a OR of two shuffles into a single
+; shuffle instruction when the shuffle indexes are not compatible.
+
+define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: test17:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test17:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test18:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    xorps %xmm3, %xmm3
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1]
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    orps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test18:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; SSE4-NEXT:    por %xmm0, %xmm2
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test18:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test18:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test18:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test19:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,2]
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test19:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3]
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    por %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test19:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test19:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
+; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test19:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[12,13,14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9,10,11,8,9,10,11]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: test20:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test20:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: test21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test21:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test21:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test21:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vorpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+; Verify that the dag-combiner keeps the correct domain for float/double vectors
+; bitcast to use the mask-or blend combine.
+
+define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
+; SSE2-LABEL: test22:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test22:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test22:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %bc1 = bitcast <2 x double> %a0 to <2 x i64>
+  %bc2 = bitcast <2 x double> %a1 to <2 x i64>
+  %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  %bc3 = bitcast <2 x i64> %or to <2 x double>
+  ret <2 x double> %bc3
+}
+
+
+define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
+; SSE2-LABEL: test23:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test23:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test23:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX-NEXT:    retq
+  %bc1 = bitcast <4 x float> %a0 to <4 x i32>
+  %bc2 = bitcast <4 x float> %a1 to <4 x i32>
+  %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
+  %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  %bc3 = bitcast <4 x i32> %or to <4 x float>
+  ret <4 x float> %bc3
+}
+
+
+define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
+; SSE2-LABEL: test24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test24:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test24:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %bc1 = bitcast <4 x float> %a0 to <2 x i64>
+  %bc2 = bitcast <4 x float> %a1 to <2 x i64>
+  %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  %bc3 = bitcast <2 x i64> %or to <4 x float>
+  ret <4 x float> %bc3
+}
+
+
+define <4 x float> @test25(<4 x float> %a0) {
+; SSE2-LABEL: test25:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],mem[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test25:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test25:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test25:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test25:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX512-NEXT:    retq
+  %bc1 = bitcast <4 x float> %a0 to <4 x i32>
+  %bc2 = bitcast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> to <4 x i32>
+  %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
+  %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  %bc3 = bitcast <4 x i32> %or to <4 x float>
+  ret <4 x float> %bc3
+}
+
+
+; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
+; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
+; handle legal vector value types.
+define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
+; SSE2-LABEL: test_crash:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    andps %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_crash:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_crash:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i8> %shuf1, %shuf2
+  ret <4 x i8> %or
+}
+
+; Verify that we can fold regardless of which operand is the zeroinitializer
+
+define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2b:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2b:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2b:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2c:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2c:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2c:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
+  %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2d:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2d:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2d:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+; Make sure we can have an undef where an index pointing to the zero vector should be
+
+define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2e:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2e:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2e:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2f:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2f:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2f:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0
+
+define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
+; SSE-LABEL: or_and_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: or_and_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: or_and_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: or_and_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
+; AVX512-NEXT:    vpternlogq $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %1 = and <2 x i64> %a0, <i64 7, i64 7>
+  %2 = or <2 x i64> %1, <i64 3, i64 3>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
+; SSE-LABEL: or_and_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: or_and_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: or_and_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: or_and_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
+; AVX512-NEXT:    vpternlogd $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
+  %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
+  ret <4 x i32> %2
+}
+
+; If all masked bits are going to be set, that's a constant fold.
+
+define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) {
+; SSE-LABEL: or_and_v4i32_fold:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [3,3,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: or_and_v4i32_fold:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
+; AVX-NEXT:    retq
+  %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
+  %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 5188de543f72..3b2102f46a29 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefixes=CHECK,CHECK-LV
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -early-live-intervals | FileCheck %s -check-prefixes=CHECK,CHECK-LIS
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -early-live-intervals | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v3 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
 
 define i32 @or_self(i32 %x) {
 ; CHECK-LABEL: or_self:
@@ -19,472 +21,34 @@ define <4 x i32> @or_self_vec(<4 x i32> %x) {
   ret <4 x i32> %or
 }
 
-; Verify that each of the following test cases is folded into a single
-; instruction which performs a blend operation.
-
-define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
-  %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
-  %or = or <4 x i32> %and1, %and2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <2 x i64> %a, <i64 -1, i64 0>
-  %and2 = and <2 x i64> %b, <i64 0, i64 -1>
-  %or = or <2 x i64> %and1, %and2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test9:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
-  %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
-  %or = or <4 x i32> %and1, %and2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test10:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <2 x i64> %a, <i64 0, i64 -1>
-  %and2 = and <2 x i64> %b, <i64 -1, i64 0>
-  %or = or <2 x i64> %and1, %and2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test11:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
-  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
-  %or = or <4 x i32> %and1, %and2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test12:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
-  %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
-  %or = or <4 x i32> %and1, %and2
-  ret <4 x i32> %or
-}
-
-
-; Verify that the following test cases are folded into single shuffles.
-
-define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test13:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test14:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test15:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-; Verify that the dag-combiner does not fold a OR of two shuffles into a single
-; shuffle instruction when the shuffle indexes are not compatible.
-
-define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test17:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    psllq $32, %xmm0
-; CHECK-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test18:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; CHECK-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test19:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3]
-; CHECK-NEXT:    pxor %xmm3, %xmm3
-; CHECK-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test20:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test21:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-; Verify that the dag-combiner keeps the correct domain for float/double vectors
-; bitcast to use the mask-or blend combine.
-
-define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test22:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %bc1 = bitcast <2 x double> %a0 to <2 x i64>
-  %bc2 = bitcast <2 x double> %a1 to <2 x i64>
-  %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
-  %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
-  %or = or <2 x i64> %and1, %and2
-  %bc3 = bitcast <2 x i64> %or to <2 x double>
-  ret <2 x double> %bc3
-}
-
-
-define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test23:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; CHECK-NEXT:    retq
-  %bc1 = bitcast <4 x float> %a0 to <4 x i32>
-  %bc2 = bitcast <4 x float> %a1 to <4 x i32>
-  %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
-  %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
-  %or = or <4 x i32> %and1, %and2
-  %bc3 = bitcast <4 x i32> %or to <4 x float>
-  ret <4 x float> %bc3
-}
-
-
-define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test24:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %bc1 = bitcast <4 x float> %a0 to <2 x i64>
-  %bc2 = bitcast <4 x float> %a1 to <2 x i64>
-  %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
-  %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
-  %or = or <2 x i64> %and1, %and2
-  %bc3 = bitcast <2 x i64> %or to <4 x float>
-  ret <4 x float> %bc3
-}
-
-
-define <4 x float> @test25(<4 x float> %a0) {
-; CHECK-LABEL: test25:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
-; CHECK-NEXT:    retq
-  %bc1 = bitcast <4 x float> %a0 to <4 x i32>
-  %bc2 = bitcast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> to <4 x i32>
-  %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
-  %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
-  %or = or <4 x i32> %and1, %and2
-  %bc3 = bitcast <4 x i32> %or to <4 x float>
-  ret <4 x float> %bc3
-}
-
-
-; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
-; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
-; handle legal vector value types.
-define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
-; CHECK-LABEL: test_crash:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i8> %shuf1, %shuf2
-  ret <4 x i8> %or
-}
-
-; Verify that we can fold regardless of which operand is the zeroinitializer
-
-define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2b:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2c:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
-  %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2d:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-; Make sure we can have an undef where an index pointing to the zero vector should be
-
-define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2e:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2f:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0
-
-define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
-; CHECK-LABEL: or_and_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    retq
-  %1 = and <2 x i64> %a0, <i64 7, i64 7>
-  %2 = or <2 x i64> %1, <i64 3, i64 3>
-  ret <2 x i64> %2
-}
-
-define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
-; CHECK-LABEL: or_and_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    retq
-  %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
-  %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
-  ret <4 x i32> %2
-}
-
-; If all masked bits are going to be set, that's a constant fold.
-
-define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) {
-; CHECK-LABEL: or_and_v4i32_fold:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [3,3,3,3]
-; CHECK-NEXT:    retq
-  %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
-  %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %2
-}
-
 ; fold (or x, c) -> c iff (x & ~c) == 0
 
 define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
-; CHECK-LABEL: or_zext_v2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295]
-; CHECK-NEXT:    retq
+; SSE-LABEL: or_zext_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: or_zext_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
+; AVX-NEXT:    # xmm0 = mem[0,0]
+; AVX-NEXT:    retq
   %1 = zext <2 x i32> %a0 to <2 x i64>
   %2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
   ret <2 x i64> %2
 }
 
 define <4 x i32> @or_zext_v4i16(<4 x i16> %a0) {
-; CHECK-LABEL: or_zext_v4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
-; CHECK-NEXT:    retq
+; SSE-LABEL: or_zext_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: or_zext_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [65535,65535,65535,65535]
+; AVX-NEXT:    retq
   %1 = zext <4 x i16> %a0 to <4 x i32>
   %2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535>
   ret <4 x i32> %2
@@ -522,12 +86,19 @@ define i64 @or_and_and_commute_i64(i64 %x, i64 %y) {
 }
 
 define <4 x i32> @or_and_and_v4i32(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: or_and_and_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps %xmm1, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: or_and_and_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: or_and_and_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %xy = or <4 x i32> %x, %y
   %mx = and <4 x i32> %x, <i32 2, i32 4, i32 8, i32 16>
   %mxy = and <4 x i32> %xy, <i32 1, i32 -1, i32 -5, i32 -25>
@@ -611,7 +182,106 @@ define i32 @or_and_multiuse_and_multiuse_i32(i32 %x, i32 %y) nounwind {
   ret i32 %r
 }
 
+define i64 @or_build_pair_not(i32 %a0, i32 %a1) {
+; CHECK-LABEL: or_build_pair_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    shlq $32, %rsi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    retq
+  %n0 = xor i32 %a0, -1
+  %n1 = xor i32 %a1, -1
+  %x0 = zext i32 %n0 to i64
+  %x1 = zext i32 %n1 to i64
+  %hi = shl i64 %x1, 32
+  %r = or i64 %hi, %x0
+  ret i64 %r
+}
+
+define i64 @PR89533(<64 x i8> %a0) {
+; SSE-LABEL: PR89533:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    xorl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %ecx
+; SSE-NEXT:    notl %ecx
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    orl %eax, %ecx
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE-NEXT:    pmovmskb %xmm2, %edx
+; SSE-NEXT:    xorl $65535, %edx # imm = 0xFFFF
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm3
+; SSE-NEXT:    pmovmskb %xmm3, %eax
+; SSE-NEXT:    notl %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    orl %edx, %eax
+; SSE-NEXT:    shlq $32, %rax
+; SSE-NEXT:    orq %rcx, %rax
+; SSE-NEXT:    je .LBB11_2
+; SSE-NEXT:  # %bb.1: # %cond.false
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    retq
+; SSE-NEXT:  .LBB11_2: # %cond.end
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR89533:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovmskb %xmm3, %eax
+; AVX1-NEXT:    xorl $65535, %eax # imm = 0xFFFF
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    notl %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    orl %eax, %ecx
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
+; AVX1-NEXT:    xorl $65535, %edx # imm = 0xFFFF
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    shll $16, %eax
+; AVX1-NEXT:    orl %edx, %eax
+; AVX1-NEXT:    shlq $32, %rax
+; AVX1-NEXT:    orq %rcx, %rax
+; AVX1-NEXT:    je .LBB11_2
+; AVX1-NEXT:  # %bb.1: # %cond.false
+; AVX1-NEXT:    rep bsfq %rax, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB11_2: # %cond.end
+; AVX1-NEXT:    movl $64, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR89533:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    orq %rax, %rcx
+; AVX2-NEXT:    notq %rcx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cmp = icmp ne <64 x i8> %a0, <i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95>
+  %mask = bitcast <64 x i1> %cmp to i64
+  %tz = tail call i64 @llvm.cttz.i64(i64 %mask, i1 false)
+  ret i64 %tz
+}
+
 declare void @use_i32(i32)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-LIS: {{.*}}
-; CHECK-LV: {{.*}}
+
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index e12ca56023a7..33cc8e96f663 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -182,101 +182,101 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    subl %ebx, %ebp
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    bsrl %ebx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %ebp, %ebp
 ; X86-NEXT:    xorl $31, %ebp
 ; X86-NEXT:    addl $32, %ebp
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %ebp
 ; X86-NEXT:    addl $64, %ebp
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    cmovnel %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    subl %edx, %ebp
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
@@ -284,40 +284,40 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    cmovnel %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovnel %esi, %eax
-; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    xorl $127, %ebx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl %ebp, %edi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -332,234 +332,233 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %ebx
-; X86-NEXT:    movl 144(%esp,%ebx), %edx
-; X86-NEXT:    movl 148(%esp,%ebx), %edi
+; X86-NEXT:    movsbl %al, %edi
+; X86-NEXT:    movl 144(%esp,%edi), %edx
+; X86-NEXT:    movl 148(%esp,%edi), %esi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 140(%esp,%ebx), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl 136(%esp,%ebx), %esi
+; X86-NEXT:    movl 140(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrl %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl 136(%esp,%edi), %edx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movb %bl, %ch
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %dl, %ch
 ; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    movb %bl, %cl
+; X86-NEXT:    movb %dl, %cl
 ; X86-NEXT:    shrb $3, %cl
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ebp
-; X86-NEXT:    movl 100(%esp,%ebp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%ebp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl 88(%esp,%ebp), %ebp
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %ebx, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl 100(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%edx), %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    shrdl %cl, %esi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    movl 92(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebp, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl $1, %ebp, %edx
-; X86-NEXT:    shldl $1, %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    shldl $1, %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %ebp
-; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    shldl $1, %eax, %ebx
+; X86-NEXT:    orl %ecx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:  .LBB4_8: # %udiv-end
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edi
 ; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %ebx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %esi, (%ebp)
-; X86-NEXT:    movl %eax, 4(%ebp)
-; X86-NEXT:    movl %edx, 8(%ebp)
-; X86-NEXT:    movl %edi, 12(%ebp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %ebx, 8(%ecx)
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    imull %esi, %ecx
@@ -568,12 +567,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    subl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index 135494ac25f8..b2614c5fe049 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -56,9 +56,7 @@ define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
 ; NOBMI-LABEL: masked_merge2:
 ; NOBMI:       # %bb.0:
 ; NOBMI-NEXT:    movl %esi, %eax
-; NOBMI-NEXT:    xorb %sil, %al
-; NOBMI-NEXT:    andb %dil, %al
-; NOBMI-NEXT:    xorb %sil, %al
+; NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge2:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index b212e9438e1b..c79da37988e4 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -459,8 +459,7 @@ define i32 @freeze_ashr(i32 %a0) nounwind {
 ; X64-LABEL: freeze_ashr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    sarl $3, %eax
-; X64-NEXT:    sarl $3, %eax
+; X64-NEXT:    sarl $6, %eax
 ; X64-NEXT:    retq
   %x = ashr i32 %a0, 3
   %y = freeze i32 %x
@@ -531,30 +530,12 @@ define i32 @freeze_ashr_outofrange(i32 %a0) nounwind {
 define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psraw $1, %xmm2
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    pandn %xmm2, %xmm3
-; X86-NEXT:    psraw $3, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    por %xmm3, %xmm0
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psraw $3, %xmm2
-; X86-NEXT:    psraw $1, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    pandn %xmm2, %xmm1
-; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    psraw $4, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsraw $1, %xmm0, %xmm1
-; X64-NEXT:    vpsraw $3, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT:    vpsraw $3, %xmm0, %xmm1
-; X64-NEXT:    vpsraw $1, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT:    vpsraw $4, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %x = ashr <8 x i16> %a0, <i16 3, i16 1, i16 3, i16 1, i16 3, i16 1, i16 3, i16 1>
   %y = freeze <8 x i16> %x
@@ -592,8 +573,7 @@ define i32 @freeze_lshr(i32 %a0) nounwind {
 ; X64-LABEL: freeze_lshr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $2, %eax
-; X64-NEXT:    shrl %eax
+; X64-NEXT:    shrl $3, %eax
 ; X64-NEXT:    retq
   %x = lshr i32 %a0, 2
   %y = freeze i32 %x
@@ -664,30 +644,12 @@ define i32 @freeze_lshr_outofrange(i32 %a0) nounwind {
 define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psrlw $1, %xmm2
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    pandn %xmm2, %xmm3
-; X86-NEXT:    psrlw $2, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    por %xmm3, %xmm0
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psrlw $2, %xmm2
-; X86-NEXT:    psrlw $1, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    pandn %xmm2, %xmm1
-; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    psrlw $3, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; X64-NEXT:    vpsrlw $2, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; X64-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %x = lshr <8 x i16> %a0, <i16 2, i16 1, i16 2, i16 1, i16 2, i16 1, i16 2, i16 1>
   %y = freeze <8 x i16> %x
diff --git a/llvm/test/CodeGen/X86/freeze-combine.ll b/llvm/test/CodeGen/X86/freeze-combine.ll
index b037a6d9a1b9..1cfb8627a4dd 100644
--- a/llvm/test/CodeGen/X86/freeze-combine.ll
+++ b/llvm/test/CodeGen/X86/freeze-combine.ll
@@ -3,9 +3,9 @@
 define i32 @const() {
   ; CHECK-LABEL: name: const
   ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK:   [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
-  ; CHECK:   $eax = COPY [[MOV32ri]]
-  ; CHECK:   RET 0, $eax
+  ; CHECK-NEXT:   [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
+  ; CHECK-NEXT:   $eax = COPY [[MOV32ri]]
+  ; CHECK-NEXT:   RET 0, $eax
   %y = freeze i32 1
   ret i32 %y
 }
@@ -13,11 +13,11 @@ define i32 @const() {
 define i32 @fold(i32 %x) {
   ; CHECK-LABEL: name: fold
   ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK:   liveins: $edi
-  ; CHECK:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
-  ; CHECK:   [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]]
-  ; CHECK:   $eax = COPY [[COPY1]]
-  ; CHECK:   RET 0, $eax
+  ; CHECK-NEXT:   liveins: $edi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
+  ; CHECK-NEXT:   $eax = COPY [[COPY]]
+  ; CHECK-NEXT:   RET 0, $eax
   %y = freeze i32 %x
   %z = freeze i32 %y
   ret i32 %z
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index ee7f4aea02c0..fe240286462e 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -672,3 +672,23 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
   ret void
 }
 declare <4 x float> @llvm.sin.v4f32(<4 x float>)
+
+; Test that we can eliminate freeze by changing the BUILD_VECTOR to a splat
+; zero vector.
+define void @freeze_buildvector_not_simple_type(ptr %dst) nounwind {
+; X86-LABEL: freeze_buildvector_not_simple_type:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_buildvector_not_simple_type:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $0, 4(%rdi)
+; X64-NEXT:    movl $0, (%rdi)
+; X64-NEXT:    retq
+  %i0 = freeze <5 x i8> <i8 poison, i8 0, i8 0, i8 undef, i8 0>
+  store <5 x i8> %i0, ptr %dst
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 0c341dc63a9e..afe0ebb9dcb4 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -522,17 +522,17 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshr_v16i8:
 ; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    psrlw $7, %xmm1
 ; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    por %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_fshr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 7ab8300b269a..96aff5b2af31 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -421,18 +421,18 @@ define <16 x i8> @splatconstant_rotr_v16i8(<16 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotr_v16i8:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
-; GFNISSE-NEXT:    psrlw $7, %xmm1
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    paddb %xmm0, %xmm1
+; GFNISSE-NEXT:    psrlw $7, %xmm0
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    por %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_rotr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm0, %xmm1
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_rotr_v16i8:
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 39d02f9112f4..2f780e3c6fe1 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -676,12 +676,13 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    orl $256, %edi # imm = 0x100
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB22_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB22_1:
 ; X64-NEXT:    movl $32, %eax
@@ -713,12 +714,13 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotr_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB23_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB23_1:
 ; X64-NEXT:    movl $32, %eax
@@ -773,12 +775,13 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotr_with_fshr_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB25_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB25_1:
 ; X64-NEXT:    movl $32, %eax
@@ -808,12 +811,13 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    orl $256, %edi # imm = 0x100
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB26_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB26_1:
 ; X64-NEXT:    movl $32, %eax
@@ -845,12 +849,13 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotl_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB27_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB27_1:
 ; X64-NEXT:    movl $32, %eax
@@ -905,12 +910,13 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotl_with_fshl_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB29_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB29_1:
 ; X64-NEXT:    movl $32, %eax
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
index ae1320f8b086..200a8184d4bd 100644
--- a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize.ll b/llvm/test/CodeGen/X86/memcmp-minsize.ll
index 544d1c49f26b..9c20f3e0cdef 100644
--- a/llvm/test/CodeGen/X86/memcmp-minsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-minsize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 762691151f4b..3db6ae8b76b2 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index c0c7b98d471c..edd61641ad2a 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index cb45fd3ebb90..1c301da26bea 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 720344a22e43..1ee3317b9c96 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index d3cced3233ea..5a6375e08bca 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -893,27 +893,26 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [1,1]
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psubq %xmm1, %xmm4
 ; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm6, %xmm1
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm1, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
+; SSE41-NEXT:    pmuludq %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm0, %xmm4
+; SSE41-NEXT:    paddq %xmm1, %xmm4
+; SSE41-NEXT:    psllq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_i64_signed_reg_reg:
@@ -1077,27 +1076,26 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [1,1]
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psubq %xmm1, %xmm4
 ; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm6, %xmm1
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm1, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
+; SSE41-NEXT:    pmuludq %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm0, %xmm4
+; SSE41-NEXT:    paddq %xmm1, %xmm4
+; SSE41-NEXT:    psllq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1993,14 +1991,14 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
 ;
 ; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm2, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
@@ -2786,14 +2784,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ;
 ; AVX512VL-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm2, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index cc08396ae8c7..e880a1acc9e8 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1445,14 +1445,14 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
 ;
 ; AVX512VL-FALLBACK-LABEL: vec256_i16_unsigned_reg_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm2, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpminuw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
@@ -2210,14 +2210,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ;
 ; AVX512VL-FALLBACK-LABEL: vec256_i8_unsigned_reg_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm2, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 2fdf6ef224ca..366dad1612b4 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -684,22 +684,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -715,22 +714,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -772,20 +770,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm6
 ; AVX512F-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm4, %zmm4
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -803,20 +800,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm4, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
index f0917be88744..2a5e834f0ac7 100644
--- a/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
@@ -23,7 +23,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
 ; CHECK-NEXT:    packuswb %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -34,7 +34,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 ; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; CHECK-NEXT:    retq
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -43,7 +43,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    retq
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -52,7 +52,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    retq
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -61,7 +61,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -70,7 +70,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    retq
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -83,7 +83,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) #0 {
 ; CHECK-NEXT:    psllw $8, %xmm0
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -95,7 +95,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) #0 {
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -115,20 +115,20 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) #0 {
 ; CHECK-NEXT:    movaps %xmm5, %xmm3
 ; CHECK-NEXT:    retq
 
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
 
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 04aff9b7d2e5..ace78b38d53e 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,7 +22,7 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $176, %esp
+; X86-NEXT:    subl $160, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -47,55 +47,54 @@ define void @f() nounwind {
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT:    bsrl %esi, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    jmp .LBB0_3
 ; X86-NEXT:  .LBB0_1:
-; X86-NEXT:    bsrl %edi, %ecx
-; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB0_3: # %BB_udiv-special-cases
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB0_4
 ; X86-NEXT:  # %bb.5: # %BB_udiv-special-cases
-; X86-NEXT:    addl $64, %ecx
+; X86-NEXT:    addl $64, %eax
 ; X86-NEXT:    jmp .LBB0_6
 ; X86-NEXT:  .LBB0_4:
-; X86-NEXT:    bsrl %edx, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:  .LBB0_6: # %BB_udiv-special-cases
-; X86-NEXT:    subl $62, %ecx
+; X86-NEXT:    subl $62, %eax
 ; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    addl $-66, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    addl $-66, %eax
 ; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    adcl $3, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movb $1, %al
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    adcl $3, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    jne .LBB0_11
 ; X86-NEXT:  # %bb.7: # %BB_udiv-special-cases
-; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    xorl $65, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    xorl $65, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    je .LBB0_11
 ; X86-NEXT:  # %bb.8: # %udiv-bb1
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl $1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    andl $3, %ebx
 ; X86-NEXT:    movb $65, %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, %ch
@@ -112,29 +111,31 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%esp,%eax), %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 120(%esp,%eax), %edi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    shll %cl, %edi
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 128(%esp,%eax), %edi
-; X86-NEXT:    movl 132(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl 112(%esp,%eax), %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 116(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    je .LBB0_11
 ; X86-NEXT:  # %bb.9: # %udiv-preheader
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    andl $3, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -149,20 +150,20 @@ define void @f() nounwind {
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    movl 80(%esp,%esi), %edx
-; X86-NEXT:    movl 84(%esp,%esi), %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 64(%esp,%eax), %edi
+; X86-NEXT:    movl 68(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 88(%esp,%esi), %esi
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl 72(%esp,%eax), %ebx
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
@@ -179,63 +180,62 @@ define void @f() nounwind {
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB0_10: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    shldl $1, %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    andl $2, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    leal (%eax,%edx,2), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    shldl $1, %ebx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    andl $2, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    leal (%edx,%ebx,2), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    shll $30, %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sarl $30, %eax
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    shrdl $1, %ebx, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl $1, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    shll $30, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sarl $30, %edx
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    shrdl $1, %esi, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $3, %edi
-; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $3, %esi
+; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    jne .LBB0_10
 ; X86-NEXT:  .LBB0_11: # %udiv-end
 ; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 782c84408f25..1b13cee628df 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -8,21 +8,20 @@ define i64 @PR62286(i32 %a) {
 ; SSE-LABEL: PR62286:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,0]
-; SSE-NEXT:    paddd %xmm1, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,1,0]
+; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT:    paddq %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: PR62286:
@@ -47,10 +46,10 @@ define i64 @PR62286(i32 %a) {
 ; AVX2-LABEL: PR62286:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr90668.ll b/llvm/test/CodeGen/X86/pr90668.ll
new file mode 100644
index 000000000000..e3aa638b850f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr90668.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define i64 @off(i8 signext %a) {
+; CHECK-LABEL: off:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addb $-128, %dil
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    shll $3, %eax
+; CHECK-NEXT:    retq
+entry:
+  %add = xor i8 %a, -128
+  %conv2 = zext i8 %add to i64
+  %mul = shl nuw nsw i64 %conv2, 3
+  ret i64 %mul
+}
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index a9f3e8b22fb6..785b97d8c240 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -14,7 +14,6 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    movq %rdi, %rax
 ; ILP-NEXT:    leal (%rsi,%rsi), %ecx
-; ILP-NEXT:    addb $3, %cl
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -23,6 +22,7 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    addb $3, %cl
 ; ILP-NEXT:    movl %ecx, %edx
 ; ILP-NEXT:    shrb $3, %dl
 ; ILP-NEXT:    andb $7, %cl
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 31297a06f809..a1cabb433d87 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -563,18 +563,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $120, %rsp
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; X64-NEXT:    psllq $32, %xmm3
+; X64-NEXT:    pxor %xmm3, %xmm3
+; X64-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
 ; X64-NEXT:    psrad $31, %xmm2
 ; X64-NEXT:    psrlq $31, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shldq $31, %rbp, %r14
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; X64-NEXT:    pxor %xmm0, %xmm0
 ; X64-NEXT:    pcmpgtd %xmm1, %xmm0
@@ -582,113 +584,113 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm1, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    shlq $31, %r12
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %rbp
-; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r12
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
+; X64-NEXT:    shrq $63, %rbp
+; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    movl $4294967295, %edx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rdx, %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmpq %rdx, %r13
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rcx, %r14
-; X64-NEXT:    cmovgeq %rdx, %rbp
+; X64-NEXT:    cmovgeq %rdx, %r13
+; X64-NEXT:    cmovgeq %rcx, %r12
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rbp, %rcx
+; X64-NEXT:    cmpq %r13, %rcx
 ; X64-NEXT:    movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r14, %rax
-; X64-NEXT:    cmovgeq %rcx, %rbp
-; X64-NEXT:    movq %rbp, %xmm0
+; X64-NEXT:    sbbq %r12, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
+; X64-NEXT:    movq %r13, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shldq $31, %rbp, %r14
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    shlq $31, %r12
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %rbp
-; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r12
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
+; X64-NEXT:    shrq $63, %rbp
+; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r14
-; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    cmovgeq %rax, %r12
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rbp, %rcx
+; X64-NEXT:    cmpq %r13, %rcx
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r14, %rax
-; X64-NEXT:    cmovgeq %rcx, %rbp
-; X64-NEXT:    movq %rbp, %xmm0
+; X64-NEXT:    sbbq %r12, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
+; X64-NEXT:    movq %r13, %xmm0
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-NEXT:    psrlq $1, %xmm1
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT:    # xmm0 = mem[0,1,1,3]
-; X64-NEXT:    psllq $32, %xmm0
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; X64-NEXT:    psrad $31, %xmm1
 ; X64-NEXT:    psrlq $31, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shldq $31, %rbp, %r14
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X64-NEXT:    pxor %xmm1, %xmm1
 ; X64-NEXT:    pcmpgtd %xmm0, %xmm1
@@ -696,94 +698,92 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    shlq $31, %r12
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %rbp
-; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r12
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
+; X64-NEXT:    shrq $63, %rbp
+; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r14
-; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    cmovgeq %rax, %r12
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rbp, %rcx
+; X64-NEXT:    cmpq %r13, %rcx
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r14, %rax
-; X64-NEXT:    cmovgeq %rcx, %rbp
-; X64-NEXT:    movq %rbp, %xmm0
+; X64-NEXT:    sbbq %r12, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
+; X64-NEXT:    movq %r13, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shldq $31, %rbp, %r14
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    shlq $31, %r12
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %rbp
-; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r12
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
+; X64-NEXT:    shrq $63, %rbp
+; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r14
-; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    cmovgeq %rax, %r12
 ; X64-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rbp, %rax
-; X64-NEXT:    sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    cmovgeq %rax, %rbp
-; X64-NEXT:    movq %rbp, %xmm1
+; X64-NEXT:    cmpq %r13, %rax
+; X64-NEXT:    sbbq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    cmovgeq %rax, %r13
+; X64-NEXT:    movq %r13, %xmm1
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-NEXT:    psrlq $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 97c3c2040b29..a80d8d8cd01b 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -46,7 +46,6 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:    movq 24(%rsi), %rcx
 ; CHECK-NEXT:    movq 32(%rsi), %rdx
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
-; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [2,2]
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -54,39 +53,45 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_2 Depth 2
 ; CHECK-NEXT:    xorpd %xmm3, %xmm3
-; CHECK-NEXT:    movq $-1024, %rdi # imm = 0xFC00
+; CHECK-NEXT:    movq $-1024, %rsi # imm = 0xFC00
 ; CHECK-NEXT:    movdqa %xmm0, %xmm4
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    cmpq 1024(%rdx,%rdi), %rsi
-; CHECK-NEXT:    movq %rcx, %r8
-; CHECK-NEXT:    sbbq 1032(%rdx,%rdi), %r8
-; CHECK-NEXT:    setge %r8b
-; CHECK-NEXT:    movzbl %r8b, %r8d
-; CHECK-NEXT:    andl $1, %r8d
+; CHECK-NEXT:    movdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-NEXT:    movdqu 1040(%rdx,%rsi), %xmm6
+; CHECK-NEXT:    movq %xmm5, %rdi
+; CHECK-NEXT:    movq %xmm6, %r8
+; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; CHECK-NEXT:    movq %xmm5, %r9
+; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
+; CHECK-NEXT:    movq %xmm5, %r10
 ; CHECK-NEXT:    negq %r8
-; CHECK-NEXT:    movq %r8, %xmm5
-; CHECK-NEXT:    cmpq 1040(%rdx,%rdi), %rsi
 ; CHECK-NEXT:    movq %rcx, %r8
-; CHECK-NEXT:    sbbq 1048(%rdx,%rdi), %r8
+; CHECK-NEXT:    sbbq %r10, %r8
 ; CHECK-NEXT:    setge %r8b
 ; CHECK-NEXT:    movzbl %r8b, %r8d
-; CHECK-NEXT:    andl $1, %r8d
 ; CHECK-NEXT:    negq %r8
-; CHECK-NEXT:    movq %r8, %xmm6
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; CHECK-NEXT:    movdqa %xmm1, %xmm6
-; CHECK-NEXT:    psllq %xmm4, %xmm6
+; CHECK-NEXT:    movq %r8, %xmm5
+; CHECK-NEXT:    negq %rdi
+; CHECK-NEXT:    movq %rcx, %rdi
+; CHECK-NEXT:    sbbq %r9, %rdi
+; CHECK-NEXT:    setge %dil
+; CHECK-NEXT:    movzbl %dil, %edi
+; CHECK-NEXT:    negq %rdi
+; CHECK-NEXT:    movq %rdi, %xmm6
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm5
+; CHECK-NEXT:    psllq %xmm4, %xmm5
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
 ; CHECK-NEXT:    movdqa %xmm1, %xmm8
 ; CHECK-NEXT:    psllq %xmm7, %xmm8
-; CHECK-NEXT:    movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
-; CHECK-NEXT:    andpd %xmm5, %xmm8
+; CHECK-NEXT:    movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
+; CHECK-NEXT:    andpd %xmm6, %xmm8
 ; CHECK-NEXT:    orpd %xmm8, %xmm3
 ; CHECK-NEXT:    paddq %xmm2, %xmm4
-; CHECK-NEXT:    addq $32, %rdi
+; CHECK-NEXT:    addq $32, %rsi
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  # %bb.3: # %middle.block
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
@@ -101,7 +106,6 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:    movq 24(%rsi), %rcx
 ; CHECK-AVX2-NEXT:    movq 32(%rsi), %rdx
 ; CHECK-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,1]
-; CHECK-AVX2-NEXT:    xorl %esi, %esi
 ; CHECK-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [1,1]
 ; CHECK-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [2,2]
 ; CHECK-AVX2-NEXT:    .p2align 4, 0x90
@@ -109,34 +113,40 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-AVX2-NEXT:    # Child Loop BB0_2 Depth 2
 ; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    movq $-1024, %rdi # imm = 0xFC00
+; CHECK-AVX2-NEXT:    movq $-1024, %rsi # imm = 0xFC00
 ; CHECK-AVX2-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-AVX2-NEXT:    .p2align 4, 0x90
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    cmpq 1024(%rdx,%rdi), %rsi
-; CHECK-AVX2-NEXT:    movq %rcx, %r8
-; CHECK-AVX2-NEXT:    sbbq 1032(%rdx,%rdi), %r8
+; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
+; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
+; CHECK-AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; CHECK-AVX2-NEXT:    vmovq %xmm5, %rdi
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %r8
+; CHECK-AVX2-NEXT:    vmovq %xmm7, %r9
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm7, %r10
+; CHECK-AVX2-NEXT:    negq %r10
+; CHECK-AVX2-NEXT:    movq %rcx, %r10
+; CHECK-AVX2-NEXT:    sbbq %r8, %r10
 ; CHECK-AVX2-NEXT:    setge %r8b
 ; CHECK-AVX2-NEXT:    movzbl %r8b, %r8d
-; CHECK-AVX2-NEXT:    andl $1, %r8d
 ; CHECK-AVX2-NEXT:    negq %r8
 ; CHECK-AVX2-NEXT:    vmovq %r8, %xmm5
-; CHECK-AVX2-NEXT:    cmpq 1040(%rdx,%rdi), %rsi
+; CHECK-AVX2-NEXT:    negq %r9
 ; CHECK-AVX2-NEXT:    movq %rcx, %r8
-; CHECK-AVX2-NEXT:    sbbq 1048(%rdx,%rdi), %r8
-; CHECK-AVX2-NEXT:    setge %r8b
-; CHECK-AVX2-NEXT:    movzbl %r8b, %r8d
-; CHECK-AVX2-NEXT:    andl $1, %r8d
-; CHECK-AVX2-NEXT:    negq %r8
-; CHECK-AVX2-NEXT:    vmovq %r8, %xmm6
-; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; CHECK-AVX2-NEXT:    sbbq %rdi, %r8
+; CHECK-AVX2-NEXT:    setge %dil
+; CHECK-AVX2-NEXT:    movzbl %dil, %edi
+; CHECK-AVX2-NEXT:    negq %rdi
+; CHECK-AVX2-NEXT:    vmovq %rdi, %xmm6
+; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
 ; CHECK-AVX2-NEXT:    vpsllvq %xmm4, %xmm1, %xmm6
 ; CHECK-AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm5
 ; CHECK-AVX2-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; CHECK-AVX2-NEXT:    vpaddq %xmm2, %xmm4, %xmm4
-; CHECK-AVX2-NEXT:    addq $32, %rdi
+; CHECK-AVX2-NEXT:    addq $32, %rsi
 ; CHECK-AVX2-NEXT:    jne .LBB0_2
 ; CHECK-AVX2-NEXT:  # %bb.3: # %middle.block
 ; CHECK-AVX2-NEXT:    # in Loop: Header=BB0_1 Depth=1
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-1.ll b/llvm/test/CodeGen/X86/subreg-to-reg-1.ll
index 8acdb6176f57..49bff9e075e5 100644
--- a/llvm/test/CodeGen/X86/subreg-to-reg-1.ll
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-1.ll
@@ -1,14 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
 
-; CHECK:     {{leal	.*[)], %e.*}}
-; CHECK-NOT: {{leal	.*[)], %e.*}}
-
 ; Don't eliminate or coalesce away the explicit zero-extension!
 ; This is currently using an leal because of a 3-addressification detail,
 ; though this isn't necessary; The point of this test is to make sure
 ; a 32-bit add is used.
 
 define i64 @foo(i64 %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    leal -1(%rdi), %eax
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
   %b = add i64 %a, 4294967295
   %c = and i64 %b, 4294967295
   %d = add i64 %c, 1
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-3.ll b/llvm/test/CodeGen/X86/subreg-to-reg-3.ll
index db9d0d12c3d7..2bd5ca1716af 100644
--- a/llvm/test/CodeGen/X86/subreg-to-reg-3.ll
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-3.ll
@@ -1,10 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
 
-; CHECK: imull
-
 ; Don't eliminate or coalesce away the explicit zero-extension!
 
 define i64 @foo(i64 %a) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    imull $7823, %edi, %eax # imm = 0x1E8F
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
   %b = mul i64 %a, 7823
   %c = and i64 %b, 4294967295
   %d = add i64 %c, 1
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-6.ll b/llvm/test/CodeGen/X86/subreg-to-reg-6.ll
index 7a6f78fac368..f0dc17b55661 100644
--- a/llvm/test/CodeGen/X86/subreg-to-reg-6.ll
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-6.ll
@@ -1,6 +1,18 @@
-; RUN: llc < %s -mtriple=x86_64--
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
 
 define i64 @foo() nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $12, 0
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %bb65
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1: # %bb56
 entry:
 	%t0 = load i32, ptr null, align 8
 	switch i32 %t0, label %bb65 [
@@ -22,6 +34,14 @@ bb65:
 }
 
 define i64 @bar(i64 %t0) nounwind {
+; CHECK-LABEL: bar:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    retq
 	call void asm "", "{cx}"(i64 0) nounwind
 	%t1 = sub i64 0, %t0
 	%t2 = and i64 %t1, 4294967295
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index e0f438eb7cc8..f1fd05565c47 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -2384,52 +2384,45 @@ define void @vec384_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
 ; SCALAR-LABEL: vec384_v3i8:
 ; SCALAR:       # %bb.0:
-; SCALAR-NEXT:    movl (%rdi), %ecx
-; SCALAR-NEXT:    movl %ecx, %eax
-; SCALAR-NEXT:    shrl $16, %eax
-; SCALAR-NEXT:    movl %ecx, %edi
-; SCALAR-NEXT:    shrl $8, %edi
+; SCALAR-NEXT:    movl (%rdi), %eax
+; SCALAR-NEXT:    movl %eax, %ecx
+; SCALAR-NEXT:    shrl $16, %ecx
 ; SCALAR-NEXT:    notb %cl
-; SCALAR-NEXT:    movzbl %cl, %r8d
-; SCALAR-NEXT:    notb %dil
-; SCALAR-NEXT:    movzbl %dil, %ecx
-; SCALAR-NEXT:    shll $8, %ecx
-; SCALAR-NEXT:    orl %r8d, %ecx
-; SCALAR-NEXT:    notb %al
-; SCALAR-NEXT:    movb %al, 2(%rsi)
-; SCALAR-NEXT:    movw %cx, (%rsi)
-; SCALAR-NEXT:    movb %al, 2(%rdx)
-; SCALAR-NEXT:    movw %cx, (%rdx)
-; SCALAR-NEXT:    movb %al, 6(%rdx)
-; SCALAR-NEXT:    movw %cx, 4(%rdx)
-; SCALAR-NEXT:    movb %al, 10(%rdx)
-; SCALAR-NEXT:    movw %cx, 8(%rdx)
-; SCALAR-NEXT:    movb %al, 14(%rdx)
-; SCALAR-NEXT:    movw %cx, 12(%rdx)
-; SCALAR-NEXT:    movb %al, 18(%rdx)
-; SCALAR-NEXT:    movw %cx, 16(%rdx)
-; SCALAR-NEXT:    movb %al, 22(%rdx)
-; SCALAR-NEXT:    movw %cx, 20(%rdx)
-; SCALAR-NEXT:    movb %al, 26(%rdx)
-; SCALAR-NEXT:    movw %cx, 24(%rdx)
-; SCALAR-NEXT:    movb %al, 30(%rdx)
-; SCALAR-NEXT:    movw %cx, 28(%rdx)
-; SCALAR-NEXT:    movb %al, 34(%rdx)
-; SCALAR-NEXT:    movw %cx, 32(%rdx)
-; SCALAR-NEXT:    movb %al, 38(%rdx)
-; SCALAR-NEXT:    movw %cx, 36(%rdx)
-; SCALAR-NEXT:    movb %al, 42(%rdx)
-; SCALAR-NEXT:    movw %cx, 40(%rdx)
-; SCALAR-NEXT:    movb %al, 46(%rdx)
-; SCALAR-NEXT:    movw %cx, 44(%rdx)
-; SCALAR-NEXT:    movb %al, 50(%rdx)
-; SCALAR-NEXT:    movw %cx, 48(%rdx)
-; SCALAR-NEXT:    movb %al, 54(%rdx)
-; SCALAR-NEXT:    movw %cx, 52(%rdx)
-; SCALAR-NEXT:    movb %al, 58(%rdx)
-; SCALAR-NEXT:    movw %cx, 56(%rdx)
-; SCALAR-NEXT:    movb %al, 62(%rdx)
-; SCALAR-NEXT:    movw %cx, 60(%rdx)
+; SCALAR-NEXT:    notl %eax
+; SCALAR-NEXT:    movw %ax, (%rsi)
+; SCALAR-NEXT:    movb %cl, 2(%rsi)
+; SCALAR-NEXT:    movb %cl, 2(%rdx)
+; SCALAR-NEXT:    movw %ax, (%rdx)
+; SCALAR-NEXT:    movb %cl, 6(%rdx)
+; SCALAR-NEXT:    movw %ax, 4(%rdx)
+; SCALAR-NEXT:    movb %cl, 10(%rdx)
+; SCALAR-NEXT:    movw %ax, 8(%rdx)
+; SCALAR-NEXT:    movb %cl, 14(%rdx)
+; SCALAR-NEXT:    movw %ax, 12(%rdx)
+; SCALAR-NEXT:    movb %cl, 18(%rdx)
+; SCALAR-NEXT:    movw %ax, 16(%rdx)
+; SCALAR-NEXT:    movb %cl, 22(%rdx)
+; SCALAR-NEXT:    movw %ax, 20(%rdx)
+; SCALAR-NEXT:    movb %cl, 26(%rdx)
+; SCALAR-NEXT:    movw %ax, 24(%rdx)
+; SCALAR-NEXT:    movb %cl, 30(%rdx)
+; SCALAR-NEXT:    movw %ax, 28(%rdx)
+; SCALAR-NEXT:    movb %cl, 34(%rdx)
+; SCALAR-NEXT:    movw %ax, 32(%rdx)
+; SCALAR-NEXT:    movb %cl, 38(%rdx)
+; SCALAR-NEXT:    movw %ax, 36(%rdx)
+; SCALAR-NEXT:    movb %cl, 42(%rdx)
+; SCALAR-NEXT:    movw %ax, 40(%rdx)
+; SCALAR-NEXT:    movb %cl, 46(%rdx)
+; SCALAR-NEXT:    movw %ax, 44(%rdx)
+; SCALAR-NEXT:    movb %cl, 50(%rdx)
+; SCALAR-NEXT:    movw %ax, 48(%rdx)
+; SCALAR-NEXT:    movb %cl, 54(%rdx)
+; SCALAR-NEXT:    movw %ax, 52(%rdx)
+; SCALAR-NEXT:    movb %cl, 58(%rdx)
+; SCALAR-NEXT:    movw %ax, 56(%rdx)
+; SCALAR-NEXT:    movb %cl, 62(%rdx)
+; SCALAR-NEXT:    movw %ax, 60(%rdx)
 ; SCALAR-NEXT:    retq
 ;
 ; SSE2-ONLY-LABEL: vec384_v3i8:
@@ -3060,12 +3053,7 @@ define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR:       # %bb.0:
 ; SCALAR-NEXT:    movl 8(%rdi), %eax
 ; SCALAR-NEXT:    movq (%rdi), %rcx
-; SCALAR-NEXT:    movq %rcx, %rdi
-; SCALAR-NEXT:    shrq $32, %rdi
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    shlq $32, %rdi
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %rdi, %rcx
+; SCALAR-NEXT:    notq %rcx
 ; SCALAR-NEXT:    notl %eax
 ; SCALAR-NEXT:    movl %eax, 8(%rsi)
 ; SCALAR-NEXT:    movq %rcx, (%rsi)
@@ -3196,12 +3184,7 @@ define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR:       # %bb.0:
 ; SCALAR-NEXT:    movl 8(%rdi), %eax
 ; SCALAR-NEXT:    movq (%rdi), %rcx
-; SCALAR-NEXT:    movq %rcx, %rdi
-; SCALAR-NEXT:    shrq $32, %rdi
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    shlq $32, %rdi
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %rdi, %rcx
+; SCALAR-NEXT:    notq %rcx
 ; SCALAR-NEXT:    notl %eax
 ; SCALAR-NEXT:    movl %eax, 8(%rsi)
 ; SCALAR-NEXT:    movq %rcx, (%rsi)
@@ -3794,56 +3777,29 @@ define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
 ; SCALAR-LABEL: vec384_v6i8:
 ; SCALAR:       # %bb.0:
-; SCALAR-NEXT:    movq (%rdi), %rdi
-; SCALAR-NEXT:    movq %rdi, %rax
-; SCALAR-NEXT:    shrq $40, %rax
-; SCALAR-NEXT:    movq %rdi, %rcx
+; SCALAR-NEXT:    movq (%rdi), %rax
+; SCALAR-NEXT:    movq %rax, %rcx
 ; SCALAR-NEXT:    shrq $32, %rcx
-; SCALAR-NEXT:    movl %edi, %r8d
-; SCALAR-NEXT:    shrl $24, %r8d
-; SCALAR-NEXT:    movl %edi, %r9d
-; SCALAR-NEXT:    shrl $16, %r9d
-; SCALAR-NEXT:    movl %edi, %r10d
-; SCALAR-NEXT:    shrl $8, %r10d
-; SCALAR-NEXT:    notb %dil
-; SCALAR-NEXT:    movzbl %dil, %edi
-; SCALAR-NEXT:    notb %r10b
-; SCALAR-NEXT:    movzbl %r10b, %r10d
-; SCALAR-NEXT:    shll $8, %r10d
-; SCALAR-NEXT:    orl %edi, %r10d
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %edi
-; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movzbl %r8b, %r8d
-; SCALAR-NEXT:    shll $8, %r8d
-; SCALAR-NEXT:    orl %edi, %r8d
-; SCALAR-NEXT:    notb %cl
-; SCALAR-NEXT:    movzbl %cl, %ecx
-; SCALAR-NEXT:    notb %al
-; SCALAR-NEXT:    movzbl %al, %eax
-; SCALAR-NEXT:    shll $8, %eax
-; SCALAR-NEXT:    orl %ecx, %eax
-; SCALAR-NEXT:    movw %ax, 4(%rsi)
-; SCALAR-NEXT:    shll $16, %r8d
-; SCALAR-NEXT:    movzwl %r10w, %ecx
-; SCALAR-NEXT:    orl %r8d, %ecx
-; SCALAR-NEXT:    movl %ecx, (%rsi)
-; SCALAR-NEXT:    movw %ax, 4(%rdx)
-; SCALAR-NEXT:    movl %ecx, (%rdx)
-; SCALAR-NEXT:    movw %ax, 12(%rdx)
-; SCALAR-NEXT:    movl %ecx, 8(%rdx)
-; SCALAR-NEXT:    movw %ax, 20(%rdx)
-; SCALAR-NEXT:    movl %ecx, 16(%rdx)
-; SCALAR-NEXT:    movw %ax, 28(%rdx)
-; SCALAR-NEXT:    movl %ecx, 24(%rdx)
-; SCALAR-NEXT:    movw %ax, 36(%rdx)
-; SCALAR-NEXT:    movl %ecx, 32(%rdx)
-; SCALAR-NEXT:    movw %ax, 44(%rdx)
-; SCALAR-NEXT:    movl %ecx, 40(%rdx)
-; SCALAR-NEXT:    movw %ax, 52(%rdx)
-; SCALAR-NEXT:    movl %ecx, 48(%rdx)
-; SCALAR-NEXT:    movw %ax, 60(%rdx)
-; SCALAR-NEXT:    movl %ecx, 56(%rdx)
+; SCALAR-NEXT:    notl %ecx
+; SCALAR-NEXT:    notl %eax
+; SCALAR-NEXT:    movl %eax, (%rsi)
+; SCALAR-NEXT:    movw %cx, 4(%rsi)
+; SCALAR-NEXT:    movw %cx, 4(%rdx)
+; SCALAR-NEXT:    movl %eax, (%rdx)
+; SCALAR-NEXT:    movw %cx, 12(%rdx)
+; SCALAR-NEXT:    movl %eax, 8(%rdx)
+; SCALAR-NEXT:    movw %cx, 20(%rdx)
+; SCALAR-NEXT:    movl %eax, 16(%rdx)
+; SCALAR-NEXT:    movw %cx, 28(%rdx)
+; SCALAR-NEXT:    movl %eax, 24(%rdx)
+; SCALAR-NEXT:    movw %cx, 36(%rdx)
+; SCALAR-NEXT:    movl %eax, 32(%rdx)
+; SCALAR-NEXT:    movw %cx, 44(%rdx)
+; SCALAR-NEXT:    movl %eax, 40(%rdx)
+; SCALAR-NEXT:    movw %cx, 52(%rdx)
+; SCALAR-NEXT:    movl %eax, 48(%rdx)
+; SCALAR-NEXT:    movw %cx, 60(%rdx)
+; SCALAR-NEXT:    movl %eax, 56(%rdx)
 ; SCALAR-NEXT:    retq
 ;
 ; SSE2-ONLY-LABEL: vec384_v6i8:
@@ -4072,31 +4028,20 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
 ; SCALAR-LABEL: vec384_v6i16:
 ; SCALAR:       # %bb.0:
-; SCALAR-NEXT:    movl 8(%rdi), %eax
-; SCALAR-NEXT:    movq (%rdi), %rcx
-; SCALAR-NEXT:    movq %rcx, %rdi
-; SCALAR-NEXT:    shrq $32, %rdi
-; SCALAR-NEXT:    movq %rcx, %r8
-; SCALAR-NEXT:    shrq $48, %r8
-; SCALAR-NEXT:    notl %r8d
-; SCALAR-NEXT:    shll $16, %r8d
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    movzwl %di, %edi
-; SCALAR-NEXT:    orl %r8d, %edi
+; SCALAR-NEXT:    movq (%rdi), %rax
+; SCALAR-NEXT:    movl 8(%rdi), %ecx
 ; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    notl %eax
-; SCALAR-NEXT:    movl %eax, 8(%rsi)
-; SCALAR-NEXT:    shlq $32, %rdi
-; SCALAR-NEXT:    orq %rdi, %rcx
-; SCALAR-NEXT:    movq %rcx, (%rsi)
-; SCALAR-NEXT:    movl %eax, 8(%rdx)
-; SCALAR-NEXT:    movq %rcx, (%rdx)
-; SCALAR-NEXT:    movl %eax, 24(%rdx)
-; SCALAR-NEXT:    movq %rcx, 16(%rdx)
-; SCALAR-NEXT:    movl %eax, 40(%rdx)
-; SCALAR-NEXT:    movq %rcx, 32(%rdx)
-; SCALAR-NEXT:    movl %eax, 56(%rdx)
-; SCALAR-NEXT:    movq %rcx, 48(%rdx)
+; SCALAR-NEXT:    notq %rax
+; SCALAR-NEXT:    movq %rax, (%rsi)
+; SCALAR-NEXT:    movl %ecx, 8(%rsi)
+; SCALAR-NEXT:    movl %ecx, 8(%rdx)
+; SCALAR-NEXT:    movq %rax, (%rdx)
+; SCALAR-NEXT:    movl %ecx, 24(%rdx)
+; SCALAR-NEXT:    movq %rax, 16(%rdx)
+; SCALAR-NEXT:    movl %ecx, 40(%rdx)
+; SCALAR-NEXT:    movq %rax, 32(%rdx)
+; SCALAR-NEXT:    movl %ecx, 56(%rdx)
+; SCALAR-NEXT:    movq %rax, 48(%rdx)
 ; SCALAR-NEXT:    retq
 ;
 ; SSE2-ONLY-LABEL: vec384_v6i16:
@@ -4216,25 +4161,10 @@ define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR:       # %bb.0:
 ; SCALAR-NEXT:    movq (%rdi), %rax
 ; SCALAR-NEXT:    movq 8(%rdi), %rcx
-; SCALAR-NEXT:    movq %rax, %r8
-; SCALAR-NEXT:    shrq $32, %r8
-; SCALAR-NEXT:    movq %rcx, %r9
-; SCALAR-NEXT:    shrq $32, %r9
 ; SCALAR-NEXT:    movq 16(%rdi), %rdi
-; SCALAR-NEXT:    movq %rdi, %r10
-; SCALAR-NEXT:    shrq $32, %r10
-; SCALAR-NEXT:    notl %r10d
-; SCALAR-NEXT:    shlq $32, %r10
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    orq %r10, %rdi
-; SCALAR-NEXT:    notl %r9d
-; SCALAR-NEXT:    shlq $32, %r9
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %r9, %rcx
-; SCALAR-NEXT:    notl %r8d
-; SCALAR-NEXT:    shlq $32, %r8
-; SCALAR-NEXT:    notl %eax
-; SCALAR-NEXT:    orq %r8, %rax
+; SCALAR-NEXT:    notq %rdi
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
 ; SCALAR-NEXT:    movq %rax, (%rsi)
 ; SCALAR-NEXT:    movq %rcx, 8(%rsi)
 ; SCALAR-NEXT:    movq %rdi, 16(%rsi)
@@ -4303,25 +4233,10 @@ define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR:       # %bb.0:
 ; SCALAR-NEXT:    movq (%rdi), %rax
 ; SCALAR-NEXT:    movq 8(%rdi), %rcx
-; SCALAR-NEXT:    movq %rax, %r8
-; SCALAR-NEXT:    shrq $32, %r8
-; SCALAR-NEXT:    movq %rcx, %r9
-; SCALAR-NEXT:    shrq $32, %r9
 ; SCALAR-NEXT:    movq 16(%rdi), %rdi
-; SCALAR-NEXT:    movq %rdi, %r10
-; SCALAR-NEXT:    shrq $32, %r10
-; SCALAR-NEXT:    notl %r10d
-; SCALAR-NEXT:    shlq $32, %r10
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    orq %r10, %rdi
-; SCALAR-NEXT:    notl %r9d
-; SCALAR-NEXT:    shlq $32, %r9
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %r9, %rcx
-; SCALAR-NEXT:    notl %r8d
-; SCALAR-NEXT:    shlq $32, %r8
-; SCALAR-NEXT:    notl %eax
-; SCALAR-NEXT:    orq %r8, %rax
+; SCALAR-NEXT:    notq %rdi
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
 ; SCALAR-NEXT:    movq %rax, (%rsi)
 ; SCALAR-NEXT:    movq %rcx, 8(%rsi)
 ; SCALAR-NEXT:    movq %rdi, 16(%rsi)
@@ -4619,95 +4534,20 @@ define void @vec384_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
 ; SCALAR-LABEL: vec384_v12i8:
 ; SCALAR:       # %bb.0:
-; SCALAR-NEXT:    pushq %rbp
-; SCALAR-NEXT:    pushq %r15
-; SCALAR-NEXT:    pushq %r14
-; SCALAR-NEXT:    pushq %r12
-; SCALAR-NEXT:    pushq %rbx
-; SCALAR-NEXT:    movq (%rdi), %r9
-; SCALAR-NEXT:    movq 8(%rdi), %rcx
-; SCALAR-NEXT:    movl %ecx, %eax
-; SCALAR-NEXT:    shrl $8, %eax
-; SCALAR-NEXT:    movl %ecx, %edi
-; SCALAR-NEXT:    shrl $24, %edi
-; SCALAR-NEXT:    movl %ecx, %r8d
-; SCALAR-NEXT:    shrl $16, %r8d
-; SCALAR-NEXT:    movq %r9, %r10
-; SCALAR-NEXT:    shrq $40, %r10
-; SCALAR-NEXT:    movq %r9, %r11
-; SCALAR-NEXT:    shrq $32, %r11
-; SCALAR-NEXT:    movq %r9, %rbx
-; SCALAR-NEXT:    shrq $56, %rbx
-; SCALAR-NEXT:    movq %r9, %r14
-; SCALAR-NEXT:    shrq $48, %r14
-; SCALAR-NEXT:    movl %r9d, %ebp
-; SCALAR-NEXT:    shrl $8, %ebp
-; SCALAR-NEXT:    movl %r9d, %r15d
-; SCALAR-NEXT:    shrl $24, %r15d
-; SCALAR-NEXT:    movl %r9d, %r12d
-; SCALAR-NEXT:    shrl $16, %r12d
-; SCALAR-NEXT:    notb %r12b
-; SCALAR-NEXT:    movzbl %r12b, %r12d
-; SCALAR-NEXT:    notb %r15b
-; SCALAR-NEXT:    movzbl %r15b, %r15d
-; SCALAR-NEXT:    shll $8, %r15d
-; SCALAR-NEXT:    orl %r12d, %r15d
-; SCALAR-NEXT:    shll $16, %r15d
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r9d
-; SCALAR-NEXT:    notb %bpl
-; SCALAR-NEXT:    movzbl %bpl, %ebp
-; SCALAR-NEXT:    shll $8, %ebp
-; SCALAR-NEXT:    orl %r9d, %ebp
-; SCALAR-NEXT:    movzwl %bp, %r9d
-; SCALAR-NEXT:    orl %r15d, %r9d
-; SCALAR-NEXT:    notb %r14b
-; SCALAR-NEXT:    movzbl %r14b, %ebp
-; SCALAR-NEXT:    notb %bl
-; SCALAR-NEXT:    movzbl %bl, %ebx
-; SCALAR-NEXT:    shll $8, %ebx
-; SCALAR-NEXT:    orl %ebp, %ebx
-; SCALAR-NEXT:    shll $16, %ebx
-; SCALAR-NEXT:    notb %r11b
-; SCALAR-NEXT:    movzbl %r11b, %r11d
-; SCALAR-NEXT:    notb %r10b
-; SCALAR-NEXT:    movzbl %r10b, %r10d
-; SCALAR-NEXT:    shll $8, %r10d
-; SCALAR-NEXT:    orl %r11d, %r10d
-; SCALAR-NEXT:    movzwl %r10w, %r10d
-; SCALAR-NEXT:    orl %ebx, %r10d
-; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movzbl %r8b, %r8d
-; SCALAR-NEXT:    notb %dil
-; SCALAR-NEXT:    movzbl %dil, %edi
-; SCALAR-NEXT:    shll $8, %edi
-; SCALAR-NEXT:    orl %r8d, %edi
-; SCALAR-NEXT:    shll $16, %edi
-; SCALAR-NEXT:    notb %cl
-; SCALAR-NEXT:    movzbl %cl, %ecx
-; SCALAR-NEXT:    notb %al
-; SCALAR-NEXT:    movzbl %al, %eax
-; SCALAR-NEXT:    shll $8, %eax
-; SCALAR-NEXT:    orl %ecx, %eax
-; SCALAR-NEXT:    movzwl %ax, %eax
-; SCALAR-NEXT:    orl %edi, %eax
-; SCALAR-NEXT:    movl %eax, 8(%rsi)
-; SCALAR-NEXT:    shlq $32, %r10
-; SCALAR-NEXT:    orq %r10, %r9
-; SCALAR-NEXT:    movq %r9, (%rsi)
-; SCALAR-NEXT:    movl %eax, 8(%rdx)
-; SCALAR-NEXT:    movq %r9, (%rdx)
-; SCALAR-NEXT:    movl %eax, 24(%rdx)
-; SCALAR-NEXT:    movq %r9, 16(%rdx)
-; SCALAR-NEXT:    movl %eax, 40(%rdx)
-; SCALAR-NEXT:    movq %r9, 32(%rdx)
-; SCALAR-NEXT:    movl %eax, 56(%rdx)
-; SCALAR-NEXT:    movq %r9, 48(%rdx)
-; SCALAR-NEXT:    popq %rbx
-; SCALAR-NEXT:    popq %r12
-; SCALAR-NEXT:    popq %r14
-; SCALAR-NEXT:    popq %r15
-; SCALAR-NEXT:    popq %rbp
+; SCALAR-NEXT:    movq (%rdi), %rax
+; SCALAR-NEXT:    movl 8(%rdi), %ecx
+; SCALAR-NEXT:    notl %ecx
+; SCALAR-NEXT:    notq %rax
+; SCALAR-NEXT:    movq %rax, (%rsi)
+; SCALAR-NEXT:    movl %ecx, 8(%rsi)
+; SCALAR-NEXT:    movl %ecx, 8(%rdx)
+; SCALAR-NEXT:    movq %rax, (%rdx)
+; SCALAR-NEXT:    movl %ecx, 24(%rdx)
+; SCALAR-NEXT:    movq %rax, 16(%rdx)
+; SCALAR-NEXT:    movl %ecx, 40(%rdx)
+; SCALAR-NEXT:    movq %rax, 32(%rdx)
+; SCALAR-NEXT:    movl %ecx, 56(%rdx)
+; SCALAR-NEXT:    movq %rax, 48(%rdx)
 ; SCALAR-NEXT:    retq
 ;
 ; SSE2-ONLY-LABEL: vec384_v12i8:
@@ -4825,47 +4665,12 @@ define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
 ; SCALAR-LABEL: vec384_v12i16:
 ; SCALAR:       # %bb.0:
-; SCALAR-NEXT:    pushq %r14
-; SCALAR-NEXT:    pushq %rbx
 ; SCALAR-NEXT:    movq (%rdi), %rax
 ; SCALAR-NEXT:    movq 8(%rdi), %rcx
-; SCALAR-NEXT:    movq %rax, %r8
-; SCALAR-NEXT:    shrq $32, %r8
-; SCALAR-NEXT:    movq %rax, %r9
-; SCALAR-NEXT:    shrq $48, %r9
-; SCALAR-NEXT:    movq %rcx, %r10
-; SCALAR-NEXT:    shrq $32, %r10
-; SCALAR-NEXT:    movq %rcx, %r11
-; SCALAR-NEXT:    shrq $48, %r11
 ; SCALAR-NEXT:    movq 16(%rdi), %rdi
-; SCALAR-NEXT:    movq %rdi, %rbx
-; SCALAR-NEXT:    shrq $32, %rbx
-; SCALAR-NEXT:    movq %rdi, %r14
-; SCALAR-NEXT:    shrq $48, %r14
-; SCALAR-NEXT:    notl %r14d
-; SCALAR-NEXT:    shll $16, %r14d
-; SCALAR-NEXT:    notl %ebx
-; SCALAR-NEXT:    movzwl %bx, %ebx
-; SCALAR-NEXT:    orl %r14d, %ebx
-; SCALAR-NEXT:    shlq $32, %rbx
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    orq %rbx, %rdi
-; SCALAR-NEXT:    notl %r11d
-; SCALAR-NEXT:    shll $16, %r11d
-; SCALAR-NEXT:    notl %r10d
-; SCALAR-NEXT:    movzwl %r10w, %r10d
-; SCALAR-NEXT:    orl %r11d, %r10d
-; SCALAR-NEXT:    shlq $32, %r10
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %r10, %rcx
-; SCALAR-NEXT:    notl %r9d
-; SCALAR-NEXT:    shll $16, %r9d
-; SCALAR-NEXT:    notl %r8d
-; SCALAR-NEXT:    movzwl %r8w, %r8d
-; SCALAR-NEXT:    orl %r9d, %r8d
-; SCALAR-NEXT:    shlq $32, %r8
-; SCALAR-NEXT:    notl %eax
-; SCALAR-NEXT:    orq %r8, %rax
+; SCALAR-NEXT:    notq %rdi
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
 ; SCALAR-NEXT:    movq %rax, (%rsi)
 ; SCALAR-NEXT:    movq %rcx, 8(%rsi)
 ; SCALAR-NEXT:    movq %rdi, 16(%rsi)
@@ -4875,8 +4680,6 @@ define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec
 ; SCALAR-NEXT:    movq %rdi, 48(%rdx)
 ; SCALAR-NEXT:    movq %rcx, 40(%rdx)
 ; SCALAR-NEXT:    movq %rax, 32(%rdx)
-; SCALAR-NEXT:    popq %rbx
-; SCALAR-NEXT:    popq %r14
 ; SCALAR-NEXT:    retq
 ;
 ; SSE2-LABEL: vec384_v12i16:
@@ -5125,144 +4928,9 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR-NEXT:    movq (%rdi), %rax
 ; SCALAR-NEXT:    movq 8(%rdi), %rcx
 ; SCALAR-NEXT:    movq 16(%rdi), %rdi
-; SCALAR-NEXT:    movq %rdi, %r8
-; SCALAR-NEXT:    shrq $40, %r8
-; SCALAR-NEXT:    movq %rdi, %r9
-; SCALAR-NEXT:    shrq $56, %r9
-; SCALAR-NEXT:    movq %rdi, %r10
-; SCALAR-NEXT:    shrq $48, %r10
-; SCALAR-NEXT:    notb %r10b
-; SCALAR-NEXT:    movzbl %r10b, %r10d
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r9d
-; SCALAR-NEXT:    shll $8, %r9d
-; SCALAR-NEXT:    orl %r10d, %r9d
-; SCALAR-NEXT:    movq %rdi, %r10
-; SCALAR-NEXT:    shrq $32, %r10
-; SCALAR-NEXT:    notb %r10b
-; SCALAR-NEXT:    movzbl %r10b, %r10d
-; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movzbl %r8b, %r8d
-; SCALAR-NEXT:    shll $8, %r8d
-; SCALAR-NEXT:    orl %r10d, %r8d
-; SCALAR-NEXT:    movl %edi, %r10d
-; SCALAR-NEXT:    shrl $24, %r10d
-; SCALAR-NEXT:    shll $16, %r9d
-; SCALAR-NEXT:    movzwl %r8w, %r8d
-; SCALAR-NEXT:    orl %r9d, %r8d
-; SCALAR-NEXT:    movl %edi, %r9d
-; SCALAR-NEXT:    shrl $16, %r9d
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r9d
-; SCALAR-NEXT:    notb %r10b
-; SCALAR-NEXT:    movzbl %r10b, %r10d
-; SCALAR-NEXT:    shll $8, %r10d
-; SCALAR-NEXT:    orl %r9d, %r10d
-; SCALAR-NEXT:    movl %edi, %r9d
-; SCALAR-NEXT:    shrl $8, %r9d
-; SCALAR-NEXT:    notb %dil
-; SCALAR-NEXT:    movzbl %dil, %edi
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r11d
-; SCALAR-NEXT:    shll $8, %r11d
-; SCALAR-NEXT:    orl %edi, %r11d
-; SCALAR-NEXT:    movq %rcx, %r9
-; SCALAR-NEXT:    shrq $40, %r9
-; SCALAR-NEXT:    shll $16, %r10d
-; SCALAR-NEXT:    movzwl %r11w, %edi
-; SCALAR-NEXT:    orl %r10d, %edi
-; SCALAR-NEXT:    movq %rcx, %r10
-; SCALAR-NEXT:    shrq $56, %r10
-; SCALAR-NEXT:    shlq $32, %r8
-; SCALAR-NEXT:    orq %r8, %rdi
-; SCALAR-NEXT:    movq %rcx, %r8
-; SCALAR-NEXT:    shrq $48, %r8
-; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movzbl %r8b, %r8d
-; SCALAR-NEXT:    notb %r10b
-; SCALAR-NEXT:    movzbl %r10b, %r10d
-; SCALAR-NEXT:    shll $8, %r10d
-; SCALAR-NEXT:    orl %r8d, %r10d
-; SCALAR-NEXT:    movq %rcx, %r8
-; SCALAR-NEXT:    shrq $32, %r8
-; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movzbl %r8b, %r8d
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r9d
-; SCALAR-NEXT:    shll $8, %r9d
-; SCALAR-NEXT:    orl %r8d, %r9d
-; SCALAR-NEXT:    movl %ecx, %r11d
-; SCALAR-NEXT:    shrl $24, %r11d
-; SCALAR-NEXT:    shll $16, %r10d
-; SCALAR-NEXT:    movzwl %r9w, %r8d
-; SCALAR-NEXT:    orl %r10d, %r8d
-; SCALAR-NEXT:    movl %ecx, %r9d
-; SCALAR-NEXT:    shrl $16, %r9d
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r9d
-; SCALAR-NEXT:    notb %r11b
-; SCALAR-NEXT:    movzbl %r11b, %r10d
-; SCALAR-NEXT:    shll $8, %r10d
-; SCALAR-NEXT:    orl %r9d, %r10d
-; SCALAR-NEXT:    movl %ecx, %r9d
-; SCALAR-NEXT:    shrl $8, %r9d
-; SCALAR-NEXT:    notb %cl
-; SCALAR-NEXT:    movzbl %cl, %ecx
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r11d
-; SCALAR-NEXT:    shll $8, %r11d
-; SCALAR-NEXT:    orl %ecx, %r11d
-; SCALAR-NEXT:    movq %rax, %r9
-; SCALAR-NEXT:    shrq $40, %r9
-; SCALAR-NEXT:    shll $16, %r10d
-; SCALAR-NEXT:    movzwl %r11w, %ecx
-; SCALAR-NEXT:    orl %r10d, %ecx
-; SCALAR-NEXT:    movq %rax, %r10
-; SCALAR-NEXT:    shrq $56, %r10
-; SCALAR-NEXT:    shlq $32, %r8
-; SCALAR-NEXT:    orq %r8, %rcx
-; SCALAR-NEXT:    movq %rax, %r8
-; SCALAR-NEXT:    shrq $48, %r8
-; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movzbl %r8b, %r8d
-; SCALAR-NEXT:    notb %r10b
-; SCALAR-NEXT:    movzbl %r10b, %r10d
-; SCALAR-NEXT:    shll $8, %r10d
-; SCALAR-NEXT:    orl %r8d, %r10d
-; SCALAR-NEXT:    movq %rax, %r8
-; SCALAR-NEXT:    shrq $32, %r8
-; SCALAR-NEXT:    notb %r8b
-; SCALAR-NEXT:    movzbl %r8b, %r8d
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r9d
-; SCALAR-NEXT:    shll $8, %r9d
-; SCALAR-NEXT:    orl %r8d, %r9d
-; SCALAR-NEXT:    movl %eax, %r11d
-; SCALAR-NEXT:    shrl $24, %r11d
-; SCALAR-NEXT:    shll $16, %r10d
-; SCALAR-NEXT:    movzwl %r9w, %r8d
-; SCALAR-NEXT:    orl %r10d, %r8d
-; SCALAR-NEXT:    movl %eax, %r9d
-; SCALAR-NEXT:    shrl $16, %r9d
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r9d
-; SCALAR-NEXT:    notb %r11b
-; SCALAR-NEXT:    movzbl %r11b, %r10d
-; SCALAR-NEXT:    shll $8, %r10d
-; SCALAR-NEXT:    orl %r9d, %r10d
-; SCALAR-NEXT:    movl %eax, %r9d
-; SCALAR-NEXT:    shrl $8, %r9d
-; SCALAR-NEXT:    notb %al
-; SCALAR-NEXT:    movzbl %al, %eax
-; SCALAR-NEXT:    notb %r9b
-; SCALAR-NEXT:    movzbl %r9b, %r9d
-; SCALAR-NEXT:    shll $8, %r9d
-; SCALAR-NEXT:    orl %eax, %r9d
-; SCALAR-NEXT:    shll $16, %r10d
-; SCALAR-NEXT:    movzwl %r9w, %eax
-; SCALAR-NEXT:    orl %r10d, %eax
-; SCALAR-NEXT:    shlq $32, %r8
-; SCALAR-NEXT:    orq %r8, %rax
+; SCALAR-NEXT:    notq %rdi
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
 ; SCALAR-NEXT:    movq %rax, (%rsi)
 ; SCALAR-NEXT:    movq %rcx, 8(%rsi)
 ; SCALAR-NEXT:    movq %rdi, 16(%rsi)
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index cee30f5fe5da..460c5fe11f82 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,16 +1045,12 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: saddo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k2
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftlw $12, %k2, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 64ed08104885..d06993da6365 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -1062,16 +1062,12 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: ssubo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1 {%k1}
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 950e943bd902..bac118095331 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,16 +1098,12 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: uaddo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k2
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftlw $12, %k2, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 7de972770d8d..ab75ada72f25 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -1145,16 +1145,12 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: usubo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1 {%k1}
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index 78797b9acc2e..27aaad6353ed 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -3137,11 +3137,11 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
 ; AVX512-LABEL: mul_v8i64_cast_cond:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovw %edi, %k1
-; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm2
-; AVX512-NEXT:    vpmuludq %zmm2, %zmm0, %zmm2
-; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm3
-; AVX512-NEXT:    vpmuludq %zmm1, %zmm3, %zmm3
-; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm0 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index b839452725a9..3aaa9268a8d8 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -58,12 +58,12 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ; SSE41-NEXT:    psrlq %xmm4, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    pandn %xmm3, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
 ; SSE41-NEXT:    paddq %xmm0, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psllq %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; SSE41-NEXT:    psllq %xmm2, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -76,11 +76,11 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -158,13 +158,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v2i64:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -366,13 +366,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v4i32:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -646,26 +646,26 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v8i16:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: var_funnnel_v8i16:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
-; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
@@ -995,26 +995,26 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v16i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: var_funnnel_v16i8:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
-; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 7b6b0ea83c7e..fc65f759f5fb 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -486,22 +486,22 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; XOPAVX2-LABEL: var_funnnel_v16i16:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm4
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; XOPAVX2-NEXT:    vpsubw %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
-; XOPAVX2-NEXT:    vpshlw %xmm5, %xmm7, %xmm5
-; XOPAVX2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
-; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; XOPAVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT:    vpshlw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 0426c48aecfc..a6067a960fc0 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -185,13 +185,13 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v2i32:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index c54da38ef10c..75baba5f35f7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -927,9 +927,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psllq $7, %xmm1
-; SSE2-NEXT:    paddq %xmm0, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    psllq $7, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i64:
@@ -975,9 +975,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; X86-SSE-LABEL: constant_shift_v2i64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psllq $7, %xmm1
-; X86-SSE-NEXT:    paddq %xmm0, %xmm0
-; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; X86-SSE-NEXT:    paddq %xmm0, %xmm1
+; X86-SSE-NEXT:    psllq $7, %xmm0
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X86-SSE-NEXT:    retl
   %shift = shl <2 x i64> %a, <i64 1, i64 7>
   ret <2 x i64> %shift
diff --git a/llvm/test/DebugInfo/Generic/gmlt.test b/llvm/test/DebugInfo/Generic/gmlt.test
index 3cb3d9c14761..e808c13a04ea 100644
--- a/llvm/test/DebugInfo/Generic/gmlt.test
+++ b/llvm/test/DebugInfo/Generic/gmlt.test
@@ -1,4 +1,4 @@
 ; RUN: %llc_dwarf -O0 -filetype=obj < %S/../Inputs/gmlt.ll | llvm-dwarfdump -v - | FileCheck %S/../Inputs/gmlt.ll
 
 ; There's a darwin specific test in X86/gmlt, so it's okay to XFAIL this here.
-; XFAIL: target={{.*}}-darwin{{.*}}
+; XFAIL: target={{.*}}-apple{{.*}}
diff --git a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
index d6a1d8d6e136..d28836d56037 100644
--- a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+++ b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
@@ -18,21 +18,21 @@
 ; RELOCS-BOTH:         Section ({{.*}}) .rela.debug_frame {
 ; RELOCS-NORL-NEXT:      0x1C R_LARCH_32 .debug_frame 0x0
 ; RELOCS-NORL-NEXT:      0x20 R_LARCH_64 .text 0x0
-; RELOCS-ENRL-NEXT:      0x1C R_LARCH_32 <null> 0x0
-; RELOCS-ENRL-NEXT:      0x20 R_LARCH_64 <null> 0x0
-; RELOCS-ENRL-NEXT:      0x28 R_LARCH_ADD64 <null> 0x0
-; RELOCS-ENRL-NEXT:      0x28 R_LARCH_SUB64 <null> 0x0
-; RELOCS-ENRL-NEXT:      0x3F R_LARCH_ADD6 <null> 0x0
-; RELOCS-ENRL-NEXT:      0x3F R_LARCH_SUB6 <null> 0x0
+; RELOCS-ENRL-NEXT:      0x1C R_LARCH_32 .L0  0x0
+; RELOCS-ENRL-NEXT:      0x20 R_LARCH_64 .L0  0x0
+; RELOCS-ENRL-NEXT:      0x28 R_LARCH_ADD64 .L0  0x0
+; RELOCS-ENRL-NEXT:      0x28 R_LARCH_SUB64 .L0  0x0
+; RELOCS-ENRL-NEXT:      0x3F R_LARCH_ADD6 .L0  0x0
+; RELOCS-ENRL-NEXT:      0x3F R_LARCH_SUB6 .L0  0x0
 ; RELOCS-BOTH-NEXT:    }
 ; RELOCS-BOTH:         Section ({{.*}}) .rela.debug_line {
 ; RELOCS-BOTH-NEXT:      0x22 R_LARCH_32 .debug_line_str 0x0
 ; RELOCS-BOTH-NEXT:      0x31 R_LARCH_32 .debug_line_str 0x2
 ; RELOCS-BOTH-NEXT:      0x46 R_LARCH_32 .debug_line_str 0x1B
 ; RELOCS-NORL-NEXT:      0x4F R_LARCH_64 .text 0x0
-; RELOCS-ENRL-NEXT:      0x4F R_LARCH_64 <null> 0x0
-; RELOCS-ENRL-NEXT:      0x5F R_LARCH_ADD16 <null> 0x0
-; RELOCS-ENRL-NEXT:      0x5F R_LARCH_SUB16 <null> 0x0
+; RELOCS-ENRL-NEXT:      0x4F R_LARCH_64 .L0  0x0
+; RELOCS-ENRL-NEXT:      0x5F R_LARCH_ADD16 .L0  0x0
+; RELOCS-ENRL-NEXT:      0x5F R_LARCH_SUB16 .L0  0x0
 ; RELOCS-BOTH-NEXT:    }
 ; RELOCS-BOTH-NEXT:  ]
 
diff --git a/llvm/test/DebugInfo/RISCV/dwarf-riscv-relocs.ll b/llvm/test/DebugInfo/RISCV/dwarf-riscv-relocs.ll
index e5de1713f4e0..99594b5e01e9 100644
--- a/llvm/test/DebugInfo/RISCV/dwarf-riscv-relocs.ll
+++ b/llvm/test/DebugInfo/RISCV/dwarf-riscv-relocs.ll
@@ -6,14 +6,14 @@
 
 ; Check that we actually have relocations, otherwise this is kind of pointless.
 ; READOBJ-RELOCS:  Section ({{.*}}) .rela.debug_info {
-; READOBJ-RELOCS:    0x1B R_RISCV_ADD32 <null> 0x0
-; READOBJ-RELOCS-NEXT:    0x1B R_RISCV_SUB32 <null> 0x0
+; READOBJ-RELOCS:    0x1B R_RISCV_ADD32 .L0  0x0
+; READOBJ-RELOCS-NEXT:    0x1B R_RISCV_SUB32 .L0  0x0
 ; READOBJ-RELOCS:  Section ({{.*}}) .rela.debug_frame {
-; READOBJ-RELOCS:    0x20 R_RISCV_ADD32 <null> 0x0
-; READOBJ-RELOCS-NEXT:    0x20 R_RISCV_SUB32 <null> 0x0
+; READOBJ-RELOCS:    0x20 R_RISCV_ADD32 .L0  0x0
+; READOBJ-RELOCS-NEXT:    0x20 R_RISCV_SUB32 .L0  0x0
 ; READOBJ-RELOCS:  Section ({{.*}}) .rela.debug_line {
-; READOBJ-RELOCS:    0x5A R_RISCV_ADD16 <null> 0x0
-; READOBJ-RELOCS-NEXT:    0x5A R_RISCV_SUB16 <null> 0x0
+; READOBJ-RELOCS:    0x5A R_RISCV_ADD16 .L0  0x0
+; READOBJ-RELOCS-NEXT:    0x5A R_RISCV_SUB16 .L0  0x0
 
 ; Check that we can print the source, even with relocations.
 ; OBJDUMP-SOURCE: Disassembly of section .text:
diff --git a/llvm/test/DebugInfo/RISCV/relax-debug-frame.ll b/llvm/test/DebugInfo/RISCV/relax-debug-frame.ll
index f655a7c0a7ef..ffef0ec23406 100644
--- a/llvm/test/DebugInfo/RISCV/relax-debug-frame.ll
+++ b/llvm/test/DebugInfo/RISCV/relax-debug-frame.ll
@@ -4,11 +4,11 @@
 ; RUN:     | FileCheck -check-prefix=RELAX-DWARFDUMP %s
 ;
 ; RELAX:      Section ({{.*}}) .rela.eh_frame {
-; RELAX-NEXT:   0x1C R_RISCV_32_PCREL <null> 0x0
-; RELAX-NEXT:   0x30 R_RISCV_32_PCREL <null> 0x0
-; RELAX-NEXT:   0x44 R_RISCV_32_PCREL <null> 0x0
-; RELAX-NEXT:   0x48 R_RISCV_ADD32 <null> 0x0
-; RELAX-NEXT:   0x48 R_RISCV_SUB32 <null> 0x0
+; RELAX-NEXT:   0x1C R_RISCV_32_PCREL .L0  0x0
+; RELAX-NEXT:   0x30 R_RISCV_32_PCREL .L0  0x0
+; RELAX-NEXT:   0x44 R_RISCV_32_PCREL .L0  0x0
+; RELAX-NEXT:   0x48 R_RISCV_ADD32 .L0  0x0
+; RELAX-NEXT:   0x48 R_RISCV_SUB32 .L0  0x0
 ; RELAX-NEXT:  }
 
 ; RELAX-DWARFDUMP-NOT: error: failed to compute relocation
diff --git a/llvm/test/DebugInfo/Symbolize/ELF/riscv-empty-name-symbol.s b/llvm/test/DebugInfo/Symbolize/ELF/riscv-temporary-symbol.s
index 1e0fa8a30618..0b54f104ab95 100644
--- a/llvm/test/DebugInfo/Symbolize/ELF/riscv-empty-name-symbol.s
+++ b/llvm/test/DebugInfo/Symbolize/ELF/riscv-temporary-symbol.s
@@ -1,10 +1,11 @@
 # REQUIRES: riscv-registered-target
-## Ignore empty name symbols.
+## Ignore .L0 symbols that are generated by LLVM integrated assembler and GNU
+## assembler for .debug_line/.eh_frame related assembler directives.
 
 # RUN: llvm-mc -filetype=obj -triple=riscv64 %s -o %t
 # RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=SYM
 
-# SYM: 0000000000000004  0 NOTYPE LOCAL  DEFAULT [[#]] {{$}}
+# SYM: 0000000000000004  0 NOTYPE LOCAL  DEFAULT [[#]] .L0 {{$}}
 # SYM: 0000000000000000  0 NOTYPE GLOBAL DEFAULT [[#]] foo
 
 ## Make sure we test at an address larger than or equal to an empty name symbol.
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s b/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
index a5038022dfe0..e7114e4d643c 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
@@ -7,7 +7,7 @@
 # the section start and section end. So that by relocating these symbol, the section length
 # can be calculated.
 #
-# CHECK: Creating defined graph symbol for ELF symbol ""
+# CHECK: Creating defined graph symbol for ELF symbol ".L0 "
 # CHECK: Creating defined graph symbol for ELF symbol "main"
         .text
         .globl main
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail.ll
deleted file mode 100644
index a7321833d748..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %addr1 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr1(ptr null)
-
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %exit
-  ]
-await.ready:
-  %save2 = call token @llvm.coro.save(ptr null)
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr2(ptr null)
-
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %exit
-    i8 1, label %exit
-  ]
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; Verify that in the initial function resume is not marked with musttail.
-; CHECK-LABEL: @f(
-; CHECK: %[[addr1:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NOT: musttail call fastcc void %[[addr1]](ptr null)
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @f.resume(
-; CHECK: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK: call void @llvm.instrprof
-; CHECK-NEXT: musttail call fastcc void %[[addr2]](ptr null)
-; CHECK-NEXT: ret void
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail1.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail1.ll
deleted file mode 100644
index 6098dee9a580..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail1.ll
+++ /dev/null
@@ -1,97 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %addr1 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr1(ptr null)
-
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.suspend
-    i8 1, label %exit
-  ]
-await.suspend:
-  %save2 = call token @llvm.coro.save(ptr null)
-  %br0 = call i8 @switch_result()
-  switch i8 %br0, label %unreach [
-    i8 0, label %await.resume3
-    i8 1, label %await.resume1
-    i8 2, label %await.resume2
-  ]
-await.resume1:
-  %hdl = call ptr @g()
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr %hdl, i8 0)
-  call fastcc void %addr2(ptr %hdl)
-  br label %final.suspend
-await.resume2:
-  %hdl2 = call ptr @h()
-  %addr3 = call ptr @llvm.coro.subfn.addr(ptr %hdl2, i8 0)
-  call fastcc void %addr3(ptr %hdl2)
-  br label %final.suspend
-await.resume3:
-  %addr4 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr4(ptr null)
-  br label %final.suspend
-final.suspend:
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %pre.exit
-    i8 1, label %exit
-  ]
-pre.exit:
-  br label %exit
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-unreach:
-  unreachable
-}
-
-; Verify that in the initial function resume is not marked with musttail.
-; CHECK-LABEL: @f(
-; CHECK: %[[addr1:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NOT: musttail call fastcc void %[[addr1]](ptr null)
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @f.resume(
-; CHECK: %[[hdl:.+]] = call ptr @g()
-; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; CHECK: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
-; CHECK-NEXT: ret void
-; CHECK: %[[hdl2:.+]] = call ptr @h()
-; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; CHECK: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
-; CHECK-NEXT: ret void
-; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK: musttail call fastcc void %[[addr4]](ptr null)
-; CHECK-NEXT: ret void
-
-
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-declare i8 @switch_result()
-declare ptr @g()
-declare ptr @h()
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail10.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail10.ll
deleted file mode 100644
index f43b10ebf42e..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail10.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-target triple = "wasm64-unknown-unknown"
-
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %addr1 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr1(ptr null)
-
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %exit
-  ]
-await.ready:
-  %save2 = call token @llvm.coro.save(ptr null)
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr2(ptr null)
-
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %exit
-    i8 1, label %exit
-  ]
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; CHECK: musttail call
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-
-attributes #0 = { presplitcoroutine "target-features"="+tail-call" }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail11.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail11.ll
deleted file mode 100644
index fc5bb9a1b20b..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail11.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
-
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %addr1 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr1(ptr null)
-
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %exit
-  ]
-await.ready:
-  %save2 = call token @llvm.coro.save(ptr null)
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr2(ptr null)
-
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %exit
-    i8 1, label %exit
-  ]
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; CHECK: musttail call
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-
-attributes #0 = { presplitcoroutine "target-features"="+tail-call" }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail12.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail12.ll
deleted file mode 100644
index 634d0106a2e6..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail12.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-declare void @fakeresume1(ptr)
-declare void @print()
-
-define void @f(i1 %cond) #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-
-  %init_suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %init_suspend, label %coro.end [
-    i8 0, label %await.ready
-    i8 1, label %coro.end
-  ]
-await.ready:
-  %save2 = call token @llvm.coro.save(ptr null)
-  br i1 %cond, label %then, label %else
-
-then:
-  call fastcc void @fakeresume1(ptr align 8 null)
-  br label %merge
-
-else:
-  br label %merge
-
-merge:
-  %v0 = phi i1 [0, %then], [1, %else]
-  br label %compare
-
-compare:
-  %cond.cmp = icmp eq i1 %v0, 0
-  br i1 %cond.cmp, label %ready, label %prepare
-
-prepare:
-  call void @print()
-  br label %ready
-
-ready:
-  %suspend = call i8 @llvm.coro.suspend(token %save2, i1 true)
-  %switch = icmp ult i8 %suspend, 2
-  br i1 %switch, label %cleanup, label %coro.end
-
-cleanup:
-  %free.handle = call ptr @llvm.coro.free(token %id, ptr %vFrame)
-  %.not = icmp eq ptr %free.handle, null
-  br i1 %.not, label %coro.end, label %coro.free
-
-coro.free:
-  call void @delete(ptr nonnull %free.handle) #2
-  br label %coro.end
-
-coro.end:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; CHECK-LABEL: @f.resume(
-; CHECK-NOT:      }
-; CHECK:          call void @print()
-
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-declare void @delete(ptr nonnull) #2
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail13.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail13.ll
deleted file mode 100644
index 2f9a14c90107..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail13.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-declare void @fakeresume1(ptr)
-declare void @may_throw(ptr)
-declare void @print()
-
-define void @f(i1 %cond) #0 personality i32 3 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-
-  %init_suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %init_suspend, label %coro.end [
-    i8 0, label %await.ready
-    i8 1, label %coro.end
-  ]
-await.ready:
-  call fastcc void @fakeresume1(ptr align 8 null)
-  invoke void @may_throw(ptr null)
-    to label %ready unwind label %lpad
-
-ready:
-  %save2 = call token @llvm.coro.save(ptr null)
-  %suspend = call i8 @llvm.coro.suspend(token %save2, i1 true)
-  %switch = icmp ult i8 %suspend, 2
-  br i1 %switch, label %cleanup, label %coro.end
-
-cleanup:
-  %free.handle = call ptr @llvm.coro.free(token %id, ptr %vFrame)
-  %.not = icmp eq ptr %free.handle, null
-  br i1 %.not, label %coro.end, label %coro.free
-
-lpad:
-  %lpval = landingpad { ptr, i32 }
-     cleanup
-
-  %need.resume = call i1 @llvm.coro.end(ptr null, i1 true, token none)
-  resume { ptr, i32 } %lpval
-
-coro.free:
-  call void @delete(ptr nonnull %free.handle) #2
-  br label %coro.end
-
-coro.end:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; CHECK-LABEL: @f.resume(
-; CHECK-NOT:          musttail call fastcc void @fakeresume1(
-; CHECK:     }
-
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-declare void @delete(ptr nonnull) #2
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail2.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail2.ll
deleted file mode 100644
index 61b61a200e70..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail2.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-define void @fakeresume1(ptr)  {
-entry:
-  ret void;
-}
-
-define void @fakeresume2(ptr align 8)  {
-entry:
-  ret void;
-}
-
-define void @g() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr null)
-
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %exit
-  ]
-await.ready:
-  %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume2(ptr align 8 null)
-
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %exit
-    i8 1, label %exit
-  ]
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; Verify that in the initial function resume is not marked with musttail.
-; CHECK-LABEL: @g(
-; CHECK-NOT: musttail call fastcc void @fakeresume1(ptr null)
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @g.resume(
-; CHECK: musttail call fastcc void @fakeresume2(ptr align 8 null)
-; CHECK-NEXT: ret void
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail3.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail3.ll
deleted file mode 100644
index 82176b8085e6..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail3.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %addr1 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr1(ptr null)
-
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  %cmp = icmp eq i8 %suspend, 0
-  br i1 %cmp, label %await.suspend, label %exit
-await.suspend:
-  %save2 = call token @llvm.coro.save(ptr null)
-  %br0 = call i8 @switch_result()
-  switch i8 %br0, label %unreach [
-    i8 0, label %await.resume3
-    i8 1, label %await.resume1
-    i8 2, label %await.resume2
-  ]
-await.resume1:
-  %hdl = call ptr @g()
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr %hdl, i8 0)
-  call fastcc void %addr2(ptr %hdl)
-  br label %final.suspend
-await.resume2:
-  %hdl2 = call ptr @h()
-  %addr3 = call ptr @llvm.coro.subfn.addr(ptr %hdl2, i8 0)
-  call fastcc void %addr3(ptr %hdl2)
-  br label %final.suspend
-await.resume3:
-  %addr4 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr4(ptr null)
-  br label %final.suspend
-final.suspend:
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  %cmp2 = icmp eq i8 %suspend2, 0
-  br i1 %cmp2, label %pre.exit, label %exit
-pre.exit:
-  br label %exit
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-unreach:
-  unreachable
-}
-
-; Verify that in the initial function resume is not marked with musttail.
-; CHECK-LABEL: @f(
-; CHECK: %[[addr1:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK-NOT: musttail call fastcc void %[[addr1]](ptr null)
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @f.resume(
-; CHECK: %[[hdl:.+]] = call ptr @g()
-; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; CHECK: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
-; CHECK-NEXT: ret void
-; CHECK: %[[hdl2:.+]] = call ptr @h()
-; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; CHECK: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
-; CHECK-NEXT: ret void
-; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; CHECK: musttail call fastcc void %[[addr4]](ptr null)
-; CHECK-NEXT: ret void
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-declare i8 @switch_result()
-declare ptr @g()
-declare ptr @h()
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail4.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail4.ll
deleted file mode 100644
index be70fc4b51f1..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail4.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-define void @fakeresume1(ptr)  {
-entry:
-  ret void;
-}
-
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-
-  %init_suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %init_suspend, label %coro.end [
-    i8 0, label %await.ready
-    i8 1, label %coro.end
-  ]
-await.ready:
-  %save2 = call token @llvm.coro.save(ptr null)
-
-  call fastcc void @fakeresume1(ptr align 8 null)
-  %suspend = call i8 @llvm.coro.suspend(token %save2, i1 true)
-  %switch = icmp ult i8 %suspend, 2
-  br i1 %switch, label %cleanup, label %coro.end
-
-cleanup:
-  %free.handle = call ptr @llvm.coro.free(token %id, ptr %vFrame)
-  %.not = icmp eq ptr %free.handle, null
-  br i1 %.not, label %coro.end, label %coro.free
-
-coro.free:
-  call void @delete(ptr nonnull %free.handle) #2
-  br label %coro.end
-
-coro.end:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; CHECK-LABEL: @f.resume(
-; CHECK:          musttail call fastcc void @fakeresume1(
-; CHECK-NEXT:     ret void
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-declare void @delete(ptr nonnull) #2
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail5.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail5.ll
deleted file mode 100644
index 3e5bddd8e131..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail5.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-declare void @fakeresume1(ptr align 8)
-
-define void @g() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %alloc.var = alloca i8
-  call void @llvm.lifetime.start.p0(i64 1, ptr %alloc.var)
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.suspend
-    i8 1, label %exit
-  ]
-await.suspend:
-  %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %exit
-  ]
-await.ready:
-  call void @consume(ptr %alloc.var)
-  call void @llvm.lifetime.end.p0(i64 1, ptr %alloc.var)
-  br label %exit
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @g.resume(
-; CHECK:          musttail call fastcc void @fakeresume1(ptr align 8 null)
-; CHECK-NEXT:     ret void
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-declare void @consume(ptr)
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll
deleted file mode 100644
index 4359d5305d4d..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-declare void @fakeresume1(ptr align 8)
-
-define void @g() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %alloc.var = alloca i64
-  call void @llvm.lifetime.start.p0(i64 1, ptr %alloc.var)
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.suspend
-    i8 1, label %exit
-  ]
-await.suspend:
-  %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %exit
-  ]
-await.ready:
-  call void @consume(ptr %alloc.var)
-  call void @llvm.lifetime.end.p0(i64 1, ptr %alloc.var)
-  br label %exit
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @g.resume(
-; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
-; CHECK-NEXT: ret void
-
-; It has a cleanup bb.
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %alloc.var = alloca i64
-  call void @llvm.lifetime.start.p0(i64 1, ptr %alloc.var)
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.suspend
-    i8 1, label %exit
-  ]
-await.suspend:
-  %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %cleanup
-  ]
-await.ready:
-  call void @consume(ptr %alloc.var)
-  call void @llvm.lifetime.end.p0(i64 1, ptr %alloc.var)
-  br label %exit
-
-cleanup:
-  %free.handle = call ptr @llvm.coro.free(token %id, ptr %vFrame)
-  %.not = icmp eq ptr %free.handle, null
-  br i1 %.not, label %exit, label %coro.free
-
-coro.free:
-  call void @delete(ptr nonnull %free.handle) #2
-  br label %exit
-
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @f.resume(
-; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
-; CHECK-NEXT: ret void
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-declare void @delete(ptr nonnull) #2
-declare void @consume(ptr)
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll
deleted file mode 100644
index 2a14be0f9218..000000000000
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; Tests that instrumentation doesn't interfere with lowering (coro-split).
-; It should convert coro.resume followed by a suspend to a musttail call.
-
-; The difference between this and coro-split-musttail5.ll and coro-split-musttail5.ll
-; is that this contains dead instruction generated during the transformation,
-; which makes the optimization harder.
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-declare void @fakeresume1(ptr align 8)
-
-define void @g() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %alloc.var = alloca i64
-  call void @llvm.lifetime.start.p0(i64 1, ptr %alloc.var)
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.suspend
-    i8 1, label %exit
-  ]
-await.suspend:
-  %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %exit
-  ]
-await.ready:
-  call void @consume(ptr %alloc.var)
-  call void @llvm.lifetime.end.p0(i64 1, ptr %alloc.var)
-  br label %exit
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @g.resume(
-; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
-; CHECK-NEXT:    ret void
-
-; It has a cleanup bb.
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %alloc.var = alloca i64
-  call void @llvm.lifetime.start.p0(i64 1, ptr %alloc.var)
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.suspend
-    i8 1, label %exit
-  ]
-await.suspend:
-  %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %cleanup
-  ]
-await.ready:
-  call void @consume(ptr %alloc.var)
-  call void @llvm.lifetime.end.p0(i64 1, ptr %alloc.var)
-  br label %exit
-
-cleanup:
-  %free.handle = call ptr @llvm.coro.free(token %id, ptr %vFrame)
-  %.not = icmp eq ptr %free.handle, null
-  br i1 %.not, label %exit, label %coro.free
-
-coro.free:
-  call void @delete(ptr nonnull %free.handle) #2
-  br label %exit
-
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; Verify that in the resume part resume call is marked with musttail.
-; CHECK-LABEL: @f.resume(
-; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
-; CHECK-NEXT:    ret void
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-declare void @delete(ptr nonnull) #2
-declare void @consume(ptr)
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-
-attributes #0 = { presplitcoroutine }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
new file mode 100644
index 000000000000..39b2b6225d8b
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -0,0 +1,3617 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes=msan 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test1(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test88(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test87(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test86(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test85(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test84(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test83(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test82(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test81(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test80(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test79(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test78(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test77(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test76(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP16]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP22]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test75(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP16]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <2 x i1> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to x86_mmx
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP22]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test74(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP16]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP22]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test73(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test73(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[TMP10]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test72(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test72(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+define i64 @test72_2(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test72_2(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind
+  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+
+define i64 @test71(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test71(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[TMP6]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to i64
+  ret i64 %2
+}
+
+declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test70(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test70(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+define i64 @test70_2(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test70_2(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test69(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test69(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[TMP10]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+
+define i64 @test68(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test68(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[TMP6]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to i64
+  ret i64 %2
+}
+
+declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test67(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test67(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[TMP10]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test66(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test66(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
+  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+define i64 @test66_2(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test66_2(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind
+  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
+  %4 = extractelement <1 x i64> %3, i32 0
+  ret i64 %4
+}
+
+declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test65(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <1 x i64> [[TMP16]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP17]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test64(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <1 x i64> [[TMP16]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP17]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test63(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP5]]
+;
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test62(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <1 x i64> [[TMP16]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP17]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test61(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <1 x i64> [[TMP16]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP17]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test60(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP5]]
+;
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test59(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <1 x i64> [[TMP16]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP17]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test58(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <1 x i64> [[TMP16]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP17]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1.i = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test56(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test55(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test54(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test53(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test52(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test51(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test50(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test49(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP13:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP18]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test48(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test47(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test46(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test45(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test44(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1 = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+
+declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test43(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test42(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test41(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test40(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test39(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test38(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test37(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test36(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1 = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test35(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test34(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test33(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test32(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 48
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test31(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test30(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test29(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test28(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test27(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test26(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind
+
+define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
+; CHECK-LABEL: define void @test25(
+; CHECK-SAME: ptr [[P:%.*]], <1 x i64> [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind
+  ret void
+}
+
+declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+
+define i32 @test24(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i32 @test24(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP6]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
+  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
+  ret i32 %1
+}
+
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+
+define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0 {
+; CHECK-LABEL: define void @test23(
+; CHECK-SAME: <1 x i64> [[D:%.*]], <1 x i64> [[N:%.*]], ptr [[P:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[N]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[D]] to <8 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    tail call void @llvm.x86.mmx.maskmovq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = bitcast <1 x i64> %n to <8 x i8>
+  %1 = bitcast <1 x i64> %d to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
+  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
+  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind
+  ret void
+}
+
+declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test22(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP13]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
+  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+
+define i64 @test21(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test21(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP5]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+define i32 @test21_2(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i32 @test21_2(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <2 x i32>
+  %5 = extractelement <2 x i32> %4, i32 0
+  ret i32 %5
+}
+
+declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test20(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP5]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
+  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+
+define <2 x double> @test19(<1 x i64> %a) #0 {
+; CHECK-LABEL: define <2 x double> @test19(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
+  ret <2 x double> %2
+}
+
+declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+
+define i64 @test18(<2 x double> %a) #0 {
+; CHECK-LABEL: define i64 @test18(
+; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = extractelement <1 x i64> %2, i32 0
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+
+define i64 @test17(<2 x double> %a) #0 {
+; CHECK-LABEL: define i64 @test17(
+; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = extractelement <1 x i64> %2, i32 0
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+
+define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test16(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]], i8 16)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+entry:
+  %0 = extractelement <1 x i64> %a, i32 0
+  %mmx_var = bitcast i64 %0 to x86_mmx
+  %1 = extractelement <1 x i64> %b, i32 0
+  %mmx_var1 = bitcast i64 %1 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
+  %3 = bitcast x86_mmx %2 to i64
+  ret i64 %3
+}
+
+declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+
+define i64 @test15(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test15(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP10]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+
+define i64 @test14(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test14(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP10]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+
+define i64 @test13(<1 x i64> %a) #0 {
+; CHECK-LABEL: define i64 @test13(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <8 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP10]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %4 = bitcast <8 x i8> %3 to <1 x i64>
+  %5 = extractelement <1 x i64> %4, i32 0
+  ret i64 %5
+}
+
+declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test12(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to x86_mmx
+  %3 = bitcast <2 x i32> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %6 = bitcast <2 x i32> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test11(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test10(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to x86_mmx
+  %3 = bitcast <8 x i8> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %6 = bitcast <8 x i8> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test9(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to x86_mmx
+  %3 = bitcast <8 x i8> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %6 = bitcast <8 x i8> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test8(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test7(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP20]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <8 x i8>
+  %1 = bitcast <1 x i64> %a to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to x86_mmx
+  %3 = bitcast <8 x i8> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %6 = bitcast <8 x i8> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test6(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test5(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to x86_mmx
+  %3 = bitcast <2 x i32> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %6 = bitcast <2 x i32> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test4(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test3(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <4 x i16>
+  %1 = bitcast <1 x i64> %a to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to x86_mmx
+  %3 = bitcast <4 x i16> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK-LABEL: define i64 @test2(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+entry:
+  %0 = bitcast <1 x i64> %b to <2 x i32>
+  %1 = bitcast <1 x i64> %a to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to x86_mmx
+  %3 = bitcast <2 x i32> %0 to x86_mmx
+  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
+  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %6 = bitcast <2 x i32> %5 to <1 x i64>
+  %7 = extractelement <1 x i64> %6, i32 0
+  ret i64 %7
+}
+
+define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 {
+; ALL-LABEL: test89:
+; ALL:       # %bb.0:
+; ALL-NEXT:    cvtpi2ps %mm0, %xmm0
+; ALL-NEXT:    ret{{[l|q]}}
+; CHECK-LABEL: define <4 x float> @test89(
+; CHECK-SAME: <4 x float> [[A:%.*]], x86_mmx [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], x86_mmx [[B]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[C]]
+;
+  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
+  ret <4 x float> %c
+}
+
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+
+define void @test90() #0 {
+; ALL-LABEL: test90:
+; ALL:       # %bb.0:
+; ALL-NEXT:    emms
+; ALL-NEXT:    ret{{[l|q]}}
+; CHECK-LABEL: define void @test90(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @llvm.x86.mmx.emms()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.mmx.emms()
+  ret void
+}
+
+declare void @llvm.x86.mmx.emms()
+
+define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
+; CHECK-LABEL: define <1 x i64> @test_mm_insert_pi16(
+; CHECK-SAME: <1 x i64> [[A_COERCE:%.*]], i32 [[D:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx [[TMP0]], i32 [[D]], i32 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x i64> [[TMP2]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2)
+  %2 = bitcast x86_mmx %1 to <1 x i64>
+  ret <1 x i64> %2
+}
+
+declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg)
+
+define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
+; CHECK-LABEL: define i32 @test_mm_extract_pi16(
+; CHECK-SAME: <1 x i64> [[A_COERCE:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx [[TMP0]], i32 2)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
+  %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2)
+  ret i32 %1
+}
+
+declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg)
+
+attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vscale.ll b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
new file mode 100644
index 000000000000..e1a4a9b7aa68
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S -msan-check-access-address=0 -passes="msan" 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test_load_store_i32(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_load_store_i32(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store <vscale x 4 x i32> [[_MSLD]], ptr [[TMP7]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP1]], ptr [[B]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 4 x i32>, ptr %a
+  store <vscale x 4 x i32> %1, ptr %b
+  ret void
+}
+
+define void @test_load_store_add_int(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_load_store_add_int(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i64>, ptr [[A]], align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP4]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 8 x i64>, ptr [[B]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load <vscale x 8 x i64>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <vscale x 8 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 8 x i64> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store <vscale x 8 x i64> [[_MSLD1]], ptr [[TMP12]], align 64
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP5]], ptr [[B]], align 64
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 8 x i64>, ptr %a
+  %2 = load <vscale x 8 x i64>, ptr %b
+  %3 = add <vscale x 8 x i64> %1, %2
+  store <vscale x 8 x i64> %2, ptr %b
+  ret void
+}
+
+define void @test_load_store_float(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_load_store_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store <vscale x 4 x i32> [[_MSLD]], ptr [[TMP7]], align 16
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP1]], ptr [[B]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 4 x float>, ptr %a
+  store <vscale x 4 x float> %1, ptr %b
+  ret void
+}
+
+define void @test_load_store_add_float(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_load_store_add_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x float>, ptr [[B]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <vscale x 2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <vscale x 2 x float> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store <vscale x 2 x i32> [[_MSLD1]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    store <vscale x 2 x float> [[TMP5]], ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 2 x float>, ptr %a
+  %2 = load <vscale x 2 x float>, ptr %b
+  %3 = fadd <vscale x 2 x float> %1, %2
+  store <vscale x 2 x float> %2, ptr %b
+  ret void
+}
+
+define <vscale x 2 x float> @fn_ret(ptr %a) sanitize_memory {
+; CHECK-LABEL: define <vscale x 2 x float> @fn_ret(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <vscale x 2 x float> [[TMP1]]
+;
+  %1 = load <vscale x 2 x float>, ptr %a
+  ret <vscale x 2 x float> %1
+}
+
+define void @test_ret(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_ret(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 [[TMP1]], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x float> @fn_ret(ptr [[A]])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load <vscale x 2 x i32>, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    store <vscale x 2 x i32> [[_MSRET]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <vscale x 2 x float> [[TMP5]], ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = call <vscale x 2 x float> @fn_ret(ptr %a)
+  store <vscale x 2 x float> %1, ptr %b
+  ret void
+}
+
+define void @fn_param(<vscale x 2 x float> %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @fn_param(
+; CHECK-SAME: <vscale x 2 x float> [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <vscale x 2 x float> [[A]], ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+  store <vscale x 2 x float> %a, ptr %b
+  ret void
+}
+
+define void @test_param(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_param(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    store i64 [[TMP1]], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @fn_param(<vscale x 2 x float> [[TMP2]], ptr [[B]])
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 2 x float>, ptr %a
+  call void @fn_param(<vscale x 2 x float> %1, ptr %b)
+  ret void
+}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/MC/AArch64/SME2/zip-diagnostics.s b/llvm/test/MC/AArch64/SME2/zip-diagnostics.s
index 6c80096ab486..fbe07a4fca5d 100644
--- a/llvm/test/MC/AArch64/SME2/zip-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/zip-diagnostics.s
@@ -23,3 +23,12 @@ zip {z20.b-z23.b}, {z9.b-z12.b}
 // CHECK-NEXT: zip {z20.b-z23.b}, {z9.b-z12.b}
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
+zip {z1.q-z2.q}, z0.q, z0.q
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: zip {z1.q-z2.q}, z0.q, z0.q
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+zip {z1.q-z4.q}, z0.q, z0.q
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: zip {z1.q-z4.q}, z0.q, z0.q
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
index c13a1be05b1c..a18989880a34 100644
--- a/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Out of range index offset
diff --git a/llvm/test/MC/AArch64/SME2p1/fadd.s b/llvm/test/MC/AArch64/SME2p1/fadd.s
index a8e64a63dbdb..bdb769093c83 100644
--- a/llvm/test/MC/AArch64/SME2p1/fadd.s
+++ b/llvm/test/MC/AArch64/SME2p1/fadd.s
@@ -1,300 +1,302 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fadd    za.h[w8, 0, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c00 <unknown>
 
 fadd    za.h[w8, 0], {z0.h - z1.h}  // 11000001-10100100-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c00 <unknown>
 
 fadd    za.h[w10, 5, vgx2], {z10.h, z11.h}  // 11000001-10100100-01011101-01000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x45,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45d45 <unknown>
 
 fadd    za.h[w10, 5], {z10.h - z11.h}  // 11000001-10100100-01011101-01000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x45,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45d45 <unknown>
 
 fadd    za.h[w11, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d87 <unknown>
 
 fadd    za.h[w11, 7], {z12.h - z13.h}  // 11000001-10100100-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d87 <unknown>
 
 fadd    za.h[w11, 7, vgx2], {z30.h, z31.h}  // 11000001-10100100-01111111-11000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xc7,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47fc7 <unknown>
 
 fadd    za.h[w11, 7], {z30.h - z31.h}  // 11000001-10100100-01111111-11000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xc7,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47fc7 <unknown>
 
 fadd    za.h[w8, 5, vgx2], {z16.h, z17.h}  // 11000001-10100100-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41e05 <unknown>
 
 fadd    za.h[w8, 5], {z16.h - z17.h}  // 11000001-10100100-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41e05 <unknown>
 
 fadd    za.h[w8, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c01 <unknown>
 
 fadd    za.h[w8, 1], {z0.h - z1.h}  // 11000001-10100100-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c01 <unknown>
 
 fadd    za.h[w10, 0, vgx2], {z18.h, z19.h}  // 11000001-10100100-01011110, 01000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x40,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45e40 <unknown>
 
 fadd    za.h[w10, 0], {z18.h - z19.h}  // 11000001-10100100-01011110-01000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x40,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45e40 <unknown>
 
 fadd    za.h[w8, 0, vgx2], {z12.h, z13.h}  // 11000001-10100100-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41d80 <unknown>
 
 fadd    za.h[w8, 0], {z12.h - z13.h}  // 11000001-10100100-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41d80 <unknown>
 
 fadd    za.h[w10, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45c01 <unknown>
 
 fadd    za.h[w10, 1], {z0.h - z1.h}  // 11000001-10100100-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45c01 <unknown>
 
 fadd    za.h[w8, 5, vgx2], {z22.h, z23.h}  // 11000001-10100100-00011110, 11000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xc5,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41ec5 <unknown>
 
 fadd    za.h[w8, 5], {z22.h - z23.h}  // 11000001-10100100-00011110-11000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xc5,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41ec5 <unknown>
 
 fadd    za.h[w11, 2, vgx2], {z8.h, z9.h}  // 11000001-10100100-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d02 <unknown>
 
 fadd    za.h[w11, 2], {z8.h - z9.h}  // 11000001-10100100-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d02 <unknown>
 
 fadd    za.h[w9, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a43d87 <unknown>
 
 fadd    za.h[w9, 7], {z12.h - z13.h}  // 11000001-10100100-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a43d87 <unknown>
 
 fadd    za.h[w8, 0, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c00 <unknown>
 
 fadd    za.h[w8, 0], {z0.h - z3.h}  // 11000001-10100101-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c00 <unknown>
 
 fadd    za.h[w10, 5, vgx4], {z8.h - z11.h}  // 11000001-10100101-01011101-00000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x05,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55d05 <unknown>
 
 fadd    za.h[w10, 5], {z8.h - z11.h}  // 11000001-10100101-01011101-00000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x05,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55d05 <unknown>
 
 fadd    za.h[w11, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d87 <unknown>
 
 fadd    za.h[w11, 7], {z12.h - z15.h}  // 11000001-10100101-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d87 <unknown>
 
 fadd    za.h[w11, 7, vgx4], {z28.h - z31.h}  // 11000001-10100101-01111111-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x87,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57f87 <unknown>
 
 fadd    za.h[w11, 7], {z28.h - z31.h}  // 11000001-10100101-01111111-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x87,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57f87 <unknown>
 
 fadd    za.h[w8, 5, vgx4], {z16.h - z19.h}  // 11000001-10100101-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e05 <unknown>
 
 fadd    za.h[w8, 5], {z16.h - z19.h}  // 11000001-10100101-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e05 <unknown>
 
 fadd    za.h[w8, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c01 <unknown>
 
 fadd    za.h[w8, 1], {z0.h - z3.h}  // 11000001-10100101-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c01 <unknown>
 
 fadd    za.h[w10, 0, vgx4], {z16.h - z19.h}  // 11000001-10100101-01011110-00000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x00,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55e00 <unknown>
 
 fadd    za.h[w10, 0], {z16.h - z19.h}  // 11000001-10100101-01011110-00000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x00,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55e00 <unknown>
 
 fadd    za.h[w8, 0, vgx4], {z12.h - z15.h}  // 11000001-10100101-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51d80 <unknown>
 
 fadd    za.h[w8, 0], {z12.h - z15.h}  // 11000001-10100101-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51d80 <unknown>
 
 fadd    za.h[w10, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55c01 <unknown>
 
 fadd    za.h[w10, 1], {z0.h - z3.h}  // 11000001-10100101-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55c01 <unknown>
 
 fadd    za.h[w8, 5, vgx4], {z20.h - z23.h}  // 11000001-10100101-00011110-10000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x85,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e85 <unknown>
 
 fadd    za.h[w8, 5], {z20.h - z23.h}  // 11000001-10100101-00011110-10000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x85,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e85 <unknown>
 
 fadd    za.h[w11, 2, vgx4], {z8.h - z11.h}  // 11000001-10100101-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d02 <unknown>
 
 fadd    za.h[w11, 2], {z8.h - z11.h}  // 11000001-10100101-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d02 <unknown>
 
 fadd    za.h[w9, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a53d87 <unknown>
 
 fadd    za.h[w9, 7], {z12.h - z15.h}  // 11000001-10100101-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a53d87 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvt.s b/llvm/test/MC/AArch64/SME2p1/fcvt.s
index b5707bad0a24..2731055dedec 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvt.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvt.s
@@ -1,36 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fcvt    {z0.s, z1.s}, z0.h  // 11000001-10100000-11100000-00000000
 // CHECK-INST: fcvt    { z0.s, z1.s }, z0.h
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e000 <unknown>
 
 fcvt    {z20.s, z21.s}, z10.h  // 11000001-10100000-11100001-01010100
 // CHECK-INST: fcvt    { z20.s, z21.s }, z10.h
 // CHECK-ENCODING: [0x54,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e154 <unknown>
 
 fcvt    {z22.s, z23.s}, z13.h  // 11000001-10100000-11100001-10110110
 // CHECK-INST: fcvt    { z22.s, z23.s }, z13.h
 // CHECK-ENCODING: [0xb6,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e1b6 <unknown>
 
 fcvt    {z30.s, z31.s}, z31.h  // 11000001-10100000-11100011-11111110
 // CHECK-INST: fcvt    { z30.s, z31.s }, z31.h
 // CHECK-ENCODING: [0xfe,0xe3,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e3fe <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
index a723d2fc6f3a..ad3eaba7bdc2 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvtl.s b/llvm/test/MC/AArch64/SME2p1/fcvtl.s
index 31cf90d03796..6284915e4983 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvtl.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvtl.s
@@ -1,36 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fcvtl   {z0.s, z1.s}, z0.h  // 11000001-10100000-11100000-00000001
 // CHECK-INST: fcvtl   { z0.s, z1.s }, z0.h
 // CHECK-ENCODING: [0x01,0xe0,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e001 <unknown>
 
 fcvtl   {z20.s, z21.s}, z10.h  // 11000001-10100000-11100001-01010101
 // CHECK-INST: fcvtl   { z20.s, z21.s }, z10.h
 // CHECK-ENCODING: [0x55,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e155 <unknown>
 
 fcvtl   {z22.s, z23.s}, z13.h  // 11000001-10100000-11100001-10110111
 // CHECK-INST: fcvtl   { z22.s, z23.s }, z13.h
 // CHECK-ENCODING: [0xb7,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e1b7 <unknown>
 
 fcvtl   {z30.s, z31.s}, z31.h  // 11000001-10100000-11100011-11111111
 // CHECK-INST: fcvtl   { z30.s, z31.s }, z31.h
 // CHECK-ENCODING: [0xff,0xe3,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e3ff <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
index d32f795728a2..2f0dccb57c90 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/fmla.s b/llvm/test/MC/AArch64/SME2p1/fmla.s
index 10529d81eed6..df9ac8076e56 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmla.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmla.s
@@ -1,877 +1,877 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h  // 11000001-00100000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c00 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, z0.h  // 11000001-00100000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c00 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h  // 11000001-00100101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d45 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, z5.h  // 11000001-00100101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d45 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z13.h, z14.h}, z8.h  // 11000001-00101000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287da7 <unknown>
 
 fmla    za.h[w11, 7], {z13.h - z14.h}, z8.h  // 11000001-00101000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287da7 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z31.h, z0.h}, z15.h  // 11000001-00101111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fe7 <unknown>
 
 fmla    za.h[w11, 7], {z31.h - z0.h}, z15.h  // 11000001-00101111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fe7 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z17.h, z18.h}, z0.h  // 11000001-00100000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e25 <unknown>
 
 fmla    za.h[w8, 5], {z17.h - z18.h}, z0.h  // 11000001-00100000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e25 <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z1.h, z2.h}, z14.h  // 11000001-00101110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c21 <unknown>
 
 fmla    za.h[w8, 1], {z1.h - z2.h}, z14.h  // 11000001-00101110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c21 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z19.h, z20.h}, z4.h  // 11000001-00100100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e60 <unknown>
 
 fmla    za.h[w10, 0], {z19.h - z20.h}, z4.h  // 11000001-00100100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e60 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h  // 11000001-00100010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d80 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, z2.h  // 11000001-00100010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d80 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z1.h, z2.h}, z10.h  // 11000001-00101010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c21 <unknown>
 
 fmla    za.h[w10, 1], {z1.h - z2.h}, z10.h  // 11000001-00101010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c21 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h  // 11000001-00101110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ec5 <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, z14.h  // 11000001-00101110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ec5 <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z9.h, z10.h}, z1.h  // 11000001-00100001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d22 <unknown>
 
 fmla    za.h[w11, 2], {z9.h - z10.h}, z1.h  // 11000001-00100001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d22 <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h  // 11000001-00101011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d87 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, z11.h  // 11000001-00101011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d87 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h[0]  // 11000001-00010000-00010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101000 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, z0.h[0]  // 11000001-00010000-00010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101000 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h[2]  // 11000001-00010101-01010101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x45,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155545 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, z5.h[2]  // 11000001-00010101-01010101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x45,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155545 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z12.h, z13.h}, z8.h[6]  // 11000001-00011000-01111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d87 <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z13.h}, z8.h[6]  // 11000001-00011000-01111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d87 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z30.h, z31.h}, z15.h[7]  // 11000001-00011111-01111111-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xcf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fcf <unknown>
 
 fmla    za.h[w11, 7], {z30.h - z31.h}, z15.h[7]  // 11000001-00011111-01111111-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xcf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fcf <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z16.h, z17.h}, z0.h[6]  // 11000001-00010000-00011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e05 <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z17.h}, z0.h[6]  // 11000001-00010000-00011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e05 <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z0.h, z1.h}, z14.h[2]  // 11000001-00011110-00010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1401 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z1.h}, z14.h[2]  // 11000001-00011110-00010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1401 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z18.h, z19.h}, z4.h[3]  // 11000001-00010100-01010110-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x48,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145648 <unknown>
 
 fmla    za.h[w10, 0], {z18.h - z19.h}, z4.h[3]  // 11000001-00010100-01010110-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x48,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145648 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h[4]  // 11000001-00010010-00011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121980 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, z2.h[4]  // 11000001-00010010-00011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121980 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z0.h, z1.h}, z10.h[4]  // 11000001-00011010-01011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5801 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z1.h}, z10.h[4]  // 11000001-00011010-01011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5801 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h[5]  // 11000001-00011110-00011010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xcd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1acd <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, z14.h[5]  // 11000001-00011110-00011010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xcd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1acd <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z8.h, z9.h}, z1.h[2]  // 11000001-00010001-01110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117502 <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z9.h}, z1.h[2]  // 11000001-00010001-01110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117502 <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h[4]  // 11000001-00011011-00111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3987 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, z11.h[4]  // 11000001-00011011-00111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3987 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-10100000-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01008 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, {z0.h - z1.h}  // 11000001-10100000-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01008 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, {z20.h, z21.h}  // 11000001-10110100-01010001-01001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x4d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4514d <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, {z20.h - z21.h}  // 11000001-10110100-01010001-01001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x4d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4514d <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z12.h, z13.h}, {z8.h, z9.h}  // 11000001-10101000-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8718f <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z13.h}, {z8.h - z9.h}  // 11000001-10101000-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8718f <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-10111110-01110011-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73cf <unknown>
 
 fmla    za.h[w11, 7], {z30.h - z31.h}, {z30.h - z31.h}  // 11000001-10111110-01110011-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73cf <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z16.h, z17.h}, {z16.h, z17.h}  // 11000001-10110000-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0120d <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z17.h}, {z16.h - z17.h}  // 11000001-10110000-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0120d <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z0.h, z1.h}, {z30.h, z31.h}  // 11000001-10111110-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1009 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z1.h}, {z30.h - z31.h}  // 11000001-10111110-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1009 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z18.h, z19.h}, {z20.h, z21.h}  // 11000001-10110100-01010010-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x48,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45248 <unknown>
 
 fmla    za.h[w10, 0], {z18.h - z19.h}, {z20.h - z21.h}  // 11000001-10110100-01010010-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x48,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45248 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, {z2.h, z3.h}  // 11000001-10100010-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21188 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, {z2.h - z3.h}  // 11000001-10100010-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21188 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z0.h, z1.h}, {z26.h, z27.h}  // 11000001-10111010-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5009 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z1.h}, {z26.h - z27.h}  // 11000001-10111010-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5009 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, {z30.h, z31.h}  // 11000001-10111110-00010010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12cd <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, {z30.h - z31.h}  // 11000001-10111110-00010010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12cd <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z8.h, z9.h}, {z0.h, z1.h}  // 11000001-10100000-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0710a <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z9.h}, {z0.h - z1.h}  // 11000001-10100000-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0710a <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, {z10.h, z11.h}  // 11000001-10101010-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa318f <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, {z10.h - z11.h}  // 11000001-10101010-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa318f <unknown>
 
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c00 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c00 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d45 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d45 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387da7 <unknown>
 
 fmla    za.h[w11, 7], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387da7 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fe7 <unknown>
 
 fmla    za.h[w11, 7], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fe7 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e25 <unknown>
 
 fmla    za.h[w8, 5], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e25 <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c21 <unknown>
 
 fmla    za.h[w8, 1], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c21 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e60 <unknown>
 
 fmla    za.h[w10, 0], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e60 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d80 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d80 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c21 <unknown>
 
 fmla    za.h[w10, 1], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c21 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ec5 <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ec5 <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d22 <unknown>
 
 fmla    za.h[w11, 2], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d22 <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d87 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d87 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109000 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109000 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x05,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d505 <unknown>
 
 fmla    za.h[w10, 5], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x05,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d505 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd87 <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd87 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x8f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff8f <unknown>
 
 fmla    za.h[w11, 7], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x8f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff8f <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e05 <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e05 <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9401 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9401 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x08,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d608 <unknown>
 
 fmla    za.h[w10, 0], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x08,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d608 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129980 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129980 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad801 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad801 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x8d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a8d <unknown>
 
 fmla    za.h[w8, 5], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x8d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a8d <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f502 <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f502 <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb987 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb987 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11008 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11008 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x0d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5510d <unknown>
 
 fmla    za.h[w10, 5], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x0d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5510d <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9718f <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9718f <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd738f <unknown>
 
 fmla    za.h[w11, 7], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd738f <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1120d <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1120d <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1009 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1009 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x08,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55208 <unknown>
 
 fmla    za.h[w10, 0], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x08,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55208 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11188 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11188 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95009 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95009 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd128d <unknown>
 
 fmla    za.h[w8, 5], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd128d <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1710a <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1710a <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9318f <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9318f <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
index 2174e4202ba0..3ff09321e343 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/fmls.s b/llvm/test/MC/AArch64/SME2p1/fmls.s
index 9bbb21869e37..67b1430240e8 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmls.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmls.s
@@ -1,878 +1,878 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d  --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h  // 11000001-00100000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c08 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, z0.h  // 11000001-00100000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c08 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h  // 11000001-00100101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d4d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, z5.h  // 11000001-00100101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d4d <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z13.h, z14.h}, z8.h  // 11000001-00101000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287daf <unknown>
 
 fmls    za.h[w11, 7], {z13.h - z14.h}, z8.h  // 11000001-00101000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287daf <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z31.h, z0.h}, z15.h  // 11000001-00101111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fef <unknown>
 
 fmls    za.h[w11, 7], {z31.h - z0.h}, z15.h  // 11000001-00101111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fef <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z17.h, z18.h}, z0.h  // 11000001-00100000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e2d <unknown>
 
 fmls    za.h[w8, 5], {z17.h - z18.h}, z0.h  // 11000001-00100000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e2d <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z1.h, z2.h}, z14.h  // 11000001-00101110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c29 <unknown>
 
 fmls    za.h[w8, 1], {z1.h - z2.h}, z14.h  // 11000001-00101110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c29 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z19.h, z20.h}, z4.h  // 11000001-00100100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e68 <unknown>
 
 fmls    za.h[w10, 0], {z19.h - z20.h}, z4.h  // 11000001-00100100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e68 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h  // 11000001-00100010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d88 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, z2.h  // 11000001-00100010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d88 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z1.h, z2.h}, z10.h  // 11000001-00101010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c29 <unknown>
 
 fmls    za.h[w10, 1], {z1.h - z2.h}, z10.h  // 11000001-00101010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c29 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h  // 11000001-00101110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ecd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, z14.h  // 11000001-00101110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ecd <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z9.h, z10.h}, z1.h  // 11000001-00100001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d2a <unknown>
 
 fmls    za.h[w11, 2], {z9.h - z10.h}, z1.h  // 11000001-00100001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d2a <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h  // 11000001-00101011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d8f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, z11.h  // 11000001-00101011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d8f <unknown>
 
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h[0]  // 11000001-00010000-00010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101010 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, z0.h[0]  // 11000001-00010000-00010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101010 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h[2]  // 11000001-00010101-01010101-01010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x55,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155555 <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, z5.h[2]  // 11000001-00010101-01010101-01010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x55,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155555 <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z12.h, z13.h}, z8.h[6]  // 11000001-00011000-01111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d97 <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z13.h}, z8.h[6]  // 11000001-00011000-01111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d97 <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z30.h, z31.h}, z15.h[7]  // 11000001-00011111-01111111-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xdf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fdf <unknown>
 
 fmls    za.h[w11, 7], {z30.h - z31.h}, z15.h[7]  // 11000001-00011111-01111111-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xdf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fdf <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z16.h, z17.h}, z0.h[6]  // 11000001-00010000-00011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e15 <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z17.h}, z0.h[6]  // 11000001-00010000-00011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e15 <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z0.h, z1.h}, z14.h[2]  // 11000001-00011110-00010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1411 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z1.h}, z14.h[2]  // 11000001-00011110-00010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1411 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z18.h, z19.h}, z4.h[3]  // 11000001-00010100-01010110-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x58,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145658 <unknown>
 
 fmls    za.h[w10, 0], {z18.h - z19.h}, z4.h[3]  // 11000001-00010100-01010110-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x58,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145658 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h[4]  // 11000001-00010010-00011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121990 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, z2.h[4]  // 11000001-00010010-00011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121990 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z0.h, z1.h}, z10.h[4]  // 11000001-00011010-01011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5811 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z1.h}, z10.h[4]  // 11000001-00011010-01011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5811 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h[5]  // 11000001-00011110-00011010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xdd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1add <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, z14.h[5]  // 11000001-00011110-00011010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xdd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1add <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z8.h, z9.h}, z1.h[2]  // 11000001-00010001-01110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117512 <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z9.h}, z1.h[2]  // 11000001-00010001-01110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117512 <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h[4]  // 11000001-00011011-00111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3997 <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, z11.h[4]  // 11000001-00011011-00111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3997 <unknown>
 
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-10100000-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x18,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01018 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, {z0.h - z1.h}  // 11000001-10100000-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x18,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01018 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, {z20.h, z21.h}  // 11000001-10110100-01010001-01011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x5d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4515d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, {z20.h - z21.h}  // 11000001-10110100-01010001-01011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x5d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4515d <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z12.h, z13.h}, {z8.h, z9.h}  // 11000001-10101000-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8719f <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z13.h}, {z8.h - z9.h}  // 11000001-10101000-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8719f <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-10111110-01110011-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73df <unknown>
 
 fmls    za.h[w11, 7], {z30.h - z31.h}, {z30.h - z31.h}  // 11000001-10111110-01110011-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73df <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z16.h, z17.h}, {z16.h, z17.h}  // 11000001-10110000-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0121d <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z17.h}, {z16.h - z17.h}  // 11000001-10110000-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0121d <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z0.h, z1.h}, {z30.h, z31.h}  // 11000001-10111110-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1019 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z1.h}, {z30.h - z31.h}  // 11000001-10111110-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1019 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z18.h, z19.h}, {z20.h, z21.h}  // 11000001-10110100-01010010-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x58,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45258 <unknown>
 
 fmls    za.h[w10, 0], {z18.h - z19.h}, {z20.h - z21.h}  // 11000001-10110100-01010010-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x58,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45258 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, {z2.h, z3.h}  // 11000001-10100010-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21198 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, {z2.h - z3.h}  // 11000001-10100010-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21198 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z0.h, z1.h}, {z26.h, z27.h}  // 11000001-10111010-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5019 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z1.h}, {z26.h - z27.h}  // 11000001-10111010-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5019 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, {z30.h, z31.h}  // 11000001-10111110-00010010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12dd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, {z30.h - z31.h}  // 11000001-10111110-00010010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12dd <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z8.h, z9.h}, {z0.h, z1.h}  // 11000001-10100000-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0711a <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z9.h}, {z0.h - z1.h}  // 11000001-10100000-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0711a <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, {z10.h, z11.h}  // 11000001-10101010-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa319f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, {z10.h - z11.h}  // 11000001-10101010-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa319f <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c08 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c08 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d4d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d4d <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387daf <unknown>
 
 fmls    za.h[w11, 7], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387daf <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fef <unknown>
 
 fmls    za.h[w11, 7], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fef <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e2d <unknown>
 
 fmls    za.h[w8, 5], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e2d <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c29 <unknown>
 
 fmls    za.h[w8, 1], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c29 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e68 <unknown>
 
 fmls    za.h[w10, 0], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e68 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d88 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d88 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c29 <unknown>
 
 fmls    za.h[w10, 1], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c29 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ecd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ecd <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d2a <unknown>
 
 fmls    za.h[w11, 2], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d2a <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d8f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d8f <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109010 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109010 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x15,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d515 <unknown>
 
 fmls    za.h[w10, 5], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x15,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d515 <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd97 <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd97 <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x9f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff9f <unknown>
 
 fmls    za.h[w11, 7], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x9f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff9f <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e15 <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e15 <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9411 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9411 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x18,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d618 <unknown>
 
 fmls    za.h[w10, 0], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x18,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d618 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129990 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129990 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad811 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad811 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x9d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a9d <unknown>
 
 fmls    za.h[w8, 5], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x9d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a9d <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f512 <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f512 <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb997 <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb997 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x18,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11018 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x18,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11018 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x1d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5511d <unknown>
 
 fmls    za.h[w10, 5], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x1d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5511d <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9719f <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9719f <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd739f <unknown>
 
 fmls    za.h[w11, 7], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd739f <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1121d <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1121d <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1019 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1019 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x18,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55218 <unknown>
 
 fmls    za.h[w10, 0], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x18,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55218 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11198 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11198 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95019 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95019 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd129d <unknown>
 
 fmls    za.h[w8, 5], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd129d <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1711a <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1711a <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9319f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9319f <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
index def19a316c2a..1c561959c25e 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid predicate register
diff --git a/llvm/test/MC/AArch64/SME2p1/fmopa.s b/llvm/test/MC/AArch64/SME2p1/fmopa.s
index e53d21244fde..0a586d3acc42 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmopa.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmopa.s
@@ -1,85 +1,85 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d  --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmopa   za0.h, p0/m, p0/m, z0.h, z0.h  // 10000001-10000000-00000000-00001000
 // CHECK-INST: fmopa   za0.h, p0/m, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x08,0x00,0x80,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81800008 <unknown>
 
 fmopa   za1.h, p5/m, p2/m, z10.h, z21.h  // 10000001-10010101-01010101-01001001
 // CHECK-INST: fmopa   za1.h, p5/m, p2/m, z10.h, z21.h
 // CHECK-ENCODING: [0x49,0x55,0x95,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81955549 <unknown>
 
 fmopa   za1.h, p3/m, p7/m, z13.h, z8.h  // 10000001-10001000-11101101-10101001
 // CHECK-INST: fmopa   za1.h, p3/m, p7/m, z13.h, z8.h
 // CHECK-ENCODING: [0xa9,0xed,0x88,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8188eda9 <unknown>
 
 fmopa   za1.h, p7/m, p7/m, z31.h, z31.h  // 10000001-10011111-11111111-11101001
 // CHECK-INST: fmopa   za1.h, p7/m, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xe9,0xff,0x9f,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819fffe9 <unknown>
 
 fmopa   za1.h, p3/m, p0/m, z17.h, z16.h  // 10000001-10010000-00001110-00101001
 // CHECK-INST: fmopa   za1.h, p3/m, p0/m, z17.h, z16.h
 // CHECK-ENCODING: [0x29,0x0e,0x90,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81900e29 <unknown>
 
 fmopa   za1.h, p1/m, p4/m, z1.h, z30.h  // 10000001-10011110-10000100-00101001
 // CHECK-INST: fmopa   za1.h, p1/m, p4/m, z1.h, z30.h
 // CHECK-ENCODING: [0x29,0x84,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e8429 <unknown>
 
 fmopa   za0.h, p5/m, p2/m, z19.h, z20.h  // 10000001-10010100-01010110-01101000
 // CHECK-INST: fmopa   za0.h, p5/m, p2/m, z19.h, z20.h
 // CHECK-ENCODING: [0x68,0x56,0x94,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81945668 <unknown>
 
 fmopa   za0.h, p6/m, p0/m, z12.h, z2.h  // 10000001-10000010-00011001-10001000
 // CHECK-INST: fmopa   za0.h, p6/m, p0/m, z12.h, z2.h
 // CHECK-ENCODING: [0x88,0x19,0x82,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81821988 <unknown>
 
 fmopa   za1.h, p2/m, p6/m, z1.h, z26.h  // 10000001-10011010-11001000-00101001
 // CHECK-INST: fmopa   za1.h, p2/m, p6/m, z1.h, z26.h
 // CHECK-ENCODING: [0x29,0xc8,0x9a,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ac829 <unknown>
 
 fmopa   za1.h, p2/m, p0/m, z22.h, z30.h  // 10000001-10011110-00001010-11001001
 // CHECK-INST: fmopa   za1.h, p2/m, p0/m, z22.h, z30.h
 // CHECK-ENCODING: [0xc9,0x0a,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e0ac9 <unknown>
 
 fmopa   za0.h, p5/m, p7/m, z9.h, z1.h  // 10000001-10000001-11110101-00101000
 // CHECK-INST: fmopa   za0.h, p5/m, p7/m, z9.h, z1.h
 // CHECK-ENCODING: [0x28,0xf5,0x81,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8181f528 <unknown>
 
 fmopa   za1.h, p2/m, p5/m, z12.h, z11.h  // 10000001-10001011-10101001-10001001
 // CHECK-INST: fmopa   za1.h, p2/m, p5/m, z12.h, z11.h
 // CHECK-ENCODING: [0x89,0xa9,0x8b,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 818ba989 <unknown>
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
index 75eea8113262..0ec227ae0e68 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid predicate register
diff --git a/llvm/test/MC/AArch64/SME2p1/fmops.s b/llvm/test/MC/AArch64/SME2p1/fmops.s
index 325d4c125b60..597665d59150 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmops.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmops.s
@@ -1,84 +1,84 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmops   za0.h, p0/m, p0/m, z0.h, z0.h  // 10000001-10000000-00000000-00011000
 // CHECK-INST: fmops   za0.h, p0/m, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x18,0x00,0x80,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81800018 <unknown>
 
 fmops   za1.h, p5/m, p2/m, z10.h, z21.h  // 10000001-10010101-01010101-01011001
 // CHECK-INST: fmops   za1.h, p5/m, p2/m, z10.h, z21.h
 // CHECK-ENCODING: [0x59,0x55,0x95,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81955559 <unknown>
 
 fmops   za1.h, p3/m, p7/m, z13.h, z8.h  // 10000001-10001000-11101101-10111001
 // CHECK-INST: fmops   za1.h, p3/m, p7/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb9,0xed,0x88,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8188edb9 <unknown>
 
 fmops   za1.h, p7/m, p7/m, z31.h, z31.h  // 10000001-10011111-11111111-11111001
 // CHECK-INST: fmops   za1.h, p7/m, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xf9,0xff,0x9f,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ffff9 <unknown>
 
 fmops   za1.h, p3/m, p0/m, z17.h, z16.h  // 10000001-10010000-00001110-00111001
 // CHECK-INST: fmops   za1.h, p3/m, p0/m, z17.h, z16.h
 // CHECK-ENCODING: [0x39,0x0e,0x90,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81900e39 <unknown>
 
 fmops   za1.h, p1/m, p4/m, z1.h, z30.h  // 10000001-10011110-10000100-00111001
 // CHECK-INST: fmops   za1.h, p1/m, p4/m, z1.h, z30.h
 // CHECK-ENCODING: [0x39,0x84,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e8439 <unknown>
 
 fmops   za0.h, p5/m, p2/m, z19.h, z20.h  // 10000001-10010100-01010110-01111000
 // CHECK-INST: fmops   za0.h, p5/m, p2/m, z19.h, z20.h
 // CHECK-ENCODING: [0x78,0x56,0x94,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81945678 <unknown>
 
 fmops   za0.h, p6/m, p0/m, z12.h, z2.h  // 10000001-10000010-00011001-10011000
 // CHECK-INST: fmops   za0.h, p6/m, p0/m, z12.h, z2.h
 // CHECK-ENCODING: [0x98,0x19,0x82,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81821998 <unknown>
 
 fmops   za1.h, p2/m, p6/m, z1.h, z26.h  // 10000001-10011010-11001000-00111001
 // CHECK-INST: fmops   za1.h, p2/m, p6/m, z1.h, z26.h
 // CHECK-ENCODING: [0x39,0xc8,0x9a,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ac839 <unknown>
 
 fmops   za1.h, p2/m, p0/m, z22.h, z30.h  // 10000001-10011110-00001010-11011001
 // CHECK-INST: fmops   za1.h, p2/m, p0/m, z22.h, z30.h
 // CHECK-ENCODING: [0xd9,0x0a,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e0ad9 <unknown>
 
 fmops   za0.h, p5/m, p7/m, z9.h, z1.h  // 10000001-10000001-11110101-00111000
 // CHECK-INST: fmops   za0.h, p5/m, p7/m, z9.h, z1.h
 // CHECK-ENCODING: [0x38,0xf5,0x81,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8181f538 <unknown>
 
 fmops   za1.h, p2/m, p5/m, z12.h, z11.h  // 10000001-10001011-10101001-10011001
 // CHECK-INST: fmops   za1.h, p2/m, p5/m, z12.h, z11.h
 // CHECK-ENCODING: [0x99,0xa9,0x8b,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 818ba999 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
index 716427a2f725..60cef4260be9 100644
--- a/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
@@ -1,5 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
-
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 // --------------------------------------------------------------------------//
 // Out of range index offset
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fsub.s b/llvm/test/MC/AArch64/SME2p1/fsub.s
index b3735d554765..66410008eb11 100644
--- a/llvm/test/MC/AArch64/SME2p1/fsub.s
+++ b/llvm/test/MC/AArch64/SME2p1/fsub.s
@@ -1,296 +1,298 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 
 fsub    za.h[w8, 0], {z0.h - z1.h}  // 11000001-10100100-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c08 <unknown>
 
 fsub    za.h[w10, 5, vgx2], {z10.h, z11.h}  // 11000001-10100100-01011101-01001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x4d,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45d4d <unknown>
 
 fsub    za.h[w10, 5], {z10.h - z11.h}  // 11000001-10100100-01011101-01001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x4d,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45d4d <unknown>
 
 fsub    za.h[w11, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d8f <unknown>
 
 fsub    za.h[w11, 7], {z12.h - z13.h}  // 11000001-10100100-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d8f <unknown>
 
 fsub    za.h[w11, 7, vgx2], {z30.h, z31.h}  // 11000001-10100100-01111111-11001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47fcf <unknown>
 
 fsub    za.h[w11, 7], {z30.h - z31.h}  // 11000001-10100100-01111111-11001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47fcf <unknown>
 
 fsub    za.h[w8, 5, vgx2], {z16.h, z17.h}  // 11000001-10100100-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41e0d <unknown>
 
 fsub    za.h[w8, 5], {z16.h - z17.h}  // 11000001-10100100-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41e0d <unknown>
 
 fsub    za.h[w8, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c09 <unknown>
 
 fsub    za.h[w8, 1], {z0.h - z1.h}  // 11000001-10100100-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c09 <unknown>
 
 fsub    za.h[w10, 0, vgx2], {z18.h, z19.h}  // 11000001-10100100-01011110, 01001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x48,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45e48 <unknown>
 
 fsub    za.h[w10, 0], {z18.h - z19.h}  // 11000001-10100100-01011110-01001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x48,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45e48 <unknown>
 
 fsub    za.h[w8, 0, vgx2], {z12.h, z13.h}  // 11000001-10100100-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41d88 <unknown>
 
 fsub    za.h[w8, 0], {z12.h - z13.h}  // 11000001-10100100-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41d88 <unknown>
 
 fsub    za.h[w10, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45c09 <unknown>
 
 fsub    za.h[w10, 1], {z0.h - z1.h}  // 11000001-10100100-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45c09 <unknown>
 
 fsub    za.h[w8, 5, vgx2], {z22.h, z23.h}  // 11000001-10100100-00011110, 11001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xcd,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41ecd <unknown>
 
 fsub    za.h[w8, 5], {z22.h - z23.h}  // 11000001-10100100-00011110-11001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xcd,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41ecd <unknown>
 
 fsub    za.h[w11, 2, vgx2], {z8.h, z9.h}  // 11000001-10100100-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d0a <unknown>
 
 fsub    za.h[w11, 2], {z8.h - z9.h}  // 11000001-10100100-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d0a <unknown>
 
 fsub    za.h[w9, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a43d8f <unknown>
 
 fsub    za.h[w9, 7], {z12.h - z13.h}  // 11000001-10100100-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a43d8f <unknown>
 
 
 fsub    za.h[w8, 0, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c08 <unknown>
 
 fsub    za.h[w8, 0], {z0.h - z3.h}  // 11000001-10100101-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c08 <unknown>
 
 fsub    za.h[w10, 5, vgx4], {z8.h - z11.h}  // 11000001-10100101-01011101-00001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0d,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55d0d <unknown>
 
 fsub    za.h[w10, 5], {z8.h - z11.h}  // 11000001-10100101-01011101-00001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0d,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55d0d <unknown>
 
 fsub    za.h[w11, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d8f <unknown>
 
 fsub    za.h[w11, 7], {z12.h - z15.h}  // 11000001-10100101-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d8f <unknown>
 
 fsub    za.h[w11, 7, vgx4], {z28.h - z31.h}  // 11000001-10100101-01111111-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57f8f <unknown>
 
 fsub    za.h[w11, 7], {z28.h - z31.h}  // 11000001-10100101-01111111-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57f8f <unknown>
 
 fsub    za.h[w8, 5, vgx4], {z16.h - z19.h}  // 11000001-10100101-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e0d <unknown>
 
 fsub    za.h[w8, 5], {z16.h - z19.h}  // 11000001-10100101-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e0d <unknown>
 
 fsub    za.h[w8, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c09 <unknown>
 
 fsub    za.h[w8, 1], {z0.h - z3.h}  // 11000001-10100101-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c09 <unknown>
 
 fsub    za.h[w10, 0, vgx4], {z16.h - z19.h}  // 11000001-10100101-01011110-00001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x08,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55e08 <unknown>
 
 fsub    za.h[w10, 0], {z16.h - z19.h}  // 11000001-10100101-01011110-00001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x08,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55e08 <unknown>
 
 fsub    za.h[w8, 0, vgx4], {z12.h - z15.h}  // 11000001-10100101-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51d88 <unknown>
 
 fsub    za.h[w8, 0], {z12.h - z15.h}  // 11000001-10100101-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51d88 <unknown>
 
 fsub    za.h[w10, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55c09 <unknown>
 
 fsub    za.h[w10, 1], {z0.h - z3.h}  // 11000001-10100101-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55c09 <unknown>
 
 fsub    za.h[w8, 5, vgx4], {z20.h - z23.h}  // 11000001-10100101-00011110-10001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x8d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e8d <unknown>
 
 fsub    za.h[w8, 5], {z20.h - z23.h}  // 11000001-10100101-00011110-10001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x8d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e8d <unknown>
 
 fsub    za.h[w11, 2, vgx4], {z8.h - z11.h}  // 11000001-10100101-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d0a <unknown>
 
 fsub    za.h[w11, 2], {z8.h - z11.h}  // 11000001-10100101-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d0a <unknown>
 
 fsub    za.h[w9, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a53d8f <unknown>
 
 fsub    za.h[w9, 7], {z12.h - z15.h}  // 11000001-10100101-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a53d8f <unknown>
diff --git a/llvm/test/MC/AMDGPU/ds-err.s b/llvm/test/MC/AMDGPU/ds-err.s
index 2d25fdf5e302..c31f4c759395 100644
--- a/llvm/test/MC/AMDGPU/ds-err.s
+++ b/llvm/test/MC/AMDGPU/ds-err.s
@@ -18,19 +18,19 @@ ds_write2_b32 v2, v4, v6 offset0:4 offset0:8
 ds_write2_b32 v2, v4, v6 offset1:4 offset1:8
 
 // offset0 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset0 value.
 ds_write2_b32 v2, v4, v6 offset0:1000000000
 
 // offset0 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset0 value.
 ds_write2_b32 v2, v4, v6 offset0:0x100
 
 // offset1 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset1 value.
 ds_write2_b32 v2, v4, v6 offset1:1000000000
 
 // offset1 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset1 value.
 ds_write2_b32 v2, v4, v6 offset1:0x100
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index 3ec31626be5b..7f99afe01925 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -22,13 +22,13 @@ s_delay_alu instid0(VALU_DEP_1) | SALU_CYCLE_1)
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: expected a left parenthesis
 
 lds_direct_load v15 wait_vdst:16
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid wait_vdst value.
 
 lds_direct_load v15 wait_vdst
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_interp_p10_f32 v0, v1, v2, v3 wait_exp:8
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid wait_exp value.
 
 v_interp_p2_f32 v0, -v1, v2, v3 wait_exp
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index be9edc3e019e..b0854881d428 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -1126,6 +1126,18 @@ v_cvt_sr_fp8_f32 v10, s2, v5
 v_cvt_sr_fp8_f32 v5, -|v255|, v4
 // GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
 
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1  ; encoding: [0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2  ; encoding: [0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3  ; encoding: [0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
 v_cvt_sr_bf8_f32 v1, v2, v3
 // GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
 
@@ -1135,6 +1147,18 @@ v_cvt_sr_bf8_f32 v10, s2, v5
 v_cvt_sr_bf8_f32 v5, -|v255|, v4
 // GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
 
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1  ; encoding: [0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2  ; encoding: [0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3  ; encoding: [0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
 v_cvt_pk_i16_f32 v5, v1, v2
 // GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index d0e309adce41..16cd8d5aa5e9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1192,6 +1192,18 @@ v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:
 v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
 // GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
 v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
 // GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
@@ -1219,6 +1231,18 @@ v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:
 v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
 // GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
 v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 25b13ac62e4a..d6ef14cff5fa 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -698,6 +698,18 @@ v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
 // GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
 v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -710,6 +722,18 @@ v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
 // GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
 v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
index 55c5fcabea73..a9dd290ea67d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
@@ -101,3 +101,8 @@ v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1]
 // GFX12: error: invalid op_sel operand
 // GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1]
 // GFX12-NEXT:{{^}}                                ^
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:4
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid byte_sel value.
+// GFX12-NEXT:{{^}}v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:4
+// GFX12-NEXT:{{^}}                            ^
diff --git a/llvm/test/MC/AMDGPU/pal-msgpack.s b/llvm/test/MC/AMDGPU/pal-msgpack.s
index 886cc8b0538b..03c6c547af8a 100644
--- a/llvm/test/MC/AMDGPU/pal-msgpack.s
+++ b/llvm/test/MC/AMDGPU/pal-msgpack.s
@@ -14,10 +14,10 @@ amdpal.pipelines:
       - 0x123456789abcdef0
       - 0xfedcba9876543210
     .registers:      
-      0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0
-      0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
-      0xa1b3 (SPI_PS_INPUT_ENA): 0x1
-      0xa1b4 (SPI_PS_INPUT_ADDR): 0x1
+      '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0
+      '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
+      '0xa1b3 (SPI_PS_INPUT_ENA)': 0x1
+      '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x1
 ...
 	.end_amdgpu_pal_metadata
 
@@ -34,10 +34,10 @@ amdpal.pipelines:
 // ASM:       - 0x123456789abcdef0
 // ASM:       - 0xfedcba9876543210
 // ASM:     .registers:      
-// ASM:       0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0
-// ASM:       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
-// ASM:       0xa1b3 (SPI_PS_INPUT_ENA): 0x1
-// ASM:       0xa1b4 (SPI_PS_INPUT_ADDR): 0x1
+// ASM:       '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0
+// ASM:       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
+// ASM:       '0xa1b3 (SPI_PS_INPUT_ENA)': 0x1
+// ASM:       '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x1
 // ASM: ...
 // ASM: 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 6acaa8152720..2c911777ef97 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -1020,6 +1020,15 @@
 # GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
 0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20
 
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1  ; encoding: [0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2  ; encoding: [0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3  ; encoding: [0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00
+
 # GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
 0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00
 
@@ -1029,6 +1038,15 @@
 # GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
 0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20
 
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1  ; encoding: [0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2  ; encoding: [0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3  ; encoding: [0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00
+
 # GFX12: v_cvt_pk_i16_f32 v5, v1, v2             ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 0771e6449b62..f9b6c1b73ddc 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -945,6 +945,15 @@
 # GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
 
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
 # GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
 
@@ -972,6 +981,15 @@
 # GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
 
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
 # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index a836adafb31e..eedc6d491087 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -570,6 +570,15 @@
 # GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
 # GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
@@ -582,6 +591,15 @@
 # GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
 # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/M68k/data.txt b/llvm/test/MC/Disassembler/M68k/data.txt
index 8e2fb3f13560..3951ea677f11 100644
--- a/llvm/test/MC/Disassembler/M68k/data.txt
+++ b/llvm/test/MC/Disassembler/M68k/data.txt
@@ -36,6 +36,12 @@
 # CHECK: move.l (64,%sp,%a0), %d0
 0x20 0x37 0x88 0x40
 
+# CHECK: move.b #234, %d2
+0x14 0x3c 0x00 0xea
+
+# CHECK: moveq #100, %d2
+0x74 0x64
+
 # CHECK: move.l $f0000000, %a5
 0x2a 0x79 0xf0 0x00 0x00 0x00
 
diff --git a/llvm/test/MC/ELF/RISCV/gen-dwarf.s b/llvm/test/MC/ELF/RISCV/gen-dwarf.s
index 342ed1cc0e7e..34d02f5da44f 100644
--- a/llvm/test/MC/ELF/RISCV/gen-dwarf.s
+++ b/llvm/test/MC/ELF/RISCV/gen-dwarf.s
@@ -40,28 +40,28 @@
 # CHECK-NEXT: 0x00000020: [DW_RLE_end_of_list ]
 
 # RELOC:      Section ([[#]]) .rela.eh_frame {
-# RELOC-NEXT:   0x1C R_RISCV_32_PCREL <null> 0x0
-# RELOC-NEXT:   0x20 R_RISCV_ADD32 <null> 0x0
-# RELOC-NEXT:   0x20 R_RISCV_SUB32 <null> 0x0
-# RELOC-NEXT:   0x25 R_RISCV_SET6 <null> 0x0
-# RELOC-NEXT:   0x25 R_RISCV_SUB6 <null> 0x0
-# RELOC-NEXT:   0x34 R_RISCV_32_PCREL <null> 0x0
+# RELOC-NEXT:   0x1C R_RISCV_32_PCREL .L0  0x0
+# RELOC-NEXT:   0x20 R_RISCV_ADD32 .L0  0x0
+# RELOC-NEXT:   0x20 R_RISCV_SUB32 .L0  0x0
+# RELOC-NEXT:   0x25 R_RISCV_SET6 .L0  0x0
+# RELOC-NEXT:   0x25 R_RISCV_SUB6 .L0  0x0
+# RELOC-NEXT:   0x34 R_RISCV_32_PCREL .L0  0x0
 # RELOC-NEXT: }
 
 # RELOC:      Section ([[#]]) .rela.debug_rnglists {
 # RELOC-NEXT:   0xD R_RISCV_64 .text.foo 0x0
-# RELOC-NEXT:   0x15 R_RISCV_SET_ULEB128 <null> 0x0
+# RELOC-NEXT:   0x15 R_RISCV_SET_ULEB128 .L0  0x0
 # RELOC-NEXT:   0x15 R_RISCV_SUB_ULEB128 .text.foo 0x0
 # RELOC-NEXT:   0x17 R_RISCV_64 .text.bar 0x0
 # RELOC-NEXT: }
 
 # RELOC:      Section ([[#]]) .rela.debug_line {
-# RELOC:        R_RISCV_ADD16 <null> 0x0
-# RELOC-NEXT:   R_RISCV_SUB16 <null> 0x0
-# RELOC-NEXT:   R_RISCV_ADD16 <null> 0x0
-# RELOC-NEXT:   R_RISCV_SUB16 <null> 0x0
-# RELOC-NEXT:   R_RISCV_ADD16 <null> 0x0
-# RELOC-NEXT:   R_RISCV_SUB16 <null> 0x0
+# RELOC:        R_RISCV_ADD16 .L0  0x0
+# RELOC-NEXT:   R_RISCV_SUB16 .L0  0x0
+# RELOC-NEXT:   R_RISCV_ADD16 .L0  0x0
+# RELOC-NEXT:   R_RISCV_SUB16 .L0  0x0
+# RELOC-NEXT:   R_RISCV_ADD16 .L0  0x0
+# RELOC-NEXT:   R_RISCV_SUB16 .L0  0x0
 # RELOC:      }
 
 # RELOC:      Hex dump of section '.eh_frame':
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la-bad.s b/llvm/test/MC/LoongArch/Macros/macros-la-bad.s
index 03c6355e40b0..29c9745e4ad8 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la-bad.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la-bad.s
@@ -11,3 +11,6 @@ la.abs $a0, $a1, sym
 
 la.pcrel $a0, $a0, sym
 # CHECK: :[[#@LINE-1]]:11: error: $rd must be different from $rj
+
+la.tls.desc $a1, sym
+# CHECK: :[[#@LINE-1]]:14: error: $rd must be $r4
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
index 1a1d12d7d7df..5c572c8e75a0 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
@@ -3,6 +3,8 @@
 # RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
 # RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
 # RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
+# RUN: llvm-mc --triple=loongarch64 --defsym ABS=1 --mattr=+la-global-with-abs \
+# RUN:     %s | FileCheck %s --check-prefix=ABS
 
 # RELOC:      Relocations [
 # RELOC-NEXT:   Section ({{.*}}) .rela.text {
@@ -124,5 +126,48 @@ la.tls.gd $a0, $a1, sym_gd_large
 # RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_gd_large 0x0
 # RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_gd_large 0x0
 
+la.tls.desc $a0, sym_desc
+# CHECK-NEXT: pcalau12i $a0, %desc_pc_hi20(sym_desc)
+# CHECK-NEXT: addi.d $a0, $a0, %desc_pc_lo12(sym_desc)
+# CHECK-NEXT: ld.d $ra, $a0, %desc_ld(sym_desc)
+# CHECK-NEXT: jirl $ra, $ra, %desc_call(sym_desc)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_DESC_PC_HI20 sym_desc 0x0
+# RELOC-NEXT: R_LARCH_TLS_DESC_PC_LO12 sym_desc 0x0
+# RELOC-NEXT: R_LARCH_TLS_DESC_LD sym_desc 0x0
+# RELOC-NEXT: R_LARCH_TLS_DESC_CALL sym_desc 0x0
+
+la.tls.desc $a0, $a1, sym_desc_large
+# CHECK-NEXT: pcalau12i $a0, %desc_pc_hi20(sym_desc_large)
+# CHECK-NEXT: addi.d $a1, $zero, %desc_pc_lo12(sym_desc_large)
+# CHECK-NEXT: lu32i.d $a1, %desc64_pc_lo20(sym_desc_large)
+# CHECK-NEXT: lu52i.d $a1, $a1, %desc64_pc_hi12(sym_desc_large)
+# CHECK-NEXT: add.d $a0, $a0, $a1
+# CHECK-NEXT: ld.d $ra, $a0, %desc_ld(sym_desc_large)
+# CHECK-NEXT: jirl $ra, $ra, %desc_call(sym_desc_large)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_DESC_PC_HI20 sym_desc_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_DESC_PC_LO12 sym_desc_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_DESC64_PC_LO20 sym_desc_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_DESC64_PC_HI12 sym_desc_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_DESC_LD sym_desc_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_DESC_CALL sym_desc_large 0x0
+
+
 # RELOC-NEXT:   }
 # RELOC-NEXT: ]
+
+#############################################################
+## with feature: +la-global-with-abs
+#############################################################
+.ifdef ABS
+
+la.tls.desc $a0, sym_desc
+# ABS:      lu12i.w $a0, %desc_hi20(sym_desc)
+# ABS-NEXT: ori $a0, $a0, %desc_lo12(sym_desc)
+# ABS-NEXT: lu32i.d $a0, %desc64_lo20(sym_desc)
+# ABS-NEXT: lu52i.d $a0, $a0, %desc64_hi12(sym_desc)
+# ABS-NEXT: ld.d $ra, $a0, %desc_ld(sym_desc)
+# ABS-NEXT: jirl $ra, $ra, %desc_call(sym_desc)
+
+.endif
diff --git a/llvm/test/MC/LoongArch/Misc/tls-symbols.s b/llvm/test/MC/LoongArch/Misc/tls-symbols.s
index 2f91cbe004d2..340fea29ed94 100644
--- a/llvm/test/MC/LoongArch/Misc/tls-symbols.s
+++ b/llvm/test/MC/LoongArch/Misc/tls-symbols.s
@@ -77,3 +77,25 @@ lu12i.w $a1, %le_hi20(le)
 # CHECK-NEXT:   Other: 0
 # CHECK-NEXT:   Section: Undefined
 # CHECK-NEXT: }
+
+pcalau12i $a1, %desc_pc_hi20(desc_pc)
+# CHECK-NEXT: Symbol {
+# CHECK-NEXT:   Name: desc_pc
+# CHECK-NEXT:   Value: 0x0
+# CHECK-NEXT:   Size: 0
+# CHECK-NEXT:   Binding: Global
+# CHECK-NEXT:   Type: TLS
+# CHECK-NEXT:   Other: 0
+# CHECK-NEXT:   Section: Undefined
+# CHECK-NEXT: }
+
+lu12i.w $a1, %desc_hi20(desc_abs)
+# CHECK-NEXT: Symbol {
+# CHECK-NEXT:   Name: desc_abs
+# CHECK-NEXT:   Value: 0x0
+# CHECK-NEXT:   Size: 0
+# CHECK-NEXT:   Binding: Global
+# CHECK-NEXT:   Type: TLS
+# CHECK-NEXT:   Other: 0
+# CHECK-NEXT:   Section: Undefined
+# CHECK-NEXT: }
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index bec71e103893..87df59978c6e 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -223,3 +223,53 @@ pcaddu18i $t1, %call36(foo)
 # RELOC: R_LARCH_CALL36 foo 0x0
 # INSTR: pcaddu18i $t1, %call36(foo)
 # FIXUP: fixup A - offset: 0, value: %call36(foo), kind: FK_NONE
+
+pcalau12i $t1, %desc_pc_hi20(foo)
+# RELOC: R_LARCH_TLS_DESC_PC_HI20 foo 0x0
+# INSTR: pcalau12i $t1, %desc_pc_hi20(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_pc_hi20(foo), kind: FK_NONE
+
+addi.d $t1, $t1, %desc_pc_lo12(foo)
+# RELOC: R_LARCH_TLS_DESC_PC_LO12 foo 0x0
+# INSTR: addi.d $t1, $t1, %desc_pc_lo12(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_pc_lo12(foo), kind: FK_NONE
+
+lu32i.d $t1, %desc64_pc_lo20(foo)
+# RELOC: R_LARCH_TLS_DESC64_PC_LO20 foo 0x0
+# INSTR: lu32i.d $t1, %desc64_pc_lo20(foo)
+# FIXUP: fixup A - offset: 0, value: %desc64_pc_lo20(foo), kind: FK_NONE
+
+lu52i.d $t1, $t1, %desc64_pc_hi12(foo)
+# RELOC: R_LARCH_TLS_DESC64_PC_HI12 foo 0x0
+# INSTR: lu52i.d $t1, $t1, %desc64_pc_hi12(foo)
+# FIXUP: fixup A - offset: 0, value: %desc64_pc_hi12(foo), kind: FK_NONE
+
+ld.d $ra, $t1, %desc_ld(foo)
+# RELOC: R_LARCH_TLS_DESC_LD foo 0x0
+# INSTR: ld.d $ra, $t1, %desc_ld(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_ld(foo), kind: FK_NONE
+
+jirl $ra, $ra, %desc_call(foo)
+# RELOC: R_LARCH_TLS_DESC_CALL foo 0x0
+# INSTR: jirl $ra, $ra, %desc_call(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_call(foo), kind: FK_NONE
+
+lu12i.w $t1, %desc_hi20(foo)
+# RELOC: R_LARCH_TLS_DESC_HI20 foo 0x0
+# INSTR: lu12i.w $t1, %desc_hi20(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_hi20(foo), kind: FK_NONE
+
+ori $t1, $t1, %desc_lo12(foo)
+# RELOC: R_LARCH_TLS_DESC_LO12 foo 0x0
+# INSTR: ori $t1, $t1, %desc_lo12(foo)
+# FIXUP: fixup A - offset: 0, value: %desc_lo12(foo), kind: FK_NONE
+
+lu32i.d $t1, %desc64_lo20(foo)
+# RELOC: R_LARCH_TLS_DESC64_LO20 foo 0x0
+# INSTR: lu32i.d $t1, %desc64_lo20(foo)
+# FIXUP: fixup A - offset: 0, value: %desc64_lo20(foo), kind: FK_NONE
+
+lu52i.d $t1, $t1, %desc64_hi12(foo)
+# RELOC: R_LARCH_TLS_DESC64_HI12 foo 0x0
+# INSTR: lu52i.d $t1, $t1, %desc64_hi12(foo)
+# FIXUP: fixup A - offset: 0, value: %desc64_hi12(foo), kind: FK_NONE
diff --git a/llvm/test/MC/M68k/Data/Classes/MxMove_RI.s b/llvm/test/MC/M68k/Data/Classes/MxMove_RI.s
index 091367a68256..2081924d7b17 100644
--- a/llvm/test/MC/M68k/Data/Classes/MxMove_RI.s
+++ b/llvm/test/MC/M68k/Data/Classes/MxMove_RI.s
@@ -9,3 +9,6 @@ move.l	#42, %a1
 ; CHECK:      move.l  #-1, %a1
 ; CHECK-SAME: encoding: [0x22,0x7c,0xff,0xff,0xff,0xff]
 move.l	#-1, %a1
+; CHECK:      moveq  #-17, %d3
+; CHECK-SAME: encoding: [0x76,0xef]
+moveq	#-17, %d3
diff --git a/llvm/test/MC/RISCV/XTHeadVdot-valid.s b/llvm/test/MC/RISCV/XTHeadVdot-valid.s
index 2e00bd1cac3e..ab411dfac730 100644
--- a/llvm/test/MC/RISCV/XTHeadVdot-valid.s
+++ b/llvm/test/MC/RISCV/XTHeadVdot-valid.s
@@ -12,82 +12,82 @@ th.vmaqau.vv v8, v20, v4, v0.t
 # CHECK-INST: th.vmaqau.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x88]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 88 <unknown>
+# CHECK-UNKNOWN: 884a640b <unknown>
 
 th.vmaqau.vv v8, v20, v4
 # CHECK-INST: th.vmaqau.vv v8, v20, v4
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x8a]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 8a <unknown>
+# CHECK-UNKNOWN: 8a4a640b <unknown>
 
 th.vmaqau.vx v8, a0, v4, v0.t
 # CHECK-INST: th.vmaqau.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 8c <unknown>
+# CHECK-UNKNOWN: 8c45640b <unknown>
 
 th.vmaqau.vx v8, a0, v4
 # CHECK-INST: th.vmaqau.vx v8, a0, v4
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 8e <unknown>
+# CHECK-UNKNOWN: 8e45640b <unknown>
 
 th.vmaqa.vv v8, v20, v4, v0.t
 # CHECK-INST: th.vmaqa.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x80]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 80 <unknown>
+# CHECK-UNKNOWN: 804a640b <unknown>
 
 th.vmaqa.vv v8, v20, v4
 # CHECK-INST: th.vmaqa.vv v8, v20, v4
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x82]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 82 <unknown>
+# CHECK-UNKNOWN: 824a640b <unknown>
 
 th.vmaqa.vx v8, a0, v4, v0.t
 # CHECK-INST: th.vmaqa.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 84 <unknown>
+# CHECK-UNKNOWN: 8445640b <unknown>
 
 th.vmaqa.vx v8, a0, v4
 # CHECK-INST: th.vmaqa.vx v8, a0, v4
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 86 <unknown>
+# CHECK-UNKNOWN: 8645640b <unknown>
 
 th.vmaqasu.vv v8, v20, v4, v0.t
 # CHECK-INST: th.vmaqasu.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x90]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 90 <unknown>
+# CHECK-UNKNOWN: 904a640b <unknown>
 
 th.vmaqasu.vv v8, v20, v4
 # CHECK-INST: th.vmaqasu.vv v8, v20, v4
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x92]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 92 <unknown>
+# CHECK-UNKNOWN: 924a640b <unknown>
 
 th.vmaqasu.vx v8, a0, v4, v0.t
 # CHECK-INST: th.vmaqasu.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x94]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 94 <unknown>
+# CHECK-UNKNOWN: 9445640b <unknown>
 
 th.vmaqasu.vx v8, a0, v4
 # CHECK-INST: th.vmaqasu.vx v8, a0, v4
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x96]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 96 <unknown>
+# CHECK-UNKNOWN: 9645640b <unknown>
 
 th.vmaqaus.vx v8, a0, v4, v0.t
 # CHECK-INST: th.vmaqaus.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x9c]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 9c <unknown>
+# CHECK-UNKNOWN: 9c45640b <unknown>
 
 th.vmaqaus.vx v8, a0, v4
 # CHECK-INST: th.vmaqaus.vx v8, a0, v4
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x9e]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 9e <unknown>
+# CHECK-UNKNOWN: 9e45640b <unknown>
diff --git a/llvm/test/MC/RISCV/align.s b/llvm/test/MC/RISCV/align.s
index 2eb7186d0de9..32cc071b613c 100644
--- a/llvm/test/MC/RISCV/align.s
+++ b/llvm/test/MC/RISCV/align.s
@@ -98,11 +98,11 @@ test:
 # The behavior is the same as GNU assembler.
 	.p2align 4, 1
 # RELAX-RELOC-NOT: R_RISCV_ALIGN - 0xC
-# RELAX-INST:  01 01
-# RELAX-INST:  01 01
+# RELAX-INST:  0101
+# RELAX-INST:  0101
 # C-OR-ZCA-EXT-RELAX-RELOC-NOT: R_RISCV_ALIGN - 0xE
-# C-OR-ZCA-EXT-RELAX-INST:  01 01
-# C-EXT-INST:  01 01
+# C-OR-ZCA-EXT-RELAX-INST:  0101
+# C-EXT-INST:  0101
 	ret
 # NORELAX-RELOC-NOT: R_RISCV
 # C-OR-ZCA-EXT-NORELAX-RELOC-NOT: R_RISCV
diff --git a/llvm/test/MC/RISCV/cfi-advance.s b/llvm/test/MC/RISCV/cfi-advance.s
index c4af390be757..b99af38f553a 100644
--- a/llvm/test/MC/RISCV/cfi-advance.s
+++ b/llvm/test/MC/RISCV/cfi-advance.s
@@ -1,13 +1,27 @@
 # RUN: llvm-mc -filetype=obj -triple riscv32 %s -o %t.o  
-# RUN: llvm-readobj -r %t.o | FileCheck -check-prefix=CHECK %s
+# RUN: llvm-readelf -sr %t.o | FileCheck %s
 # RUN: llvm-dwarfdump --debug-frame %t.o 2>&1 \
 # RUN:     | FileCheck -check-prefix=CHECK-DWARFDUMP %s
 
-# CHECK:      .rela.eh_frame {
-# CHECK-NEXT:   0x1C R_RISCV_32_PCREL <null> 0x0
-# CHECK-NEXT:   0x35 R_RISCV_SET6 <null> 0x0
-# CHECK-NEXT:   0x35 R_RISCV_SUB6 <null> 0x0
-# CHECK-NEXT: }
+
+# CHECK:      Relocation section '.rela.text1' at offset {{.*}} contains 1 entries:
+# CHECK-NEXT:  Offset     Info    Type                Sym. Value  Symbol's Name + Addend
+# CHECK-NEXT: 00000000  00000313 R_RISCV_CALL_PLT       00000004   .L0 + 0
+# CHECK-EMPTY:
+# CHECK-NEXT: Relocation section '.rela.eh_frame' at offset {{.*}} contains 3 entries:
+# CHECK:       Offset     Info    Type                Sym. Value  Symbol's Name + Addend
+# CHECK-NEXT: 0000001c  00000139 R_RISCV_32_PCREL       00000000   .L0 + 0
+# CHECK-NEXT: 00000035  00000b35 R_RISCV_SET6           00010178   .L0 + 0
+# CHECK-NEXT: 00000035  00000934 R_RISCV_SUB6           0001016e   .L0 + 0
+# CHECK-EMPTY:
+# CHECK:      Symbol table '.symtab' contains 15 entries:
+# CHECK-NEXT:    Num:    Value  Size Type    Bind   Vis       Ndx Name
+# CHECK-NEXT:      0: 00000000     0 NOTYPE  LOCAL  DEFAULT   UND
+# CHECK-NEXT:      1: 00000000     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
+# CHECK:           3: 00000004     0 NOTYPE  LOCAL  DEFAULT     2 .L0{{$}}
+# CHECK:           9: 0001016e     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
+# CHECK:          11: 00010178     0 NOTYPE  LOCAL  DEFAULT     2 .L0 {{$}}
+
 # CHECK-DWARFDUMP: DW_CFA_advance_loc1: 104
 # CHECK-DWARFDUMP-NEXT: DW_CFA_def_cfa_offset: +8
 # CHECK-DWARFDUMP-NEXT: DW_CFA_advance_loc2: 259
@@ -23,6 +37,9 @@
 test:
         .cfi_startproc
         nop
+## This looks similar to fake label names ".L0 ". Even if this is ".L0 ",
+## the assembler will not conflate it with fake labels.
+.L0:
         .zero 100, 0x90
         .cfi_def_cfa_offset 8
         nop
@@ -36,3 +53,6 @@ test:
         .cfi_def_cfa_offset 8
         nop
         .cfi_endproc
+
+.section .text1,"ax"
+call .L0
diff --git a/llvm/test/MC/RISCV/compress-cjal.s b/llvm/test/MC/RISCV/compress-cjal.s
index 31b9c30c2b01..d55586b005c7 100644
--- a/llvm/test/MC/RISCV/compress-cjal.s
+++ b/llvm/test/MC/RISCV/compress-cjal.s
@@ -11,7 +11,7 @@
 
 # c.jal is an rv32 only instruction.
 jal ra, 2046
-# CHECK-BYTES: fd 2f
+# CHECK-BYTES: 2ffd
 # CHECK-ALIASOBJ: jal 0x7fe
 # CHECK-ALIAS: jal 2046
 # CHECK-INST: c.jal 2046
diff --git a/llvm/test/MC/RISCV/compress-rv32d.s b/llvm/test/MC/RISCV/compress-rv32d.s
index bebc78ef8690..c41a08892862 100644
--- a/llvm/test/MC/RISCV/compress-rv32d.s
+++ b/llvm/test/MC/RISCV/compress-rv32d.s
@@ -43,22 +43,22 @@
 # Tests double precision floating point instructions available in rv32 and in rv64.
 
 fld ft0, 64(sp)
-# CHECK-BYTES: 06 20
+# CHECK-BYTES: 2006
 # CHECK-ALIAS: fld ft0, 64(sp)
 # CHECK-INST: c.fldsp ft0, 64(sp)
 # CHECK: # encoding:  [0x06,0x20]
 fsd ft0, 64(sp)
-# CHECK-BYTES: 82 a0
+# CHECK-BYTES: a082
 # CHECK-ALIAS: fsd ft0, 64(sp)
 # CHECK-INST: c.fsdsp ft0, 64(sp)
 # CHECK: # encoding:  [0x82,0xa0]
 fld fs0, 248(s0)
-# CHECK-BYTES: 60 3c
+# CHECK-BYTES: 3c60
 # CHECK-ALIAS: fld fs0, 248(s0)
 # CHECK-INST: c.fld fs0, 248(s0)
 # CHECK: # encoding:  [0x60,0x3c]
 fsd fs0, 248(s0)
-# CHECK-BYTES: 60 bc
+# CHECK-BYTES: bc60
 # CHECK-ALIAS: fsd fs0, 248(s0)
 # CHECK-INST: c.fsd fs0, 248(s0)
 # CHECK: # encoding:  [0x60,0xbc]
diff --git a/llvm/test/MC/RISCV/compress-rv32f.s b/llvm/test/MC/RISCV/compress-rv32f.s
index 3f0c69fb9893..afe15c598bb6 100644
--- a/llvm/test/MC/RISCV/compress-rv32f.s
+++ b/llvm/test/MC/RISCV/compress-rv32f.s
@@ -21,22 +21,22 @@
 
 # Instructions that are 32 bit only.
 flw ft0, 124(sp)
-# CHECK-BYTES: 76 70
+# CHECK-BYTES: 7076
 # CHECK-ALIAS: flw     ft0, 124(sp)
 # CHECK-INST: c.flwsp ft0, 124(sp)
 # CHECK:  # encoding: [0x76,0x70]
 fsw ft0, 124(sp)
-# CHECK-BYTES: 82 fe
+# CHECK-BYTES: fe82
 # CHECK-ALIAS: fsw ft0, 124(sp)
 # CHECK-INST: c.fswsp ft0, 124(sp)
 # CHECK:  # encoding: [0x82,0xfe]
 flw fs0, 124(s0)
-# CHECK-BYTES: 60 7c
+# CHECK-BYTES: 7c60
 # CHECK-ALIAS: flw fs0, 124(s0)
 # CHECK-INST: c.flw fs0, 124(s0)
 # CHECK:  # encoding:  [0x60,0x7c]
 fsw fs0, 124(s0)
-# CHECK-BYTES: 60 fc
+# CHECK-BYTES: fc60
 # CHECK-ALIAS: fsw fs0, 124(s0)
 # CHECK-INST: c.fsw fs0, 124(s0)
 # CHECK:  # encoding:  [0x60,0xfc]
diff --git a/llvm/test/MC/RISCV/compress-rv32i.s b/llvm/test/MC/RISCV/compress-rv32i.s
index b4fd72a0f81c..a75bea32ac0c 100644
--- a/llvm/test/MC/RISCV/compress-rv32i.s
+++ b/llvm/test/MC/RISCV/compress-rv32i.s
@@ -20,121 +20,121 @@
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d -M no-aliases - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-INST,CHECK-INSTOBJ64 %s
 
-# CHECK-BYTES: 2e 85
+# CHECK-BYTES: 852e
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: c.mv a0, a1
 # CHECK: # encoding:  [0x2e,0x85]
 addi a0, a1, 0
 
-# CHECK-BYTES: e0 1f
+# CHECK-BYTES: 1fe0
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: c.addi4spn s0, sp, 1020
 # CHECK: # encoding:  [0xe0,0x1f]
 addi s0, sp, 1020
 
-# CHECK-BYTES: e0 5f
+# CHECK-BYTES: 5fe0
 # CHECK-ALIAS: lw s0, 124(a5)
 # CHECK-INST: c.lw s0, 124(a5)
 # CHECK: # encoding: [0xe0,0x5f]
 lw s0, 124(a5)
 
-# CHECK-BYTES: e0 df
+# CHECK-BYTES: dfe0
 # CHECK-ALIAS: sw s0, 124(a5)
 # CHECK-INST: c.sw s0, 124(a5)
 # CHECK: # encoding: [0xe0,0xdf]
 sw s0, 124(a5)
 
-# CHECK-BYTES: 01 00
+# CHECK-BYTES: 0001
 # CHECK-ALIAS: nop
 # CHECK-INST: c.nop
 # CHECK: # encoding: [0x01,0x00]
 nop
 
-# CHECK-BYTES: 81 10
+# CHECK-BYTES: 1081
 # CHECK-ALIAS: addi ra, ra, -32
 # CHECK-INST: c.addi ra, -32
 # CHECK: # encoding:  [0x81,0x10]
 addi ra, ra, -32
 
-# CHECK-BYTES: 85 50
+# CHECK-BYTES: 5085
 # CHECK-ALIAS: li ra, -31
 # CHECK-INST: c.li ra, -31
 # CHECK: # encoding: [0x85,0x50]
 li ra, -31
 
-# CHECK-BYTES: 39 71
+# CHECK-BYTES: 7139
 # CHECK-ALIAS: addi sp, sp, -64
 # CHECK-INST: c.addi16sp sp, -64
 # CHECK:  # encoding: [0x39,0x71]
 addi sp, sp, -64
 
-# CHECK-BYTES: fd 61
+# CHECK-BYTES: 61fd
 # CHECK-ALIAS: lui gp, 31
 # CHECK-INST: c.lui gp, 31
 # CHECK: # encoding:  [0xfd,0x61]
 lui gp, 31
 
-# CHECK-BYTES: 7d 80
+# CHECK-BYTES: 807d
 # CHECK-ALIAS: srli s0, s0, 31
 # CHECK-INST: c.srli s0, 31
 # CHECK: # encoding:  [0x7d,0x80]
 srli s0, s0, 31
 
-# CHECK-BYTES: 7d 84
+# CHECK-BYTES: 847d
 # CHECK-ALIAS: srai s0, s0, 31
 # CHECK-INST: c.srai s0, 31
 # CHECK: # encoding: [0x7d,0x84]
 srai s0, s0, 31
 
-# CHECK-BYTES: 7d 88
+# CHECK-BYTES: 887d
 # CHECK-ALIAS: andi s0, s0, 31
 # CHECK-INST: c.andi s0, 31
 # CHECK: # encoding: [0x7d,0x88]
 andi s0, s0, 31
 
-# CHECK-BYTES: 1d 8c
+# CHECK-BYTES: 8c1d
 # CHECK-ALIAS: sub s0, s0, a5
 # CHECK-INST: c.sub s0, a5
 # CHECK: # encoding: [0x1d,0x8c]
 sub s0, s0, a5
 
-# CHECK-BYTES: 3d 8c
+# CHECK-BYTES: 8c3d
 # CHECK-ALIAS: xor s0, s0, a5
 # CHECK-INST: c.xor s0, a5
 # CHECK: # encoding: [0x3d,0x8c]
 xor s0, s0, a5
 
-# CHECK-BYTES: 3d 8c
+# CHECK-BYTES: 8c3d
 # CHECK-ALIAS: xor s0, s0, a5
 # CHECK-INST: c.xor s0, a5
 # CHECK: # encoding: [0x3d,0x8c]
 xor s0, a5, s0
 
-# CHECK-BYTES: 5d 8c
+# CHECK-BYTES: 8c5d
 # CHECK-ALIAS: or s0, s0, a5
 # CHECK-INST: c.or s0, a5
 # CHECK: # encoding:  [0x5d,0x8c]
 or s0, s0, a5
 
-# CHECK-BYTES: 45 8c
+# CHECK-BYTES: 8c45
 # CHECK-ALIAS: or s0, s0, s1
 # CHECK-INST: c.or s0, s1
 # CHECK:  # encoding: [0x45,0x8c]
 or  s0, s1, s0
 
-# CHECK-BYTES: 7d 8c
+# CHECK-BYTES: 8c7d
 # CHECK-ALIAS: and s0, s0, a5
 # CHECK-INST: c.and s0, a5
 # CHECK: # encoding: [0x7d,0x8c]
 and s0, s0, a5
 
-# CHECK-BYTES: 7d 8c
+# CHECK-BYTES: 8c7d
 # CHECK-ALIAS: and s0, s0, a5
 # CHECK-INST: c.and s0, a5
 # CHECK: # encoding: [0x7d,0x8c]
 and s0, a5, s0
 
-# CHECK-BYTES: 01 b0
+# CHECK-BYTES: b001
 # CHECK-ALIASASM: j -2048
 # CHECK-ALIASOBJ32: j 0xfffff826
 # CHECK-ALIASOBJ64: j 0xfffffffffffff826
@@ -144,7 +144,7 @@ and s0, a5, s0
 # CHECK:  # encoding: [0x01,0xb0]
 jal zero, -2048
 
-# CHECK-BYTES: 01 d0
+# CHECK-BYTES: d001
 # CHECK-ALIASASM: beqz s0, -256
 # CHECK-ALIASOBJ32: beqz s0, 0xffffff28
 # CHECK-ALIASOBJ64: beqz s0, 0xffffffffffffff28
@@ -154,7 +154,7 @@ jal zero, -2048
 # CHECK: # encoding: [0x01,0xd0]
 beq s0, zero, -256
 
-# CHECK-BYTES: 01 d0
+# CHECK-BYTES: d001
 # CHECK-ALIASASM: beqz s0, -256
 # CHECK-ALIASOBJ32: beqz s0, 0xffffff2a
 # CHECK-ALIASOBJ64: beqz s0, 0xffffffffffffff2a
@@ -164,7 +164,7 @@ beq s0, zero, -256
 # CHECK: # encoding: [0x01,0xd0]
 beq zero, s0, -256
 
-# CHECK-BYTES: 7d ec
+# CHECK-BYTES: ec7d
 # CHECK-ALIASASM: bnez s0, 254
 # CHECK-ALIASOBJ32: bnez s0, 0x12a
 # CHECK-ALIASOBJ64: bnez s0, 0x12a
@@ -174,7 +174,7 @@ beq zero, s0, -256
 # CHECK: # encoding: [0x7d,0xec]
 bne s0, zero, 254
 
-# CHECK-BYTES: 7d ec
+# CHECK-BYTES: ec7d
 # CHECK-ALIASASM: bnez s0, 254
 # CHECK-ALIASOBJ32: bnez s0, 0x12c
 # CHECK-ALIASOBJ64: bnez s0, 0x12c
@@ -184,67 +184,67 @@ bne s0, zero, 254
 # CHECK: # encoding: [0x7d,0xec]
 bne zero, s0, 254
 
-# CHECK-BYTES: 7e 04
+# CHECK-BYTES: 047e
 # CHECK-ALIAS: slli s0, s0, 31
 # CHECK-INST: c.slli s0, 31
 # CHECK: # encoding:  [0x7e,0x04]
 slli s0, s0, 31
 
-# CHECK-BYTES: fe 50
+# CHECK-BYTES: 50fe
 # CHECK-ALIAS: lw ra, 252(sp)
 # CHECK-INST: c.lwsp  ra, 252(sp)
 # CHECK: # encoding:  [0xfe,0x50]
 lw ra, 252(sp)
 
-# CHECK-BYTES: 82 80
+# CHECK-BYTES: 8082
 # CHECK-ALIAS: ret
 # CHECK-INST: c.jr ra
 # CHECK: # encoding:  [0x82,0x80]
 jalr zero, 0(ra)
 
-# CHECK-BYTES: 92 80
+# CHECK-BYTES: 8092
 # CHECK-ALIAS: mv ra, tp
 # CHECK-INST: c.mv ra, tp
 # CHECK:  # encoding: [0x92,0x80]
 add ra, zero, tp
 
-# CHECK-BYTES: 92 80
+# CHECK-BYTES: 8092
 # CHECK-ALIAS: mv ra, tp
 # CHECK-INST: c.mv ra, tp
 # CHECK:  # encoding: [0x92,0x80]
 add ra, tp, zero
 
-# CHECK-BYTES: 02 90
+# CHECK-BYTES: 9002
 # CHECK-ALIAS: ebreak
 # CHECK-INST: c.ebreak
 # CHECK: # encoding: [0x02,0x90]
 ebreak
 
-# CHECK-BYTES: 02 94
+# CHECK-BYTES: 9402
 # CHECK-ALIAS: jalr s0
 # CHECK-INST: c.jalr s0
 # CHECK: # encoding: [0x02,0x94]
 jalr ra, 0(s0)
 
-# CHECK-BYTES: 3e 94
+# CHECK-BYTES: 943e
 # CHECK-ALIAS: add s0, s0, a5
 # CHECK-INST: c.add s0, a5
 # CHECK: # encoding:  [0x3e,0x94]
 add s0, a5, s0
 
-# CHECK-BYTES: 3e 94
+# CHECK-BYTES: 943e
 # CHECK-ALIAS: add s0, s0, a5
 # CHECK-INST: c.add s0, a5
 # CHECK: # encoding:  [0x3e,0x94]
 add s0, s0, a5
 
-# CHECK-BYTES: 82 df
+# CHECK-BYTES: df82
 # CHECK-ALIAS: sw zero, 252(sp)
 # CHECK-INST: c.swsp zero, 252(sp)
 # CHECK: # encoding: [0x82,0xdf]
 sw zero, 252(sp)
 
-# CHECK-BYTES: 00 00
+# CHECK-BYTES: 0000
 # CHECK-ALIAS: unimp
 # CHECK-INST: c.unimp
 # CHECK: # encoding: [0x00,0x00]
diff --git a/llvm/test/MC/RISCV/compress-rv64i.s b/llvm/test/MC/RISCV/compress-rv64i.s
index 55d24f0d41c0..ab5b24307cd1 100644
--- a/llvm/test/MC/RISCV/compress-rv64i.s
+++ b/llvm/test/MC/RISCV/compress-rv64i.s
@@ -11,49 +11,49 @@
 
 # Tests compressed instructions available in rv64 and not in rv32.
 
-# CHECK-BYTES: e0 7f
+# CHECK-BYTES: 7fe0
 # CHECK-ALIAS: ld s0, 248(a5)
 # CHECK-INST: c.ld s0, 248(a5)
 # CHECK: # encoding: [0xe0,0x7f]
 ld s0, 248(a5)
 
-# CHECK-BYTES: a0 e3
+# CHECK-BYTES: e3a0
 # CHECK-ALIAS: sd s0, 64(a5)
 # CHECK-INST: c.sd s0, 64(a5)
 # CHECK: # encoding: [0xa0,0xe3]
 sd s0, 64(a5)
 
-# CHECK-BYTES: 7d 22
+# CHECK-BYTES: 227d
 # CHEACK-ALIAS: addiw tp, tp, 31
 # CHECK-INST: c.addiw  tp, 31
 # CHECK: # encoding: [0x7d,0x22]
 addiw tp, tp, 31
 
-# CHECK-BYTES: 1d 9c
+# CHECK-BYTES: 9c1d
 # CHEACK-ALIAS: subw s0, s0, a5
 # CHECK-INST: c.subw s0, a5
 # CHECK: # encoding:  [0x1d,0x9c]
 subw s0, s0, a5
 
-# CHECK-BYTES: 3d 9c
+# CHECK-BYTES: 9c3d
 # CHECK-ALIAS: addw s0, s0, a5
 # CHECK-INST: c.addw s0, a5
 # CHECK: # encoding: [0x3d,0x9c]
 addw s0, s0, a5
 
-# CHECK-BYTES: 3d 9c
+# CHECK-BYTES: 9c3d
 # CHECK-ALIAS: addw s0, s0, a5
 # CHECK-INST: c.addw s0, a5
 # CHECK: # encoding: [0x3d,0x9c]
 addw s0, a5, s0
 
-# CHECK-BYTES: ee 70
+# CHECK-BYTES: 70ee
 # CHECK-ALIAS: ld ra, 248(sp)
 # CHECK-INST: c.ldsp ra, 248(sp)
 # CHECK: # encoding:  [0xee,0x70]
 ld ra, 248(sp)
 
-# CHECK-BYTES: a2 e0
+# CHECK-BYTES: e0a2
 # CHECK-ALIAS: sd s0, 64(sp)
 # CHECK-INST: c.sdsp s0, 64(sp)
 # CHECK: # encoding: [0xa2,0xe0]
diff --git a/llvm/test/MC/RISCV/fde-reloc.s b/llvm/test/MC/RISCV/fde-reloc.s
index 1db8929e0747..81ec426c8b61 100644
--- a/llvm/test/MC/RISCV/fde-reloc.s
+++ b/llvm/test/MC/RISCV/fde-reloc.s
@@ -12,7 +12,7 @@ func:
 	.cfi_endproc
 
 # CHECK:   Section (4) .rela.eh_frame {
-# CHECK-NEXT:   0x1C R_RISCV_32_PCREL <null> 0x0
+# CHECK-NEXT:   0x1C R_RISCV_32_PCREL .L0  0x0
 # CHECK-NEXT: }
 # CHECK:      Hex dump of section '.eh_frame':
 # CHECK-NEXT: 0x00000000 10000000 00000000 017a5200 017c0101
diff --git a/llvm/test/MC/RISCV/invalid-attribute.s b/llvm/test/MC/RISCV/invalid-attribute.s
index 1d732af83cda..2989e80b269a 100644
--- a/llvm/test/MC/RISCV/invalid-attribute.s
+++ b/llvm/test/MC/RISCV/invalid-attribute.s
@@ -11,7 +11,7 @@
 # CHECK: [[@LINE-1]]:12: error: attribute name not recognised: unknown
 
 .attribute arch, "foo"
-# CHECK: [[@LINE-1]]:18: error: invalid arch name 'foo', string must begin with rv32{i,e,g} or rv64{i,e,g}
+# CHECK: [[@LINE-1]]:18: error: invalid arch name 'foo', string must begin with rv32{i,e,g}, rv64{i,e,g}, or a supported profile name{{$}}
 
 .attribute arch, "rv32i2p1_y2p0"
 # CHECK: [[@LINE-1]]:18: error: invalid arch name 'rv32i2p1_y2p0', invalid standard user-level extension 'y'
diff --git a/llvm/test/MC/RISCV/large-instructions.s b/llvm/test/MC/RISCV/large-instructions.s
new file mode 100644
index 000000000000..b50dbde17d38
--- /dev/null
+++ b/llvm/test/MC/RISCV/large-instructions.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
+# RUN:     | llvm-objdump -d - | FileCheck %s
+
+# CHECK: 011f 4523 8967 <unknown>
+.byte 0x1f, 0x01, 0x23, 0x45, 0x67, 0x89
+
+# CHECK: 4523013f cdab8967 <unknown>
+.byte 0x3f, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd
+
+# CHECK: 007f 4523 8967 cdab feef <unknown>
+.byte 0x7f, 0x00, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe
+
+# CHECK: 4523107f cdab8967 badcfeef <unknown>
+.byte 0x7f, 0x10, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba
+
+# CHECK: 207f 4523 8967 cdab feef badc 7698 <unknown>
+.byte 0x7f, 0x20, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76
+
+# CHECK: 4523307f cdab8967 badcfeef 32547698 <unknown>
+.byte 0x7f, 0x30, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32
+
+# CHECK: 407f 4523 8967 cdab feef badc 7698 3254 1210 <unknown>
+.byte 0x7f, 0x40, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12
+
+# CHECK: 4523507f cdab8967 badcfeef 32547698 56341210 <unknown>
+.byte 0x7f, 0x50, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56
+
+# CHECK: 607f 4523 8967 cdab feef badc 7698 3254 1210 5634 9a78 <unknown>
+.byte 0x7f, 0x60, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56, 0x78, 0x9a
diff --git a/llvm/test/MC/RISCV/nop-slide.s b/llvm/test/MC/RISCV/nop-slide.s
index f280d6e521e3..4dc888b3ba77 100644
--- a/llvm/test/MC/RISCV/nop-slide.s
+++ b/llvm/test/MC/RISCV/nop-slide.s
@@ -10,18 +10,18 @@
 auipc a0, 0
 
 # CHECK-RVC-NORELAX: 0000000000000000 <.text>:
-# CHECK-RVC-NORELAX-NEXT: 0: 00 00        	unimp
-# CHECK-RVC-NORELAX-NEXT: 2: 01 00        	nop
-# CHECK-RVC-NORELAX-NEXT: 4: 17 05 00 00  	auipc	a0, 0x0
+# CHECK-RVC-NORELAX-NEXT: 0: 0000      	unimp
+# CHECK-RVC-NORELAX-NEXT: 2: 0001      	nop
+# CHECK-RVC-NORELAX-NEXT: 4: 00000517  	auipc	a0, 0x0
 
 # CHECK-RVC-RELAX: 0000000000000000 <.text>:
-# CHECK-RVC-RELAX-NEXT:   0: 01 00        	nop
-# CHECK-RVC-RELAX-NEXT:   2: 00 01        	addi	s0, sp, 0x80
-# CHECK-RVC-RELAX-NEXT:   4: 00 17        	addi	s0, sp, 0x3a0
-# CHECK-RVC-RELAX-NEXT:   6: 05 00        	c.nop	0x1
-# CHECK-RVC-RELAX-NEXT:   8: 00           	<unknown>
+# CHECK-RVC-RELAX-NEXT:   0: 0001      	nop
+# CHECK-RVC-RELAX-NEXT:   2: 0100      	addi	s0, sp, 0x80
+# CHECK-RVC-RELAX-NEXT:   4: 1700      	addi	s0, sp, 0x3a0
+# CHECK-RVC-RELAX-NEXT:   6: 0005      	c.nop	0x1
+# CHECK-RVC-RELAX-NEXT:   8: 00        	<unknown>
 
 # CHECK: 0000000000000000 <.text>:
-# CHECK-NEXT: 0: 00 00        	<unknown>
-# CHECK-NEXT: 2: 00 00        	<unknown>
-# CHECK-NEXT: 4: 17 05 00 00  	auipc	a0, 0x0
+# CHECK-NEXT: 0: 0000      	<unknown>
+# CHECK-NEXT: 2: 0000      	<unknown>
+# CHECK-NEXT: 4: 00000517  	auipc	a0, 0x0
diff --git a/llvm/test/MC/RISCV/option-pushpop.s b/llvm/test/MC/RISCV/option-pushpop.s
index c830d16e590b..9c61b5dab5f3 100644
--- a/llvm/test/MC/RISCV/option-pushpop.s
+++ b/llvm/test/MC/RISCV/option-pushpop.s
@@ -25,7 +25,7 @@
 call foo
 
 # CHECK-INST: addi s0, sp, 1020
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 addi s0, sp, 1020
 
@@ -45,14 +45,14 @@ call bar
 .option rvc
 # CHECK-INST: .option rvc
 # CHECK-INST: c.addi4spn s0, sp, 1020
-# CHECK-BYTES: e0 1f
+# CHECK-BYTES: 1fe0
 # CHECK-ALIAS: addi s0, sp, 1020
 addi s0, sp, 1020
 
 .option pop     # Pop relax=true, rvc=false
 # CHECK-INST: .option pop
 # CHECK-INST: addi s0, sp, 1020
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 addi s0, sp, 1020
 
@@ -69,7 +69,7 @@ call bar
 call baz
 
 # CHECK-INST: addi s0, sp, 1020
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 addi s0, sp, 1020
 
diff --git a/llvm/test/MC/RISCV/option-rvc.s b/llvm/test/MC/RISCV/option-rvc.s
index 00c8ea167bcd..894fbab562d7 100644
--- a/llvm/test/MC/RISCV/option-rvc.s
+++ b/llvm/test/MC/RISCV/option-rvc.s
@@ -20,13 +20,13 @@
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d -M no-aliases - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-INST %s
 
-# CHECK-BYTES: 13 85 05 00
+# CHECK-BYTES: 00058513
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: addi a0, a1, 0
 # CHECK: # encoding:  [0x13,0x85,0x05,0x00]
 addi a0, a1, 0
 
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: addi s0, sp, 1020
 # CHECK: # encoding:  [0x13,0x04,0xc1,0x3f]
@@ -35,13 +35,13 @@ addi s0, sp, 1020
 
 # CHECK: .option rvc
 .option rvc
-# CHECK-BYTES: 2e 85
+# CHECK-BYTES: 852e
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: c.mv a0, a1
 # CHECK: # encoding:  [0x2e,0x85]
 addi a0, a1, 0
 
-# CHECK-BYTES: e0 1f
+# CHECK-BYTES: 1fe0
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: c.addi4spn s0, sp, 1020
 # CHECK: # encoding:  [0xe0,0x1f]
@@ -49,13 +49,13 @@ addi s0, sp, 1020
 
 # CHECK: .option norvc
 .option norvc
-# CHECK-BYTES: 13 85 05 00
+# CHECK-BYTES: 00058513
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: addi a0, a1, 0
 # CHECK: # encoding:  [0x13,0x85,0x05,0x00]
 addi a0, a1, 0
 
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: addi s0, sp, 1020
 # CHECK: # encoding:  [0x13,0x04,0xc1,0x3f]
@@ -63,13 +63,13 @@ addi s0, sp, 1020
 
 # CHECK: .option rvc
 .option rvc
-# CHECK-BYTES: 2e 85
+# CHECK-BYTES: 852e
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: c.mv a0, a1
 # CHECK: # encoding:  [0x2e,0x85]
 addi a0, a1, 0
 
-# CHECK-BYTES: e0 1f
+# CHECK-BYTES: 1fe0
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: c.addi4spn s0, sp, 1020
 # CHECK: # encoding:  [0xe0,0x1f]
@@ -77,13 +77,13 @@ addi s0, sp, 1020
 
 # CHECK: .option norvc
 .option norvc
-# CHECK-BYTES: 13 85 05 00
+# CHECK-BYTES: 00058513
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: addi a0, a1, 0
 # CHECK: # encoding:  [0x13,0x85,0x05,0x00]
 addi a0, a1, 0
 
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: addi s0, sp, 1020
 # CHECK: # encoding:  [0x13,0x04,0xc1,0x3f]
diff --git a/llvm/test/MC/RISCV/rv32e-invalid.s b/llvm/test/MC/RISCV/rv32e-invalid.s
index 9c19d3f40bcf..95dc156f250a 100644
--- a/llvm/test/MC/RISCV/rv32e-invalid.s
+++ b/llvm/test/MC/RISCV/rv32e-invalid.s
@@ -11,100 +11,100 @@
 # are rejected for RV32E/RV64E, when both assembling and disassembling.
 
 
-# CHECK-DIS: 37 18 00 00 <unknown>
+# CHECK-DIS: 00001837 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x16, 1
-# CHECK-DIS: b7 28 00 00 <unknown>
+# CHECK-DIS: 000028b7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x17, 2
-# CHECK-DIS: 37 39 00 00 <unknown>
+# CHECK-DIS: 00003937 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x18, 3
-# CHECK-DIS: b7 49 00 00 <unknown>
+# CHECK-DIS: 000049b7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x19, 4
-# CHECK-DIS: 37 5a 00 00 <unknown>
+# CHECK-DIS: 00005a37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x20, 5
-# CHECK-DIS: b7 6a 00 00 <unknown>
+# CHECK-DIS: 00006ab7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x21, 6
-# CHECK-DIS: 37 7b 00 00 <unknown>
+# CHECK-DIS: 00007b37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x22, 7
-# CHECK-DIS: b7 8b 00 00 <unknown>
+# CHECK-DIS: 00008bb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x23, 8
-# CHECK-DIS: 37 9c 00 00 <unknown>
+# CHECK-DIS: 00009c37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x24, 9
-# CHECK-DIS: b7 ac 00 00 <unknown>
+# CHECK-DIS: 0000acb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x25, 10
-# CHECK-DIS: 37 bd 00 00 <unknown>
+# CHECK-DIS: 0000bd37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x26, 11
-# CHECK-DIS: b7 cd 00 00 <unknown>
+# CHECK-DIS: 0000cdb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x27, 12
-# CHECK-DIS: 37 de 00 00 <unknown>
+# CHECK-DIS: 0000de37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x28, 13
-# CHECK-DIS: b7 ee 00 00 <unknown>
+# CHECK-DIS: 0000eeb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x29, 14
-# CHECK-DIS: 37 ff 00 00 <unknown>
+# CHECK-DIS: 0000ff37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x30, 15
-# CHECK-DIS: b7 0f 01 00 <unknown>
+# CHECK-DIS: 00010fb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x31, 16
 
-# CHECK-DIS: 17 18 01 00 <unknown>
+# CHECK-DIS: 00011817 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc a6, 17
-# CHECK-DIS: 97 28 01 00 <unknown>
+# CHECK-DIS: 00012897 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc a7, 18
-# CHECK-DIS: 17 39 01 00 <unknown>
+# CHECK-DIS: 00013917 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s2, 19
-# CHECK-DIS: 97 49 01 00 <unknown>
+# CHECK-DIS: 00014997 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s3, 20
-# CHECK-DIS: 17 5a 01 00 <unknown>
+# CHECK-DIS: 00015a17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s4, 21
-# CHECK-DIS: 97 6a 01 00 <unknown>
+# CHECK-DIS: 00016a97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s5, 22
-# CHECK-DIS: 17 7b 01 00 <unknown>
+# CHECK-DIS: 00017b17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s6, 23
-# CHECK-DIS: 97 8b 01 00 <unknown>
+# CHECK-DIS: 00018b97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s7, 24
-# CHECK-DIS: 17 9c 01 00 <unknown>
+# CHECK-DIS: 00019c17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s8, 25
-# CHECK-DIS: 97 ac 01 00 <unknown>
+# CHECK-DIS: 0001ac97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s9, 26
-# CHECK-DIS: 17 bd 01 00 <unknown>
+# CHECK-DIS: 0001bd17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s10, 27
-# CHECK-DIS: 97 cd 01 00 <unknown>
+# CHECK-DIS: 0001cd97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s11, 28
-# CHECK-DIS: 17 de 01 00 <unknown>
+# CHECK-DIS: 0001de17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc t3, 29
-# CHECK-DIS: 97 ee 01 00 <unknown>
+# CHECK-DIS: 0001ee97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc t4, 30
-# CHECK-DIS: 17 ff 01 00 <unknown>
+# CHECK-DIS: 0001ff17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc t5, 31
-# CHECK-DIS: 97 0f 02 00 <unknown>
+# CHECK-DIS: 00020f97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc t6, 32
diff --git a/llvm/test/MC/RISCV/rvv/add.s b/llvm/test/MC/RISCV/rvv/add.s
index 89cef5dc0a4c..ebfe50f2d958 100644
--- a/llvm/test/MC/RISCV/rvv/add.s
+++ b/llvm/test/MC/RISCV/rvv/add.s
@@ -12,352 +12,352 @@ vadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 00 <unknown>
+# CHECK-UNKNOWN: 004a0457 <unknown>
 
 vadd.vv v8, v4, v20
 # CHECK-INST: vadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 02 <unknown>
+# CHECK-UNKNOWN: 024a0457 <unknown>
 
 vadd.vx v8, v4, a0, v0.t
 # CHECK-INST: vadd.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 00 <unknown>
+# CHECK-UNKNOWN: 00454457 <unknown>
 
 vadd.vx v8, v4, a0
 # CHECK-INST: vadd.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 02 <unknown>
+# CHECK-UNKNOWN: 02454457 <unknown>
 
 vadd.vi v8, v4, 15, v0.t
 # CHECK-INST: vadd.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 00 <unknown>
+# CHECK-UNKNOWN: 0047b457 <unknown>
 
 vadd.vi v8, v4, 15
 # CHECK-INST: vadd.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 02 <unknown>
+# CHECK-UNKNOWN: 0247b457 <unknown>
 
 vwaddu.vv v8, v4, v20, v0.t
 # CHECK-INST: vwaddu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c0 <unknown>
+# CHECK-UNKNOWN: c04a2457 <unknown>
 
 vwaddu.vv v8, v4, v20
 # CHECK-INST: vwaddu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c2 <unknown>
+# CHECK-UNKNOWN: c24a2457 <unknown>
 
 vwaddu.vx v8, v4, a0, v0.t
 # CHECK-INST: vwaddu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c0 <unknown>
+# CHECK-UNKNOWN: c0456457 <unknown>
 
 vwaddu.vx v8, v4, a0
 # CHECK-INST: vwaddu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c2 <unknown>
+# CHECK-UNKNOWN: c2456457 <unknown>
 
 vwadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vwadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c4 <unknown>
+# CHECK-UNKNOWN: c44a2457 <unknown>
 
 vwadd.vv v8, v4, v20
 # CHECK-INST: vwadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c6 <unknown>
+# CHECK-UNKNOWN: c64a2457 <unknown>
 
 vwadd.vx v8, v4, a0, v0.t
 # CHECK-INST: vwadd.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c4 <unknown>
+# CHECK-UNKNOWN: c4456457 <unknown>
 
 vwadd.vx v8, v4, a0
 # CHECK-INST: vwadd.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c6 <unknown>
+# CHECK-UNKNOWN: c6456457 <unknown>
 
 vwaddu.wv v8, v4, v20, v0.t
 # CHECK-INST: vwaddu.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d0 <unknown>
+# CHECK-UNKNOWN: d04a2457 <unknown>
 
 vwaddu.wv v8, v4, v20
 # CHECK-INST: vwaddu.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d2 <unknown>
+# CHECK-UNKNOWN: d24a2457 <unknown>
 
 vwaddu.wx v8, v4, a0, v0.t
 # CHECK-INST: vwaddu.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d0 <unknown>
+# CHECK-UNKNOWN: d0456457 <unknown>
 
 vwaddu.wx v8, v4, a0
 # CHECK-INST: vwaddu.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d2 <unknown>
+# CHECK-UNKNOWN: d2456457 <unknown>
 
 vwadd.wv v8, v4, v20, v0.t
 # CHECK-INST: vwadd.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d4 <unknown>
+# CHECK-UNKNOWN: d44a2457 <unknown>
 
 vwadd.wv v8, v4, v20
 # CHECK-INST: vwadd.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d6 <unknown>
+# CHECK-UNKNOWN: d64a2457 <unknown>
 
 vwadd.wx v8, v4, a0, v0.t
 # CHECK-INST: vwadd.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d4 <unknown>
+# CHECK-UNKNOWN: d4456457 <unknown>
 
 vwadd.wx v8, v4, a0
 # CHECK-INST: vwadd.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d6 <unknown>
+# CHECK-UNKNOWN: d6456457 <unknown>
 
 vadc.vvm v8, v4, v20, v0
 # CHECK-INST: vadc.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 40 <unknown>
+# CHECK-UNKNOWN: 404a0457 <unknown>
 
 vadc.vvm v4, v4, v20, v0
 # CHECK-INST: vadc.vvm v4, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x02,0x4a,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a 40 <unknown>
+# CHECK-UNKNOWN: 404a0257 <unknown>
 
 vadc.vvm v8, v4, v8, v0
 # CHECK-INST: vadc.vvm v8, v4, v8, v0
 # CHECK-ENCODING: [0x57,0x04,0x44,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 44 40 <unknown>
+# CHECK-UNKNOWN: 40440457 <unknown>
 
 vadc.vxm v8, v4, a0, v0
 # CHECK-INST: vadc.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 40 <unknown>
+# CHECK-UNKNOWN: 40454457 <unknown>
 
 vadc.vim v8, v4, 15, v0
 # CHECK-INST: vadc.vim v8, v4, 15, v0
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 40 <unknown>
+# CHECK-UNKNOWN: 4047b457 <unknown>
 
 vmadc.vvm v8, v4, v20, v0
 # CHECK-INST: vmadc.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 44 <unknown>
+# CHECK-UNKNOWN: 444a0457 <unknown>
 
 vmadc.vvm v4, v4, v20, v0
 # CHECK-INST: vmadc.vvm v4, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x02,0x4a,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a 44 <unknown>
+# CHECK-UNKNOWN: 444a0257 <unknown>
 
 vmadc.vvm v8, v4, v8, v0
 # CHECK-INST: vmadc.vvm v8, v4, v8, v0
 # CHECK-ENCODING: [0x57,0x04,0x44,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 44 44 <unknown>
+# CHECK-UNKNOWN: 44440457 <unknown>
 
 vmadc.vxm v8, v4, a0, v0
 # CHECK-INST: vmadc.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 44 <unknown>
+# CHECK-UNKNOWN: 44454457 <unknown>
 
 vmadc.vim v8, v4, 15, v0
 # CHECK-INST: vmadc.vim v8, v4, 15, v0
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 44 <unknown>
+# CHECK-UNKNOWN: 4447b457 <unknown>
 
 vmadc.vv v8, v4, v20
 # CHECK-INST: vmadc.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 46 <unknown>
+# CHECK-UNKNOWN: 464a0457 <unknown>
 
 vmadc.vx v8, v4, a0
 # CHECK-INST: vmadc.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 46 <unknown>
+# CHECK-UNKNOWN: 46454457 <unknown>
 
 vmadc.vi v8, v4, 15
 # CHECK-INST: vmadc.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 46 <unknown>
+# CHECK-UNKNOWN: 4647b457 <unknown>
 
 vsaddu.vv v8, v4, v20, v0.t
 # CHECK-INST: vsaddu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 80 <unknown>
+# CHECK-UNKNOWN: 804a0457 <unknown>
 
 vsaddu.vv v8, v4, v20
 # CHECK-INST: vsaddu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 82 <unknown>
+# CHECK-UNKNOWN: 824a0457 <unknown>
 
 vsaddu.vx v8, v4, a0, v0.t
 # CHECK-INST: vsaddu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 80 <unknown>
+# CHECK-UNKNOWN: 80454457 <unknown>
 
 vsaddu.vx v8, v4, a0
 # CHECK-INST: vsaddu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 82 <unknown>
+# CHECK-UNKNOWN: 82454457 <unknown>
 
 vsaddu.vi v8, v4, 15, v0.t
 # CHECK-INST: vsaddu.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 80 <unknown>
+# CHECK-UNKNOWN: 8047b457 <unknown>
 
 vsaddu.vi v8, v4, 15
 # CHECK-INST: vsaddu.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 82 <unknown>
+# CHECK-UNKNOWN: 8247b457 <unknown>
 
 vsadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vsadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 84 <unknown>
+# CHECK-UNKNOWN: 844a0457 <unknown>
 
 vsadd.vv v8, v4, v20
 # CHECK-INST: vsadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 86 <unknown>
+# CHECK-UNKNOWN: 864a0457 <unknown>
 
 vsadd.vx v8, v4, a0, v0.t
 # CHECK-INST: vsadd.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 84 <unknown>
+# CHECK-UNKNOWN: 84454457 <unknown>
 
 vsadd.vx v8, v4, a0
 # CHECK-INST: vsadd.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 86 <unknown>
+# CHECK-UNKNOWN: 86454457 <unknown>
 
 vsadd.vi v8, v4, 15, v0.t
 # CHECK-INST: vsadd.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 84 <unknown>
+# CHECK-UNKNOWN: 8447b457 <unknown>
 
 vsadd.vi v8, v4, 15
 # CHECK-INST: vsadd.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 86 <unknown>
+# CHECK-UNKNOWN: 8647b457 <unknown>
 
 vaadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vaadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 24 <unknown>
+# CHECK-UNKNOWN: 244a2457 <unknown>
 
 vaadd.vv v8, v4, v20
 # CHECK-INST: vaadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 26 <unknown>
+# CHECK-UNKNOWN: 264a2457 <unknown>
 
 vaadd.vx v8, v4, a0, v0.t
 # CHECK-INST: vaadd.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 24 <unknown>
+# CHECK-UNKNOWN: 24456457 <unknown>
 
 vaadd.vx v8, v4, a0
 # CHECK-INST: vaadd.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 26 <unknown>
+# CHECK-UNKNOWN: 26456457 <unknown>
 
 vaaddu.vv v8, v4, v20, v0.t
 # CHECK-INST: vaaddu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 20 <unknown>
+# CHECK-UNKNOWN: 204a2457 <unknown>
 
 vaaddu.vv v8, v4, v20
 # CHECK-INST: vaaddu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 22 <unknown>
+# CHECK-UNKNOWN: 224a2457 <unknown>
 
 vaaddu.vx v8, v4, a0, v0.t
 # CHECK-INST: vaaddu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 20 <unknown>
+# CHECK-UNKNOWN: 20456457 <unknown>
 
 vaaddu.vx v8, v4, a0
 # CHECK-INST: vaaddu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 22 <unknown>
+# CHECK-UNKNOWN: 22456457 <unknown>
 
 vwcvt.x.x.v v8, v4, v0.t
 # CHECK-INST: vwcvt.x.x.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x40,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 40 c4 <unknown>
+# CHECK-UNKNOWN: c4406457 <unknown>
 
 vwcvt.x.x.v v8, v4
 # CHECK-INST: vwcvt.x.x.v v8, v4
 # CHECK-ENCODING: [0x57,0x64,0x40,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 40 c6 <unknown>
+# CHECK-UNKNOWN: c6406457 <unknown>
 
 vwcvtu.x.x.v v8, v4, v0.t
 # CHECK-INST: vwcvtu.x.x.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x40,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 40 c0 <unknown>
+# CHECK-UNKNOWN: c0406457 <unknown>
 
 vwcvtu.x.x.v v8, v4
 # CHECK-INST: vwcvtu.x.x.v v8, v4
 # CHECK-ENCODING: [0x57,0x64,0x40,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 40 c2 <unknown>
+# CHECK-UNKNOWN: c2406457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/and.s b/llvm/test/MC/RISCV/rvv/and.s
index 894263fe0152..b1182c175a5d 100644
--- a/llvm/test/MC/RISCV/rvv/and.s
+++ b/llvm/test/MC/RISCV/rvv/and.s
@@ -12,34 +12,34 @@ vand.vv v8, v4, v20, v0.t
 # CHECK-INST: vand.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 24 <unknown>
+# CHECK-UNKNOWN: 244a0457 <unknown>
 
 vand.vv v8, v4, v20
 # CHECK-INST: vand.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 26 <unknown>
+# CHECK-UNKNOWN: 264a0457 <unknown>
 
 vand.vx v8, v4, a0, v0.t
 # CHECK-INST: vand.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 24 <unknown>
+# CHECK-UNKNOWN: 24454457 <unknown>
 
 vand.vx v8, v4, a0
 # CHECK-INST: vand.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 26 <unknown>
+# CHECK-UNKNOWN: 26454457 <unknown>
 
 vand.vi v8, v4, 15, v0.t
 # CHECK-INST: vand.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 24 <unknown>
+# CHECK-UNKNOWN: 2447b457 <unknown>
 
 vand.vi v8, v4, 15
 # CHECK-INST: vand.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 26 <unknown>
+# CHECK-UNKNOWN: 2647b457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/clip.s b/llvm/test/MC/RISCV/rvv/clip.s
index f4fb2c576b30..70c23d8ec0b1 100644
--- a/llvm/test/MC/RISCV/rvv/clip.s
+++ b/llvm/test/MC/RISCV/rvv/clip.s
@@ -12,70 +12,70 @@ vnclipu.wv v8, v4, v20, v0.t
 # CHECK-INST: vnclipu.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b8 <unknown>
+# CHECK-UNKNOWN: b84a0457 <unknown>
 
 vnclipu.wv v8, v4, v20
 # CHECK-INST: vnclipu.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xba]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a ba <unknown>
+# CHECK-UNKNOWN: ba4a0457 <unknown>
 
 vnclipu.wx v8, v4, a0, v0.t
 # CHECK-INST: vnclipu.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b8 <unknown>
+# CHECK-UNKNOWN: b8454457 <unknown>
 
 vnclipu.wx v8, v4, a0
 # CHECK-INST: vnclipu.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xba]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 ba <unknown>
+# CHECK-UNKNOWN: ba454457 <unknown>
 
 vnclipu.wi v8, v4, 31, v0.t
 # CHECK-INST: vnclipu.wi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b8 <unknown>
+# CHECK-UNKNOWN: b84fb457 <unknown>
 
 vnclipu.wi v8, v4, 31
 # CHECK-INST: vnclipu.wi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xba]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f ba <unknown>
+# CHECK-UNKNOWN: ba4fb457 <unknown>
 
 vnclip.wv v8, v4, v20, v0.t
 # CHECK-INST: vnclip.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a bc <unknown>
+# CHECK-UNKNOWN: bc4a0457 <unknown>
 
 vnclip.wv v8, v4, v20
 # CHECK-INST: vnclip.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a be <unknown>
+# CHECK-UNKNOWN: be4a0457 <unknown>
 
 vnclip.wx v8, v4, a0, v0.t
 # CHECK-INST: vnclip.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 bc <unknown>
+# CHECK-UNKNOWN: bc454457 <unknown>
 
 vnclip.wx v8, v4, a0
 # CHECK-INST: vnclip.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 be <unknown>
+# CHECK-UNKNOWN: be454457 <unknown>
 
 vnclip.wi v8, v4, 31, v0.t
 # CHECK-INST: vnclip.wi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f bc <unknown>
+# CHECK-UNKNOWN: bc4fb457 <unknown>
 
 vnclip.wi v8, v4, 31
 # CHECK-INST: vnclip.wi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f be <unknown>
+# CHECK-UNKNOWN: be4fb457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/compare.s b/llvm/test/MC/RISCV/rvv/compare.s
index fe7c1144a3c0..b1b9518a1d5a 100644
--- a/llvm/test/MC/RISCV/rvv/compare.s
+++ b/llvm/test/MC/RISCV/rvv/compare.s
@@ -12,367 +12,367 @@ vmslt.vv v0, v4, v20, v0.t
 # CHECK-INST: vmslt.vv v0, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x00,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 00 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a0057 <unknown>
 
 vmseq.vv v8, v4, v20, v0.t
 # CHECK-INST: vmseq.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 60 <unknown>
+# CHECK-UNKNOWN: 604a0457 <unknown>
 
 vmseq.vv v8, v4, v20
 # CHECK-INST: vmseq.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 62 <unknown>
+# CHECK-UNKNOWN: 624a0457 <unknown>
 
 vmseq.vx v8, v4, a0, v0.t
 # CHECK-INST: vmseq.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 60 <unknown>
+# CHECK-UNKNOWN: 60454457 <unknown>
 
 vmseq.vx v8, v4, a0
 # CHECK-INST: vmseq.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 62 <unknown>
+# CHECK-UNKNOWN: 62454457 <unknown>
 
 vmseq.vi v8, v4, 15, v0.t
 # CHECK-INST: vmseq.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 60 <unknown>
+# CHECK-UNKNOWN: 6047b457 <unknown>
 
 vmseq.vi v8, v4, 15
 # CHECK-INST: vmseq.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 62 <unknown>
+# CHECK-UNKNOWN: 6247b457 <unknown>
 
 vmsne.vv v8, v4, v20, v0.t
 # CHECK-INST: vmsne.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 64 <unknown>
+# CHECK-UNKNOWN: 644a0457 <unknown>
 
 vmsne.vv v8, v4, v20
 # CHECK-INST: vmsne.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 66 <unknown>
+# CHECK-UNKNOWN: 664a0457 <unknown>
 
 vmsne.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsne.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 64 <unknown>
+# CHECK-UNKNOWN: 64454457 <unknown>
 
 vmsne.vx v8, v4, a0
 # CHECK-INST: vmsne.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 66 <unknown>
+# CHECK-UNKNOWN: 66454457 <unknown>
 
 vmsne.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsne.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 64 <unknown>
+# CHECK-UNKNOWN: 6447b457 <unknown>
 
 vmsne.vi v8, v4, 15
 # CHECK-INST: vmsne.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 66 <unknown>
+# CHECK-UNKNOWN: 6647b457 <unknown>
 
 vmsltu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmsltu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 68 <unknown>
+# CHECK-UNKNOWN: 684a0457 <unknown>
 
 vmsltu.vv v8, v4, v20
 # CHECK-INST: vmsltu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6a <unknown>
+# CHECK-UNKNOWN: 6a4a0457 <unknown>
 
 vmsltu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsltu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 68 <unknown>
+# CHECK-UNKNOWN: 68454457 <unknown>
 
 vmsltu.vx v8, v4, a0
 # CHECK-INST: vmsltu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6a <unknown>
+# CHECK-UNKNOWN: 6a454457 <unknown>
 
 vmslt.vv v8, v4, v20, v0.t
 # CHECK-INST: vmslt.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a0457 <unknown>
 
 vmslt.vv v8, v4, v20
 # CHECK-INST: vmslt.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a0457 <unknown>
 
 vmslt.vx v8, v4, a0, v0.t
 # CHECK-INST: vmslt.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6c <unknown>
+# CHECK-UNKNOWN: 6c454457 <unknown>
 
 vmslt.vx v8, v4, a0
 # CHECK-INST: vmslt.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6e <unknown>
+# CHECK-UNKNOWN: 6e454457 <unknown>
 
 vmsleu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmsleu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 70 <unknown>
+# CHECK-UNKNOWN: 704a0457 <unknown>
 
 vmsleu.vv v8, v4, v20
 # CHECK-INST: vmsleu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 72 <unknown>
+# CHECK-UNKNOWN: 724a0457 <unknown>
 
 vmsleu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsleu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 70 <unknown>
+# CHECK-UNKNOWN: 70454457 <unknown>
 
 vmsleu.vx v8, v4, a0
 # CHECK-INST: vmsleu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 72 <unknown>
+# CHECK-UNKNOWN: 72454457 <unknown>
 
 vmsleu.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsleu.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 70 <unknown>
+# CHECK-UNKNOWN: 7047b457 <unknown>
 
 vmsleu.vi v8, v4, 15
 # CHECK-INST: vmsleu.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 72 <unknown>
+# CHECK-UNKNOWN: 7247b457 <unknown>
 
 vmsle.vv v8, v4, v20, v0.t
 # CHECK-INST: vmsle.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 74 <unknown>
+# CHECK-UNKNOWN: 744a0457 <unknown>
 
 vmsle.vv v8, v4, v20
 # CHECK-INST: vmsle.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 76 <unknown>
+# CHECK-UNKNOWN: 764a0457 <unknown>
 
 vmsle.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsle.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 74 <unknown>
+# CHECK-UNKNOWN: 74454457 <unknown>
 
 vmsle.vx v8, v4, a0
 # CHECK-INST: vmsle.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 76 <unknown>
+# CHECK-UNKNOWN: 76454457 <unknown>
 
 vmsle.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsle.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 74 <unknown>
+# CHECK-UNKNOWN: 7447b457 <unknown>
 
 vmsle.vi v8, v4, 15
 # CHECK-INST: vmsle.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 76 <unknown>
+# CHECK-UNKNOWN: 7647b457 <unknown>
 
 vmsgtu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsgtu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x78]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 78 <unknown>
+# CHECK-UNKNOWN: 78454457 <unknown>
 
 vmsgtu.vx v8, v4, a0
 # CHECK-INST: vmsgtu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x7a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 7a <unknown>
+# CHECK-UNKNOWN: 7a454457 <unknown>
 
 vmsgtu.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsgtu.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x78]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 78 <unknown>
+# CHECK-UNKNOWN: 7847b457 <unknown>
 
 vmsgtu.vi v8, v4, 15
 # CHECK-INST: vmsgtu.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7a <unknown>
+# CHECK-UNKNOWN: 7a47b457 <unknown>
 
 vmsgt.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsgt.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x7c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 7c <unknown>
+# CHECK-UNKNOWN: 7c454457 <unknown>
 
 vmsgt.vx v8, v4, a0
 # CHECK-INST: vmsgt.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 7e <unknown>
+# CHECK-UNKNOWN: 7e454457 <unknown>
 
 vmsgt.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsgt.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7c <unknown>
+# CHECK-UNKNOWN: 7c47b457 <unknown>
 
 vmsgt.vi v8, v4, 15
 # CHECK-INST: vmsgt.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7e <unknown>
+# CHECK-UNKNOWN: 7e47b457 <unknown>
 
 vmsgtu.vv v8, v20, v4, v0.t
 # CHECK-INST: vmsltu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 68 <unknown>
+# CHECK-UNKNOWN: 684a0457 <unknown>
 
 vmsgtu.vv v8, v20, v4
 # CHECK-INST: vmsltu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6a <unknown>
+# CHECK-UNKNOWN: 6a4a0457 <unknown>
 
 vmsgt.vv v8, v20, v4, v0.t
 # CHECK-INST: vmslt.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a0457 <unknown>
 
 vmsgt.vv v8, v20, v4
 # CHECK-INST: vmslt.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a0457 <unknown>
 
 vmsgeu.vv v8, v20, v4, v0.t
 # CHECK-INST: vmsleu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 70 <unknown>
+# CHECK-UNKNOWN: 704a0457 <unknown>
 
 vmsgeu.vv v8, v20, v4
 # CHECK-INST: vmsleu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 72 <unknown>
+# CHECK-UNKNOWN: 724a0457 <unknown>
 
 vmsge.vv v8, v20, v4, v0.t
 # CHECK-INST: vmsle.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 74 <unknown>
+# CHECK-UNKNOWN: 744a0457 <unknown>
 
 vmsge.vv v8, v20, v4
 # CHECK-INST: vmsle.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 76 <unknown>
+# CHECK-UNKNOWN: 764a0457 <unknown>
 
 vmsltu.vi v8, v4, 16, v0.t
 # CHECK-INST: vmsleu.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 70 <unknown>
+# CHECK-UNKNOWN: 7047b457 <unknown>
 
 vmsltu.vi v8, v4, 16
 # CHECK-INST: vmsleu.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 72 <unknown>
+# CHECK-UNKNOWN: 7247b457 <unknown>
 
 vmsltu.vi v8, v4, 0, v0.t
 # CHECK-INST: vmsne.vv v8, v4, v4, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x42,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 42 64 <unknown>
+# CHECK-UNKNOWN: 64420457 <unknown>
 
 vmsltu.vi v8, v4, 0
 # CHECK-INST: vmsne.vv v8, v4, v4
 # CHECK-ENCODING: [0x57,0x04,0x42,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 42 66 <unknown>
+# CHECK-UNKNOWN: 66420457 <unknown>
 
 vmslt.vi v8, v4, 16, v0.t
 # CHECK-INST: vmsle.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 74 <unknown>
+# CHECK-UNKNOWN: 7447b457 <unknown>
 
 vmslt.vi v8, v4, 16
 # CHECK-INST: vmsle.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 76 <unknown>
+# CHECK-UNKNOWN: 7647b457 <unknown>
 
 vmsgeu.vi v8, v4, 16, v0.t
 # CHECK-INST: vmsgtu.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x78]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 78 <unknown>
+# CHECK-UNKNOWN: 7847b457 <unknown>
 
 vmsgeu.vi v8, v4, 16
 # CHECK-INST: vmsgtu.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7a <unknown>
+# CHECK-UNKNOWN: 7a47b457 <unknown>
 
 vmsgeu.vi v8, v4, 0, v0.t
 # CHECK-INST: vmseq.vv v8, v4, v4, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x42,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 42 60 <unknown>
+# CHECK-UNKNOWN: 60420457 <unknown>
 
 vmsgeu.vi v8, v4, 0
 # CHECK-INST: vmseq.vv v8, v4, v4
 # CHECK-ENCODING: [0x57,0x04,0x42,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 42 62 <unknown>
+# CHECK-UNKNOWN: 62420457 <unknown>
 
 vmsge.vi v8, v4, 16, v0.t
 # CHECK-INST: vmsgt.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7c <unknown>
+# CHECK-UNKNOWN: 7c47b457 <unknown>
 
 vmsge.vi v8, v4, 16
 # CHECK-INST: vmsgt.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7e <unknown>
+# CHECK-UNKNOWN: 7e47b457 <unknown>
 
 vmsgeu.vx v8, v4, a0
 # CHECK-INST: vmsltu.vx v8, v4, a0
@@ -380,8 +380,8 @@ vmsgeu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6a]
 # CHECK-ENCODING: [0x57,0x24,0x84,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6a <unknown>
-# CHECK-UNKNOWN: 57 24 84 76 <unknown>
+# CHECK-UNKNOWN: 6a454457 <unknown>
+# CHECK-UNKNOWN: 76842457 <unknown>
 
 vmsge.vx v0, v4, a0
 # CHECK-INST: vmslt.vx v0, v4, a0
@@ -389,8 +389,8 @@ vmsge.vx v0, v4, a0
 # CHECK-ENCODING: [0x57,0x40,0x45,0x6e]
 # CHECK-ENCODING: [0x57,0x20,0x00,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 40 45 6e <unknown>
-# CHECK-UNKNOWN: 57 20 00 76 <unknown>
+# CHECK-UNKNOWN: 6e454057 <unknown>
+# CHECK-UNKNOWN: 76002057 <unknown>
 
 vmsge.vx v8, v4, a0
 # CHECK-INST: vmslt.vx v8, v4, a0
@@ -398,8 +398,8 @@ vmsge.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6e]
 # CHECK-ENCODING: [0x57,0x24,0x84,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6e <unknown>
-# CHECK-UNKNOWN: 57 24 84 76 <unknown>
+# CHECK-UNKNOWN: 6e454457 <unknown>
+# CHECK-UNKNOWN: 76842457 <unknown>
 
 vmsgeu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsltu.vx v8, v4, a0, v0.t
@@ -407,8 +407,8 @@ vmsgeu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x68]
 # CHECK-ENCODING: [0x57,0x24,0x80,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 68 <unknown>
-# CHECK-UNKNOWN: 57 24 80 6e <unknown>
+# CHECK-UNKNOWN: 68454457 <unknown>
+# CHECK-UNKNOWN: 6e802457 <unknown>
 
 vmsge.vx v8, v4, a0, v0.t
 # CHECK-INST: vmslt.vx v8, v4, a0, v0.t
@@ -416,8 +416,8 @@ vmsge.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6c]
 # CHECK-ENCODING: [0x57,0x24,0x80,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6c <unknown>
-# CHECK-UNKNOWN: 57 24 80 6e <unknown>
+# CHECK-UNKNOWN: 6c454457 <unknown>
+# CHECK-UNKNOWN: 6e802457 <unknown>
 
 vmsgeu.vx v0, v4, a0, v0.t, v2
 # CHECK-INST: vmsltu.vx v2, v4, a0
@@ -425,8 +425,8 @@ vmsgeu.vx v0, v4, a0, v0.t, v2
 # CHECK-ENCODING: [0x57,0x41,0x45,0x6a]
 # CHECK-ENCODING: [0x57,0x20,0x01,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 41 45 6a <unknown>
-# CHECK-UNKNOWN: 57 20 01 62 <unknown>
+# CHECK-UNKNOWN: 6a454157 <unknown>
+# CHECK-UNKNOWN: 62012057 <unknown>
 
 vmsge.vx v0, v4, a0, v0.t, v2
 # CHECK-INST: vmslt.vx v2, v4, a0
@@ -434,8 +434,8 @@ vmsge.vx v0, v4, a0, v0.t, v2
 # CHECK-ENCODING: [0x57,0x41,0x45,0x6e]
 # CHECK-ENCODING: [0x57,0x20,0x01,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 41 45 6e <unknown>
-# CHECK-UNKNOWN: 57 20 01 62 <unknown>
+# CHECK-UNKNOWN: 6e454157 <unknown>
+# CHECK-UNKNOWN: 62012057 <unknown>
 
 vmsgeu.vx v9, v4, a0, v0.t, v2
 # CHECK-INST: vmsltu.vx v2, v4, a0
@@ -447,10 +447,10 @@ vmsgeu.vx v9, v4, a0, v0.t, v2
 # CHECK-ENCODING: [0xd7,0x24,0x90,0x62]
 # CHECK-ENCODING: [0xd7,0xa4,0x24,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 41 45 6a <unknown>
-# CHECK-UNKNOWN: 57 21 01 62 <unknown>
-# CHECK-UNKNOWN: d7 24 90 62 <unknown>
-# CHECK-UNKNOWN: d7 a4 24 6a <unknown>
+# CHECK-UNKNOWN: 6a454157 <unknown>
+# CHECK-UNKNOWN: 62012157 <unknown>
+# CHECK-UNKNOWN: 629024d7 <unknown>
+# CHECK-UNKNOWN: 6a24a4d7 <unknown>
 
 vmsge.vx v8, v4, a0, v0.t, v2
 # CHECK-INST: vmslt.vx v2, v4, a0
@@ -462,7 +462,7 @@ vmsge.vx v8, v4, a0, v0.t, v2
 # CHECK-ENCODING: [0x57,0x24,0x80,0x62]
 # CHECK-ENCODING: [0x57,0x24,0x24,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 41 45 6e <unknown>
-# CHECK-UNKNOWN: 57 21 01 62 <unknown>
-# CHECK-UNKNOWN: 57 24 80 62 <unknown>
-# CHECK-UNKNOWN: 57 24 24 6a <unknown>
+# CHECK-UNKNOWN: 6e454157 <unknown>
+# CHECK-UNKNOWN: 62012157 <unknown>
+# CHECK-UNKNOWN: 62802457 <unknown>
+# CHECK-UNKNOWN: 6a242457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/convert.s b/llvm/test/MC/RISCV/rvv/convert.s
index 28c0a0fa837e..269c86cef473 100644
--- a/llvm/test/MC/RISCV/rvv/convert.s
+++ b/llvm/test/MC/RISCV/rvv/convert.s
@@ -15,256 +15,256 @@ vfcvt.xu.f.v v8, v4, v0.t
 # CHECK-INST: vfcvt.xu.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x40,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 40 48 <unknown>
+# CHECK-UNKNOWN: 48401457 <unknown>
 
 vfcvt.xu.f.v v8, v4
 # CHECK-INST: vfcvt.xu.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x40,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 40 4a <unknown>
+# CHECK-UNKNOWN: 4a401457 <unknown>
 
 vfcvt.x.f.v v8, v4, v0.t
 # CHECK-INST: vfcvt.x.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x40,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 40 48 <unknown>
+# CHECK-UNKNOWN: 48409457 <unknown>
 
 vfcvt.x.f.v v8, v4
 # CHECK-INST: vfcvt.x.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x40,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 40 4a <unknown>
+# CHECK-UNKNOWN: 4a409457 <unknown>
 
 vfcvt.f.xu.v v8, v4, v0.t
 # CHECK-INST: vfcvt.f.xu.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x41,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 41 48 <unknown>
+# CHECK-UNKNOWN: 48411457 <unknown>
 
 vfcvt.f.xu.v v8, v4
 # CHECK-INST: vfcvt.f.xu.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x41,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 41 4a <unknown>
+# CHECK-UNKNOWN: 4a411457 <unknown>
 
 vfcvt.f.x.v v8, v4, v0.t
 # CHECK-INST: vfcvt.f.x.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x41,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 41 48 <unknown>
+# CHECK-UNKNOWN: 48419457 <unknown>
 
 vfcvt.f.x.v v8, v4
 # CHECK-INST: vfcvt.f.x.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x41,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 41 4a <unknown>
+# CHECK-UNKNOWN: 4a419457 <unknown>
 
 vfcvt.rtz.xu.f.v v8, v4, v0.t
 # CHECK-INST: vfcvt.rtz.xu.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x43,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 43 48 <unknown>
+# CHECK-UNKNOWN: 48431457 <unknown>
 
 vfcvt.rtz.xu.f.v v8, v4
 # CHECK-INST: vfcvt.rtz.xu.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x43,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 43 4a <unknown>
+# CHECK-UNKNOWN: 4a431457 <unknown>
 
 vfcvt.rtz.x.f.v v8, v4, v0.t
 # CHECK-INST: vfcvt.rtz.x.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x43,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 43 48 <unknown>
+# CHECK-UNKNOWN: 48439457 <unknown>
 
 vfcvt.rtz.x.f.v v8, v4
 # CHECK-INST: vfcvt.rtz.x.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x43,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 43 4a <unknown>
+# CHECK-UNKNOWN: 4a439457 <unknown>
 
 vfwcvt.xu.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.xu.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x44,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 44 48 <unknown>
+# CHECK-UNKNOWN: 48441457 <unknown>
 
 vfwcvt.xu.f.v v8, v4
 # CHECK-INST: vfwcvt.xu.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x44,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 44 4a <unknown>
+# CHECK-UNKNOWN: 4a441457 <unknown>
 
 vfwcvt.x.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.x.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x44,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 44 48 <unknown>
+# CHECK-UNKNOWN: 48449457 <unknown>
 
 vfwcvt.x.f.v v8, v4
 # CHECK-INST: vfwcvt.x.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x44,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 44 4a <unknown>
+# CHECK-UNKNOWN: 4a449457 <unknown>
 
 vfwcvt.f.xu.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.f.xu.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x45,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 45 48 <unknown>
+# CHECK-UNKNOWN: 48451457 <unknown>
 
 vfwcvt.f.xu.v v8, v4
 # CHECK-INST: vfwcvt.f.xu.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x45,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 45 4a <unknown>
+# CHECK-UNKNOWN: 4a451457 <unknown>
 
 vfwcvt.f.x.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.f.x.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x45,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 45 48 <unknown>
+# CHECK-UNKNOWN: 48459457 <unknown>
 
 vfwcvt.f.x.v v8, v4
 # CHECK-INST: vfwcvt.f.x.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x45,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 45 4a <unknown>
+# CHECK-UNKNOWN: 4a459457 <unknown>
 
 vfwcvt.f.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.f.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x46,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 46 48 <unknown>
+# CHECK-UNKNOWN: 48461457 <unknown>
 
 vfwcvt.f.f.v v8, v4
 # CHECK-INST: vfwcvt.f.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x46,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 46 4a <unknown>
+# CHECK-UNKNOWN: 4a461457 <unknown>
 
 vfwcvt.rtz.xu.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.rtz.xu.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x47,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 47 48 <unknown>
+# CHECK-UNKNOWN: 48471457 <unknown>
 
 vfwcvt.rtz.xu.f.v v8, v4
 # CHECK-INST: vfwcvt.rtz.xu.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x47,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 47 4a <unknown>
+# CHECK-UNKNOWN: 4a471457 <unknown>
 
 vfwcvt.rtz.x.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.rtz.x.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x47,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 47 48 <unknown>
+# CHECK-UNKNOWN: 48479457 <unknown>
 
 vfwcvt.rtz.x.f.v v8, v4
 # CHECK-INST: vfwcvt.rtz.x.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x47,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 47 4a <unknown>
+# CHECK-UNKNOWN: 4a479457 <unknown>
 
 vfncvt.xu.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.xu.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x48,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 48 48 <unknown>
+# CHECK-UNKNOWN: 48481457 <unknown>
 
 vfncvt.xu.f.w v4, v4, v0.t
 # CHECK-INST: vfncvt.xu.f.w v4, v4, v0.t
 # CHECK-ENCODING: [0x57,0x12,0x48,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 12 48 48 <unknown>
+# CHECK-UNKNOWN: 48481257 <unknown>
 
 vfncvt.xu.f.w v8, v4
 # CHECK-INST: vfncvt.xu.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x48,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 48 4a <unknown>
+# CHECK-UNKNOWN: 4a481457 <unknown>
 
 vfncvt.x.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.x.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x48,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 48 48 <unknown>
+# CHECK-UNKNOWN: 48489457 <unknown>
 
 vfncvt.x.f.w v8, v4
 # CHECK-INST: vfncvt.x.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x48,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 48 4a <unknown>
+# CHECK-UNKNOWN: 4a489457 <unknown>
 
 vfncvt.f.xu.w v8, v4, v0.t
 # CHECK-INST: vfncvt.f.xu.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x49,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 49 48 <unknown>
+# CHECK-UNKNOWN: 48491457 <unknown>
 
 vfncvt.f.xu.w v8, v4
 # CHECK-INST: vfncvt.f.xu.w v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x49,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 49 4a <unknown>
+# CHECK-UNKNOWN: 4a491457 <unknown>
 
 vfncvt.f.x.w v8, v4, v0.t
 # CHECK-INST: vfncvt.f.x.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x49,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 49 48 <unknown>
+# CHECK-UNKNOWN: 48499457 <unknown>
 
 vfncvt.f.x.w v8, v4
 # CHECK-INST: vfncvt.f.x.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x49,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 49 4a <unknown>
+# CHECK-UNKNOWN: 4a499457 <unknown>
 
 vfncvt.f.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.f.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 48 <unknown>
+# CHECK-UNKNOWN: 484a1457 <unknown>
 
 vfncvt.f.f.w v8, v4
 # CHECK-INST: vfncvt.f.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 4a <unknown>
+# CHECK-UNKNOWN: 4a4a1457 <unknown>
 
 vfncvt.rod.f.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.rod.f.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x4a,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 4a 48 <unknown>
+# CHECK-UNKNOWN: 484a9457 <unknown>
 
 vfncvt.rod.f.f.w v8, v4
 # CHECK-INST: vfncvt.rod.f.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x4a,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 4a 4a <unknown>
+# CHECK-UNKNOWN: 4a4a9457 <unknown>
 
 vfncvt.rtz.xu.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.rtz.xu.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4b,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4b 48 <unknown>
+# CHECK-UNKNOWN: 484b1457 <unknown>
 
 vfncvt.rtz.xu.f.w v8, v4
 # CHECK-INST: vfncvt.rtz.xu.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x4b,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4b 4a <unknown>
+# CHECK-UNKNOWN: 4a4b1457 <unknown>
 
 vfncvt.rtz.x.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.rtz.x.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x4b,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 4b 48 <unknown>
+# CHECK-UNKNOWN: 484b9457 <unknown>
 
 vfncvt.rtz.x.f.w v8, v4
 # CHECK-INST: vfncvt.rtz.x.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x4b,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 4b 4a <unknown>
+# CHECK-UNKNOWN: 4a4b9457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/div.s b/llvm/test/MC/RISCV/rvv/div.s
index 229124c671c6..aca04375f61e 100644
--- a/llvm/test/MC/RISCV/rvv/div.s
+++ b/llvm/test/MC/RISCV/rvv/div.s
@@ -12,94 +12,94 @@ vdivu.vv v8, v4, v20, v0.t
 # CHECK-INST: vdivu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 80 <unknown>
+# CHECK-UNKNOWN: 804a2457 <unknown>
 
 vdivu.vv v8, v4, v20
 # CHECK-INST: vdivu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 82 <unknown>
+# CHECK-UNKNOWN: 824a2457 <unknown>
 
 vdivu.vx v8, v4, a0, v0.t
 # CHECK-INST: vdivu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 80 <unknown>
+# CHECK-UNKNOWN: 80456457 <unknown>
 
 vdivu.vx v8, v4, a0
 # CHECK-INST: vdivu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 82 <unknown>
+# CHECK-UNKNOWN: 82456457 <unknown>
 
 vdiv.vv v8, v4, v20, v0.t
 # CHECK-INST: vdiv.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 84 <unknown>
+# CHECK-UNKNOWN: 844a2457 <unknown>
 
 vdiv.vv v8, v4, v20
 # CHECK-INST: vdiv.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 86 <unknown>
+# CHECK-UNKNOWN: 864a2457 <unknown>
 
 vdiv.vx v8, v4, a0, v0.t
 # CHECK-INST: vdiv.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 84 <unknown>
+# CHECK-UNKNOWN: 84456457 <unknown>
 
 vdiv.vx v8, v4, a0
 # CHECK-INST: vdiv.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 86 <unknown>
+# CHECK-UNKNOWN: 86456457 <unknown>
 
 vremu.vv v8, v4, v20, v0.t
 # CHECK-INST: vremu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 88 <unknown>
+# CHECK-UNKNOWN: 884a2457 <unknown>
 
 vremu.vv v8, v4, v20
 # CHECK-INST: vremu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 8a <unknown>
+# CHECK-UNKNOWN: 8a4a2457 <unknown>
 
 vremu.vx v8, v4, a0, v0.t
 # CHECK-INST: vremu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 88 <unknown>
+# CHECK-UNKNOWN: 88456457 <unknown>
 
 vremu.vx v8, v4, a0
 # CHECK-INST: vremu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 8a <unknown>
+# CHECK-UNKNOWN: 8a456457 <unknown>
 
 vrem.vv v8, v4, v20, v0.t
 # CHECK-INST: vrem.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 8c <unknown>
+# CHECK-UNKNOWN: 8c4a2457 <unknown>
 
 vrem.vv v8, v4, v20
 # CHECK-INST: vrem.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 8e <unknown>
+# CHECK-UNKNOWN: 8e4a2457 <unknown>
 
 vrem.vx v8, v4, a0, v0.t
 # CHECK-INST: vrem.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 8c <unknown>
+# CHECK-UNKNOWN: 8c456457 <unknown>
 
 vrem.vx v8, v4, a0
 # CHECK-INST: vrem.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 8e <unknown>
+# CHECK-UNKNOWN: 8e456457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/ext.s b/llvm/test/MC/RISCV/rvv/ext.s
index 80cadc096104..3bf1351d77a7 100644
--- a/llvm/test/MC/RISCV/rvv/ext.s
+++ b/llvm/test/MC/RISCV/rvv/ext.s
@@ -12,70 +12,70 @@ vzext.vf2 v8, v4, v0.t
 # CHECK-INST: vzext.vf2 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x43,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 43 48 <unknown>
+# CHECK-UNKNOWN: 48432457 <unknown>
 
 vzext.vf2 v8, v4
 # CHECK-INST: vzext.vf2 v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x43,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 43 4a <unknown>
+# CHECK-UNKNOWN: 4a432457 <unknown>
 
 vsext.vf2 v8, v4, v0.t
 # CHECK-INST: vsext.vf2 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x43,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 43 48 <unknown>
+# CHECK-UNKNOWN: 4843a457 <unknown>
 
 vsext.vf2 v8, v4
 # CHECK-INST: vsext.vf2 v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x43,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 43 4a <unknown>
+# CHECK-UNKNOWN: 4a43a457 <unknown>
 
 vzext.vf4 v8, v4, v0.t
 # CHECK-INST: vzext.vf4 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x42,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 42 48 <unknown>
+# CHECK-UNKNOWN: 48422457 <unknown>
 
 vzext.vf4 v8, v4
 # CHECK-INST: vzext.vf4 v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x42,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 42 4a <unknown>
+# CHECK-UNKNOWN: 4a422457 <unknown>
 
 vsext.vf4 v8, v4, v0.t
 # CHECK-INST: vsext.vf4 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x42,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 42 48 <unknown>
+# CHECK-UNKNOWN: 4842a457 <unknown>
 
 vsext.vf4 v8, v4
 # CHECK-INST: vsext.vf4 v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x42,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 42 4a <unknown>
+# CHECK-UNKNOWN: 4a42a457 <unknown>
 
 vzext.vf8 v8, v4, v0.t
 # CHECK-INST: vzext.vf8 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x41,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 41 48 <unknown>
+# CHECK-UNKNOWN: 48412457 <unknown>
 
 vzext.vf8 v8, v4
 # CHECK-INST: vzext.vf8 v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x41,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 41 4a <unknown>
+# CHECK-UNKNOWN: 4a412457 <unknown>
 
 vsext.vf8 v8, v4, v0.t
 # CHECK-INST: vsext.vf8 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x41,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 41 48 <unknown>
+# CHECK-UNKNOWN: 4841a457 <unknown>
 
 vsext.vf8 v8, v4
 # CHECK-INST: vsext.vf8 v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x41,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 41 4a <unknown>
+# CHECK-UNKNOWN: 4a41a457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fadd.s b/llvm/test/MC/RISCV/rvv/fadd.s
index 60ffaf62ca6b..890b2c0ad68b 100644
--- a/llvm/test/MC/RISCV/rvv/fadd.s
+++ b/llvm/test/MC/RISCV/rvv/fadd.s
@@ -15,70 +15,70 @@ vfadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vfadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x00]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 00 <unknown>
+# CHECK-UNKNOWN: 004a1457 <unknown>
 
 vfadd.vv v8, v4, v20
 # CHECK-INST: vfadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x02]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 02 <unknown>
+# CHECK-UNKNOWN: 024a1457 <unknown>
 
 vfadd.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfadd.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x00]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 00 <unknown>
+# CHECK-UNKNOWN: 00455457 <unknown>
 
 vfadd.vf v8, v4, fa0
 # CHECK-INST: vfadd.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x02]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 02 <unknown>
+# CHECK-UNKNOWN: 02455457 <unknown>
 
 vfwadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vfwadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c0 <unknown>
+# CHECK-UNKNOWN: c04a1457 <unknown>
 
 vfwadd.vv v8, v4, v20
 # CHECK-INST: vfwadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c2 <unknown>
+# CHECK-UNKNOWN: c24a1457 <unknown>
 
 vfwadd.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfwadd.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 c0 <unknown>
+# CHECK-UNKNOWN: c0455457 <unknown>
 
 vfwadd.vf v8, v4, fa0
 # CHECK-INST: vfwadd.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 c2 <unknown>
+# CHECK-UNKNOWN: c2455457 <unknown>
 
 vfwadd.wv v8, v4, v20, v0.t
 # CHECK-INST: vfwadd.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xd0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a d0 <unknown>
+# CHECK-UNKNOWN: d04a1457 <unknown>
 
 vfwadd.wv v8, v4, v20
 # CHECK-INST: vfwadd.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xd2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a d2 <unknown>
+# CHECK-UNKNOWN: d24a1457 <unknown>
 
 vfwadd.wf v8, v4, fa0, v0.t
 # CHECK-INST: vfwadd.wf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xd0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 d0 <unknown>
+# CHECK-UNKNOWN: d0455457 <unknown>
 
 vfwadd.wf v8, v4, fa0
 # CHECK-INST: vfwadd.wf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xd2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 d2 <unknown>
+# CHECK-UNKNOWN: d2455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fcompare.s b/llvm/test/MC/RISCV/rvv/fcompare.s
index 11dd7e05467b..3903bbdab650 100644
--- a/llvm/test/MC/RISCV/rvv/fcompare.s
+++ b/llvm/test/MC/RISCV/rvv/fcompare.s
@@ -15,148 +15,148 @@ vmfeq.vv v8, v4, v20, v0.t
 # CHECK-INST: vmfeq.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x60]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 60 <unknown>
+# CHECK-UNKNOWN: 604a1457 <unknown>
 
 vmfeq.vv v8, v4, v20
 # CHECK-INST: vmfeq.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x62]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 62 <unknown>
+# CHECK-UNKNOWN: 624a1457 <unknown>
 
 vmfeq.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfeq.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x60]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 60 <unknown>
+# CHECK-UNKNOWN: 60455457 <unknown>
 
 vmfeq.vf v8, v4, fa0
 # CHECK-INST: vmfeq.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x62]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 62 <unknown>
+# CHECK-UNKNOWN: 62455457 <unknown>
 
 vmfne.vv v8, v4, v20, v0.t
 # CHECK-INST: vmfne.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x70]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 70 <unknown>
+# CHECK-UNKNOWN: 704a1457 <unknown>
 
 vmfne.vv v8, v4, v20
 # CHECK-INST: vmfne.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x72]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 72 <unknown>
+# CHECK-UNKNOWN: 724a1457 <unknown>
 
 vmfne.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfne.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x70]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 70 <unknown>
+# CHECK-UNKNOWN: 70455457 <unknown>
 
 vmfne.vf v8, v4, fa0
 # CHECK-INST: vmfne.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x72]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 72 <unknown>
+# CHECK-UNKNOWN: 72455457 <unknown>
 
 vmflt.vv v8, v4, v20, v0.t
 # CHECK-INST: vmflt.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a1457 <unknown>
 
 vmflt.vv v8, v4, v20
 # CHECK-INST: vmflt.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a1457 <unknown>
 
 vmflt.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmflt.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 6c <unknown>
+# CHECK-UNKNOWN: 6c455457 <unknown>
 
 vmflt.vf v8, v4, fa0
 # CHECK-INST: vmflt.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 6e <unknown>
+# CHECK-UNKNOWN: 6e455457 <unknown>
 
 vmfle.vv v8, v4, v20, v0.t
 # CHECK-INST: vmfle.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x64]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 64 <unknown>
+# CHECK-UNKNOWN: 644a1457 <unknown>
 
 vmfle.vv v8, v4, v20
 # CHECK-INST: vmfle.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x66]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 66 <unknown>
+# CHECK-UNKNOWN: 664a1457 <unknown>
 
 vmfle.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfle.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 64 <unknown>
+# CHECK-UNKNOWN: 64455457 <unknown>
 
 vmfle.vf v8, v4, fa0
 # CHECK-INST: vmfle.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 66 <unknown>
+# CHECK-UNKNOWN: 66455457 <unknown>
 
 vmfgt.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfgt.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x74]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 74 <unknown>
+# CHECK-UNKNOWN: 74455457 <unknown>
 
 vmfgt.vf v8, v4, fa0
 # CHECK-INST: vmfgt.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x76]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 76 <unknown>
+# CHECK-UNKNOWN: 76455457 <unknown>
 
 vmfge.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfge.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x7c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 7c <unknown>
+# CHECK-UNKNOWN: 7c455457 <unknown>
 
 vmfge.vf v8, v4, fa0
 # CHECK-INST: vmfge.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 7e <unknown>
+# CHECK-UNKNOWN: 7e455457 <unknown>
 
 vmfgt.vv v8, v20, v4, v0.t
 # CHECK-INST: vmflt.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a1457 <unknown>
 
 vmfgt.vv v8, v20, v4
 # CHECK-INST: vmflt.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a1457 <unknown>
 
 vmfge.vv v8, v20, v4, v0.t
 # CHECK-INST: vmfle.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x64]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 64 <unknown>
+# CHECK-UNKNOWN: 644a1457 <unknown>
 
 vmfge.vv v8, v20, v4
 # CHECK-INST: vmfle.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x66]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 66 <unknown>
+# CHECK-UNKNOWN: 664a1457 <unknown>
 
 vmfeq.vv v0, v4, v20, v0.t
 # CHECK-INST: vmfeq.vv v0, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x10,0x4a,0x60]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 10 4a 60 <unknown>
+# CHECK-UNKNOWN: 604a1057 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fdiv.s b/llvm/test/MC/RISCV/rvv/fdiv.s
index 7eb048f3cd58..aa3aae5841a2 100644
--- a/llvm/test/MC/RISCV/rvv/fdiv.s
+++ b/llvm/test/MC/RISCV/rvv/fdiv.s
@@ -15,34 +15,34 @@ vfdiv.vv v8, v4, v20, v0.t
 # CHECK-INST: vfdiv.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x80]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 80 <unknown>
+# CHECK-UNKNOWN: 804a1457 <unknown>
 
 vfdiv.vv v8, v4, v20
 # CHECK-INST: vfdiv.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x82]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 82 <unknown>
+# CHECK-UNKNOWN: 824a1457 <unknown>
 
 vfdiv.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfdiv.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x80]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 80 <unknown>
+# CHECK-UNKNOWN: 80455457 <unknown>
 
 vfdiv.vf v8, v4, fa0
 # CHECK-INST: vfdiv.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x82]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 82 <unknown>
+# CHECK-UNKNOWN: 82455457 <unknown>
 
 vfrdiv.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfrdiv.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 84 <unknown>
+# CHECK-UNKNOWN: 84455457 <unknown>
 
 vfrdiv.vf v8, v4, fa0
 # CHECK-INST: vfrdiv.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 86 <unknown>
+# CHECK-UNKNOWN: 86455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fmacc.s b/llvm/test/MC/RISCV/rvv/fmacc.s
index 129455901b93..8ca43da80961 100644
--- a/llvm/test/MC/RISCV/rvv/fmacc.s
+++ b/llvm/test/MC/RISCV/rvv/fmacc.s
@@ -15,286 +15,286 @@ vfmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vfmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b0 <unknown>
+# CHECK-UNKNOWN: b04a1457 <unknown>
 
 vfmacc.vv v8, v20, v4
 # CHECK-INST: vfmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b2 <unknown>
+# CHECK-UNKNOWN: b24a1457 <unknown>
 
 vfmacc.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfmacc.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b0 <unknown>
+# CHECK-UNKNOWN: b0455457 <unknown>
 
 vfmacc.vf v8, fa0, v4
 # CHECK-INST: vfmacc.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b2 <unknown>
+# CHECK-UNKNOWN: b2455457 <unknown>
 
 vfnmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vfnmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b4 <unknown>
+# CHECK-UNKNOWN: b44a1457 <unknown>
 
 vfnmacc.vv v8, v20, v4
 # CHECK-INST: vfnmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b6 <unknown>
+# CHECK-UNKNOWN: b64a1457 <unknown>
 
 vfnmacc.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfnmacc.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b4 <unknown>
+# CHECK-UNKNOWN: b4455457 <unknown>
 
 vfnmacc.vf v8, fa0, v4
 # CHECK-INST: vfnmacc.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b6 <unknown>
+# CHECK-UNKNOWN: b6455457 <unknown>
 
 vfmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vfmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b8 <unknown>
+# CHECK-UNKNOWN: b84a1457 <unknown>
 
 vfmsac.vv v8, v20, v4
 # CHECK-INST: vfmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xba]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ba <unknown>
+# CHECK-UNKNOWN: ba4a1457 <unknown>
 
 vfmsac.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfmsac.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b8 <unknown>
+# CHECK-UNKNOWN: b8455457 <unknown>
 
 vfmsac.vf v8, fa0, v4
 # CHECK-INST: vfmsac.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xba]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 ba <unknown>
+# CHECK-UNKNOWN: ba455457 <unknown>
 
 vfnmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vfnmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a bc <unknown>
+# CHECK-UNKNOWN: bc4a1457 <unknown>
 
 vfnmsac.vv v8, v20, v4
 # CHECK-INST: vfnmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a be <unknown>
+# CHECK-UNKNOWN: be4a1457 <unknown>
 
 vfnmsac.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfnmsac.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 bc <unknown>
+# CHECK-UNKNOWN: bc455457 <unknown>
 
 vfnmsac.vf v8, fa0, v4
 # CHECK-INST: vfnmsac.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 be <unknown>
+# CHECK-UNKNOWN: be455457 <unknown>
 
 vfmadd.vv v8, v20, v4, v0.t
 # CHECK-INST: vfmadd.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a0 <unknown>
+# CHECK-UNKNOWN: a04a1457 <unknown>
 
 vfmadd.vv v8, v20, v4
 # CHECK-INST: vfmadd.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a2 <unknown>
+# CHECK-UNKNOWN: a24a1457 <unknown>
 
 vfmadd.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfmadd.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a0 <unknown>
+# CHECK-UNKNOWN: a0455457 <unknown>
 
 vfmadd.vf v8, fa0, v4
 # CHECK-INST: vfmadd.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a2 <unknown>
+# CHECK-UNKNOWN: a2455457 <unknown>
 
 vfnmadd.vv v8, v20, v4, v0.t
 # CHECK-INST: vfnmadd.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a4 <unknown>
+# CHECK-UNKNOWN: a44a1457 <unknown>
 
 vfnmadd.vv v8, v20, v4
 # CHECK-INST: vfnmadd.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a6 <unknown>
+# CHECK-UNKNOWN: a64a1457 <unknown>
 
 vfnmadd.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfnmadd.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a4 <unknown>
+# CHECK-UNKNOWN: a4455457 <unknown>
 
 vfnmadd.vf v8, fa0, v4
 # CHECK-INST: vfnmadd.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a6 <unknown>
+# CHECK-UNKNOWN: a6455457 <unknown>
 
 vfmsub.vv v8, v20, v4, v0.t
 # CHECK-INST: vfmsub.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a8 <unknown>
+# CHECK-UNKNOWN: a84a1457 <unknown>
 
 vfmsub.vv v8, v20, v4
 # CHECK-INST: vfmsub.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a aa <unknown>
+# CHECK-UNKNOWN: aa4a1457 <unknown>
 
 vfmsub.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfmsub.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a8 <unknown>
+# CHECK-UNKNOWN: a8455457 <unknown>
 
 vfmsub.vf v8, fa0, v4
 # CHECK-INST: vfmsub.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 aa <unknown>
+# CHECK-UNKNOWN: aa455457 <unknown>
 
 vfnmsub.vv v8, v20, v4, v0.t
 # CHECK-INST: vfnmsub.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xac]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ac <unknown>
+# CHECK-UNKNOWN: ac4a1457 <unknown>
 
 vfnmsub.vv v8, v20, v4
 # CHECK-INST: vfnmsub.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xae]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ae <unknown>
+# CHECK-UNKNOWN: ae4a1457 <unknown>
 
 vfnmsub.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfnmsub.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 ac <unknown>
+# CHECK-UNKNOWN: ac455457 <unknown>
 
 vfnmsub.vf v8, fa0, v4
 # CHECK-INST: vfnmsub.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 ae <unknown>
+# CHECK-UNKNOWN: ae455457 <unknown>
 
 vfwmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vfwmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f0 <unknown>
+# CHECK-UNKNOWN: f04a1457 <unknown>
 
 vfwmacc.vv v8, v20, v4
 # CHECK-INST: vfwmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f2 <unknown>
+# CHECK-UNKNOWN: f24a1457 <unknown>
 
 vfwmacc.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfwmacc.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f0 <unknown>
+# CHECK-UNKNOWN: f0455457 <unknown>
 
 vfwmacc.vf v8, fa0, v4
 # CHECK-INST: vfwmacc.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f2 <unknown>
+# CHECK-UNKNOWN: f2455457 <unknown>
 
 vfwnmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vfwnmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f4 <unknown>
+# CHECK-UNKNOWN: f44a1457 <unknown>
 
 vfwnmacc.vv v8, v20, v4
 # CHECK-INST: vfwnmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f6 <unknown>
+# CHECK-UNKNOWN: f64a1457 <unknown>
 
 vfwnmacc.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfwnmacc.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f4 <unknown>
+# CHECK-UNKNOWN: f4455457 <unknown>
 
 vfwnmacc.vf v8, fa0, v4
 # CHECK-INST: vfwnmacc.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f6 <unknown>
+# CHECK-UNKNOWN: f6455457 <unknown>
 
 vfwmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vfwmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f8 <unknown>
+# CHECK-UNKNOWN: f84a1457 <unknown>
 
 vfwmsac.vv v8, v20, v4
 # CHECK-INST: vfwmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xfa]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a fa <unknown>
+# CHECK-UNKNOWN: fa4a1457 <unknown>
 
 vfwmsac.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfwmsac.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f8 <unknown>
+# CHECK-UNKNOWN: f8455457 <unknown>
 
 vfwmsac.vf v8, fa0, v4
 # CHECK-INST: vfwmsac.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xfa]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 fa <unknown>
+# CHECK-UNKNOWN: fa455457 <unknown>
 
 vfwnmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vfwnmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xfc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a fc <unknown>
+# CHECK-UNKNOWN: fc4a1457 <unknown>
 
 vfwnmsac.vv v8, v20, v4
 # CHECK-INST: vfwnmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xfe]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a fe <unknown>
+# CHECK-UNKNOWN: fe4a1457 <unknown>
 
 vfwnmsac.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfwnmsac.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xfc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 fc <unknown>
+# CHECK-UNKNOWN: fc455457 <unknown>
 
 vfwnmsac.vf v8, fa0, v4
 # CHECK-INST: vfwnmsac.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xfe]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 fe <unknown>
+# CHECK-UNKNOWN: fe455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fminmax.s b/llvm/test/MC/RISCV/rvv/fminmax.s
index c8aab38e1fc8..f7e85ed31c33 100644
--- a/llvm/test/MC/RISCV/rvv/fminmax.s
+++ b/llvm/test/MC/RISCV/rvv/fminmax.s
@@ -15,46 +15,46 @@ vfmin.vv v8, v4, v20, v0.t
 # CHECK-INST: vfmin.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x10]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 10 <unknown>
+# CHECK-UNKNOWN: 104a1457 <unknown>
 
 vfmin.vv v8, v4, v20
 # CHECK-INST: vfmin.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x12]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 12 <unknown>
+# CHECK-UNKNOWN: 124a1457 <unknown>
 
 vfmin.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfmin.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x10]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 10 <unknown>
+# CHECK-UNKNOWN: 10455457 <unknown>
 
 vfmin.vf v8, v4, fa0
 # CHECK-INST: vfmin.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x12]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 12 <unknown>
+# CHECK-UNKNOWN: 12455457 <unknown>
 
 vfmax.vv v8, v4, v20, v0.t
 # CHECK-INST: vfmax.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x18]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 18 <unknown>
+# CHECK-UNKNOWN: 184a1457 <unknown>
 
 vfmax.vv v8, v4, v20
 # CHECK-INST: vfmax.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 1a <unknown>
+# CHECK-UNKNOWN: 1a4a1457 <unknown>
 
 vfmax.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfmax.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x18]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 18 <unknown>
+# CHECK-UNKNOWN: 18455457 <unknown>
 
 vfmax.vf v8, v4, fa0
 # CHECK-INST: vfmax.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 1a <unknown>
+# CHECK-UNKNOWN: 1a455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fmul.s b/llvm/test/MC/RISCV/rvv/fmul.s
index 42c37932043c..9cd6e5287750 100644
--- a/llvm/test/MC/RISCV/rvv/fmul.s
+++ b/llvm/test/MC/RISCV/rvv/fmul.s
@@ -15,46 +15,46 @@ vfmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vfmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x90]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 90 <unknown>
+# CHECK-UNKNOWN: 904a1457 <unknown>
 
 vfmul.vv v8, v4, v20
 # CHECK-INST: vfmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x92]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 92 <unknown>
+# CHECK-UNKNOWN: 924a1457 <unknown>
 
 vfmul.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfmul.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x90]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 90 <unknown>
+# CHECK-UNKNOWN: 90455457 <unknown>
 
 vfmul.vf v8, v4, fa0
 # CHECK-INST: vfmul.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x92]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 92 <unknown>
+# CHECK-UNKNOWN: 92455457 <unknown>
 
 vfwmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vfwmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a e0 <unknown>
+# CHECK-UNKNOWN: e04a1457 <unknown>
 
 vfwmul.vv v8, v4, v20
 # CHECK-INST: vfwmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a e2 <unknown>
+# CHECK-UNKNOWN: e24a1457 <unknown>
 
 vfwmul.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfwmul.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 e0 <unknown>
+# CHECK-UNKNOWN: e0455457 <unknown>
 
 vfwmul.vf v8, v4, fa0
 # CHECK-INST: vfwmul.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 e2 <unknown>
+# CHECK-UNKNOWN: e2455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fmv.s b/llvm/test/MC/RISCV/rvv/fmv.s
index a5b814f130fb..2534b5171b3e 100644
--- a/llvm/test/MC/RISCV/rvv/fmv.s
+++ b/llvm/test/MC/RISCV/rvv/fmv.s
@@ -15,16 +15,16 @@ vfmv.v.f v8, fa0
 # CHECK-INST: vfmv.v.f v8, fa0
 # CHECK-ENCODING: [0x57,0x54,0x05,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 05 5e <unknown>
+# CHECK-UNKNOWN: 5e055457 <unknown>
 
 vfmv.f.s fa0, v4
 # CHECK-INST: vfmv.f.s fa0, v4
 # CHECK-ENCODING: [0x57,0x15,0x40,0x42]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 15 40 42 <unknown>
+# CHECK-UNKNOWN: 42401557 <unknown>
 
 vfmv.s.f v8, fa0
 # CHECK-INST: vfmv.s.f v8, fa0
 # CHECK-ENCODING: [0x57,0x54,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 05 42 <unknown>
+# CHECK-UNKNOWN: 42055457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fothers.s b/llvm/test/MC/RISCV/rvv/fothers.s
index 451f6ca39ccc..997115f96bd9 100644
--- a/llvm/test/MC/RISCV/rvv/fothers.s
+++ b/llvm/test/MC/RISCV/rvv/fothers.s
@@ -13,76 +13,76 @@ vfsqrt.v v8, v4, v0.t
 # CHECK-INST: vfsqrt.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x40,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 40 4c <unknown>
+# CHECK-UNKNOWN: 4c401457 <unknown>
 
 vfsqrt.v v8, v4
 # CHECK-INST: vfsqrt.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x40,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 40 4e <unknown>
+# CHECK-UNKNOWN: 4e401457 <unknown>
 
 vfrsqrt7.v v8, v4, v0.t
 # CHECK-INST: vfrsqrt7.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x42,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 42 4c <unknown>
+# CHECK-UNKNOWN: 4c421457 <unknown>
 
 vfrsqrt7.v v8, v4
 # CHECK-INST: vfrsqrt7.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x42,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 42 4e <unknown>
+# CHECK-UNKNOWN: 4e421457 <unknown>
 
 vfrec7.v v8, v4, v0.t
 # CHECK-INST: vfrec7.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x42,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 42 4c <unknown>
+# CHECK-UNKNOWN: 4c429457 <unknown>
 
 vfrec7.v v8, v4
 # CHECK-INST: vfrec7.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x42,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 42 4e <unknown>
+# CHECK-UNKNOWN: 4e429457 <unknown>
 
 vfclass.v v8, v4, v0.t
 # CHECK-INST: vfclass.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x48,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 48 4c <unknown>
+# CHECK-UNKNOWN: 4c481457 <unknown>
 
 vfclass.v v8, v4
 # CHECK-INST: vfclass.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x48,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 48 4e <unknown>
+# CHECK-UNKNOWN: 4e481457 <unknown>
 
 vfmerge.vfm v8, v4, fa0, v0
 # CHECK-INST: vfmerge.vfm v8, v4, fa0, v0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x5c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 5c <unknown>
+# CHECK-UNKNOWN: 5c455457 <unknown>
 
 vfslide1up.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfslide1up.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x38]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 38 <unknown>
+# CHECK-UNKNOWN: 38455457 <unknown>
 
 vfslide1up.vf v8, v4, fa0
 # CHECK-INST: vfslide1up.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 3a <unknown>
+# CHECK-UNKNOWN: 3a455457 <unknown>
 
 vfslide1down.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfslide1down.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x3c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 3c <unknown>
+# CHECK-UNKNOWN: 3c455457 <unknown>
 
 vfslide1down.vf v8, v4, fa0
 # CHECK-INST: vfslide1down.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x3e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 3e <unknown>
+# CHECK-UNKNOWN: 3e455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/freduction.s b/llvm/test/MC/RISCV/rvv/freduction.s
index fca64372b15f..12326942e6e8 100644
--- a/llvm/test/MC/RISCV/rvv/freduction.s
+++ b/llvm/test/MC/RISCV/rvv/freduction.s
@@ -15,76 +15,76 @@ vfredosum.vs v8, v4, v20, v0.t
 # CHECK-INST: vfredosum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 0c <unknown>
+# CHECK-UNKNOWN: 0c4a1457 <unknown>
 
 vfredosum.vs v8, v4, v20
 # CHECK-INST: vfredosum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 0e <unknown>
+# CHECK-UNKNOWN: 0e4a1457 <unknown>
 
 vfredusum.vs v8, v4, v20, v0.t
 # CHECK-INST: vfredusum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x04]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 04 <unknown>
+# CHECK-UNKNOWN: 044a1457 <unknown>
 
 vfredusum.vs v8, v4, v20
 # CHECK-INST: vfredusum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x06]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 06 <unknown>
+# CHECK-UNKNOWN: 064a1457 <unknown>
 
 vfredmax.vs v8, v4, v20, v0.t
 # CHECK-INST: vfredmax.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x1c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 1c <unknown>
+# CHECK-UNKNOWN: 1c4a1457 <unknown>
 
 vfredmax.vs v8, v4, v20
 # CHECK-INST: vfredmax.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x1e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 1e <unknown>
+# CHECK-UNKNOWN: 1e4a1457 <unknown>
 
 vfredmin.vs v8, v4, v20, v0.t
 # CHECK-INST: vfredmin.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x14]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 14 <unknown>
+# CHECK-UNKNOWN: 144a1457 <unknown>
 
 vfredmin.vs v8, v4, v20
 # CHECK-INST: vfredmin.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x16]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 16 <unknown>
+# CHECK-UNKNOWN: 164a1457 <unknown>
 
 vfwredosum.vs v8, v4, v20, v0.t
 # CHECK-INST: vfwredosum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a cc <unknown>
+# CHECK-UNKNOWN: cc4a1457 <unknown>
 
 vfwredosum.vs v8, v4, v20
 # CHECK-INST: vfwredosum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xce]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ce <unknown>
+# CHECK-UNKNOWN: ce4a1457 <unknown>
 
 vfwredusum.vs v8, v4, v20, v0.t
 # CHECK-INST: vfwredusum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c4 <unknown>
+# CHECK-UNKNOWN: c44a1457 <unknown>
 
 vfwredusum.vs v8, v4, v20
 # CHECK-INST: vfwredusum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c6 <unknown>
+# CHECK-UNKNOWN: c64a1457 <unknown>
 
 vfredosum.vs v0, v4, v20, v0.t
 # CHECK-INST: vfredosum.vs v0, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x10,0x4a,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 10 4a 0c <unknown>
+# CHECK-UNKNOWN: 0c4a1057 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fsub.s b/llvm/test/MC/RISCV/rvv/fsub.s
index a8f2bc6260b9..62ff2e744c78 100644
--- a/llvm/test/MC/RISCV/rvv/fsub.s
+++ b/llvm/test/MC/RISCV/rvv/fsub.s
@@ -15,82 +15,82 @@ vfsub.vv v8, v4, v20, v0.t
 # CHECK-INST: vfsub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x08]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 08 <unknown>
+# CHECK-UNKNOWN: 084a1457 <unknown>
 
 vfsub.vv v8, v4, v20
 # CHECK-INST: vfsub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 0a <unknown>
+# CHECK-UNKNOWN: 0a4a1457 <unknown>
 
 vfsub.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfsub.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x08]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 08 <unknown>
+# CHECK-UNKNOWN: 08455457 <unknown>
 
 vfsub.vf v8, v4, fa0
 # CHECK-INST: vfsub.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 0a <unknown>
+# CHECK-UNKNOWN: 0a455457 <unknown>
 
 vfrsub.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfrsub.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 9c <unknown>
+# CHECK-UNKNOWN: 9c455457 <unknown>
 
 vfrsub.vf v8, v4, fa0
 # CHECK-INST: vfrsub.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 9e <unknown>
+# CHECK-UNKNOWN: 9e455457 <unknown>
 
 vfwsub.vv v8, v4, v20, v0.t
 # CHECK-INST: vfwsub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c8 <unknown>
+# CHECK-UNKNOWN: c84a1457 <unknown>
 
 vfwsub.vv v8, v4, v20
 # CHECK-INST: vfwsub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xca]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ca <unknown>
+# CHECK-UNKNOWN: ca4a1457 <unknown>
 
 vfwsub.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfwsub.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 c8 <unknown>
+# CHECK-UNKNOWN: c8455457 <unknown>
 
 vfwsub.vf v8, v4, fa0
 # CHECK-INST: vfwsub.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xca]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 ca <unknown>
+# CHECK-UNKNOWN: ca455457 <unknown>
 
 vfwsub.wv v8, v4, v20, v0.t
 # CHECK-INST: vfwsub.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xd8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a d8 <unknown>
+# CHECK-UNKNOWN: d84a1457 <unknown>
 
 vfwsub.wv v8, v4, v20
 # CHECK-INST: vfwsub.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xda]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a da <unknown>
+# CHECK-UNKNOWN: da4a1457 <unknown>
 
 vfwsub.wf v8, v4, fa0, v0.t
 # CHECK-INST: vfwsub.wf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xd8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 d8 <unknown>
+# CHECK-UNKNOWN: d8455457 <unknown>
 
 vfwsub.wf v8, v4, fa0
 # CHECK-INST: vfwsub.wf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xda]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 da <unknown>
+# CHECK-UNKNOWN: da455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/load.s b/llvm/test/MC/RISCV/rvv/load.s
index 23357df88d3f..3c251a3a8d75 100644
--- a/llvm/test/MC/RISCV/rvv/load.s
+++ b/llvm/test/MC/RISCV/rvv/load.s
@@ -12,382 +12,382 @@ vlm.v v0, (a0)
 # CHECK-INST: vlm.v v0, (a0)
 # CHECK-ENCODING: [0x07,0x00,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 00 b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50007 <unknown>
 
 vlm.v v8, (a0)
 # CHECK-INST: vlm.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50407 <unknown>
 
 vle8.v v8, (a0), v0.t
 # CHECK-INST: vle8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 00 <unknown>
+# CHECK-UNKNOWN: 00050407 <unknown>
 
 vle8.v v8, (a0)
 # CHECK-INST: vle8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 02 <unknown>
+# CHECK-UNKNOWN: 02050407 <unknown>
 
 vle16.v v8, (a0), v0.t
 # CHECK-INST: vle16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 00 <unknown>
+# CHECK-UNKNOWN: 00055407 <unknown>
 
 vle16.v v8, (a0)
 # CHECK-INST: vle16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 02 <unknown>
+# CHECK-UNKNOWN: 02055407 <unknown>
 
 vle32.v v8, (a0), v0.t
 # CHECK-INST: vle32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 00 <unknown>
+# CHECK-UNKNOWN: 00056407 <unknown>
 
 vle32.v v8, (a0)
 # CHECK-INST: vle32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 02 <unknown>
+# CHECK-UNKNOWN: 02056407 <unknown>
 
 vle64.v v8, (a0), v0.t
 # CHECK-INST: vle64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 00 <unknown>
+# CHECK-UNKNOWN: 00057407 <unknown>
 
 vle64.v v8, (a0)
 # CHECK-INST: vle64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 02 <unknown>
+# CHECK-UNKNOWN: 02057407 <unknown>
 
 vle8ff.v v8, (a0), v0.t
 # CHECK-INST: vle8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 01 <unknown>
+# CHECK-UNKNOWN: 01050407 <unknown>
 
 vle8ff.v v8, (a0)
 # CHECK-INST: vle8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x03]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 03 <unknown>
+# CHECK-UNKNOWN: 03050407 <unknown>
 
 vle16ff.v v8, (a0), v0.t
 # CHECK-INST: vle16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 01 <unknown>
+# CHECK-UNKNOWN: 01055407 <unknown>
 
 vle16ff.v v8, (a0)
 # CHECK-INST: vle16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x03]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 03 <unknown>
+# CHECK-UNKNOWN: 03055407 <unknown>
 
 vle32ff.v v8, (a0), v0.t
 # CHECK-INST: vle32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 01 <unknown>
+# CHECK-UNKNOWN: 01056407 <unknown>
 
 vle32ff.v v8, (a0)
 # CHECK-INST: vle32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x03]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 03 <unknown>
+# CHECK-UNKNOWN: 03056407 <unknown>
 
 vle64ff.v v8, (a0), v0.t
 # CHECK-INST: vle64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 01 <unknown>
+# CHECK-UNKNOWN: 01057407 <unknown>
 
 vle64ff.v v8, (a0)
 # CHECK-INST: vle64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x03]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 03 <unknown>
+# CHECK-UNKNOWN: 03057407 <unknown>
 
 vlse8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b50407 <unknown>
 
 vlse8.v v8, (a0), a1
 # CHECK-INST: vlse8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab50407 <unknown>
 
 vlse16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b55407 <unknown>
 
 vlse16.v v8, (a0), a1
 # CHECK-INST: vlse16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab55407 <unknown>
 
 vlse32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b56407 <unknown>
 
 vlse32.v v8, (a0), a1
 # CHECK-INST: vlse32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab56407 <unknown>
 
 vlse64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b57407 <unknown>
 
 vlse64.v v8, (a0), a1
 # CHECK-INST: vlse64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab57407 <unknown>
 
 vluxei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 04 <unknown>
+# CHECK-UNKNOWN: 04450407 <unknown>
 
 vluxei8.v v8, (a0), v4
 # CHECK-INST: vluxei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 06 <unknown>
+# CHECK-UNKNOWN: 06450407 <unknown>
 
 vluxei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 04 <unknown>
+# CHECK-UNKNOWN: 04455407 <unknown>
 
 vluxei16.v v8, (a0), v4
 # CHECK-INST: vluxei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 06 <unknown>
+# CHECK-UNKNOWN: 06455407 <unknown>
 
 vluxei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 04 <unknown>
+# CHECK-UNKNOWN: 04456407 <unknown>
 
 vluxei32.v v8, (a0), v4
 # CHECK-INST: vluxei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 06 <unknown>
+# CHECK-UNKNOWN: 06456407 <unknown>
 
 vluxei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 04 <unknown>
+# CHECK-UNKNOWN: 04457407 <unknown>
 
 vluxei64.v v8, (a0), v4
 # CHECK-INST: vluxei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 06 <unknown>
+# CHECK-UNKNOWN: 06457407 <unknown>
 
 vloxei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 0c <unknown>
+# CHECK-UNKNOWN: 0c450407 <unknown>
 
 vloxei8.v v8, (a0), v4
 # CHECK-INST: vloxei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 0e <unknown>
+# CHECK-UNKNOWN: 0e450407 <unknown>
 
 vloxei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 0c <unknown>
+# CHECK-UNKNOWN: 0c455407 <unknown>
 
 vloxei16.v v8, (a0), v4
 # CHECK-INST: vloxei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 0e <unknown>
+# CHECK-UNKNOWN: 0e455407 <unknown>
 
 vloxei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 0c <unknown>
+# CHECK-UNKNOWN: 0c456407 <unknown>
 
 vloxei32.v v8, (a0), v4
 # CHECK-INST: vloxei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 0e <unknown>
+# CHECK-UNKNOWN: 0e456407 <unknown>
 
 vloxei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 0c <unknown>
+# CHECK-UNKNOWN: 0c457407 <unknown>
 
 vloxei64.v v8, (a0), v4
 # CHECK-INST: vloxei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 0e <unknown>
+# CHECK-UNKNOWN: 0e457407 <unknown>
 
 vl1re8.v v8, (a0)
 # CHECK-INST: vl1re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 85 02 <unknown>
+# CHECK-UNKNOWN: 02850407 <unknown>
 
 vl1re16.v v8, (a0)
 # CHECK-INST: vl1re16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 85 02 <unknown>
+# CHECK-UNKNOWN: 02855407 <unknown>
 
 vl1re32.v v8, (a0)
 # CHECK-INST: vl1re32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 85 02 <unknown>
+# CHECK-UNKNOWN: 02856407 <unknown>
 
 vl1re64.v v8, (a0)
 # CHECK-INST: vl1re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 85 02 <unknown>
+# CHECK-UNKNOWN: 02857407 <unknown>
 
 vl2re8.v v8, (a0)
 # CHECK-INST: vl2re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 85 22 <unknown>
+# CHECK-UNKNOWN: 22850407 <unknown>
 
 vl2re16.v v8, (a0)
 # CHECK-INST: vl2re16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 85 22 <unknown>
+# CHECK-UNKNOWN: 22855407 <unknown>
 
 vl2re32.v v8, (a0)
 # CHECK-INST: vl2re32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 85 22 <unknown>
+# CHECK-UNKNOWN: 22856407 <unknown>
 
 vl2re64.v v8, (a0)
 # CHECK-INST: vl2re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 85 22 <unknown>
+# CHECK-UNKNOWN: 22857407 <unknown>
 
 vl4re8.v v8, (a0)
 # CHECK-INST: vl4re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 85 62 <unknown>
+# CHECK-UNKNOWN: 62850407 <unknown>
 
 vl4re16.v v8, (a0)
 # CHECK-INST: vl4re16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 85 62 <unknown>
+# CHECK-UNKNOWN: 62855407 <unknown>
 
 vl4re32.v v8, (a0)
 # CHECK-INST: vl4re32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 85 62 <unknown>
+# CHECK-UNKNOWN: 62856407 <unknown>
 
 vl4re64.v v8, (a0)
 # CHECK-INST: vl4re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 85 62 <unknown>
+# CHECK-UNKNOWN: 62857407 <unknown>
 
 vl8re8.v v8, (a0)
 # CHECK-INST: vl8re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 85 e2 <unknown>
+# CHECK-UNKNOWN: e2850407 <unknown>
 
 vl8re16.v v8, (a0)
 # CHECK-INST: vl8re16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 85 e2 <unknown>
+# CHECK-UNKNOWN: e2855407 <unknown>
 
 vl8re32.v v8, (a0)
 # CHECK-INST: vl8re32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 85 e2 <unknown>
+# CHECK-UNKNOWN: e2856407 <unknown>
 
 vl8re64.v v8, (a0)
 # CHECK-INST: vl8re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 85 e2 <unknown>
+# CHECK-UNKNOWN: e2857407 <unknown>
 
 vlm.v v0, 0(a0)
 # CHECK-INST: vlm.v v0, (a0)
 # CHECK-ENCODING: [0x07,0x00,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 00 b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50007 <unknown>
 
 vle8.v v8, 0(a0)
 # CHECK-INST: vle8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 02 <unknown>
+# CHECK-UNKNOWN: 02050407 <unknown>
 
 vle8ff.v v8, 0(a0), v0.t
 # CHECK-INST: vle8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 01 <unknown>
+# CHECK-UNKNOWN: 01050407 <unknown>
 
 vlse16.v v8, 0(a0), a1, v0.t
 # CHECK-INST: vlse16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b55407 <unknown>
 
 vluxei32.v v8, 0(a0), v4
 # CHECK-INST: vluxei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 06 <unknown>
+# CHECK-UNKNOWN: 06456407 <unknown>
 
 vloxei64.v v8, 0(a0), v4
 # CHECK-INST: vloxei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 0e <unknown>
+# CHECK-UNKNOWN: 0e457407 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/macc.s b/llvm/test/MC/RISCV/rvv/macc.s
index 0662620b455a..ffdc2d646929 100644
--- a/llvm/test/MC/RISCV/rvv/macc.s
+++ b/llvm/test/MC/RISCV/rvv/macc.s
@@ -12,178 +12,178 @@ vmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a b4 <unknown>
+# CHECK-UNKNOWN: b44a2457 <unknown>
 
 vmacc.vv v8, v20, v4
 # CHECK-INST: vmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a b6 <unknown>
+# CHECK-UNKNOWN: b64a2457 <unknown>
 
 vmacc.vx v8, a0, v4, v0.t
 # CHECK-INST: vmacc.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 b4 <unknown>
+# CHECK-UNKNOWN: b4456457 <unknown>
 
 vmacc.vx v8, a0, v4
 # CHECK-INST: vmacc.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 b6 <unknown>
+# CHECK-UNKNOWN: b6456457 <unknown>
 
 vnmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vnmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a bc <unknown>
+# CHECK-UNKNOWN: bc4a2457 <unknown>
 
 vnmsac.vv v8, v20, v4
 # CHECK-INST: vnmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a be <unknown>
+# CHECK-UNKNOWN: be4a2457 <unknown>
 
 vnmsac.vx v8, a0, v4, v0.t
 # CHECK-INST: vnmsac.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 bc <unknown>
+# CHECK-UNKNOWN: bc456457 <unknown>
 
 vnmsac.vx v8, a0, v4
 # CHECK-INST: vnmsac.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 be <unknown>
+# CHECK-UNKNOWN: be456457 <unknown>
 
 vmadd.vv v8, v20, v4, v0.t
 # CHECK-INST: vmadd.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a a4 <unknown>
+# CHECK-UNKNOWN: a44a2457 <unknown>
 
 vmadd.vv v8, v20, v4
 # CHECK-INST: vmadd.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a a6 <unknown>
+# CHECK-UNKNOWN: a64a2457 <unknown>
 
 vmadd.vx v8, a0, v4, v0.t
 # CHECK-INST: vmadd.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 a4 <unknown>
+# CHECK-UNKNOWN: a4456457 <unknown>
 
 vmadd.vx v8, a0, v4
 # CHECK-INST: vmadd.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 a6 <unknown>
+# CHECK-UNKNOWN: a6456457 <unknown>
 
 vnmsub.vv v8, v20, v4, v0.t
 # CHECK-INST: vnmsub.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ac <unknown>
+# CHECK-UNKNOWN: ac4a2457 <unknown>
 
 vnmsub.vv v8, v20, v4
 # CHECK-INST: vnmsub.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ae <unknown>
+# CHECK-UNKNOWN: ae4a2457 <unknown>
 
 vnmsub.vx v8, a0, v4, v0.t
 # CHECK-INST: vnmsub.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ac <unknown>
+# CHECK-UNKNOWN: ac456457 <unknown>
 
 vnmsub.vx v8, a0, v4
 # CHECK-INST: vnmsub.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ae <unknown>
+# CHECK-UNKNOWN: ae456457 <unknown>
 
 vwmaccu.vv v8, v20, v4, v0.t
 # CHECK-INST: vwmaccu.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xf0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a f0 <unknown>
+# CHECK-UNKNOWN: f04a2457 <unknown>
 
 vwmaccu.vv v8, v20, v4
 # CHECK-INST: vwmaccu.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xf2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a f2 <unknown>
+# CHECK-UNKNOWN: f24a2457 <unknown>
 
 vwmaccu.vx v8, a0, v4, v0.t
 # CHECK-INST: vwmaccu.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f0 <unknown>
+# CHECK-UNKNOWN: f0456457 <unknown>
 
 vwmaccu.vx v8, a0, v4
 # CHECK-INST: vwmaccu.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f2 <unknown>
+# CHECK-UNKNOWN: f2456457 <unknown>
 
 vwmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vwmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xf4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a f4 <unknown>
+# CHECK-UNKNOWN: f44a2457 <unknown>
 
 vwmacc.vv v8, v20, v4
 # CHECK-INST: vwmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xf6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a f6 <unknown>
+# CHECK-UNKNOWN: f64a2457 <unknown>
 
 vwmacc.vx v8, a0, v4, v0.t
 # CHECK-INST: vwmacc.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f4 <unknown>
+# CHECK-UNKNOWN: f4456457 <unknown>
 
 vwmacc.vx v8, a0, v4
 # CHECK-INST: vwmacc.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f6 <unknown>
+# CHECK-UNKNOWN: f6456457 <unknown>
 
 vwmaccsu.vv v8, v20, v4, v0.t
 # CHECK-INST: vwmaccsu.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xfc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a fc <unknown>
+# CHECK-UNKNOWN: fc4a2457 <unknown>
 
 vwmaccsu.vv v8, v20, v4
 # CHECK-INST: vwmaccsu.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xfe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a fe <unknown>
+# CHECK-UNKNOWN: fe4a2457 <unknown>
 
 vwmaccsu.vx v8, a0, v4, v0.t
 # CHECK-INST: vwmaccsu.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xfc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 fc <unknown>
+# CHECK-UNKNOWN: fc456457 <unknown>
 
 vwmaccsu.vx v8, a0, v4
 # CHECK-INST: vwmaccsu.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xfe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 fe <unknown>
+# CHECK-UNKNOWN: fe456457 <unknown>
 
 vwmaccus.vx v8, a0, v4, v0.t
 # CHECK-INST: vwmaccus.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f8 <unknown>
+# CHECK-UNKNOWN: f8456457 <unknown>
 
 vwmaccus.vx v8, a0, v4
 # CHECK-INST: vwmaccus.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xfa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 fa <unknown>
+# CHECK-UNKNOWN: fa456457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/mask.s b/llvm/test/MC/RISCV/rvv/mask.s
index c0691b97f1d9..c0dd44b6fc67 100644
--- a/llvm/test/MC/RISCV/rvv/mask.s
+++ b/llvm/test/MC/RISCV/rvv/mask.s
@@ -12,154 +12,154 @@ vmand.mm v8, v4, v20
 # CHECK-INST: vmand.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 66 <unknown>
+# CHECK-UNKNOWN: 664a2457 <unknown>
 
 vmnand.mm v8, v4, v20
 # CHECK-INST: vmnand.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 76 <unknown>
+# CHECK-UNKNOWN: 764a2457 <unknown>
 
 vmandn.mm v8, v4, v20
 # CHECK-INST: vmandn.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 62 <unknown>
+# CHECK-UNKNOWN: 624a2457 <unknown>
 
 vmxor.mm v8, v4, v20
 # CHECK-INST: vmxor.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a2457 <unknown>
 
 vmor.mm v8, v4, v20
 # CHECK-INST: vmor.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 6a <unknown>
+# CHECK-UNKNOWN: 6a4a2457 <unknown>
 
 vmnor.mm v8, v4, v20
 # CHECK-INST: vmnor.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x7a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 7a <unknown>
+# CHECK-UNKNOWN: 7a4a2457 <unknown>
 
 vmorn.mm v8, v4, v20
 # CHECK-INST: vmorn.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 72 <unknown>
+# CHECK-UNKNOWN: 724a2457 <unknown>
 
 vmxnor.mm v8, v4, v20
 # CHECK-INST: vmxnor.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 7e <unknown>
+# CHECK-UNKNOWN: 7e4a2457 <unknown>
 
 vcpop.m a2, v4, v0.t
 # CHECK-INST: vcpop.m a2, v4, v0.t
 # CHECK-ENCODING: [0x57,0x26,0x48,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 26 48 40 <unknown>
+# CHECK-UNKNOWN: 40482657 <unknown>
 
 vcpop.m a2, v4
 # CHECK-INST: vcpop.m a2, v4
 # CHECK-ENCODING: [0x57,0x26,0x48,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 26 48 42 <unknown>
+# CHECK-UNKNOWN: 42482657 <unknown>
 
 vfirst.m a2, v4, v0.t
 # CHECK-INST: vfirst.m a2, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa6,0x48,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a6 48 40 <unknown>
+# CHECK-UNKNOWN: 4048a657 <unknown>
 
 vfirst.m a2, v4
 # CHECK-INST: vfirst.m a2, v4
 # CHECK-ENCODING: [0x57,0xa6,0x48,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a6 48 42 <unknown>
+# CHECK-UNKNOWN: 4248a657 <unknown>
 
 vmsbf.m v8, v4, v0.t
 # CHECK-INST: vmsbf.m v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x40,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 40 50 <unknown>
+# CHECK-UNKNOWN: 5040a457 <unknown>
 
 vmsbf.m v8, v4
 # CHECK-INST: vmsbf.m v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x40,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 40 52 <unknown>
+# CHECK-UNKNOWN: 5240a457 <unknown>
 
 vmsif.m v8, v4, v0.t
 # CHECK-INST: vmsif.m v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x41,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 41 50 <unknown>
+# CHECK-UNKNOWN: 5041a457 <unknown>
 
 vmsif.m v8, v4
 # CHECK-INST: vmsif.m v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x41,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 41 52 <unknown>
+# CHECK-UNKNOWN: 5241a457 <unknown>
 
 vmsof.m v8, v4, v0.t
 # CHECK-INST: vmsof.m v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x41,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 41 50 <unknown>
+# CHECK-UNKNOWN: 50412457 <unknown>
 
 vmsof.m v8, v4
 # CHECK-INST: vmsof.m v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x41,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 41 52 <unknown>
+# CHECK-UNKNOWN: 52412457 <unknown>
 
 viota.m v8, v4, v0.t
 # CHECK-INST: viota.m v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x48,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 48 50 <unknown>
+# CHECK-UNKNOWN: 50482457 <unknown>
 
 viota.m v8, v4
 # CHECK-INST: viota.m v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x48,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 48 52 <unknown>
+# CHECK-UNKNOWN: 52482457 <unknown>
 
 vid.v v8, v0.t
 # CHECK-INST: vid.v v8, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x08,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 08 50 <unknown>
+# CHECK-UNKNOWN: 5008a457 <unknown>
 
 vid.v v8
 # CHECK-INST: vid.v v8
 # CHECK-ENCODING: [0x57,0xa4,0x08,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 08 52 <unknown>
+# CHECK-UNKNOWN: 5208a457 <unknown>
 
 vmmv.m v8, v4
 # CHECK-INST: vmmv.m v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x42,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 42 66 <unknown>
+# CHECK-UNKNOWN: 66422457 <unknown>
 
 vmclr.m v8
 # CHECK-INST: vmclr.m v8
 # CHECK-ENCODING: [0x57,0x24,0x84,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 84 6e <unknown>
+# CHECK-UNKNOWN: 6e842457 <unknown>
 
 vmset.m v8
 # CHECK-INST: vmset.m v8
 # CHECK-ENCODING: [0x57,0x24,0x84,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 84 7e <unknown>
+# CHECK-UNKNOWN: 7e842457 <unknown>
 
 vmnot.m v8, v4
 # CHECK-INST: vmnot.m v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x42,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 42 76 <unknown>
+# CHECK-UNKNOWN: 76422457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/minmax.s b/llvm/test/MC/RISCV/rvv/minmax.s
index 4eaf897b84c4..70fe040f07f9 100644
--- a/llvm/test/MC/RISCV/rvv/minmax.s
+++ b/llvm/test/MC/RISCV/rvv/minmax.s
@@ -12,94 +12,94 @@ vminu.vv v8, v4, v20, v0.t
 # CHECK-INST: vminu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x10]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 10 <unknown>
+# CHECK-UNKNOWN: 104a0457 <unknown>
 
 vminu.vv v8, v4, v20
 # CHECK-INST: vminu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x12]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 12 <unknown>
+# CHECK-UNKNOWN: 124a0457 <unknown>
 
 vminu.vx v8, v4, a0, v0.t
 # CHECK-INST: vminu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x10]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 10 <unknown>
+# CHECK-UNKNOWN: 10454457 <unknown>
 
 vminu.vx v8, v4, a0
 # CHECK-INST: vminu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x12]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 12 <unknown>
+# CHECK-UNKNOWN: 12454457 <unknown>
 
 vmin.vv v8, v4, v20, v0.t
 # CHECK-INST: vmin.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x14]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 14 <unknown>
+# CHECK-UNKNOWN: 144a0457 <unknown>
 
 vmin.vv v8, v4, v20
 # CHECK-INST: vmin.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x16]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 16 <unknown>
+# CHECK-UNKNOWN: 164a0457 <unknown>
 
 vmin.vx v8, v4, a0, v0.t
 # CHECK-INST: vmin.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x14]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 14 <unknown>
+# CHECK-UNKNOWN: 14454457 <unknown>
 
 vmin.vx v8, v4, a0
 # CHECK-INST: vmin.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x16]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 16 <unknown>
+# CHECK-UNKNOWN: 16454457 <unknown>
 
 vmaxu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmaxu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x18]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 18 <unknown>
+# CHECK-UNKNOWN: 184a0457 <unknown>
 
 vmaxu.vv v8, v4, v20
 # CHECK-INST: vmaxu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 1a <unknown>
+# CHECK-UNKNOWN: 1a4a0457 <unknown>
 
 vmaxu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmaxu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x18]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 18 <unknown>
+# CHECK-UNKNOWN: 18454457 <unknown>
 
 vmaxu.vx v8, v4, a0
 # CHECK-INST: vmaxu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 1a <unknown>
+# CHECK-UNKNOWN: 1a454457 <unknown>
 
 vmax.vv v8, v4, v20, v0.t
 # CHECK-INST: vmax.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x1c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 1c <unknown>
+# CHECK-UNKNOWN: 1c4a0457 <unknown>
 
 vmax.vv v8, v4, v20
 # CHECK-INST: vmax.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x1e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 1e <unknown>
+# CHECK-UNKNOWN: 1e4a0457 <unknown>
 
 vmax.vx v8, v4, a0, v0.t
 # CHECK-INST: vmax.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x1c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 1c <unknown>
+# CHECK-UNKNOWN: 1c454457 <unknown>
 
 vmax.vx v8, v4, a0
 # CHECK-INST: vmax.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x1e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 1e <unknown>
+# CHECK-UNKNOWN: 1e454457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/mul.s b/llvm/test/MC/RISCV/rvv/mul.s
index 9f7d6182d80f..2782ea683f9c 100644
--- a/llvm/test/MC/RISCV/rvv/mul.s
+++ b/llvm/test/MC/RISCV/rvv/mul.s
@@ -12,190 +12,190 @@ vmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 94 <unknown>
+# CHECK-UNKNOWN: 944a2457 <unknown>
 
 vmul.vv v8, v4, v20
 # CHECK-INST: vmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 96 <unknown>
+# CHECK-UNKNOWN: 964a2457 <unknown>
 
 vmul.vx v8, v4, a0, v0.t
 # CHECK-INST: vmul.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 94 <unknown>
+# CHECK-UNKNOWN: 94456457 <unknown>
 
 vmul.vx v8, v4, a0
 # CHECK-INST: vmul.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 96 <unknown>
+# CHECK-UNKNOWN: 96456457 <unknown>
 
 vmulh.vv v8, v4, v20, v0.t
 # CHECK-INST: vmulh.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 9c <unknown>
+# CHECK-UNKNOWN: 9c4a2457 <unknown>
 
 vmulh.vv v8, v4, v20
 # CHECK-INST: vmulh.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 9e <unknown>
+# CHECK-UNKNOWN: 9e4a2457 <unknown>
 
 vmulh.vx v8, v4, a0, v0.t
 # CHECK-INST: vmulh.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 9c <unknown>
+# CHECK-UNKNOWN: 9c456457 <unknown>
 
 vmulh.vx v8, v4, a0
 # CHECK-INST: vmulh.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 9e <unknown>
+# CHECK-UNKNOWN: 9e456457 <unknown>
 
 vmulhu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmulhu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x90]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 90 <unknown>
+# CHECK-UNKNOWN: 904a2457 <unknown>
 
 vmulhu.vv v8, v4, v20
 # CHECK-INST: vmulhu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x92]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 92 <unknown>
+# CHECK-UNKNOWN: 924a2457 <unknown>
 
 vmulhu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmulhu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x90]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 90 <unknown>
+# CHECK-UNKNOWN: 90456457 <unknown>
 
 vmulhu.vx v8, v4, a0
 # CHECK-INST: vmulhu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x92]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 92 <unknown>
+# CHECK-UNKNOWN: 92456457 <unknown>
 
 vmulhsu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmulhsu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x98]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 98 <unknown>
+# CHECK-UNKNOWN: 984a2457 <unknown>
 
 vmulhsu.vv v8, v4, v20
 # CHECK-INST: vmulhsu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x9a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 9a <unknown>
+# CHECK-UNKNOWN: 9a4a2457 <unknown>
 
 vmulhsu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmulhsu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x98]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 98 <unknown>
+# CHECK-UNKNOWN: 98456457 <unknown>
 
 vmulhsu.vx v8, v4, a0
 # CHECK-INST: vmulhsu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x9a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 9a <unknown>
+# CHECK-UNKNOWN: 9a456457 <unknown>
 
 vwmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vwmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ec <unknown>
+# CHECK-UNKNOWN: ec4a2457 <unknown>
 
 vwmul.vv v8, v4, v20
 # CHECK-INST: vwmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ee <unknown>
+# CHECK-UNKNOWN: ee4a2457 <unknown>
 
 vwmul.vx v8, v4, a0, v0.t
 # CHECK-INST: vwmul.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ec <unknown>
+# CHECK-UNKNOWN: ec456457 <unknown>
 
 vwmul.vx v8, v4, a0
 # CHECK-INST: vwmul.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ee <unknown>
+# CHECK-UNKNOWN: ee456457 <unknown>
 
 vwmulu.vv v8, v4, v20, v0.t
 # CHECK-INST: vwmulu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a e0 <unknown>
+# CHECK-UNKNOWN: e04a2457 <unknown>
 
 vwmulu.vv v8, v4, v20
 # CHECK-INST: vwmulu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a e2 <unknown>
+# CHECK-UNKNOWN: e24a2457 <unknown>
 
 vwmulu.vx v8, v4, a0, v0.t
 # CHECK-INST: vwmulu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 e0 <unknown>
+# CHECK-UNKNOWN: e0456457 <unknown>
 
 vwmulu.vx v8, v4, a0
 # CHECK-INST: vwmulu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 e2 <unknown>
+# CHECK-UNKNOWN: e2456457 <unknown>
 
 vwmulsu.vv v8, v4, v20, v0.t
 # CHECK-INST: vwmulsu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a e8 <unknown>
+# CHECK-UNKNOWN: e84a2457 <unknown>
 
 vwmulsu.vv v8, v4, v20
 # CHECK-INST: vwmulsu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ea <unknown>
+# CHECK-UNKNOWN: ea4a2457 <unknown>
 
 vwmulsu.vx v8, v4, a0, v0.t
 # CHECK-INST: vwmulsu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 e8 <unknown>
+# CHECK-UNKNOWN: e8456457 <unknown>
 
 vwmulsu.vx v8, v4, a0
 # CHECK-INST: vwmulsu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ea <unknown>
+# CHECK-UNKNOWN: ea456457 <unknown>
 
 vsmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vsmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 9c <unknown>
+# CHECK-UNKNOWN: 9c4a0457 <unknown>
 
 vsmul.vv v8, v4, v20
 # CHECK-INST: vsmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 9e <unknown>
+# CHECK-UNKNOWN: 9e4a0457 <unknown>
 
 vsmul.vx v8, v4, a0, v0.t
 # CHECK-INST: vsmul.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 9c <unknown>
+# CHECK-UNKNOWN: 9c454457 <unknown>
 
 vsmul.vx v8, v4, a0
 # CHECK-INST: vsmul.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 9e <unknown>
+# CHECK-UNKNOWN: 9e454457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/mv.s b/llvm/test/MC/RISCV/rvv/mv.s
index cd02aa94d18e..f96e14932ead 100644
--- a/llvm/test/MC/RISCV/rvv/mv.s
+++ b/llvm/test/MC/RISCV/rvv/mv.s
@@ -12,52 +12,52 @@ vmv.v.v v8, v20
 # CHECK-INST: vmv.v.v v8, v20
 # CHECK-ENCODING: [0x57,0x04,0x0a,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 0a 5e <unknown>
+# CHECK-UNKNOWN: 5e0a0457 <unknown>
 
 vmv.v.x v8, a0
 # CHECK-INST: vmv.v.x v8, a0
 # CHECK-ENCODING: [0x57,0x44,0x05,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 05 5e <unknown>
+# CHECK-UNKNOWN: 5e054457 <unknown>
 
 vmv.v.i v8, 15
 # CHECK-INST: vmv.v.i v8, 15
 # CHECK-ENCODING: [0x57,0xb4,0x07,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 07 5e <unknown>
+# CHECK-UNKNOWN: 5e07b457 <unknown>
 
 vmv.x.s a2, v4
 # CHECK-INST: vmv.x.s a2, v4
 # CHECK-ENCODING: [0x57,0x26,0x40,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 26 40 42 <unknown>
+# CHECK-UNKNOWN: 42402657 <unknown>
 
 vmv.s.x v8, a0
 # CHECK-INST: vmv.s.x v8, a0
 # CHECK-ENCODING: [0x57,0x64,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 05 42 <unknown>
+# CHECK-UNKNOWN: 42056457 <unknown>
 
 vmv1r.v v8, v4
 # CHECK-INST: vmv1r.v v8, v4
 # CHECK-ENCODING: [0x57,0x34,0x40,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 34 40 9e <unknown>
+# CHECK-UNKNOWN: 9e403457 <unknown>
 
 vmv2r.v v8, v4
 # CHECK-INST: vmv2r.v v8, v4
 # CHECK-ENCODING: [0x57,0xb4,0x40,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 40 9e <unknown>
+# CHECK-UNKNOWN: 9e40b457 <unknown>
 
 vmv4r.v v8, v4
 # CHECK-INST: vmv4r.v v8, v4
 # CHECK-ENCODING: [0x57,0xb4,0x41,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 41 9e <unknown>
+# CHECK-UNKNOWN: 9e41b457 <unknown>
 
 vmv8r.v v8, v24
 # CHECK-INST: vmv8r.v v8, v24
 # CHECK-ENCODING: [0x57,0xb4,0x83,0x9f]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 83 9f <unknown>
+# CHECK-UNKNOWN: 9f83b457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/or.s b/llvm/test/MC/RISCV/rvv/or.s
index ef281fe80bd4..306d7ae81442 100644
--- a/llvm/test/MC/RISCV/rvv/or.s
+++ b/llvm/test/MC/RISCV/rvv/or.s
@@ -12,34 +12,34 @@ vor.vv v8, v4, v20, v0.t
 # CHECK-INST: vor.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 28 <unknown>
+# CHECK-UNKNOWN: 284a0457 <unknown>
 
 vor.vv v8, v4, v20
 # CHECK-INST: vor.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 2a <unknown>
+# CHECK-UNKNOWN: 2a4a0457 <unknown>
 
 vor.vx v8, v4, a0, v0.t
 # CHECK-INST: vor.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 28 <unknown>
+# CHECK-UNKNOWN: 28454457 <unknown>
 
 vor.vx v8, v4, a0
 # CHECK-INST: vor.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 2a <unknown>
+# CHECK-UNKNOWN: 2a454457 <unknown>
 
 vor.vi v8, v4, 15, v0.t
 # CHECK-INST: vor.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 28 <unknown>
+# CHECK-UNKNOWN: 2847b457 <unknown>
 
 vor.vi v8, v4, 15
 # CHECK-INST: vor.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 2a <unknown>
+# CHECK-UNKNOWN: 2a47b457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/others.s b/llvm/test/MC/RISCV/rvv/others.s
index d1845e0bb238..cc16a8774b82 100644
--- a/llvm/test/MC/RISCV/rvv/others.s
+++ b/llvm/test/MC/RISCV/rvv/others.s
@@ -12,142 +12,142 @@ vmerge.vvm v8, v4, v20, v0
 # CHECK-INST: vmerge.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x5c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 5c <unknown>
+# CHECK-UNKNOWN: 5c4a0457 <unknown>
 
 vmerge.vxm v8, v4, a0, v0
 # CHECK-INST: vmerge.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x5c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 5c <unknown>
+# CHECK-UNKNOWN: 5c454457 <unknown>
 
 vmerge.vim v8, v4, 15, v0
 # CHECK-INST: vmerge.vim v8, v4, 15, v0
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x5c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 5c <unknown>
+# CHECK-UNKNOWN: 5c47b457 <unknown>
 
 vslideup.vx v8, v4, a0, v0.t
 # CHECK-INST: vslideup.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x38]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 38 <unknown>
+# CHECK-UNKNOWN: 38454457 <unknown>
 
 vslideup.vx v8, v4, a0
 # CHECK-INST: vslideup.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 3a <unknown>
+# CHECK-UNKNOWN: 3a454457 <unknown>
 
 vslideup.vi v8, v4, 31, v0.t
 # CHECK-INST: vslideup.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x38]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 38 <unknown>
+# CHECK-UNKNOWN: 384fb457 <unknown>
 
 vslideup.vi v8, v4, 31
 # CHECK-INST: vslideup.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 3a <unknown>
+# CHECK-UNKNOWN: 3a4fb457 <unknown>
 
 vslidedown.vx v8, v4, a0, v0.t
 # CHECK-INST: vslidedown.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x3c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 3c <unknown>
+# CHECK-UNKNOWN: 3c454457 <unknown>
 
 vslidedown.vx v8, v4, a0
 # CHECK-INST: vslidedown.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x3e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 3e <unknown>
+# CHECK-UNKNOWN: 3e454457 <unknown>
 
 vslidedown.vi v8, v4, 31, v0.t
 # CHECK-INST: vslidedown.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x3c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 3c <unknown>
+# CHECK-UNKNOWN: 3c4fb457 <unknown>
 
 vslidedown.vi v8, v4, 31
 # CHECK-INST: vslidedown.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x3e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 3e <unknown>
+# CHECK-UNKNOWN: 3e4fb457 <unknown>
 
 vslide1up.vx v8, v4, a0, v0.t
 # CHECK-INST: vslide1up.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x38]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 38 <unknown>
+# CHECK-UNKNOWN: 38456457 <unknown>
 
 vslide1up.vx v8, v4, a0
 # CHECK-INST: vslide1up.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 3a <unknown>
+# CHECK-UNKNOWN: 3a456457 <unknown>
 
 vslide1down.vx v8, v4, a0, v0.t
 # CHECK-INST: vslide1down.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x3c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 3c <unknown>
+# CHECK-UNKNOWN: 3c456457 <unknown>
 
 vslide1down.vx v8, v4, a0
 # CHECK-INST: vslide1down.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x3e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 3e <unknown>
+# CHECK-UNKNOWN: 3e456457 <unknown>
 
 vrgather.vv v8, v4, v20, v0.t
 # CHECK-INST: vrgather.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x30]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 30 <unknown>
+# CHECK-UNKNOWN: 304a0457 <unknown>
 
 vrgather.vv v8, v4, v20
 # CHECK-INST: vrgather.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x32]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 32 <unknown>
+# CHECK-UNKNOWN: 324a0457 <unknown>
 
 vrgather.vx v8, v4, a0, v0.t
 # CHECK-INST: vrgather.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x30]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 30 <unknown>
+# CHECK-UNKNOWN: 30454457 <unknown>
 
 vrgather.vx v8, v4, a0
 # CHECK-INST: vrgather.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x32]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 32 <unknown>
+# CHECK-UNKNOWN: 32454457 <unknown>
 
 vrgather.vi v8, v4, 31, v0.t
 # CHECK-INST: vrgather.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x30]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 30 <unknown>
+# CHECK-UNKNOWN: 304fb457 <unknown>
 
 vrgather.vi v8, v4, 31
 # CHECK-INST: vrgather.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x32]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 32 <unknown>
+# CHECK-UNKNOWN: 324fb457 <unknown>
 
 vrgatherei16.vv v8, v4, v20, v0.t
 # CHECK-INST: vrgatherei16.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x38]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 38 <unknown>
+# CHECK-UNKNOWN: 384a0457 <unknown>
 
 vrgatherei16.vv v8, v4, v20
 # CHECK-INST: vrgatherei16.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 3a <unknown>
+# CHECK-UNKNOWN: 3a4a0457 <unknown>
 
 vcompress.vm v8, v4, v20
 # CHECK-INST: vcompress.vm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 5e <unknown>
+# CHECK-UNKNOWN: 5e4a2457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/reduction.s b/llvm/test/MC/RISCV/rvv/reduction.s
index 2172589b7c38..006f54d9b545 100644
--- a/llvm/test/MC/RISCV/rvv/reduction.s
+++ b/llvm/test/MC/RISCV/rvv/reduction.s
@@ -12,124 +12,124 @@ vredsum.vs v8, v4, v20, v0.t
 # CHECK-INST: vredsum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 00 <unknown>
+# CHECK-UNKNOWN: 004a2457 <unknown>
 
 vredsum.vs v8, v4, v20
 # CHECK-INST: vredsum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 02 <unknown>
+# CHECK-UNKNOWN: 024a2457 <unknown>
 
 vredmaxu.vs v8, v4, v20, v0.t
 # CHECK-INST: vredmaxu.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x18]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 18 <unknown>
+# CHECK-UNKNOWN: 184a2457 <unknown>
 
 vredmaxu.vs v8, v4, v20
 # CHECK-INST: vredmaxu.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 1a <unknown>
+# CHECK-UNKNOWN: 1a4a2457 <unknown>
 
 vredmax.vs v8, v4, v20, v0.t
 # CHECK-INST: vredmax.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x1c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 1c <unknown>
+# CHECK-UNKNOWN: 1c4a2457 <unknown>
 
 vredmax.vs v8, v4, v20
 # CHECK-INST: vredmax.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x1e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 1e <unknown>
+# CHECK-UNKNOWN: 1e4a2457 <unknown>
 
 vredminu.vs v8, v4, v20, v0.t
 # CHECK-INST: vredminu.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x10]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 10 <unknown>
+# CHECK-UNKNOWN: 104a2457 <unknown>
 
 vredminu.vs v8, v4, v20
 # CHECK-INST: vredminu.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x12]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 12 <unknown>
+# CHECK-UNKNOWN: 124a2457 <unknown>
 
 vredmin.vs v8, v4, v20, v0.t
 # CHECK-INST: vredmin.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x14]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 14 <unknown>
+# CHECK-UNKNOWN: 144a2457 <unknown>
 
 vredmin.vs v8, v4, v20
 # CHECK-INST: vredmin.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x16]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 16 <unknown>
+# CHECK-UNKNOWN: 164a2457 <unknown>
 
 vredand.vs v8, v4, v20, v0.t
 # CHECK-INST: vredand.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 04 <unknown>
+# CHECK-UNKNOWN: 044a2457 <unknown>
 
 vredand.vs v8, v4, v20
 # CHECK-INST: vredand.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 06 <unknown>
+# CHECK-UNKNOWN: 064a2457 <unknown>
 
 vredor.vs v8, v4, v20, v0.t
 # CHECK-INST: vredor.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 08 <unknown>
+# CHECK-UNKNOWN: 084a2457 <unknown>
 
 vredor.vs v8, v4, v20
 # CHECK-INST: vredor.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 0a <unknown>
+# CHECK-UNKNOWN: 0a4a2457 <unknown>
 
 vredxor.vs v8, v4, v20, v0.t
 # CHECK-INST: vredxor.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 0c <unknown>
+# CHECK-UNKNOWN: 0c4a2457 <unknown>
 
 vredxor.vs v8, v4, v20
 # CHECK-INST: vredxor.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 0e <unknown>
+# CHECK-UNKNOWN: 0e4a2457 <unknown>
 
 vwredsumu.vs v8, v4, v20, v0.t
 # CHECK-INST: vwredsumu.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a c0 <unknown>
+# CHECK-UNKNOWN: c04a0457 <unknown>
 
 vwredsumu.vs v8, v4, v20
 # CHECK-INST: vwredsumu.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a c2 <unknown>
+# CHECK-UNKNOWN: c24a0457 <unknown>
 
 vwredsum.vs v8, v4, v20, v0.t
 # CHECK-INST: vwredsum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a c4 <unknown>
+# CHECK-UNKNOWN: c44a0457 <unknown>
 
 vwredsum.vs v8, v4, v20
 # CHECK-INST: vwredsum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a c6 <unknown>
+# CHECK-UNKNOWN: c64a0457 <unknown>
 
 vredsum.vs v0, v4, v20, v0.t
 # CHECK-INST: vredsum.vs v0, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x20,0x4a,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 20 4a 00 <unknown>
+# CHECK-UNKNOWN: 004a2057 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/shift.s b/llvm/test/MC/RISCV/rvv/shift.s
index 8a2e82f3c8fe..017e12499dc2 100644
--- a/llvm/test/MC/RISCV/rvv/shift.s
+++ b/llvm/test/MC/RISCV/rvv/shift.s
@@ -12,256 +12,256 @@ vsll.vv v8, v4, v20, v0.t
 # CHECK-INST: vsll.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 94 <unknown>
+# CHECK-UNKNOWN: 944a0457 <unknown>
 
 vsll.vv v8, v4, v20
 # CHECK-INST: vsll.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 96 <unknown>
+# CHECK-UNKNOWN: 964a0457 <unknown>
 
 vsll.vx v8, v4, a0, v0.t
 # CHECK-INST: vsll.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 94 <unknown>
+# CHECK-UNKNOWN: 94454457 <unknown>
 
 vsll.vx v8, v4, a0
 # CHECK-INST: vsll.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 96 <unknown>
+# CHECK-UNKNOWN: 96454457 <unknown>
 
 vsll.vi v8, v4, 31, v0.t
 # CHECK-INST: vsll.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 94 <unknown>
+# CHECK-UNKNOWN: 944fb457 <unknown>
 
 vsll.vi v8, v4, 31
 # CHECK-INST: vsll.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 96 <unknown>
+# CHECK-UNKNOWN: 964fb457 <unknown>
 
 vsrl.vv v8, v4, v20, v0.t
 # CHECK-INST: vsrl.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a0 <unknown>
+# CHECK-UNKNOWN: a04a0457 <unknown>
 
 vsrl.vv v8, v4, v20
 # CHECK-INST: vsrl.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a2 <unknown>
+# CHECK-UNKNOWN: a24a0457 <unknown>
 
 vsrl.vx v8, v4, a0, v0.t
 # CHECK-INST: vsrl.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a0 <unknown>
+# CHECK-UNKNOWN: a0454457 <unknown>
 
 vsrl.vx v8, v4, a0
 # CHECK-INST: vsrl.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a2 <unknown>
+# CHECK-UNKNOWN: a2454457 <unknown>
 
 vsrl.vi v8, v4, 31, v0.t
 # CHECK-INST: vsrl.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a0 <unknown>
+# CHECK-UNKNOWN: a04fb457 <unknown>
 
 vsrl.vi v8, v4, 31
 # CHECK-INST: vsrl.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a2 <unknown>
+# CHECK-UNKNOWN: a24fb457 <unknown>
 
 vsra.vv v8, v4, v20, v0.t
 # CHECK-INST: vsra.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a4 <unknown>
+# CHECK-UNKNOWN: a44a0457 <unknown>
 
 vsra.vv v8, v4, v20
 # CHECK-INST: vsra.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a6 <unknown>
+# CHECK-UNKNOWN: a64a0457 <unknown>
 
 vsra.vx v8, v4, a0, v0.t
 # CHECK-INST: vsra.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a4 <unknown>
+# CHECK-UNKNOWN: a4454457 <unknown>
 
 vsra.vx v8, v4, a0
 # CHECK-INST: vsra.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a6 <unknown>
+# CHECK-UNKNOWN: a6454457 <unknown>
 
 vsra.vi v8, v4, 31, v0.t
 # CHECK-INST: vsra.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a4 <unknown>
+# CHECK-UNKNOWN: a44fb457 <unknown>
 
 vsra.vi v8, v4, 31
 # CHECK-INST: vsra.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a6 <unknown>
+# CHECK-UNKNOWN: a64fb457 <unknown>
 
 vnsrl.wv v8, v4, v20, v0.t
 # CHECK-INST: vnsrl.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b0 <unknown>
+# CHECK-UNKNOWN: b04a0457 <unknown>
 
 vnsrl.wv v4, v4, v20, v0.t
 # CHECK-INST: vnsrl.wv v4, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x02,0x4a,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a b0 <unknown>
+# CHECK-UNKNOWN: b04a0257 <unknown>
 
 vnsrl.wv v8, v4, v20
 # CHECK-INST: vnsrl.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b2 <unknown>
+# CHECK-UNKNOWN: b24a0457 <unknown>
 
 vnsrl.wx v8, v4, a0, v0.t
 # CHECK-INST: vnsrl.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b0 <unknown>
+# CHECK-UNKNOWN: b0454457 <unknown>
 
 vnsrl.wx v8, v4, a0
 # CHECK-INST: vnsrl.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b2 <unknown>
+# CHECK-UNKNOWN: b2454457 <unknown>
 
 vnsrl.wi v8, v4, 31, v0.t
 # CHECK-INST: vnsrl.wi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b0 <unknown>
+# CHECK-UNKNOWN: b04fb457 <unknown>
 
 vnsrl.wi v8, v4, 31
 # CHECK-INST: vnsrl.wi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b2 <unknown>
+# CHECK-UNKNOWN: b24fb457 <unknown>
 
 vnsra.wv v8, v4, v20, v0.t
 # CHECK-INST: vnsra.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b4 <unknown>
+# CHECK-UNKNOWN: b44a0457 <unknown>
 
 vnsra.wv v8, v4, v20
 # CHECK-INST: vnsra.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b6 <unknown>
+# CHECK-UNKNOWN: b64a0457 <unknown>
 
 vnsra.wx v8, v4, a0, v0.t
 # CHECK-INST: vnsra.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b4 <unknown>
+# CHECK-UNKNOWN: b4454457 <unknown>
 
 vnsra.wx v8, v4, a0
 # CHECK-INST: vnsra.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b6 <unknown>
+# CHECK-UNKNOWN: b6454457 <unknown>
 
 vnsra.wi v8, v4, 31, v0.t
 # CHECK-INST: vnsra.wi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b4 <unknown>
+# CHECK-UNKNOWN: b44fb457 <unknown>
 
 vnsra.wi v8, v4, 31
 # CHECK-INST: vnsra.wi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b6 <unknown>
+# CHECK-UNKNOWN: b64fb457 <unknown>
 
 vssrl.vv v8, v4, v20, v0.t
 # CHECK-INST: vssrl.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a8 <unknown>
+# CHECK-UNKNOWN: a84a0457 <unknown>
 
 vssrl.vv v8, v4, v20
 # CHECK-INST: vssrl.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a aa <unknown>
+# CHECK-UNKNOWN: aa4a0457 <unknown>
 
 vssrl.vx v8, v4, a0, v0.t
 # CHECK-INST: vssrl.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a8 <unknown>
+# CHECK-UNKNOWN: a8454457 <unknown>
 
 vssrl.vx v8, v4, a0
 # CHECK-INST: vssrl.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 aa <unknown>
+# CHECK-UNKNOWN: aa454457 <unknown>
 
 vssrl.vi v8, v4, 31, v0.t
 # CHECK-INST: vssrl.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a8 <unknown>
+# CHECK-UNKNOWN: a84fb457 <unknown>
 
 vssrl.vi v8, v4, 31
 # CHECK-INST: vssrl.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f aa <unknown>
+# CHECK-UNKNOWN: aa4fb457 <unknown>
 
 vssra.vv v8, v4, v20, v0.t
 # CHECK-INST: vssra.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a ac <unknown>
+# CHECK-UNKNOWN: ac4a0457 <unknown>
 
 vssra.vv v8, v4, v20
 # CHECK-INST: vssra.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a ae <unknown>
+# CHECK-UNKNOWN: ae4a0457 <unknown>
 
 vssra.vx v8, v4, a0, v0.t
 # CHECK-INST: vssra.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 ac <unknown>
+# CHECK-UNKNOWN: ac454457 <unknown>
 
 vssra.vx v8, v4, a0
 # CHECK-INST: vssra.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 ae <unknown>
+# CHECK-UNKNOWN: ae454457 <unknown>
 
 vssra.vi v8, v4, 31, v0.t
 # CHECK-INST: vssra.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f ac <unknown>
+# CHECK-UNKNOWN: ac4fb457 <unknown>
 
 vssra.vi v8, v4, 31
 # CHECK-INST: vssra.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f ae <unknown>
+# CHECK-UNKNOWN: ae4fb457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/sign-injection.s b/llvm/test/MC/RISCV/rvv/sign-injection.s
index 96d37c42d208..23e9be868a42 100644
--- a/llvm/test/MC/RISCV/rvv/sign-injection.s
+++ b/llvm/test/MC/RISCV/rvv/sign-injection.s
@@ -15,70 +15,70 @@ vfsgnj.vv v8, v4, v20, v0.t
 # CHECK-INST: vfsgnj.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x20]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 20 <unknown>
+# CHECK-UNKNOWN: 204a1457 <unknown>
 
 vfsgnj.vv v8, v4, v20
 # CHECK-INST: vfsgnj.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x22]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 22 <unknown>
+# CHECK-UNKNOWN: 224a1457 <unknown>
 
 vfsgnj.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfsgnj.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x20]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 20 <unknown>
+# CHECK-UNKNOWN: 20455457 <unknown>
 
 vfsgnj.vf v8, v4, fa0
 # CHECK-INST: vfsgnj.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x22]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 22 <unknown>
+# CHECK-UNKNOWN: 22455457 <unknown>
 
 vfsgnjn.vv v8, v4, v20, v0.t
 # CHECK-INST: vfsgnjn.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x24]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 24 <unknown>
+# CHECK-UNKNOWN: 244a1457 <unknown>
 
 vfsgnjn.vv v8, v4, v20
 # CHECK-INST: vfsgnjn.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x26]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 26 <unknown>
+# CHECK-UNKNOWN: 264a1457 <unknown>
 
 vfsgnjn.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfsgnjn.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 24 <unknown>
+# CHECK-UNKNOWN: 24455457 <unknown>
 
 vfsgnjn.vf v8, v4, fa0
 # CHECK-INST: vfsgnjn.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 26 <unknown>
+# CHECK-UNKNOWN: 26455457 <unknown>
 
 vfsgnjx.vv v8, v4, v20, v0.t
 # CHECK-INST: vfsgnjx.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x28]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 28 <unknown>
+# CHECK-UNKNOWN: 284a1457 <unknown>
 
 vfsgnjx.vv v8, v4, v20
 # CHECK-INST: vfsgnjx.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 2a <unknown>
+# CHECK-UNKNOWN: 2a4a1457 <unknown>
 
 vfsgnjx.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfsgnjx.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x28]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 28 <unknown>
+# CHECK-UNKNOWN: 28455457 <unknown>
 
 vfsgnjx.vf v8, v4, fa0
 # CHECK-INST: vfsgnjx.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 2a <unknown>
+# CHECK-UNKNOWN: 2a455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/snippet.s b/llvm/test/MC/RISCV/rvv/snippet.s
index c032e468dede..c3e57e193f84 100644
--- a/llvm/test/MC/RISCV/rvv/snippet.s
+++ b/llvm/test/MC/RISCV/rvv/snippet.s
@@ -6,27 +6,27 @@
 
 loop:
     vsetvli a3, a0, e16,m4,ta,ma  # vtype = 16-bit integer vectors
-# CHECK-INST: d7 76 a5 0c    vsetvli a3, a0, e16, m4, ta, ma
+# CHECK-INST: 0ca576d7 vsetvli a3, a0, e16, m4, ta, ma
     vle16.v v4, (a1)              # Get 16b vector
-# CHECK-INST: 07 d2 05 02    vle16.v   v4, (a1)
+# CHECK-INST: 0205d207 vle16.v v4, (a1)
     slli t1, a3, 1                # Multiply length by two bytes/element
-# CHECK-INST: 13 93 16 00    slli    t1, a3, 0x1
+# CHECK-INST: 00169313 slli t1, a3, 0x1
     add a1, a1, t1                # Bump pointer
-# CHECK-INST: b3 85 65 00    add     a1, a1, t1
+# CHECK-INST: 006585b3 add a1, a1, t1
     vwmul.vx v8, v4, x10          # 32b in <v8--v15>
-# CHECK-INST: 57 64 45 ee    vwmul.vx        v8, v4, a0
+# CHECK-INST: ee456457 vwmul.vx v8, v4, a0
 
     vsetvli x0, a0, e32,m8,ta,ma  # Operate on 32b values
-# CHECK-INST: 57 70 35 0d    vsetvli zero, a0, e32, m8, ta, ma
+# CHECK-INST: 0d357057 vsetvli zero, a0, e32, m8, ta, ma
     vsrl.vi v8, v8, 3
-# CHECK-INST: 57 b4 81 a2    vsrl.vi v8, v8, 0x3
+# CHECK-INST: a281b457 vsrl.vi v8, v8, 0x3
     vse32.v v8, (a2)              # Store vector of 32b
-# CHECK-INST: 27 64 06 02    vse32.v   v8, (a2)
+# CHECK-INST: 02066427 vse32.v v8, (a2)
     slli t1, a3, 2                # Multiply length by four bytes/element
-# CHECK-INST: 13 93 26 00    slli    t1, a3, 0x2
+# CHECK-INST: 00269313 slli t1, a3, 0x2
     add a2, a2, t1                # Bump pointer
-# CHECK-INST: 33 06 66 00    add     a2, a2, t1
+# CHECK-INST: 00660633 add a2, a2, t1
     sub a0, a0, a3                # Decrement count
-# CHECK-INST: 33 05 d5 40    sub     a0, a0, a3
+# CHECK-INST: 40d50533 sub a0, a0, a3
     bnez a0, loop                 # Any more?
-# CHECK-INST: e3 1a 05 fc    bnez    a0, 0x0
+# CHECK-INST: fc051ae3 bnez a0, 0x0
diff --git a/llvm/test/MC/RISCV/rvv/store.s b/llvm/test/MC/RISCV/rvv/store.s
index a38f19f266fa..c6a34705fa4a 100644
--- a/llvm/test/MC/RISCV/rvv/store.s
+++ b/llvm/test/MC/RISCV/rvv/store.s
@@ -12,250 +12,250 @@ vsm.v v24, (a0)
 # CHECK-INST: vsm.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50c27 <unknown>
 
 vse8.v v24, (a0), v0.t
 # CHECK-INST: vse8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 00 <unknown>
+# CHECK-UNKNOWN: 00050c27 <unknown>
 
 vse8.v v24, (a0)
 # CHECK-INST: vse8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 02 <unknown>
+# CHECK-UNKNOWN: 02050c27 <unknown>
 
 vse16.v v24, (a0), v0.t
 # CHECK-INST: vse16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 00 <unknown>
+# CHECK-UNKNOWN: 00055c27 <unknown>
 
 vse16.v v24, (a0)
 # CHECK-INST: vse16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 02 <unknown>
+# CHECK-UNKNOWN: 02055c27 <unknown>
 
 vse32.v v24, (a0), v0.t
 # CHECK-INST: vse32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 00 <unknown>
+# CHECK-UNKNOWN: 00056c27 <unknown>
 
 vse32.v v24, (a0)
 # CHECK-INST: vse32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 02 <unknown>
+# CHECK-UNKNOWN: 02056c27 <unknown>
 
 vse64.v v24, (a0), v0.t
 # CHECK-INST: vse64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 00 <unknown>
+# CHECK-UNKNOWN: 00057c27 <unknown>
 
 vse64.v v24, (a0)
 # CHECK-INST: vse64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 02 <unknown>
+# CHECK-UNKNOWN: 02057c27 <unknown>
 
 vsse8.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b50c27 <unknown>
 
 vsse8.v v24, (a0), a1
 # CHECK-INST: vsse8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab50c27 <unknown>
 
 vsse16.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b55c27 <unknown>
 
 vsse16.v v24, (a0), a1
 # CHECK-INST: vsse16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab55c27 <unknown>
 
 vsse32.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b56c27 <unknown>
 
 vsse32.v v24, (a0), a1
 # CHECK-INST: vsse32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab56c27 <unknown>
 
 vsse64.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b57c27 <unknown>
 
 vsse64.v v24, (a0), a1
 # CHECK-INST: vsse64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab57c27 <unknown>
 
 vsuxei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 04 <unknown>
+# CHECK-UNKNOWN: 04450c27 <unknown>
 
 vsuxei8.v v24, (a0), v4
 # CHECK-INST: vsuxei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 06 <unknown>
+# CHECK-UNKNOWN: 06450c27 <unknown>
 
 vsuxei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 04 <unknown>
+# CHECK-UNKNOWN: 04455c27 <unknown>
 
 vsuxei16.v v24, (a0), v4
 # CHECK-INST: vsuxei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 06 <unknown>
+# CHECK-UNKNOWN: 06455c27 <unknown>
 
 vsuxei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 04 <unknown>
+# CHECK-UNKNOWN: 04456c27 <unknown>
 
 vsuxei32.v v24, (a0), v4
 # CHECK-INST: vsuxei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 06 <unknown>
+# CHECK-UNKNOWN: 06456c27 <unknown>
 
 vsuxei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 04 <unknown>
+# CHECK-UNKNOWN: 04457c27 <unknown>
 
 vsuxei64.v v24, (a0), v4
 # CHECK-INST: vsuxei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 06 <unknown>
+# CHECK-UNKNOWN: 06457c27 <unknown>
 
 vsoxei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c450c27 <unknown>
 
 vsoxei8.v v24, (a0), v4
 # CHECK-INST: vsoxei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 0e <unknown>
+# CHECK-UNKNOWN: 0e450c27 <unknown>
 
 vsoxei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c455c27 <unknown>
 
 vsoxei16.v v24, (a0), v4
 # CHECK-INST: vsoxei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 0e <unknown>
+# CHECK-UNKNOWN: 0e455c27 <unknown>
 
 vsoxei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c456c27 <unknown>
 
 vsoxei32.v v24, (a0), v4
 # CHECK-INST: vsoxei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 0e <unknown>
+# CHECK-UNKNOWN: 0e456c27 <unknown>
 
 vsoxei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c457c27 <unknown>
 
 vsoxei64.v v24, (a0), v4
 # CHECK-INST: vsoxei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 0e <unknown>
+# CHECK-UNKNOWN: 0e457c27 <unknown>
 
 vs1r.v v24, (a0)
 # CHECK-INST: vs1r.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 85 02 <unknown>
+# CHECK-UNKNOWN: 02850c27 <unknown>
 
 vs2r.v v24, (a0)
 # CHECK-INST: vs2r.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 85 22 <unknown>
+# CHECK-UNKNOWN: 22850c27 <unknown>
 
 vs4r.v v24, (a0)
 # CHECK-INST: vs4r.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 85 62 <unknown>
+# CHECK-UNKNOWN: 62850c27 <unknown>
 
 vs8r.v v24, (a0)
 # CHECK-INST: vs8r.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 85 e2 <unknown>
+# CHECK-UNKNOWN: e2850c27 <unknown>
 
 vsm.v v24, 0(a0)
 # CHECK-INST: vsm.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50c27 <unknown>
 
 vse8.v v24, 0(a0), v0.t
 # CHECK-INST: vse8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 00 <unknown>
+# CHECK-UNKNOWN: 00050c27 <unknown>
 
 vsse16.v v24, 0(a0), a1, v0.t
 # CHECK-INST: vsse16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b55c27 <unknown>
 
 vsuxei8.v v24, 0(a0), v4, v0.t
 # CHECK-INST: vsuxei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 04 <unknown>
+# CHECK-UNKNOWN: 04450c27 <unknown>
 
 vsoxei32.v v24, 0(a0), v4, v0.t
 # CHECK-INST: vsoxei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c456c27 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/sub.s b/llvm/test/MC/RISCV/rvv/sub.s
index 3cc75ae730e7..6a637d9207ee 100644
--- a/llvm/test/MC/RISCV/rvv/sub.s
+++ b/llvm/test/MC/RISCV/rvv/sub.s
@@ -12,298 +12,298 @@ vsub.vv v8, v4, v20, v0.t
 # CHECK-INST: vsub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 08 <unknown>
+# CHECK-UNKNOWN: 084a0457 <unknown>
 
 vsub.vv v8, v4, v20
 # CHECK-INST: vsub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 0a <unknown>
+# CHECK-UNKNOWN: 0a4a0457 <unknown>
 
 vsub.vx v8, v4, a0, v0.t
 # CHECK-INST: vsub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 08 <unknown>
+# CHECK-UNKNOWN: 08454457 <unknown>
 
 vsub.vx v8, v4, a0
 # CHECK-INST: vsub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 0a <unknown>
+# CHECK-UNKNOWN: 0a454457 <unknown>
 
 vrsub.vx v8, v4, a0, v0.t
 # CHECK-INST: vrsub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 0c <unknown>
+# CHECK-UNKNOWN: 0c454457 <unknown>
 
 vrsub.vx v8, v4, a0
 # CHECK-INST: vrsub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 0e <unknown>
+# CHECK-UNKNOWN: 0e454457 <unknown>
 
 vrsub.vi v8, v4, 15, v0.t
 # CHECK-INST: vrsub.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 0c <unknown>
+# CHECK-UNKNOWN: 0c47b457 <unknown>
 
 vrsub.vi v8, v4, 15
 # CHECK-INST: vrsub.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 0e <unknown>
+# CHECK-UNKNOWN: 0e47b457 <unknown>
 
 vwsubu.vv v8, v4, v20, v0.t
 # CHECK-INST: vwsubu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c8 <unknown>
+# CHECK-UNKNOWN: c84a2457 <unknown>
 
 vwsubu.vv v8, v4, v20
 # CHECK-INST: vwsubu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ca <unknown>
+# CHECK-UNKNOWN: ca4a2457 <unknown>
 
 vwsubu.vx v8, v4, a0, v0.t
 # CHECK-INST: vwsubu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c8 <unknown>
+# CHECK-UNKNOWN: c8456457 <unknown>
 
 vwsubu.vx v8, v4, a0
 # CHECK-INST: vwsubu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ca <unknown>
+# CHECK-UNKNOWN: ca456457 <unknown>
 
 vwsub.vv v8, v4, v20, v0.t
 # CHECK-INST: vwsub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a cc <unknown>
+# CHECK-UNKNOWN: cc4a2457 <unknown>
 
 vwsub.vv v8, v4, v20
 # CHECK-INST: vwsub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ce <unknown>
+# CHECK-UNKNOWN: ce4a2457 <unknown>
 
 vwsub.vx v8, v4, a0, v0.t
 # CHECK-INST: vwsub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 cc <unknown>
+# CHECK-UNKNOWN: cc456457 <unknown>
 
 vwsub.vx v8, v4, a0
 # CHECK-INST: vwsub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ce <unknown>
+# CHECK-UNKNOWN: ce456457 <unknown>
 
 vwsubu.wv v8, v4, v20, v0.t
 # CHECK-INST: vwsubu.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d8 <unknown>
+# CHECK-UNKNOWN: d84a2457 <unknown>
 
 vwsubu.wv v8, v4, v20
 # CHECK-INST: vwsubu.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xda]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a da <unknown>
+# CHECK-UNKNOWN: da4a2457 <unknown>
 
 vwsubu.wx v8, v4, a0, v0.t
 # CHECK-INST: vwsubu.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d8 <unknown>
+# CHECK-UNKNOWN: d8456457 <unknown>
 
 vwsubu.wx v8, v4, a0
 # CHECK-INST: vwsubu.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xda]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 da <unknown>
+# CHECK-UNKNOWN: da456457 <unknown>
 
 vwsub.wv v8, v4, v20, v0.t
 # CHECK-INST: vwsub.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xdc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a dc <unknown>
+# CHECK-UNKNOWN: dc4a2457 <unknown>
 
 vwsub.wv v8, v4, v20
 # CHECK-INST: vwsub.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xde]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a de <unknown>
+# CHECK-UNKNOWN: de4a2457 <unknown>
 
 vwsub.wx v8, v4, a0, v0.t
 # CHECK-INST: vwsub.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xdc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 dc <unknown>
+# CHECK-UNKNOWN: dc456457 <unknown>
 
 vwsub.wx v8, v4, a0
 # CHECK-INST: vwsub.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xde]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 de <unknown>
+# CHECK-UNKNOWN: de456457 <unknown>
 
 vsbc.vvm v8, v4, v20, v0
 # CHECK-INST: vsbc.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 48 <unknown>
+# CHECK-UNKNOWN: 484a0457 <unknown>
 
 vsbc.vvm v4, v4, v20, v0
 # CHECK-INST: vsbc.vvm v4, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x02,0x4a,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a 48 <unknown>
+# CHECK-UNKNOWN: 484a0257 <unknown>
 
 vsbc.vvm v8, v4, v8, v0
 # CHECK-INST: vsbc.vvm v8, v4, v8, v0
 # CHECK-ENCODING: [0x57,0x04,0x44,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 44 48 <unknown>
+# CHECK-UNKNOWN: 48440457 <unknown>
 
 vsbc.vxm v8, v4, a0, v0
 # CHECK-INST: vsbc.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 48 <unknown>
+# CHECK-UNKNOWN: 48454457 <unknown>
 
 vmsbc.vvm v8, v4, v20, v0
 # CHECK-INST: vmsbc.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 4c <unknown>
+# CHECK-UNKNOWN: 4c4a0457 <unknown>
 
 vmsbc.vvm v4, v4, v20, v0
 # CHECK-INST: vmsbc.vvm v4, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x02,0x4a,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a 4c <unknown>
+# CHECK-UNKNOWN: 4c4a0257 <unknown>
 
 vmsbc.vvm v8, v4, v8, v0
 # CHECK-INST: vmsbc.vvm v8, v4, v8, v0
 # CHECK-ENCODING: [0x57,0x04,0x44,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 44 4c <unknown>
+# CHECK-UNKNOWN: 4c440457 <unknown>
 
 vmsbc.vxm v8, v4, a0, v0
 # CHECK-INST: vmsbc.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 4c <unknown>
+# CHECK-UNKNOWN: 4c454457 <unknown>
 
 vmsbc.vv v8, v4, v20
 # CHECK-INST: vmsbc.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 4e <unknown>
+# CHECK-UNKNOWN: 4e4a0457 <unknown>
 
 vmsbc.vx v8, v4, a0
 # CHECK-INST: vmsbc.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 4e <unknown>
+# CHECK-UNKNOWN: 4e454457 <unknown>
 
 vssubu.vv v8, v4, v20, v0.t
 # CHECK-INST: vssubu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 88 <unknown>
+# CHECK-UNKNOWN: 884a0457 <unknown>
 
 vssubu.vv v8, v4, v20
 # CHECK-INST: vssubu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 8a <unknown>
+# CHECK-UNKNOWN: 8a4a0457 <unknown>
 
 vssubu.vx v8, v4, a0, v0.t
 # CHECK-INST: vssubu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 88 <unknown>
+# CHECK-UNKNOWN: 88454457 <unknown>
 
 vssubu.vx v8, v4, a0
 # CHECK-INST: vssubu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 8a <unknown>
+# CHECK-UNKNOWN: 8a454457 <unknown>
 
 vssub.vv v8, v4, v20, v0.t
 # CHECK-INST: vssub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 8c <unknown>
+# CHECK-UNKNOWN: 8c4a0457 <unknown>
 
 vssub.vv v8, v4, v20
 # CHECK-INST: vssub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 8e <unknown>
+# CHECK-UNKNOWN: 8e4a0457 <unknown>
 
 vssub.vx v8, v4, a0, v0.t
 # CHECK-INST: vssub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 8c <unknown>
+# CHECK-UNKNOWN: 8c454457 <unknown>
 
 vssub.vx v8, v4, a0
 # CHECK-INST: vssub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 8e <unknown>
+# CHECK-UNKNOWN: 8e454457 <unknown>
 
 vasub.vv v8, v4, v20, v0.t
 # CHECK-INST: vasub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 2c <unknown>
+# CHECK-UNKNOWN: 2c4a2457 <unknown>
 
 vasub.vv v8, v4, v20
 # CHECK-INST: vasub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 2e <unknown>
+# CHECK-UNKNOWN: 2e4a2457 <unknown>
 
 vasub.vx v8, v4, a0, v0.t
 # CHECK-INST: vasub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 2c <unknown>
+# CHECK-UNKNOWN: 2c456457 <unknown>
 
 vasub.vx v8, v4, a0
 # CHECK-INST: vasub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 2e <unknown>
+# CHECK-UNKNOWN: 2e456457 <unknown>
 
 vasubu.vv v8, v4, v20, v0.t
 # CHECK-INST: vasubu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 28 <unknown>
+# CHECK-UNKNOWN: 284a2457 <unknown>
 
 vasubu.vv v8, v4, v20
 # CHECK-INST: vasubu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 2a <unknown>
+# CHECK-UNKNOWN: 2a4a2457 <unknown>
 
 vasubu.vx v8, v4, a0, v0.t
 # CHECK-INST: vasubu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 28 <unknown>
+# CHECK-UNKNOWN: 28456457 <unknown>
 
 vasubu.vx v8, v4, a0
 # CHECK-INST: vasubu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 2a <unknown>
+# CHECK-UNKNOWN: 2a456457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/vsetvl.s b/llvm/test/MC/RISCV/rvv/vsetvl.s
index 69a48d24f190..c9197d8917a4 100644
--- a/llvm/test/MC/RISCV/rvv/vsetvl.s
+++ b/llvm/test/MC/RISCV/rvv/vsetvl.s
@@ -13,149 +13,149 @@ vsetvli a2, a0, 0x224
 # CHECK-INST: vsetvli a2, a0, 548
 # CHECK-ENCODING: [0x57,0x76,0x45,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 45 22 <unknown>
+# CHECK-UNKNOWN: 22457657 <unknown>
 
 vsetvli a2, a0, 0xd0
 # CHECK-INST: vsetvli a2, a0, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 0d <unknown>
+# CHECK-UNKNOWN: 0d057657 <unknown>
 
 vsetvli a2, a0, 0xd1
 # CHECK-INST: vsetvli a2, a0, e32, m2, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x15,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 15 0d <unknown>
+# CHECK-UNKNOWN: 0d157657 <unknown>
 
 vsetvli a2, a0, 0x50
 # CHECK-INST: vsetvli a2, a0, e32, m1, ta, mu
 # CHECK-ENCODING: [0x57,0x76,0x05,0x05]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 05 <unknown>
+# CHECK-UNKNOWN: 05057657 <unknown>
 
 vsetvli a2, a0, 0x90
 # CHECK-INST: vsetvli a2, a0, e32, m1, tu, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x09]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 09 <unknown>
+# CHECK-UNKNOWN: 09057657 <unknown>
 
 vsetvli a2, a0, 144
 # CHECK-INST: vsetvli a2, a0, e32, m1, tu, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x09]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 09 <unknown>
+# CHECK-UNKNOWN: 09057657 <unknown>
 
 vsetvli a2, a0, e32, m1, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32,  m1,  ta,  ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 0d <unknown>
+# CHECK-UNKNOWN: 0d057657 <unknown>
 
 vsetvli a2, a0, e32, m2, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, m2, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x15,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 15 0d <unknown>
+# CHECK-UNKNOWN: 0d157657 <unknown>
 
 vsetvli a2, a0, e32, m4, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, m4, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x25,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 25 0d <unknown>
+# CHECK-UNKNOWN: 0d257657 <unknown>
 
 vsetvli a2, a0, e32, m8, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, m8, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x35,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 35 0d <unknown>
+# CHECK-UNKNOWN: 0d357657 <unknown>
 
 vsetvli a2, a0, e32, mf2, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf2, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x75,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 75 0d <unknown>
+# CHECK-UNKNOWN: 0d757657 <unknown>
 
 vsetvli a2, a0, e32, mf4, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf4, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x65,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 65 0d <unknown>
+# CHECK-UNKNOWN: 0d657657 <unknown>
 
 vsetvli a2, a0, e32, mf8, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf8, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x55,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 55 0d <unknown>
+# CHECK-UNKNOWN: 0d557657 <unknown>
 
 vsetvli a2, a0, e32, m1, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 0d <unknown>
+# CHECK-UNKNOWN: 0d057657 <unknown>
 
 vsetvli a2, a0, e32, m1, tu, ma
 # CHECK-INST: vsetvli a2, a0, e32, m1, tu, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x09]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 09 <unknown>
+# CHECK-UNKNOWN: 09057657 <unknown>
 
 vsetvli a2, a0, e32, m1, ta, mu
 # CHECK-INST: vsetvli a2, a0, e32, m1, ta, mu
 # CHECK-ENCODING: [0x57,0x76,0x05,0x05]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 05 <unknown>
+# CHECK-UNKNOWN: 05057657 <unknown>
 
 vsetvli a2, a0, e32, m1, tu, mu
 # CHECK-INST: vsetvli a2, a0, e32, m1
 # CHECK-ENCODING: [0x57,0x76,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 01 <unknown>
+# CHECK-UNKNOWN: 01057657 <unknown>
 
 vsetvl a2, a0, a1
 # CHECK-INST: vsetvl a2, a0, a1
 # CHECK-ENCODING: [0x57,0x76,0xb5,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 b5 80 <unknown>
+# CHECK-UNKNOWN: 80b57657 <unknown>
 
 # reserved filed: vlmul[2:0]=4, vsew[2:0]=0b1xx, non-zero bits 8/9/10.
 vsetivli a2, 0, 0x224
 # CHECK-INST: vsetivli a2, 0, 548
 # CHECK-ENCODING: [0x57,0x76,0x40,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 40 e2 <unknown>
+# CHECK-UNKNOWN: e2407657 <unknown>
 
 vsetivli a2, 0, 0xd0
 # CHECK-INST: vsetivli a2, 0, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x00,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 00 cd <unknown>
+# CHECK-UNKNOWN: cd007657 <unknown>
 
 vsetivli a2, 15, 0xd0
 # CHECK-INST: vsetivli a2, 15, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0xf6,0x07,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 f6 07 cd <unknown>
+# CHECK-UNKNOWN: cd07f657 <unknown>
 
 vsetivli a2, 15, 208
 # CHECK-INST: vsetivli a2, 15, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0xf6,0x07,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 f6 07 cd <unknown>
+# CHECK-UNKNOWN: cd07f657 <unknown>
 
 vsetivli a2, 0, e32, m1, ta, ma
 # CHECK-INST: vsetivli a2, 0, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x00,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 00 cd <unknown>
+# CHECK-UNKNOWN: cd007657 <unknown>
 
 vsetivli a2, 15, e32, m1, ta, ma
 # CHECK-INST: vsetivli a2, 15, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0xf6,0x07,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 f6 07 cd <unknown>
+# CHECK-UNKNOWN: cd07f657 <unknown>
 
 vsetivli a2, 31, e32, m1, ta, ma
 # CHECK-INST: vsetivli a2, 31, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0xf6,0x0f,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 f6 0f cd <unknown>
+# CHECK-UNKNOWN: cd0ff657 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xor.s b/llvm/test/MC/RISCV/rvv/xor.s
index 5ea0f694e0d6..572388ed2267 100644
--- a/llvm/test/MC/RISCV/rvv/xor.s
+++ b/llvm/test/MC/RISCV/rvv/xor.s
@@ -12,46 +12,46 @@ vxor.vv v8, v4, v20, v0.t
 # CHECK-INST: vxor.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 2c <unknown>
+# CHECK-UNKNOWN: 2c4a0457 <unknown>
 
 vxor.vv v8, v4, v20
 # CHECK-INST: vxor.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 2e <unknown>
+# CHECK-UNKNOWN: 2e4a0457 <unknown>
 
 vxor.vx v8, v4, a0, v0.t
 # CHECK-INST: vxor.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 2c <unknown>
+# CHECK-UNKNOWN: 2c454457 <unknown>
 
 vxor.vx v8, v4, a0
 # CHECK-INST: vxor.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 2e <unknown>
+# CHECK-UNKNOWN: 2e454457 <unknown>
 
 vxor.vi v8, v4, 15, v0.t
 # CHECK-INST: vxor.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 2c <unknown>
+# CHECK-UNKNOWN: 2c47b457 <unknown>
 
 vxor.vi v8, v4, 15
 # CHECK-INST: vxor.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 2e <unknown>
+# CHECK-UNKNOWN: 2e47b457 <unknown>
 
 vnot.v v8, v4, v0.t
 # CHECK-INST: vnot.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 2c <unknown>
+# CHECK-UNKNOWN: 2c4fb457 <unknown>
 
 vnot.v v8, v4
 # CHECK-INST: vnot.v v8, v4
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 2e <unknown>
+# CHECK-UNKNOWN: 2e4fb457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xsfvcp.s b/llvm/test/MC/RISCV/rvv/xsfvcp.s
index a137311f575a..4298bf7a7b7a 100644
--- a/llvm/test/MC/RISCV/rvv/xsfvcp.s
+++ b/llvm/test/MC/RISCV/rvv/xsfvcp.s
@@ -21,166 +21,166 @@ sf.vc.x 0x3, 0xf, 0x1f, a1
 # CHECK-INST: sf.vc.x 3, 15, 31, a1
 # CHECK-ENCODING: [0xdb,0xcf,0xf5,0x0e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db cf f5 0e <unknown>
+# CHECK-UNKNOWN: 0ef5cfdb <unknown>
 
 sf.vc.i 0x3, 0xf, 0x1f, 15
 # CHECK-INST: sf.vc.i 3, 15, 31, 15
 # CHECK-ENCODING: [0xdb,0xbf,0xf7,0x0e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db bf f7 0e <unknown>
+# CHECK-UNKNOWN: 0ef7bfdb <unknown>
 
 sf.vc.vv 0x3, 0x1f, v2, v1
 # CHECK-INST: sf.vc.vv 3, 31, v2, v1
 # CHECK-ENCODING: [0xdb,0x8f,0x20,0x2e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db 8f 20 2e <unknown>
+# CHECK-UNKNOWN: 2e208fdb <unknown>
 
 sf.vc.xv 0x3, 0x1f, v2, a1
 # CHECK-INST: sf.vc.xv 3, 31, v2, a1
 # CHECK-ENCODING: [0xdb,0xcf,0x25,0x2e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db cf 25 2e <unknown>
+# CHECK-UNKNOWN: 2e25cfdb <unknown>
 
 sf.vc.iv 0x3, 0x1f, v2, 15
 # CHECK-INST: sf.vc.iv 3, 31, v2, 15
 # CHECK-ENCODING: [0xdb,0xbf,0x27,0x2e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db bf 27 2e <unknown>
+# CHECK-UNKNOWN: 2e27bfdb <unknown>
 
 sf.vc.fv 0x1, 0x1f, v2, fa1
 # CHECK-INST: sf.vc.fv 1, 31, v2, fa1
 # CHECK-ENCODING: [0xdb,0xdf,0x25,0x2e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db df 25 2e <unknown>
+# CHECK-UNKNOWN: 2e25dfdb <unknown>
 
 sf.vc.vvv 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.vvv 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0xae]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 ae <unknown>
+# CHECK-UNKNOWN: ae20805b <unknown>
 
 sf.vc.xvv 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.xvv 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0xae]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 ae <unknown>
+# CHECK-UNKNOWN: ae25c05b <unknown>
 
 sf.vc.ivv 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.ivv 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0xae]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 ae <unknown>
+# CHECK-UNKNOWN: ae27b05b <unknown>
 
 sf.vc.fvv 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.fvv 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0xae]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 ae <unknown>
+# CHECK-UNKNOWN: ae25d05b <unknown>
 
 sf.vc.vvw 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.vvw 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0xfe]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 fe <unknown>
+# CHECK-UNKNOWN: fe20805b <unknown>
 
 sf.vc.xvw 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.xvw 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0xfe]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 fe <unknown>
+# CHECK-UNKNOWN: fe25c05b <unknown>
 
 sf.vc.ivw 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.ivw 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0xfe]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 fe <unknown>
+# CHECK-UNKNOWN: fe27b05b <unknown>
 
 sf.vc.fvw 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.fvw 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0xfe]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 fe <unknown>
+# CHECK-UNKNOWN: fe25d05b <unknown>
 
 sf.vc.v.x 0x3, 0xf, v0, a1
 # CHECK-INST: sf.vc.v.x 3, 15, v0, a1
 # CHECK-ENCODING: [0x5b,0xc0,0xf5,0x0c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 f5 0c <unknown>
+# CHECK-UNKNOWN: 0cf5c05b <unknown>
 
 sf.vc.v.i 0x3, 0xf, v0, 15
 # CHECK-INST: sf.vc.v.i 3, 15, v0, 15
 # CHECK-ENCODING: [0x5b,0xb0,0xf7,0x0c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 f7 0c <unknown>
+# CHECK-UNKNOWN: 0cf7b05b <unknown>
 
 sf.vc.v.vv 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.v.vv 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0x2c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 2c <unknown>
+# CHECK-UNKNOWN: 2c20805b <unknown>
 
 sf.vc.v.xv 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.v.xv 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0x2c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 2c <unknown>
+# CHECK-UNKNOWN: 2c25c05b <unknown>
 
 sf.vc.v.iv 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.v.iv 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0x2c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 2c <unknown>
+# CHECK-UNKNOWN: 2c27b05b <unknown>
 
 sf.vc.v.fv 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.v.fv 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0x2c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 2c <unknown>
+# CHECK-UNKNOWN: 2c25d05b <unknown>
 
 sf.vc.v.vvv 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.v.vvv 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0xac]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 ac <unknown>
+# CHECK-UNKNOWN: ac20805b <unknown>
 
 sf.vc.v.xvv 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.v.xvv 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0xac]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 ac <unknown>
+# CHECK-UNKNOWN: ac25c05b <unknown>
 
 sf.vc.v.ivv 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.v.ivv 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0xac]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 ac <unknown>
+# CHECK-UNKNOWN: ac27b05b <unknown>
 
 sf.vc.v.fvv 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.v.fvv 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0xac]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 ac <unknown>
+# CHECK-UNKNOWN: ac25d05b <unknown>
 
 sf.vc.v.vvw 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.v.vvw 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0xfc]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 fc <unknown>
+# CHECK-UNKNOWN: fc20805b <unknown>
 
 sf.vc.v.xvw 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.v.xvw 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0xfc]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 fc <unknown>
+# CHECK-UNKNOWN: fc25c05b <unknown>
 
 sf.vc.v.ivw 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.v.ivw 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0xfc]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 fc <unknown>
+# CHECK-UNKNOWN: fc27b05b <unknown>
 
 sf.vc.v.fvw 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.v.fvw 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0xfc]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 fc <unknown>
+# CHECK-UNKNOWN: fc25d05b <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xsfvfnrclip.s b/llvm/test/MC/RISCV/rvv/xsfvfnrclip.s
index d8b184659ac4..7508d44bc916 100644
--- a/llvm/test/MC/RISCV/rvv/xsfvfnrclip.s
+++ b/llvm/test/MC/RISCV/rvv/xsfvfnrclip.s
@@ -12,22 +12,22 @@ sf.vfnrclip.xu.f.qf v4, v8, fa2
 # CHECK-INST: sf.vfnrclip.xu.f.qf v4, v8, fa2
 # CHECK-ENCODING: [0x5b,0x52,0x86,0x8a]
 # CHECK-ERROR: instruction requires the following: 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
-# CHECK-UNKNOWN: 5b 52 86 8a <unknown>
+# CHECK-UNKNOWN: 8a86525b <unknown>
 
 sf.vfnrclip.xu.f.qf v4, v8, fa2, v0.t
 # CHECK-INST: sf.vfnrclip.xu.f.qf v4, v8, fa2
 # CHECK-ENCODING: [0x5b,0x52,0x86,0x88]
 # CHECK-ERROR: instruction requires the following: 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
-# CHECK-UNKNOWN: 5b 52 86 88 <unknown>
+# CHECK-UNKNOWN: 8886525b <unknown>
 
 sf.vfnrclip.x.f.qf v4, v8, fa2
 # CHECK-INST: sf.vfnrclip.x.f.qf v4, v8, fa2
 # CHECK-ENCODING: [0x5b,0x52,0x86,0x8e]
 # CHECK-ERROR: instruction requires the following: 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
-# CHECK-UNKNOWN: 5b 52 86 8e <unknown>
+# CHECK-UNKNOWN: 8e86525b <unknown>
 
 sf.vfnrclip.x.f.qf v4, v8, fa2, v0.t
 # CHECK-INST: sf.vfnrclip.x.f.qf v4, v8, fa2
 # CHECK-ENCODING: [0x5b,0x52,0x86,0x8c]
 # CHECK-ERROR: instruction requires the following: 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
-# CHECK-UNKNOWN: 5b 52 86 8c <unknown>
+# CHECK-UNKNOWN: 8c86525b <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xsfvfwmacc.s b/llvm/test/MC/RISCV/rvv/xsfvfwmacc.s
index ba054fff2bd8..a9843c350fc8 100644
--- a/llvm/test/MC/RISCV/rvv/xsfvfwmacc.s
+++ b/llvm/test/MC/RISCV/rvv/xsfvfwmacc.s
@@ -12,4 +12,4 @@ sf.vfwmacc.4x4x4 v8, v4, v20
 # CHECK-INST: sf.vfwmacc.4x4x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x14,0x42,0xf3]
 # CHECK-ERROR: instruction requires the following: 'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction and 4-by-4))
-# CHECK-UNKNOWN: 5b 14 42 f3 <unknown>
+# CHECK-UNKNOWN: f342145b <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xsfvqmacc.s b/llvm/test/MC/RISCV/rvv/xsfvqmacc.s
index ba19f2184486..81703c847d74 100644
--- a/llvm/test/MC/RISCV/rvv/xsfvqmacc.s
+++ b/llvm/test/MC/RISCV/rvv/xsfvqmacc.s
@@ -12,46 +12,46 @@ sf.vqmaccu.2x8x2 v8, v4, v20
 # CHECK-INST: sf.vqmaccu.2x8x2 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xb3]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
-# CHECK-UNKNOWN: 5b 24 42 b3 <unknown>
+# CHECK-UNKNOWN: b342245b <unknown>
 
 sf.vqmacc.2x8x2 v8, v4, v20
 # CHECK-INST: sf.vqmacc.2x8x2 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xb7]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
-# CHECK-UNKNOWN: 5b 24 42 b7 <unknown>
+# CHECK-UNKNOWN: b742245b <unknown>
 
 sf.vqmaccus.2x8x2 v8, v4, v20
 # CHECK-INST: sf.vqmaccus.2x8x2 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xbb]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
-# CHECK-UNKNOWN: 5b 24 42 bb <unknown>
+# CHECK-UNKNOWN: bb42245b <unknown>
 
 sf.vqmaccsu.2x8x2 v8, v4, v20
 # CHECK-INST: sf.vqmaccsu.2x8x2 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xbf]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
-# CHECK-UNKNOWN: 5b 24 42 bf <unknown>
+# CHECK-UNKNOWN: bf42245b <unknown>
 
 sf.vqmaccu.4x8x4 v8, v4, v20
 # CHECK-INST: sf.vqmaccu.4x8x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xf3]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))
-# CHECK-UNKNOWN: 5b 24 42 f3 <unknown>
+# CHECK-UNKNOWN: f342245b <unknown>
 
 sf.vqmacc.4x8x4 v8, v4, v20
 # CHECK-INST: sf.vqmacc.4x8x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xf7]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))
-# CHECK-UNKNOWN: 5b 24 42 f7 <unknown>
+# CHECK-UNKNOWN: f742245b <unknown>
 
 sf.vqmaccus.4x8x4 v8, v4, v20
 # CHECK-INST: sf.vqmaccus.4x8x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xfb]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))
-# CHECK-UNKNOWN: 5b 24 42 fb <unknown>
+# CHECK-UNKNOWN: fb42245b <unknown>
 
 sf.vqmaccsu.4x8x4 v8, v4, v20
 # CHECK-INST: sf.vqmaccsu.4x8x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xff]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))
-# CHECK-UNKNOWN: 5b 24 42 ff <unknown>
+# CHECK-UNKNOWN: ff42245b <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvbb.s b/llvm/test/MC/RISCV/rvv/zvbb.s
index 04e5ad6e03f7..d9d1f6f42d32 100644
--- a/llvm/test/MC/RISCV/rvv/zvbb.s
+++ b/llvm/test/MC/RISCV/rvv/zvbb.s
@@ -12,40 +12,40 @@ vbrev.v v10, v9, v0.t
 # CHECK-INST: vbrev.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0x25,0x95,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 25 95 48   <unknown>
+# CHECK-UNKNOWN: 48952557 <unknown>
 
 vclz.v v10, v9, v0.t
 # CHECK-INST: vclz.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0x25,0x96,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 25 96 48   <unknown>
+# CHECK-UNKNOWN: 48962557 <unknown>
 
 vcpop.v v10, v9, v0.t
 # CHECK-INST: vcpop.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0x25,0x97,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 25 97 48   <unknown>
+# CHECK-UNKNOWN: 48972557 <unknown>
 
 vctz.v v10, v9, v0.t
 # CHECK-INST: vctz.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0xa5,0x96,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 a5 96 48   <unknown>
+# CHECK-UNKNOWN: 4896a557 <unknown>
 
 vwsll.vv v10, v9, v8, v0.t
 # CHECK-INST: vwsll.vv v10, v9, v8, v0.t
 # CHECK-ENCODING: [0x57,0x05,0x94,0xd4]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 05 94 d4   <unknown>
+# CHECK-UNKNOWN: d4940557 <unknown>
 
 vwsll.vx v10, v9, a0, v0.t
 # CHECK-INST: vwsll.vx v10, v9, a0, v0.t
 # CHECK-ENCODING: [0x57,0x45,0x95,0xd4]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 45 95 d4   <unknown>
+# CHECK-UNKNOWN: d4954557 <unknown>
 
 vwsll.vi v10, v9, 29, v0.t
 # CHECK-INST: vwsll.vi v10, v9, 29, v0.t
 # CHECK-ENCODING: [0x57,0xb5,0x9e,0xd4]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 b5 9e d4   <unknown>
+# CHECK-UNKNOWN: d49eb557 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvbc.s b/llvm/test/MC/RISCV/rvv/zvbc.s
index b32349a2db13..0eb02d153b79 100644
--- a/llvm/test/MC/RISCV/rvv/zvbc.s
+++ b/llvm/test/MC/RISCV/rvv/zvbc.s
@@ -12,22 +12,22 @@ vclmul.vv v10, v9, v8
 # CHECK-INST: vclmul.vv v10, v9, v8
 # CHECK-ENCODING: [0x57,0x25,0x94,0x32]
 # CHECK-ERROR: instruction requires the following: 'Zvbc' (Vector Carryless Multiplication){{$}}
-# CHECK-UNKNOWN: 57 25 94 32   <unknown>
+# CHECK-UNKNOWN: 32942557 <unknown>
 
 vclmul.vx v10, v9, a0
 # CHECK-INST: vclmul.vx v10, v9, a0
 # CHECK-ENCODING: [0x57,0x65,0x95,0x32]
 # CHECK-ERROR: instruction requires the following: 'Zvbc' (Vector Carryless Multiplication){{$}}
-# CHECK-UNKNOWN: 57 65 95 32   <unknown>
+# CHECK-UNKNOWN: 32956557 <unknown>
 
 vclmulh.vv v10, v9, v8
 # CHECK-INST: vclmulh.vv v10, v9, v8
 # CHECK-ENCODING: [0x57,0x25,0x94,0x36]
 # CHECK-ERROR: instruction requires the following: 'Zvbc' (Vector Carryless Multiplication){{$}}
-# CHECK-UNKNOWN: 57 25 94 36   <unknown>
+# CHECK-UNKNOWN: 36942557 <unknown>
 
 vclmulh.vx v10, v9, a0
 # CHECK-INST: vclmulh.vx v10, v9, a0
 # CHECK-ENCODING: [0x57,0x65,0x95,0x36]
 # CHECK-ERROR: instruction requires the following: 'Zvbc' (Vector Carryless Multiplication){{$}}
-# CHECK-UNKNOWN: 57 65 95 36   <unknown>
+# CHECK-UNKNOWN: 36956557 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvfbfmin.s b/llvm/test/MC/RISCV/rvv/zvfbfmin.s
index 1cbe027ef26c..7965c2482b00 100644
--- a/llvm/test/MC/RISCV/rvv/zvfbfmin.s
+++ b/llvm/test/MC/RISCV/rvv/zvfbfmin.s
@@ -20,23 +20,23 @@
 # CHECK-INST: vfncvtbf16.f.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x4e,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfmin' (Vector BF16 Converts){{$}}
-# CHECK-UNKNOWN: 57 94 4e 48 <unknown>
+# CHECK-UNKNOWN: 484e9457 <unknown>
 vfncvtbf16.f.f.w v8, v4, v0.t
 
 # CHECK-INST: vfncvtbf16.f.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x4e,0x4a]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfmin' (Vector BF16 Converts){{$}}
-# CHECK-UNKNOWN: 57 94 4e 4a <unknown>
+# CHECK-UNKNOWN: 4a4e9457 <unknown>
 vfncvtbf16.f.f.w v8, v4
 
 # CHECK-INST: vfwcvtbf16.f.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x46,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfmin' (Vector BF16 Converts){{$}}
-# CHECK-UNKNOWN: 57 94 46 48 <unknown>
+# CHECK-UNKNOWN: 48469457 <unknown>
 vfwcvtbf16.f.f.v v8, v4, v0.t
 
 # CHECK-INST: vfwcvtbf16.f.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x46,0x4a]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfmin' (Vector BF16 Converts){{$}}
-# CHECK-UNKNOWN: 57 94 46 4a <unknown>
+# CHECK-UNKNOWN: 4a469457 <unknown>
 vfwcvtbf16.f.f.v v8, v4
diff --git a/llvm/test/MC/RISCV/rvv/zvfbfwma.s b/llvm/test/MC/RISCV/rvv/zvfbfwma.s
index 5a30d9f19ab6..330dee58d836 100644
--- a/llvm/test/MC/RISCV/rvv/zvfbfwma.s
+++ b/llvm/test/MC/RISCV/rvv/zvfbfwma.s
@@ -20,25 +20,25 @@
 # CHECK-INST: vfwmaccbf16.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xec]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfwma' (Vector BF16 widening mul-add){{$}}
-# CHECK-UNKNOWN: 57 14 4a ec <unknown>
+# CHECK-UNKNOWN: ec4a1457 <unknown>
 vfwmaccbf16.vv v8, v20, v4, v0.t
 
 # CHECK-INST: vfwmaccbf16.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xee]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfwma' (Vector BF16 widening mul-add){{$}}
-# CHECK-UNKNOWN: 57 14 4a ee <unknown>
+# CHECK-UNKNOWN: ee4a1457 <unknown>
 vfwmaccbf16.vv v8, v20, v4
 
 # CHECK-INST: vfwmaccbf16.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfwma' (Vector BF16 widening mul-add){{$}}
-# CHECK-UNKNOWN: 57 54 45 ec <unknown>
+# CHECK-UNKNOWN: ec455457 <unknown>
 vfwmaccbf16.vf v8, fa0, v4, v0.t
 
 # CHECK-INST: vfwmaccbf16.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfwma' (Vector BF16 widening mul-add){{$}}
-# CHECK-UNKNOWN: 57 54 45 ee <unknown>
+# CHECK-UNKNOWN: ee455457 <unknown>
 vfwmaccbf16.vf v8, fa0, v4
 
 # Check scalar half FP load/store/move included in this extension.
@@ -46,23 +46,23 @@ vfwmaccbf16.vf v8, fa0, v4
 # CHECK-INST: flh ft0, 12(a0)
 # CHECK-ENCODING: [0x07,0x10,0xc5,0x00]
 # CHECK-ERROR: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal) or 'Zfbfmin' (Scalar BF16 Converts){{$}}
-# CHECK-UNKNOWN: 07 10 c5 00 <unknown>
+# CHECK-UNKNOWN: 00c51007 <unknown>
 flh f0, 12(a0)
 
 # CHECK-INST: fsh ft6, 2047(s4)
 # CHECK-ENCODING: [0xa7,0x1f,0x6a,0x7e]
 # CHECK-ERROR: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal) or 'Zfbfmin' (Scalar BF16 Converts){{$}}
-# CHECK-UNKNOWN: a7 1f 6a 7e <unknown>
+# CHECK-UNKNOWN: 7e6a1fa7 <unknown>
 fsh f6, 2047(s4)
 
 # CHECK-INST: fmv.x.h a2, fs7
 # CHECK-ENCODING: [0x53,0x86,0x0b,0xe4]
 # CHECK-ERROR: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal) or 'Zfbfmin' (Scalar BF16 Converts){{$}}
-# CHECK-UNKNOWN: 53 86 0b e4 <unknown>
+# CHECK-UNKNOWN: e40b8653 <unknown>
 fmv.x.h a2, fs7
 
 # CHECK-INST: fmv.h.x ft1, a6
 # CHECK-ENCODING: [0xd3,0x00,0x08,0xf4]
 # CHECK-ERROR: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal) or 'Zfbfmin' (Scalar BF16 Converts){{$}}
-# CHECK-UNKNOWN: d3 00 08 f4 <unknown>
+# CHECK-UNKNOWN: f40800d3 <unknown>
 fmv.h.x ft1, a6
diff --git a/llvm/test/MC/RISCV/rvv/zvkb.s b/llvm/test/MC/RISCV/rvv/zvkb.s
index ae2dec18d33c..1833ba860c90 100644
--- a/llvm/test/MC/RISCV/rvv/zvkb.s
+++ b/llvm/test/MC/RISCV/rvv/zvkb.s
@@ -12,52 +12,52 @@ vandn.vv v10, v9, v8, v0.t
 # CHECK-INST: vandn.vv v10, v9, v8, v0.t
 # CHECK-ENCODING: [0x57,0x05,0x94,0x04]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 05 94 04   <unknown>
+# CHECK-UNKNOWN: 04940557 <unknown>
 
 vandn.vx v10, v9, a0, v0.t
 # CHECK-INST: vandn.vx v10, v9, a0, v0.t
 # CHECK-ENCODING: [0x57,0x45,0x95,0x04]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 45 95 04   <unknown>
+# CHECK-UNKNOWN: 04954557 <unknown>
 
 vbrev8.v v10, v9, v0.t
 # CHECK-INST: vbrev8.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0x25,0x94,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 25 94 48   <unknown>
+# CHECK-UNKNOWN: 48942557 <unknown>
 
 vrev8.v v10, v9, v0.t
 # CHECK-INST: vrev8.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0xa5,0x94,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 a5 94 48   <unknown>
+# CHECK-UNKNOWN: 4894a557 <unknown>
 
 vrol.vv v10, v9, v8, v0.t
 # CHECK-INST: vrol.vv v10, v9, v8, v0.t
 # CHECK-ENCODING: [0x57,0x05,0x94,0x54]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 05 94 54   <unknown>
+# CHECK-UNKNOWN: 54940557 <unknown>
 
 vrol.vx v10, v9, a0, v0.t
 # CHECK-INST: vrol.vx v10, v9, a0, v0.t
 # CHECK-ENCODING: [0x57,0x45,0x95,0x54]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 45 95 54   <unknown>
+# CHECK-UNKNOWN: 54954557 <unknown>
 
 vror.vv v10, v9, v8, v0.t
 # CHECK-INST: vror.vv v10, v9, v8, v0.t
 # CHECK-ENCODING: [0x57,0x05,0x94,0x50]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 05 94 50   <unknown>
+# CHECK-UNKNOWN: 50940557 <unknown>
 
 vror.vx v10, v9, a0, v0.t
 # CHECK-INST: vror.vx v10, v9, a0, v0.t
 # CHECK-ENCODING: [0x57,0x45,0x95,0x50]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 45 95 50   <unknown>
+# CHECK-UNKNOWN: 50954557 <unknown>
 
 vror.vi v10, v9, 33, v0.t
 # CHECK-INST: vror.vi v10, v9, 33, v0.t
 # CHECK-ENCODING: [0x57,0xb5,0x90,0x54]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 b5 90 54   <unknown>
+# CHECK-UNKNOWN: 5490b557 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvkg.s b/llvm/test/MC/RISCV/rvv/zvkg.s
index f2016bc116b6..48b84659e0ae 100644
--- a/llvm/test/MC/RISCV/rvv/zvkg.s
+++ b/llvm/test/MC/RISCV/rvv/zvkg.s
@@ -12,10 +12,10 @@ vghsh.vv v10, v9, v8
 # CHECK-INST: vghsh.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0xb2]
 # CHECK-ERROR: instruction requires the following: 'Zvkg' (Vector GCM instructions for Cryptography){{$}}
-# CHECK-UNKNOWN: 77 25 94 b2   <unknown>
+# CHECK-UNKNOWN: b2942577 <unknown>
 
 vgmul.vv v10, v9
 # CHECK-INST: vgmul.vv v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x98,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkg' (Vector GCM instructions for Cryptography){{$}}
-# CHECK-UNKNOWN: 77 a5 98 a2   <unknown>
+# CHECK-UNKNOWN: a298a577 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvkned.s b/llvm/test/MC/RISCV/rvv/zvkned.s
index e51a9cc562f1..bee3d74ee88d 100644
--- a/llvm/test/MC/RISCV/rvv/zvkned.s
+++ b/llvm/test/MC/RISCV/rvv/zvkned.s
@@ -12,76 +12,76 @@ vaesdf.vv v10, v9
 # CHECK-INST: vaesdf.vv v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x90,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 90 a2   <unknown>
+# CHECK-UNKNOWN: a290a577 <unknown>
 
 vaesdf.vs v10, v9
 # CHECK-INST: vaesdf.vs v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x90,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 90 a6   <unknown>
+# CHECK-UNKNOWN: a690a577 <unknown>
 
 vaesef.vv v10, v9
 # CHECK-INST: vaesef.vv v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x91,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 91 a2   <unknown>
+# CHECK-UNKNOWN: a291a577 <unknown>
                        
 vaesef.vs v10, v9
 # CHECK-INST: vaesef.vs v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x91,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 91 a6   <unknown>
+# CHECK-UNKNOWN: a691a577 <unknown>
 
 vaesdm.vv v10, v9
 # CHECK-INST: vaesdm.vv v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x90,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 90 a2   <unknown>
+# CHECK-UNKNOWN: a2902577 <unknown>
                        
 vaesdm.vs v10, v9
 # CHECK-INST: vaesdm.vs v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x90,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 90 a6   <unknown>
+# CHECK-UNKNOWN: a6902577 <unknown>
 
 vaesem.vv v10, v9
 # CHECK-INST: vaesem.vv v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x91,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 91 a2   <unknown>
+# CHECK-UNKNOWN: a2912577 <unknown>
                        
 vaesem.vs v10, v9
 # CHECK-INST: vaesem.vs v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x91,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 91 a6   <unknown>
+# CHECK-UNKNOWN: a6912577 <unknown>
 
 vaeskf1.vi v10, v9, 1
 # CHECK-INST: vaeskf1.vi v10, v9, 1
 # CHECK-ENCODING: [0x77,0xa5,0x90,0x8a]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 90 8a   <unknown>
+# CHECK-UNKNOWN: 8a90a577 <unknown>
 
 vaeskf1.vi v10, v9, 31
 # CHECK-INST: vaeskf1.vi v10, v9, 31
 # CHECK-ENCODING: [0x77,0xa5,0x9f,0x8a]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 9f 8a   <unknown>
+# CHECK-UNKNOWN: 8a9fa577 <unknown>
 
 vaeskf2.vi v10, v9, 2
 # CHECK-INST: vaeskf2.vi v10, v9, 2
 # CHECK-ENCODING: [0x77,0x25,0x91,0xaa]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 91 aa   <unknown>
+# CHECK-UNKNOWN: aa912577 <unknown>
 
 vaeskf2.vi v10, v9, 31
 # CHECK-INST: vaeskf2.vi v10, v9, 31
 # CHECK-ENCODING: [0x77,0xa5,0x9f,0xaa]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 9f aa   <unknown>
+# CHECK-UNKNOWN: aa9fa577 <unknown>
 
 vaesz.vs v10, v9
 # CHECK-INST: vaesz.vs v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x93,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 93 a6   <unknown>
+# CHECK-UNKNOWN: a693a577 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvknh.s b/llvm/test/MC/RISCV/rvv/zvknh.s
index aa8033a5f217..b16b9081f7e6 100644
--- a/llvm/test/MC/RISCV/rvv/zvknh.s
+++ b/llvm/test/MC/RISCV/rvv/zvknh.s
@@ -18,17 +18,17 @@
 vsha2ms.vv v10, v9, v8
 # CHECK-INST: vsha2ms.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0xb6]
-# CHECK-UNKNOWN: 77 25 94 b6   <unknown>
+# CHECK-UNKNOWN: b6942577 <unknown>
 # CHECK-ERROR: instruction requires the following: 'Zvknha' or 'Zvknhb' (Vector SHA-2){{$}}
 
 vsha2ch.vv v10, v9, v8
 # CHECK-INST: vsha2ch.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0xba]
-# CHECK-UNKNOWN: 77 25 94 ba   <unknown>
+# CHECK-UNKNOWN: ba942577 <unknown>
 # CHECK-ERROR: instruction requires the following: 'Zvknha' or 'Zvknhb' (Vector SHA-2){{$}}
 
 vsha2cl.vv v10, v9, v8
 # CHECK-INST: vsha2cl.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0xbe]
-# CHECK-UNKNOWN: 77 25 94 be   <unknown>
+# CHECK-UNKNOWN: be942577 <unknown>
 # CHECK-ERROR: instruction requires the following: 'Zvknha' or 'Zvknhb' (Vector SHA-2){{$}}
diff --git a/llvm/test/MC/RISCV/rvv/zvksed.s b/llvm/test/MC/RISCV/rvv/zvksed.s
index 87c9713f8c65..f7a0949272ff 100644
--- a/llvm/test/MC/RISCV/rvv/zvksed.s
+++ b/llvm/test/MC/RISCV/rvv/zvksed.s
@@ -12,22 +12,22 @@ vsm4k.vi v10, v9, 7
 # CHECK-INST: vsm4k.vi v10, v9, 7
 # CHECK-ENCODING: [0x77,0xa5,0x93,0x86]
 # CHECK-ERROR: instruction requires the following: 'Zvksed' (SM4 Block Cipher Instructions){{$}}
-# CHECK-UNKNOWN: 77 a5 93 86   <unknown>
+# CHECK-UNKNOWN: 8693a577 <unknown>
 
 vsm4k.vi v10, v9, 31
 # CHECK-INST: vsm4k.vi v10, v9, 31
 # CHECK-ENCODING: [0x77,0xa5,0x9f,0x86]
 # CHECK-ERROR: instruction requires the following: 'Zvksed' (SM4 Block Cipher Instructions){{$}}
-# CHECK-UNKNOWN: 77 a5 9f 86   <unknown>
+# CHECK-UNKNOWN: 869fa577 <unknown>
 
 vsm4r.vv v10, v9
 # CHECK-INST: vsm4r.vv v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x98,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvksed' (SM4 Block Cipher Instructions){{$}}
-# CHECK-UNKNOWN: 77 25 98 a2   <unknown>
+# CHECK-UNKNOWN: a2982577 <unknown>
 
 vsm4r.vs v10, v9
 # CHECK-INST: vsm4r.vs v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x98,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvksed' (SM4 Block Cipher Instructions){{$}}
-# CHECK-UNKNOWN: 77 25 98 a6   <unknown>
+# CHECK-UNKNOWN: a6982577 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvksh.s b/llvm/test/MC/RISCV/rvv/zvksh.s
index 06251ff6efe5..ef1c654b4605 100644
--- a/llvm/test/MC/RISCV/rvv/zvksh.s
+++ b/llvm/test/MC/RISCV/rvv/zvksh.s
@@ -12,17 +12,17 @@ vsm3c.vi v10, v9, 7
 # CHECK-INST: vsm3c.vi v10, v9, 7
 # CHECK-ENCODING: [0x77,0xa5,0x93,0xae]
 # CHECK-ERROR: instruction requires the following: 'Zvksh' (SM3 Hash Function Instructions){{$}}
-# CHECK-UNKNOWN: 77 a5 93 ae   <unknown>
+# CHECK-UNKNOWN: ae93a577 <unknown>
 
 vsm3me.vv v10, v9, v8
 # CHECK-INST: vsm3me.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0x82]
 # CHECK-ERROR: instruction requires the following: 'Zvksh' (SM3 Hash Function Instructions){{$}}
-# CHECK-UNKNOWN: 77 25 94 82   <unknown>
+# CHECK-UNKNOWN: 82942577 <unknown>
 
 # vs1 is allowed to overlap, but not vs2.
 vsm3me.vv v10, v9, v10
 # CHECK-INST: vsm3me.vv v10, v9, v10
 # CHECK-ENCODING: [0x77,0x25,0x95,0x82]
 # CHECK-ERROR: instruction requires the following: 'Zvksh' (SM3 Hash Function Instructions){{$}}
-# CHECK-UNKNOWN: 77 25 95 82   <unknown>
+# CHECK-UNKNOWN: 82952577 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvlsseg.s b/llvm/test/MC/RISCV/rvv/zvlsseg.s
index 9a83ea9f8721..65089e2261be 100644
--- a/llvm/test/MC/RISCV/rvv/zvlsseg.s
+++ b/llvm/test/MC/RISCV/rvv/zvlsseg.s
@@ -13,3076 +13,3076 @@ vlseg2e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 20 <unknown>
+# CHECK-UNKNOWN: 20050407 <unknown>
 
 vlseg2e8.v v8, (a0)
 # CHECK-INST: vlseg2e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 22 <unknown>
+# CHECK-UNKNOWN: 22050407 <unknown>
 
 vlseg2e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 20 <unknown>
+# CHECK-UNKNOWN: 20055407 <unknown>
 
 vlseg2e16.v v8, (a0)
 # CHECK-INST: vlseg2e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 22 <unknown>
+# CHECK-UNKNOWN: 22055407 <unknown>
 
 vlseg2e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 20 <unknown>
+# CHECK-UNKNOWN: 20056407 <unknown>
 
 vlseg2e32.v v8, (a0)
 # CHECK-INST: vlseg2e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 22 <unknown>
+# CHECK-UNKNOWN: 22056407 <unknown>
 
 vlseg2e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 20 <unknown>
+# CHECK-UNKNOWN: 20057407 <unknown>
 
 vlseg2e64.v v8, (a0)
 # CHECK-INST: vlseg2e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 22 <unknown>
+# CHECK-UNKNOWN: 22057407 <unknown>
 
 vlseg2e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x21]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 21 <unknown>
+# CHECK-UNKNOWN: 21050407 <unknown>
 
 vlseg2e8ff.v v8, (a0)
 # CHECK-INST: vlseg2e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 23 <unknown>
+# CHECK-UNKNOWN: 23050407 <unknown>
 
 vlseg2e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x21]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 21 <unknown>
+# CHECK-UNKNOWN: 21055407 <unknown>
 
 vlseg2e16ff.v v8, (a0)
 # CHECK-INST: vlseg2e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 23 <unknown>
+# CHECK-UNKNOWN: 23055407 <unknown>
 
 vlseg2e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x21]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 21 <unknown>
+# CHECK-UNKNOWN: 21056407 <unknown>
 
 vlseg2e32ff.v v8, (a0)
 # CHECK-INST: vlseg2e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 23 <unknown>
+# CHECK-UNKNOWN: 23056407 <unknown>
 
 vlseg2e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x21]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 21 <unknown>
+# CHECK-UNKNOWN: 21057407 <unknown>
 
 vlseg2e64ff.v v8, (a0)
 # CHECK-INST: vlseg2e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 23 <unknown>
+# CHECK-UNKNOWN: 23057407 <unknown>
 
 vlsseg2e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 28 <unknown>
+# CHECK-UNKNOWN: 28b50407 <unknown>
 
 vlsseg2e8.v v8, (a0), a1
 # CHECK-INST: vlsseg2e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab50407 <unknown>
 
 vlsseg2e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 28 <unknown>
+# CHECK-UNKNOWN: 28b55407 <unknown>
 
 vlsseg2e16.v v8, (a0), a1
 # CHECK-INST: vlsseg2e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab55407 <unknown>
 
 vlsseg2e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 28 <unknown>
+# CHECK-UNKNOWN: 28b56407 <unknown>
 
 vlsseg2e32.v v8, (a0), a1
 # CHECK-INST: vlsseg2e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab56407 <unknown>
 
 vlsseg2e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 28 <unknown>
+# CHECK-UNKNOWN: 28b57407 <unknown>
 
 vlsseg2e64.v v8, (a0), a1
 # CHECK-INST: vlsseg2e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab57407 <unknown>
 
 vluxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 24 <unknown>
+# CHECK-UNKNOWN: 24450407 <unknown>
 
 vluxseg2ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg2ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 26 <unknown>
+# CHECK-UNKNOWN: 26450407 <unknown>
 
 vluxseg2ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 24 <unknown>
+# CHECK-UNKNOWN: 24455407 <unknown>
 
 vluxseg2ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg2ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 26 <unknown>
+# CHECK-UNKNOWN: 26455407 <unknown>
 
 vluxseg2ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 24 <unknown>
+# CHECK-UNKNOWN: 24456407 <unknown>
 
 vluxseg2ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg2ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 26 <unknown>
+# CHECK-UNKNOWN: 26456407 <unknown>
 
 vluxseg2ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 24 <unknown>
+# CHECK-UNKNOWN: 24457407 <unknown>
 
 vluxseg2ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg2ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 26 <unknown>
+# CHECK-UNKNOWN: 26457407 <unknown>
 
 vloxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 2c <unknown>
+# CHECK-UNKNOWN: 2c450407 <unknown>
 
 vloxseg2ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg2ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 2e <unknown>
+# CHECK-UNKNOWN: 2e450407 <unknown>
 
 vloxseg2ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg2ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 2c <unknown>
+# CHECK-UNKNOWN: 2c455407 <unknown>
 
 vloxseg2ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg2ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 2e <unknown>
+# CHECK-UNKNOWN: 2e455407 <unknown>
 
 vloxseg2ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg2ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 2c <unknown>
+# CHECK-UNKNOWN: 2c456407 <unknown>
 
 vloxseg2ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg2ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 2e <unknown>
+# CHECK-UNKNOWN: 2e456407 <unknown>
 
 vloxseg2ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg2ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 2c <unknown>
+# CHECK-UNKNOWN: 2c457407 <unknown>
 
 vloxseg2ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg2ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 2e <unknown>
+# CHECK-UNKNOWN: 2e457407 <unknown>
 
 vlseg3e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 40 <unknown>
+# CHECK-UNKNOWN: 40050407 <unknown>
 
 vlseg3e8.v v8, (a0)
 # CHECK-INST: vlseg3e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 42 <unknown>
+# CHECK-UNKNOWN: 42050407 <unknown>
 
 vlseg3e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 40 <unknown>
+# CHECK-UNKNOWN: 40055407 <unknown>
 
 vlseg3e16.v v8, (a0)
 # CHECK-INST: vlseg3e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 42 <unknown>
+# CHECK-UNKNOWN: 42055407 <unknown>
 
 vlseg3e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 40 <unknown>
+# CHECK-UNKNOWN: 40056407 <unknown>
 
 vlseg3e32.v v8, (a0)
 # CHECK-INST: vlseg3e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 42 <unknown>
+# CHECK-UNKNOWN: 42056407 <unknown>
 
 vlseg3e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 40 <unknown>
+# CHECK-UNKNOWN: 40057407 <unknown>
 
 vlseg3e64.v v8, (a0)
 # CHECK-INST: vlseg3e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 42 <unknown>
+# CHECK-UNKNOWN: 42057407 <unknown>
 
 vlseg3e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x41]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 41 <unknown>
+# CHECK-UNKNOWN: 41050407 <unknown>
 
 vlseg3e8ff.v v8, (a0)
 # CHECK-INST: vlseg3e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x43]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 43 <unknown>
+# CHECK-UNKNOWN: 43050407 <unknown>
 
 vlseg3e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x41]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 41 <unknown>
+# CHECK-UNKNOWN: 41055407 <unknown>
 
 vlseg3e16ff.v v8, (a0)
 # CHECK-INST: vlseg3e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x43]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 43 <unknown>
+# CHECK-UNKNOWN: 43055407 <unknown>
 
 vlseg3e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x41]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 41 <unknown>
+# CHECK-UNKNOWN: 41056407 <unknown>
 
 vlseg3e32ff.v v8, (a0)
 # CHECK-INST: vlseg3e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x43]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 43 <unknown>
+# CHECK-UNKNOWN: 43056407 <unknown>
 
 vlseg3e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x41]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 41 <unknown>
+# CHECK-UNKNOWN: 41057407 <unknown>
 
 vlseg3e64ff.v v8, (a0)
 # CHECK-INST: vlseg3e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x43]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 43 <unknown>
+# CHECK-UNKNOWN: 43057407 <unknown>
 
 vlsseg3e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 48 <unknown>
+# CHECK-UNKNOWN: 48b50407 <unknown>
 
 vlsseg3e8.v v8, (a0), a1
 # CHECK-INST: vlsseg3e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab50407 <unknown>
 
 vlsseg3e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 48 <unknown>
+# CHECK-UNKNOWN: 48b55407 <unknown>
 
 vlsseg3e16.v v8, (a0), a1
 # CHECK-INST: vlsseg3e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab55407 <unknown>
 
 vlsseg3e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 48 <unknown>
+# CHECK-UNKNOWN: 48b56407 <unknown>
 
 vlsseg3e32.v v8, (a0), a1
 # CHECK-INST: vlsseg3e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab56407 <unknown>
 
 vlsseg3e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 48 <unknown>
+# CHECK-UNKNOWN: 48b57407 <unknown>
 
 vlsseg3e64.v v8, (a0), a1
 # CHECK-INST: vlsseg3e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab57407 <unknown>
 
 vluxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 44 <unknown>
+# CHECK-UNKNOWN: 44450407 <unknown>
 
 vluxseg3ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg3ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 46 <unknown>
+# CHECK-UNKNOWN: 46450407 <unknown>
 
 vluxseg3ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 44 <unknown>
+# CHECK-UNKNOWN: 44455407 <unknown>
 
 vluxseg3ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg3ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 46 <unknown>
+# CHECK-UNKNOWN: 46455407 <unknown>
 
 vluxseg3ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 44 <unknown>
+# CHECK-UNKNOWN: 44456407 <unknown>
 
 vluxseg3ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg3ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 46 <unknown>
+# CHECK-UNKNOWN: 46456407 <unknown>
 
 vluxseg3ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 44 <unknown>
+# CHECK-UNKNOWN: 44457407 <unknown>
 
 vluxseg3ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg3ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 46 <unknown>
+# CHECK-UNKNOWN: 46457407 <unknown>
 
 vloxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 4c <unknown>
+# CHECK-UNKNOWN: 4c450407 <unknown>
 
 vloxseg3ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg3ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 4e <unknown>
+# CHECK-UNKNOWN: 4e450407 <unknown>
 
 vloxseg3ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg3ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 4c <unknown>
+# CHECK-UNKNOWN: 4c455407 <unknown>
 
 vloxseg3ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg3ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 4e <unknown>
+# CHECK-UNKNOWN: 4e455407 <unknown>
 
 vloxseg3ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg3ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 4c <unknown>
+# CHECK-UNKNOWN: 4c456407 <unknown>
 
 vloxseg3ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg3ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 4e <unknown>
+# CHECK-UNKNOWN: 4e456407 <unknown>
 
 vloxseg3ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg3ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 4c <unknown>
+# CHECK-UNKNOWN: 4c457407 <unknown>
 
 vloxseg3ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg3ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 4e <unknown>
+# CHECK-UNKNOWN: 4e457407 <unknown>
 
 vlseg4e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 60 <unknown>
+# CHECK-UNKNOWN: 60050407 <unknown>
 
 vlseg4e8.v v8, (a0)
 # CHECK-INST: vlseg4e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 62 <unknown>
+# CHECK-UNKNOWN: 62050407 <unknown>
 
 vlseg4e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 60 <unknown>
+# CHECK-UNKNOWN: 60055407 <unknown>
 
 vlseg4e16.v v8, (a0)
 # CHECK-INST: vlseg4e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 62 <unknown>
+# CHECK-UNKNOWN: 62055407 <unknown>
 
 vlseg4e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 60 <unknown>
+# CHECK-UNKNOWN: 60056407 <unknown>
 
 vlseg4e32.v v8, (a0)
 # CHECK-INST: vlseg4e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 62 <unknown>
+# CHECK-UNKNOWN: 62056407 <unknown>
 
 vlseg4e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 60 <unknown>
+# CHECK-UNKNOWN: 60057407 <unknown>
 
 vlseg4e64.v v8, (a0)
 # CHECK-INST: vlseg4e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 62 <unknown>
+# CHECK-UNKNOWN: 62057407 <unknown>
 
 vlseg4e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x61]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 61 <unknown>
+# CHECK-UNKNOWN: 61050407 <unknown>
 
 vlseg4e8ff.v v8, (a0)
 # CHECK-INST: vlseg4e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x63]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 63 <unknown>
+# CHECK-UNKNOWN: 63050407 <unknown>
 
 vlseg4e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x61]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 61 <unknown>
+# CHECK-UNKNOWN: 61055407 <unknown>
 
 vlseg4e16ff.v v8, (a0)
 # CHECK-INST: vlseg4e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x63]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 63 <unknown>
+# CHECK-UNKNOWN: 63055407 <unknown>
 
 vlseg4e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x61]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 61 <unknown>
+# CHECK-UNKNOWN: 61056407 <unknown>
 
 vlseg4e32ff.v v8, (a0)
 # CHECK-INST: vlseg4e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x63]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 63 <unknown>
+# CHECK-UNKNOWN: 63056407 <unknown>
 
 vlseg4e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x61]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 61 <unknown>
+# CHECK-UNKNOWN: 61057407 <unknown>
 
 vlseg4e64ff.v v8, (a0)
 # CHECK-INST: vlseg4e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x63]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 63 <unknown>
+# CHECK-UNKNOWN: 63057407 <unknown>
 
 vlsseg4e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 68 <unknown>
+# CHECK-UNKNOWN: 68b50407 <unknown>
 
 vlsseg4e8.v v8, (a0), a1
 # CHECK-INST: vlsseg4e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab50407 <unknown>
 
 vlsseg4e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 68 <unknown>
+# CHECK-UNKNOWN: 68b55407 <unknown>
 
 vlsseg4e16.v v8, (a0), a1
 # CHECK-INST: vlsseg4e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab55407 <unknown>
 
 vlsseg4e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 68 <unknown>
+# CHECK-UNKNOWN: 68b56407 <unknown>
 
 vlsseg4e32.v v8, (a0), a1
 # CHECK-INST: vlsseg4e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab56407 <unknown>
 
 vlsseg4e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 68 <unknown>
+# CHECK-UNKNOWN: 68b57407 <unknown>
 
 vlsseg4e64.v v8, (a0), a1
 # CHECK-INST: vlsseg4e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab57407 <unknown>
 
 vluxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 64 <unknown>
+# CHECK-UNKNOWN: 64450407 <unknown>
 
 vluxseg4ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg4ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 66 <unknown>
+# CHECK-UNKNOWN: 66450407 <unknown>
 
 vluxseg4ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 64 <unknown>
+# CHECK-UNKNOWN: 64455407 <unknown>
 
 vluxseg4ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg4ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 66 <unknown>
+# CHECK-UNKNOWN: 66455407 <unknown>
 
 vluxseg4ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 64 <unknown>
+# CHECK-UNKNOWN: 64456407 <unknown>
 
 vluxseg4ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg4ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 66 <unknown>
+# CHECK-UNKNOWN: 66456407 <unknown>
 
 vluxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 64 <unknown>
+# CHECK-UNKNOWN: 64457407 <unknown>
 
 vluxseg4ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg4ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 66 <unknown>
+# CHECK-UNKNOWN: 66457407 <unknown>
 
 vloxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 6c <unknown>
+# CHECK-UNKNOWN: 6c450407 <unknown>
 
 vloxseg4ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg4ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 6e <unknown>
+# CHECK-UNKNOWN: 6e450407 <unknown>
 
 vloxseg4ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg4ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 6c <unknown>
+# CHECK-UNKNOWN: 6c455407 <unknown>
 
 vloxseg4ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg4ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 6e <unknown>
+# CHECK-UNKNOWN: 6e455407 <unknown>
 
 vloxseg4ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg4ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 6c <unknown>
+# CHECK-UNKNOWN: 6c456407 <unknown>
 
 vloxseg4ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg4ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 6e <unknown>
+# CHECK-UNKNOWN: 6e456407 <unknown>
 
 vloxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 6c <unknown>
+# CHECK-UNKNOWN: 6c457407 <unknown>
 
 vloxseg4ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg4ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 6e <unknown>
+# CHECK-UNKNOWN: 6e457407 <unknown>
 
 vlseg5e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 80 <unknown>
+# CHECK-UNKNOWN: 80050407 <unknown>
 
 vlseg5e8.v v8, (a0)
 # CHECK-INST: vlseg5e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 82 <unknown>
+# CHECK-UNKNOWN: 82050407 <unknown>
 
 vlseg5e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 80 <unknown>
+# CHECK-UNKNOWN: 80055407 <unknown>
 
 vlseg5e16.v v8, (a0)
 # CHECK-INST: vlseg5e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 82 <unknown>
+# CHECK-UNKNOWN: 82055407 <unknown>
 
 vlseg5e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 80 <unknown>
+# CHECK-UNKNOWN: 80056407 <unknown>
 
 vlseg5e32.v v8, (a0)
 # CHECK-INST: vlseg5e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 82 <unknown>
+# CHECK-UNKNOWN: 82056407 <unknown>
 
 vlseg5e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 80 <unknown>
+# CHECK-UNKNOWN: 80057407 <unknown>
 
 vlseg5e64.v v8, (a0)
 # CHECK-INST: vlseg5e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 82 <unknown>
+# CHECK-UNKNOWN: 82057407 <unknown>
 
 vlseg5e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x81]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 81 <unknown>
+# CHECK-UNKNOWN: 81050407 <unknown>
 
 vlseg5e8ff.v v8, (a0)
 # CHECK-INST: vlseg5e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x83]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 83 <unknown>
+# CHECK-UNKNOWN: 83050407 <unknown>
 
 vlseg5e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x81]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 81 <unknown>
+# CHECK-UNKNOWN: 81055407 <unknown>
 
 vlseg5e16ff.v v8, (a0)
 # CHECK-INST: vlseg5e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x83]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 83 <unknown>
+# CHECK-UNKNOWN: 83055407 <unknown>
 
 vlseg5e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x81]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 81 <unknown>
+# CHECK-UNKNOWN: 81056407 <unknown>
 
 vlseg5e32ff.v v8, (a0)
 # CHECK-INST: vlseg5e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x83]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 83 <unknown>
+# CHECK-UNKNOWN: 83056407 <unknown>
 
 vlseg5e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x81]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 81 <unknown>
+# CHECK-UNKNOWN: 81057407 <unknown>
 
 vlseg5e64ff.v v8, (a0)
 # CHECK-INST: vlseg5e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x83]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 83 <unknown>
+# CHECK-UNKNOWN: 83057407 <unknown>
 
 vlsseg5e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 88 <unknown>
+# CHECK-UNKNOWN: 88b50407 <unknown>
 
 vlsseg5e8.v v8, (a0), a1
 # CHECK-INST: vlsseg5e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab50407 <unknown>
 
 vlsseg5e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 88 <unknown>
+# CHECK-UNKNOWN: 88b55407 <unknown>
 
 vlsseg5e16.v v8, (a0), a1
 # CHECK-INST: vlsseg5e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab55407 <unknown>
 
 vlsseg5e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 88 <unknown>
+# CHECK-UNKNOWN: 88b56407 <unknown>
 
 vlsseg5e32.v v8, (a0), a1
 # CHECK-INST: vlsseg5e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab56407 <unknown>
 
 vlsseg5e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 88 <unknown>
+# CHECK-UNKNOWN: 88b57407 <unknown>
 
 vlsseg5e64.v v8, (a0), a1
 # CHECK-INST: vlsseg5e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab57407 <unknown>
 
 vluxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 84 <unknown>
+# CHECK-UNKNOWN: 84450407 <unknown>
 
 vluxseg5ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg5ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 86 <unknown>
+# CHECK-UNKNOWN: 86450407 <unknown>
 
 vluxseg5ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 84 <unknown>
+# CHECK-UNKNOWN: 84455407 <unknown>
 
 vluxseg5ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg5ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 86 <unknown>
+# CHECK-UNKNOWN: 86455407 <unknown>
 
 vluxseg5ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 84 <unknown>
+# CHECK-UNKNOWN: 84456407 <unknown>
 
 vluxseg5ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg5ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 86 <unknown>
+# CHECK-UNKNOWN: 86456407 <unknown>
 
 vluxseg5ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 84 <unknown>
+# CHECK-UNKNOWN: 84457407 <unknown>
 
 vluxseg5ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg5ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 86 <unknown>
+# CHECK-UNKNOWN: 86457407 <unknown>
 
 vloxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 8c <unknown>
+# CHECK-UNKNOWN: 8c450407 <unknown>
 
 vloxseg5ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg5ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 8e <unknown>
+# CHECK-UNKNOWN: 8e450407 <unknown>
 
 vloxseg5ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg5ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 8c <unknown>
+# CHECK-UNKNOWN: 8c455407 <unknown>
 
 vloxseg5ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg5ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 8e <unknown>
+# CHECK-UNKNOWN: 8e455407 <unknown>
 
 vloxseg5ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg5ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 8c <unknown>
+# CHECK-UNKNOWN: 8c456407 <unknown>
 
 vloxseg5ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg5ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 8e <unknown>
+# CHECK-UNKNOWN: 8e456407 <unknown>
 
 vloxseg5ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg5ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 8c <unknown>
+# CHECK-UNKNOWN: 8c457407 <unknown>
 
 vloxseg5ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg5ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 8e <unknown>
+# CHECK-UNKNOWN: 8e457407 <unknown>
 
 vlseg6e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 a0 <unknown>
+# CHECK-UNKNOWN: a0050407 <unknown>
 
 vlseg6e8.v v8, (a0)
 # CHECK-INST: vlseg6e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 a2 <unknown>
+# CHECK-UNKNOWN: a2050407 <unknown>
 
 vlseg6e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 a0 <unknown>
+# CHECK-UNKNOWN: a0055407 <unknown>
 
 vlseg6e16.v v8, (a0)
 # CHECK-INST: vlseg6e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 a2 <unknown>
+# CHECK-UNKNOWN: a2055407 <unknown>
 
 vlseg6e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 a0 <unknown>
+# CHECK-UNKNOWN: a0056407 <unknown>
 
 vlseg6e32.v v8, (a0)
 # CHECK-INST: vlseg6e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 a2 <unknown>
+# CHECK-UNKNOWN: a2056407 <unknown>
 
 vlseg6e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 a0 <unknown>
+# CHECK-UNKNOWN: a0057407 <unknown>
 
 vlseg6e64.v v8, (a0)
 # CHECK-INST: vlseg6e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 a2 <unknown>
+# CHECK-UNKNOWN: a2057407 <unknown>
 
 vlseg6e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 a1 <unknown>
+# CHECK-UNKNOWN: a1050407 <unknown>
 
 vlseg6e8ff.v v8, (a0)
 # CHECK-INST: vlseg6e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 a3 <unknown>
+# CHECK-UNKNOWN: a3050407 <unknown>
 
 vlseg6e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xa1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 a1 <unknown>
+# CHECK-UNKNOWN: a1055407 <unknown>
 
 vlseg6e16ff.v v8, (a0)
 # CHECK-INST: vlseg6e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xa3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 a3 <unknown>
+# CHECK-UNKNOWN: a3055407 <unknown>
 
 vlseg6e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xa1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 a1 <unknown>
+# CHECK-UNKNOWN: a1056407 <unknown>
 
 vlseg6e32ff.v v8, (a0)
 # CHECK-INST: vlseg6e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xa3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 a3 <unknown>
+# CHECK-UNKNOWN: a3056407 <unknown>
 
 vlseg6e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xa1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 a1 <unknown>
+# CHECK-UNKNOWN: a1057407 <unknown>
 
 vlseg6e64ff.v v8, (a0)
 # CHECK-INST: vlseg6e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xa3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 a3 <unknown>
+# CHECK-UNKNOWN: a3057407 <unknown>
 
 vlsseg6e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b50407 <unknown>
 
 vlsseg6e8.v v8, (a0), a1
 # CHECK-INST: vlsseg6e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 aa <unknown>
+# CHECK-UNKNOWN: aab50407 <unknown>
 
 vlsseg6e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b55407 <unknown>
 
 vlsseg6e16.v v8, (a0), a1
 # CHECK-INST: vlsseg6e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 aa <unknown>
+# CHECK-UNKNOWN: aab55407 <unknown>
 
 vlsseg6e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b56407 <unknown>
 
 vlsseg6e32.v v8, (a0), a1
 # CHECK-INST: vlsseg6e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 aa <unknown>
+# CHECK-UNKNOWN: aab56407 <unknown>
 
 vlsseg6e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b57407 <unknown>
 
 vlsseg6e64.v v8, (a0), a1
 # CHECK-INST: vlsseg6e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 aa <unknown>
+# CHECK-UNKNOWN: aab57407 <unknown>
 
 vluxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 a4 <unknown>
+# CHECK-UNKNOWN: a4450407 <unknown>
 
 vluxseg6ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg6ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 a6 <unknown>
+# CHECK-UNKNOWN: a6450407 <unknown>
 
 vluxseg6ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 a4 <unknown>
+# CHECK-UNKNOWN: a4455407 <unknown>
 
 vluxseg6ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg6ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 a6 <unknown>
+# CHECK-UNKNOWN: a6455407 <unknown>
 
 vluxseg6ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 a4 <unknown>
+# CHECK-UNKNOWN: a4456407 <unknown>
 
 vluxseg6ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg6ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 a6 <unknown>
+# CHECK-UNKNOWN: a6456407 <unknown>
 
 vluxseg6ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 a4 <unknown>
+# CHECK-UNKNOWN: a4457407 <unknown>
 
 vluxseg6ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg6ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 a6 <unknown>
+# CHECK-UNKNOWN: a6457407 <unknown>
 
 vloxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ac <unknown>
+# CHECK-UNKNOWN: ac450407 <unknown>
 
 vloxseg6ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg6ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ae <unknown>
+# CHECK-UNKNOWN: ae450407 <unknown>
 
 vloxseg6ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg6ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ac <unknown>
+# CHECK-UNKNOWN: ac455407 <unknown>
 
 vloxseg6ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg6ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ae <unknown>
+# CHECK-UNKNOWN: ae455407 <unknown>
 
 vloxseg6ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg6ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ac <unknown>
+# CHECK-UNKNOWN: ac456407 <unknown>
 
 vloxseg6ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg6ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ae <unknown>
+# CHECK-UNKNOWN: ae456407 <unknown>
 
 vloxseg6ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg6ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ac <unknown>
+# CHECK-UNKNOWN: ac457407 <unknown>
 
 vloxseg6ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg6ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ae <unknown>
+# CHECK-UNKNOWN: ae457407 <unknown>
 
 vlseg7e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 c0 <unknown>
+# CHECK-UNKNOWN: c0050407 <unknown>
 
 vlseg7e8.v v8, (a0)
 # CHECK-INST: vlseg7e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 c2 <unknown>
+# CHECK-UNKNOWN: c2050407 <unknown>
 
 vlseg7e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 c0 <unknown>
+# CHECK-UNKNOWN: c0055407 <unknown>
 
 vlseg7e16.v v8, (a0)
 # CHECK-INST: vlseg7e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 c2 <unknown>
+# CHECK-UNKNOWN: c2055407 <unknown>
 
 vlseg7e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 c0 <unknown>
+# CHECK-UNKNOWN: c0056407 <unknown>
 
 vlseg7e32.v v8, (a0)
 # CHECK-INST: vlseg7e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 c2 <unknown>
+# CHECK-UNKNOWN: c2056407 <unknown>
 
 vlseg7e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 c0 <unknown>
+# CHECK-UNKNOWN: c0057407 <unknown>
 
 vlseg7e64.v v8, (a0)
 # CHECK-INST: vlseg7e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 c2 <unknown>
+# CHECK-UNKNOWN: c2057407 <unknown>
 
 vlseg7e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 c1 <unknown>
+# CHECK-UNKNOWN: c1050407 <unknown>
 
 vlseg7e8ff.v v8, (a0)
 # CHECK-INST: vlseg7e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 c3 <unknown>
+# CHECK-UNKNOWN: c3050407 <unknown>
 
 vlseg7e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xc1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 c1 <unknown>
+# CHECK-UNKNOWN: c1055407 <unknown>
 
 vlseg7e16ff.v v8, (a0)
 # CHECK-INST: vlseg7e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xc3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 c3 <unknown>
+# CHECK-UNKNOWN: c3055407 <unknown>
 
 vlseg7e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xc1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 c1 <unknown>
+# CHECK-UNKNOWN: c1056407 <unknown>
 
 vlseg7e32ff.v v8, (a0)
 # CHECK-INST: vlseg7e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xc3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 c3 <unknown>
+# CHECK-UNKNOWN: c3056407 <unknown>
 
 vlseg7e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xc1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 c1 <unknown>
+# CHECK-UNKNOWN: c1057407 <unknown>
 
 vlseg7e64ff.v v8, (a0)
 # CHECK-INST: vlseg7e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xc3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 c3 <unknown>
+# CHECK-UNKNOWN: c3057407 <unknown>
 
 vlsseg7e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b50407 <unknown>
 
 vlsseg7e8.v v8, (a0), a1
 # CHECK-INST: vlsseg7e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 ca <unknown>
+# CHECK-UNKNOWN: cab50407 <unknown>
 
 vlsseg7e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b55407 <unknown>
 
 vlsseg7e16.v v8, (a0), a1
 # CHECK-INST: vlsseg7e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 ca <unknown>
+# CHECK-UNKNOWN: cab55407 <unknown>
 
 vlsseg7e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b56407 <unknown>
 
 vlsseg7e32.v v8, (a0), a1
 # CHECK-INST: vlsseg7e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 ca <unknown>
+# CHECK-UNKNOWN: cab56407 <unknown>
 
 vlsseg7e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b57407 <unknown>
 
 vlsseg7e64.v v8, (a0), a1
 # CHECK-INST: vlsseg7e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 ca <unknown>
+# CHECK-UNKNOWN: cab57407 <unknown>
 
 vluxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 c4 <unknown>
+# CHECK-UNKNOWN: c4450407 <unknown>
 
 vluxseg7ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg7ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 c6 <unknown>
+# CHECK-UNKNOWN: c6450407 <unknown>
 
 vluxseg7ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 c4 <unknown>
+# CHECK-UNKNOWN: c4455407 <unknown>
 
 vluxseg7ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg7ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 c6 <unknown>
+# CHECK-UNKNOWN: c6455407 <unknown>
 
 vluxseg7ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 c4 <unknown>
+# CHECK-UNKNOWN: c4456407 <unknown>
 
 vluxseg7ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg7ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 c6 <unknown>
+# CHECK-UNKNOWN: c6456407 <unknown>
 
 vluxseg7ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 c4 <unknown>
+# CHECK-UNKNOWN: c4457407 <unknown>
 
 vluxseg7ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg7ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 c6 <unknown>
+# CHECK-UNKNOWN: c6457407 <unknown>
 
 vloxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 cc <unknown>
+# CHECK-UNKNOWN: cc450407 <unknown>
 
 vloxseg7ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg7ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ce <unknown>
+# CHECK-UNKNOWN: ce450407 <unknown>
 
 vloxseg7ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg7ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 cc <unknown>
+# CHECK-UNKNOWN: cc455407 <unknown>
 
 vloxseg7ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg7ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ce <unknown>
+# CHECK-UNKNOWN: ce455407 <unknown>
 
 vloxseg7ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg7ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 cc <unknown>
+# CHECK-UNKNOWN: cc456407 <unknown>
 
 vloxseg7ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg7ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ce <unknown>
+# CHECK-UNKNOWN: ce456407 <unknown>
 
 vloxseg7ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg7ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 cc <unknown>
+# CHECK-UNKNOWN: cc457407 <unknown>
 
 vloxseg7ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg7ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ce <unknown>
+# CHECK-UNKNOWN: ce457407 <unknown>
 
 vlseg8e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 e0 <unknown>
+# CHECK-UNKNOWN: e0050407 <unknown>
 
 vlseg8e8.v v8, (a0)
 # CHECK-INST: vlseg8e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 e2 <unknown>
+# CHECK-UNKNOWN: e2050407 <unknown>
 
 vlseg8e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 e0 <unknown>
+# CHECK-UNKNOWN: e0055407 <unknown>
 
 vlseg8e16.v v8, (a0)
 # CHECK-INST: vlseg8e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 e2 <unknown>
+# CHECK-UNKNOWN: e2055407 <unknown>
 
 vlseg8e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 e0 <unknown>
+# CHECK-UNKNOWN: e0056407 <unknown>
 
 vlseg8e32.v v8, (a0)
 # CHECK-INST: vlseg8e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 e2 <unknown>
+# CHECK-UNKNOWN: e2056407 <unknown>
 
 vlseg8e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 e0 <unknown>
+# CHECK-UNKNOWN: e0057407 <unknown>
 
 vlseg8e64.v v8, (a0)
 # CHECK-INST: vlseg8e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 e2 <unknown>
+# CHECK-UNKNOWN: e2057407 <unknown>
 
 vlseg8e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 e1 <unknown>
+# CHECK-UNKNOWN: e1050407 <unknown>
 
 vlseg8e8ff.v v8, (a0)
 # CHECK-INST: vlseg8e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 e3 <unknown>
+# CHECK-UNKNOWN: e3050407 <unknown>
 
 vlseg8e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xe1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 e1 <unknown>
+# CHECK-UNKNOWN: e1055407 <unknown>
 
 vlseg8e16ff.v v8, (a0)
 # CHECK-INST: vlseg8e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xe3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 e3 <unknown>
+# CHECK-UNKNOWN: e3055407 <unknown>
 
 vlseg8e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xe1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 e1 <unknown>
+# CHECK-UNKNOWN: e1056407 <unknown>
 
 vlseg8e32ff.v v8, (a0)
 # CHECK-INST: vlseg8e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xe3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 e3 <unknown>
+# CHECK-UNKNOWN: e3056407 <unknown>
 
 vlseg8e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xe1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 e1 <unknown>
+# CHECK-UNKNOWN: e1057407 <unknown>
 
 vlseg8e64ff.v v8, (a0)
 # CHECK-INST: vlseg8e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xe3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 e3 <unknown>
+# CHECK-UNKNOWN: e3057407 <unknown>
 
 vlsseg8e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b50407 <unknown>
 
 vlsseg8e8.v v8, (a0), a1
 # CHECK-INST: vlsseg8e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 ea <unknown>
+# CHECK-UNKNOWN: eab50407 <unknown>
 
 vlsseg8e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b55407 <unknown>
 
 vlsseg8e16.v v8, (a0), a1
 # CHECK-INST: vlsseg8e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 ea <unknown>
+# CHECK-UNKNOWN: eab55407 <unknown>
 
 vlsseg8e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b56407 <unknown>
 
 vlsseg8e32.v v8, (a0), a1
 # CHECK-INST: vlsseg8e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 ea <unknown>
+# CHECK-UNKNOWN: eab56407 <unknown>
 
 vlsseg8e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b57407 <unknown>
 
 vlsseg8e64.v v8, (a0), a1
 # CHECK-INST: vlsseg8e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 ea <unknown>
+# CHECK-UNKNOWN: eab57407 <unknown>
 
 vluxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 e4 <unknown>
+# CHECK-UNKNOWN: e4450407 <unknown>
 
 vluxseg8ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg8ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 e6 <unknown>
+# CHECK-UNKNOWN: e6450407 <unknown>
 
 vluxseg8ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 e4 <unknown>
+# CHECK-UNKNOWN: e4455407 <unknown>
 
 vluxseg8ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg8ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 e6 <unknown>
+# CHECK-UNKNOWN: e6455407 <unknown>
 
 vluxseg8ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 e4 <unknown>
+# CHECK-UNKNOWN: e4456407 <unknown>
 
 vluxseg8ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg8ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 e6 <unknown>
+# CHECK-UNKNOWN: e6456407 <unknown>
 
 vluxseg8ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 e4 <unknown>
+# CHECK-UNKNOWN: e4457407 <unknown>
 
 vluxseg8ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg8ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 e6 <unknown>
+# CHECK-UNKNOWN: e6457407 <unknown>
 
 vloxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ec <unknown>
+# CHECK-UNKNOWN: ec450407 <unknown>
 
 vloxseg8ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg8ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ee <unknown>
+# CHECK-UNKNOWN: ee450407 <unknown>
 
 vloxseg8ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg8ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ec <unknown>
+# CHECK-UNKNOWN: ec455407 <unknown>
 
 vloxseg8ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg8ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ee <unknown>
+# CHECK-UNKNOWN: ee455407 <unknown>
 
 vloxseg8ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg8ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ec <unknown>
+# CHECK-UNKNOWN: ec456407 <unknown>
 
 vloxseg8ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg8ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ee <unknown>
+# CHECK-UNKNOWN: ee456407 <unknown>
 
 vloxseg8ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg8ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ec <unknown>
+# CHECK-UNKNOWN: ec457407 <unknown>
 
 vloxseg8ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg8ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ee <unknown>
+# CHECK-UNKNOWN: ee457407 <unknown>
 
 vsseg2e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg2e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 20 <unknown>
+# CHECK-UNKNOWN: 20050c27 <unknown>
 
 vsseg2e8.v v24, (a0)
 # CHECK-INST: vsseg2e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 22 <unknown>
+# CHECK-UNKNOWN: 22050c27 <unknown>
 
 vsseg2e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg2e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 20 <unknown>
+# CHECK-UNKNOWN: 20055c27 <unknown>
 
 vsseg2e16.v v24, (a0)
 # CHECK-INST: vsseg2e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 22 <unknown>
+# CHECK-UNKNOWN: 22055c27 <unknown>
 
 vsseg2e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg2e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 20 <unknown>
+# CHECK-UNKNOWN: 20056c27 <unknown>
 
 vsseg2e32.v v24, (a0)
 # CHECK-INST: vsseg2e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 22 <unknown>
+# CHECK-UNKNOWN: 22056c27 <unknown>
 
 vsseg2e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg2e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 20 <unknown>
+# CHECK-UNKNOWN: 20057c27 <unknown>
 
 vsseg2e64.v v24, (a0)
 # CHECK-INST: vsseg2e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 22 <unknown>
+# CHECK-UNKNOWN: 22057c27 <unknown>
 
 vssseg2e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b50c27 <unknown>
 
 vssseg2e8.v v24, (a0), a1
 # CHECK-INST: vssseg2e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab50c27 <unknown>
 
 vssseg2e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b55c27 <unknown>
 
 vssseg2e16.v v24, (a0), a1
 # CHECK-INST: vssseg2e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab55c27 <unknown>
 
 vssseg2e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b56c27 <unknown>
 
 vssseg2e32.v v24, (a0), a1
 # CHECK-INST: vssseg2e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab56c27 <unknown>
 
 vssseg2e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b57c27 <unknown>
 
 vssseg2e64.v v24, (a0), a1
 # CHECK-INST: vssseg2e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab57c27 <unknown>
 
 vsuxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 24 <unknown>
+# CHECK-UNKNOWN: 24450c27 <unknown>
 
 vsuxseg2ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg2ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 26 <unknown>
+# CHECK-UNKNOWN: 26450c27 <unknown>
 
 vsuxseg2ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 24 <unknown>
+# CHECK-UNKNOWN: 24455c27 <unknown>
 
 vsuxseg2ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg2ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 26 <unknown>
+# CHECK-UNKNOWN: 26455c27 <unknown>
 
 vsuxseg2ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 24 <unknown>
+# CHECK-UNKNOWN: 24456c27 <unknown>
 
 vsuxseg2ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg2ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 26 <unknown>
+# CHECK-UNKNOWN: 26456c27 <unknown>
 
 vsuxseg2ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 24 <unknown>
+# CHECK-UNKNOWN: 24457c27 <unknown>
 
 vsuxseg2ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg2ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 26 <unknown>
+# CHECK-UNKNOWN: 26457c27 <unknown>
 
 vsoxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 2c <unknown>
+# CHECK-UNKNOWN: 2c450c27 <unknown>
 
 vsoxseg2ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg2ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 2e <unknown>
+# CHECK-UNKNOWN: 2e450c27 <unknown>
 
 vsoxseg2ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg2ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 2c <unknown>
+# CHECK-UNKNOWN: 2c455c27 <unknown>
 
 vsoxseg2ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg2ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 2e <unknown>
+# CHECK-UNKNOWN: 2e455c27 <unknown>
 
 vsoxseg2ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg2ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 2c <unknown>
+# CHECK-UNKNOWN: 2c456c27 <unknown>
 
 vsoxseg2ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg2ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 2e <unknown>
+# CHECK-UNKNOWN: 2e456c27 <unknown>
 
 vsoxseg2ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg2ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 2c <unknown>
+# CHECK-UNKNOWN: 2c457c27 <unknown>
 
 vsoxseg2ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg2ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 2e <unknown>
+# CHECK-UNKNOWN: 2e457c27 <unknown>
 
 vsseg3e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg3e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 40 <unknown>
+# CHECK-UNKNOWN: 40050c27 <unknown>
 
 vsseg3e8.v v24, (a0)
 # CHECK-INST: vsseg3e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 42 <unknown>
+# CHECK-UNKNOWN: 42050c27 <unknown>
 
 vsseg3e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg3e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 40 <unknown>
+# CHECK-UNKNOWN: 40055c27 <unknown>
 
 vsseg3e16.v v24, (a0)
 # CHECK-INST: vsseg3e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 42 <unknown>
+# CHECK-UNKNOWN: 42055c27 <unknown>
 
 vsseg3e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg3e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 40 <unknown>
+# CHECK-UNKNOWN: 40056c27 <unknown>
 
 vsseg3e32.v v24, (a0)
 # CHECK-INST: vsseg3e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 42 <unknown>
+# CHECK-UNKNOWN: 42056c27 <unknown>
 
 vsseg3e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg3e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 40 <unknown>
+# CHECK-UNKNOWN: 40057c27 <unknown>
 
 vsseg3e64.v v24, (a0)
 # CHECK-INST: vsseg3e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 42 <unknown>
+# CHECK-UNKNOWN: 42057c27 <unknown>
 
 vssseg3e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 48 <unknown>
+# CHECK-UNKNOWN: 48b50c27 <unknown>
 
 vssseg3e8.v v24, (a0), a1
 # CHECK-INST: vssseg3e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab50c27 <unknown>
 
 vssseg3e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 48 <unknown>
+# CHECK-UNKNOWN: 48b55c27 <unknown>
 
 vssseg3e16.v v24, (a0), a1
 # CHECK-INST: vssseg3e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab55c27 <unknown>
 
 vssseg3e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 48 <unknown>
+# CHECK-UNKNOWN: 48b56c27 <unknown>
 
 vssseg3e32.v v24, (a0), a1
 # CHECK-INST: vssseg3e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab56c27 <unknown>
 
 vssseg3e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 48 <unknown>
+# CHECK-UNKNOWN: 48b57c27 <unknown>
 
 vssseg3e64.v v24, (a0), a1
 # CHECK-INST: vssseg3e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab57c27 <unknown>
 
 vsuxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 44 <unknown>
+# CHECK-UNKNOWN: 44450c27 <unknown>
 
 vsuxseg3ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg3ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 46 <unknown>
+# CHECK-UNKNOWN: 46450c27 <unknown>
 
 vsuxseg3ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 44 <unknown>
+# CHECK-UNKNOWN: 44455c27 <unknown>
 
 vsuxseg3ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg3ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 46 <unknown>
+# CHECK-UNKNOWN: 46455c27 <unknown>
 
 vsuxseg3ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 44 <unknown>
+# CHECK-UNKNOWN: 44456c27 <unknown>
 
 vsuxseg3ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg3ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 46 <unknown>
+# CHECK-UNKNOWN: 46456c27 <unknown>
 
 vsuxseg3ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 44 <unknown>
+# CHECK-UNKNOWN: 44457c27 <unknown>
 
 vsuxseg3ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg3ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 46 <unknown>
+# CHECK-UNKNOWN: 46457c27 <unknown>
 
 vsoxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 4c <unknown>
+# CHECK-UNKNOWN: 4c450c27 <unknown>
 
 vsoxseg3ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg3ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 4e <unknown>
+# CHECK-UNKNOWN: 4e450c27 <unknown>
 
 vsoxseg3ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg3ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 4c <unknown>
+# CHECK-UNKNOWN: 4c455c27 <unknown>
 
 vsoxseg3ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg3ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 4e <unknown>
+# CHECK-UNKNOWN: 4e455c27 <unknown>
 
 vsoxseg3ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg3ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 4c <unknown>
+# CHECK-UNKNOWN: 4c456c27 <unknown>
 
 vsoxseg3ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg3ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 4e <unknown>
+# CHECK-UNKNOWN: 4e456c27 <unknown>
 
 vsoxseg3ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg3ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 4c <unknown>
+# CHECK-UNKNOWN: 4c457c27 <unknown>
 
 vsoxseg3ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg3ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 4e <unknown>
+# CHECK-UNKNOWN: 4e457c27 <unknown>
 
 vsseg4e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg4e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 60 <unknown>
+# CHECK-UNKNOWN: 60050c27 <unknown>
 
 vsseg4e8.v v24, (a0)
 # CHECK-INST: vsseg4e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 62 <unknown>
+# CHECK-UNKNOWN: 62050c27 <unknown>
 
 vsseg4e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg4e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 60 <unknown>
+# CHECK-UNKNOWN: 60055c27 <unknown>
 
 vsseg4e16.v v24, (a0)
 # CHECK-INST: vsseg4e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 62 <unknown>
+# CHECK-UNKNOWN: 62055c27 <unknown>
 
 vsseg4e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg4e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 60 <unknown>
+# CHECK-UNKNOWN: 60056c27 <unknown>
 
 vsseg4e32.v v24, (a0)
 # CHECK-INST: vsseg4e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 62 <unknown>
+# CHECK-UNKNOWN: 62056c27 <unknown>
 
 vsseg4e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg4e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 60 <unknown>
+# CHECK-UNKNOWN: 60057c27 <unknown>
 
 vsseg4e64.v v24, (a0)
 # CHECK-INST: vsseg4e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 62 <unknown>
+# CHECK-UNKNOWN: 62057c27 <unknown>
 
 vssseg4e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 68 <unknown>
+# CHECK-UNKNOWN: 68b50c27 <unknown>
 
 vssseg4e8.v v24, (a0), a1
 # CHECK-INST: vssseg4e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab50c27 <unknown>
 
 vssseg4e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 68 <unknown>
+# CHECK-UNKNOWN: 68b55c27 <unknown>
 
 vssseg4e16.v v24, (a0), a1
 # CHECK-INST: vssseg4e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab55c27 <unknown>
 
 vssseg4e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 68 <unknown>
+# CHECK-UNKNOWN: 68b56c27 <unknown>
 
 vssseg4e32.v v24, (a0), a1
 # CHECK-INST: vssseg4e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab56c27 <unknown>
 
 vssseg4e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 68 <unknown>
+# CHECK-UNKNOWN: 68b57c27 <unknown>
 
 vssseg4e64.v v24, (a0), a1
 # CHECK-INST: vssseg4e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab57c27 <unknown>
 
 vsuxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 64 <unknown>
+# CHECK-UNKNOWN: 64450c27 <unknown>
 
 vsuxseg4ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg4ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 66 <unknown>
+# CHECK-UNKNOWN: 66450c27 <unknown>
 
 vsuxseg4ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 64 <unknown>
+# CHECK-UNKNOWN: 64455c27 <unknown>
 
 vsuxseg4ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg4ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 66 <unknown>
+# CHECK-UNKNOWN: 66455c27 <unknown>
 
 vsuxseg4ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 64 <unknown>
+# CHECK-UNKNOWN: 64456c27 <unknown>
 
 vsuxseg4ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg4ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 66 <unknown>
+# CHECK-UNKNOWN: 66456c27 <unknown>
 
 vsuxseg4ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 64 <unknown>
+# CHECK-UNKNOWN: 64457c27 <unknown>
 
 vsuxseg4ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg4ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 66 <unknown>
+# CHECK-UNKNOWN: 66457c27 <unknown>
 
 vsoxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 6c <unknown>
+# CHECK-UNKNOWN: 6c450c27 <unknown>
 
 vsoxseg4ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg4ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 6e <unknown>
+# CHECK-UNKNOWN: 6e450c27 <unknown>
 
 vsoxseg4ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg4ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 6c <unknown>
+# CHECK-UNKNOWN: 6c455c27 <unknown>
 
 vsoxseg4ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg4ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 6e <unknown>
+# CHECK-UNKNOWN: 6e455c27 <unknown>
 
 vsoxseg4ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg4ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 6c <unknown>
+# CHECK-UNKNOWN: 6c456c27 <unknown>
 
 vsoxseg4ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg4ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 6e <unknown>
+# CHECK-UNKNOWN: 6e456c27 <unknown>
 
 vsoxseg4ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg4ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 6c <unknown>
+# CHECK-UNKNOWN: 6c457c27 <unknown>
 
 vsoxseg4ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg4ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 6e <unknown>
+# CHECK-UNKNOWN: 6e457c27 <unknown>
 
 vsseg5e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg5e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 80 <unknown>
+# CHECK-UNKNOWN: 80050c27 <unknown>
 
 vsseg5e8.v v24, (a0)
 # CHECK-INST: vsseg5e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 82 <unknown>
+# CHECK-UNKNOWN: 82050c27 <unknown>
 
 vsseg5e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg5e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 80 <unknown>
+# CHECK-UNKNOWN: 80055c27 <unknown>
 
 vsseg5e16.v v24, (a0)
 # CHECK-INST: vsseg5e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 82 <unknown>
+# CHECK-UNKNOWN: 82055c27 <unknown>
 
 vsseg5e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg5e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 80 <unknown>
+# CHECK-UNKNOWN: 80056c27 <unknown>
 
 vsseg5e32.v v24, (a0)
 # CHECK-INST: vsseg5e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 82 <unknown>
+# CHECK-UNKNOWN: 82056c27 <unknown>
 
 vsseg5e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg5e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 80 <unknown>
+# CHECK-UNKNOWN: 80057c27 <unknown>
 
 vsseg5e64.v v24, (a0)
 # CHECK-INST: vsseg5e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 82 <unknown>
+# CHECK-UNKNOWN: 82057c27 <unknown>
 
 vssseg5e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 88 <unknown>
+# CHECK-UNKNOWN: 88b50c27 <unknown>
 
 vssseg5e8.v v24, (a0), a1
 # CHECK-INST: vssseg5e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab50c27 <unknown>
 
 vssseg5e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 88 <unknown>
+# CHECK-UNKNOWN: 88b55c27 <unknown>
 
 vssseg5e16.v v24, (a0), a1
 # CHECK-INST: vssseg5e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab55c27 <unknown>
 
 vssseg5e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 88 <unknown>
+# CHECK-UNKNOWN: 88b56c27 <unknown>
 
 vssseg5e32.v v24, (a0), a1
 # CHECK-INST: vssseg5e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab56c27 <unknown>
 
 vssseg5e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 88 <unknown>
+# CHECK-UNKNOWN: 88b57c27 <unknown>
 
 vssseg5e64.v v24, (a0), a1
 # CHECK-INST: vssseg5e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab57c27 <unknown>
 
 vsuxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 84 <unknown>
+# CHECK-UNKNOWN: 84450c27 <unknown>
 
 vsuxseg5ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg5ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 86 <unknown>
+# CHECK-UNKNOWN: 86450c27 <unknown>
 
 vsuxseg5ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 84 <unknown>
+# CHECK-UNKNOWN: 84455c27 <unknown>
 
 vsuxseg5ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg5ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 86 <unknown>
+# CHECK-UNKNOWN: 86455c27 <unknown>
 
 vsuxseg5ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 84 <unknown>
+# CHECK-UNKNOWN: 84456c27 <unknown>
 
 vsuxseg5ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg5ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 86 <unknown>
+# CHECK-UNKNOWN: 86456c27 <unknown>
 
 vsuxseg5ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 84 <unknown>
+# CHECK-UNKNOWN: 84457c27 <unknown>
 
 vsuxseg5ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg5ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 86 <unknown>
+# CHECK-UNKNOWN: 86457c27 <unknown>
 
 vsoxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 8c <unknown>
+# CHECK-UNKNOWN: 8c450c27 <unknown>
 
 vsoxseg5ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg5ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 8e <unknown>
+# CHECK-UNKNOWN: 8e450c27 <unknown>
 
 vsoxseg5ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg5ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 8c <unknown>
+# CHECK-UNKNOWN: 8c455c27 <unknown>
 
 vsoxseg5ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg5ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 8e <unknown>
+# CHECK-UNKNOWN: 8e455c27 <unknown>
 
 vsoxseg5ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg5ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 8c <unknown>
+# CHECK-UNKNOWN: 8c456c27 <unknown>
 
 vsoxseg5ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg5ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 8e <unknown>
+# CHECK-UNKNOWN: 8e456c27 <unknown>
 
 vsoxseg5ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg5ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 8c <unknown>
+# CHECK-UNKNOWN: 8c457c27 <unknown>
 
 vsoxseg5ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg5ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 8e <unknown>
+# CHECK-UNKNOWN: 8e457c27 <unknown>
 
 vsseg6e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg6e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 a0 <unknown>
+# CHECK-UNKNOWN: a0050c27 <unknown>
 
 vsseg6e8.v v24, (a0)
 # CHECK-INST: vsseg6e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 a2 <unknown>
+# CHECK-UNKNOWN: a2050c27 <unknown>
 
 vsseg6e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg6e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 a0 <unknown>
+# CHECK-UNKNOWN: a0055c27 <unknown>
 
 vsseg6e16.v v24, (a0)
 # CHECK-INST: vsseg6e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 a2 <unknown>
+# CHECK-UNKNOWN: a2055c27 <unknown>
 
 vsseg6e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg6e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 a0 <unknown>
+# CHECK-UNKNOWN: a0056c27 <unknown>
 
 vsseg6e32.v v24, (a0)
 # CHECK-INST: vsseg6e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 a2 <unknown>
+# CHECK-UNKNOWN: a2056c27 <unknown>
 
 vsseg6e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg6e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 a0 <unknown>
+# CHECK-UNKNOWN: a0057c27 <unknown>
 
 vsseg6e64.v v24, (a0)
 # CHECK-INST: vsseg6e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 a2 <unknown>
+# CHECK-UNKNOWN: a2057c27 <unknown>
 
 vssseg6e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b50c27 <unknown>
 
 vssseg6e8.v v24, (a0), a1
 # CHECK-INST: vssseg6e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 aa <unknown>
+# CHECK-UNKNOWN: aab50c27 <unknown>
 
 vssseg6e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b55c27 <unknown>
 
 vssseg6e16.v v24, (a0), a1
 # CHECK-INST: vssseg6e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 aa <unknown>
+# CHECK-UNKNOWN: aab55c27 <unknown>
 
 vssseg6e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b56c27 <unknown>
 
 vssseg6e32.v v24, (a0), a1
 # CHECK-INST: vssseg6e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 aa <unknown>
+# CHECK-UNKNOWN: aab56c27 <unknown>
 
 vssseg6e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b57c27 <unknown>
 
 vssseg6e64.v v24, (a0), a1
 # CHECK-INST: vssseg6e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 aa <unknown>
+# CHECK-UNKNOWN: aab57c27 <unknown>
 
 vsuxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4450c27 <unknown>
 
 vsuxseg6ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg6ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 a6 <unknown>
+# CHECK-UNKNOWN: a6450c27 <unknown>
 
 vsuxseg6ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4455c27 <unknown>
 
 vsuxseg6ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg6ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 a6 <unknown>
+# CHECK-UNKNOWN: a6455c27 <unknown>
 
 vsuxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4456c27 <unknown>
 
 vsuxseg6ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg6ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 a6 <unknown>
+# CHECK-UNKNOWN: a6456c27 <unknown>
 
 vsuxseg6ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4457c27 <unknown>
 
 vsuxseg6ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg6ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 a6 <unknown>
+# CHECK-UNKNOWN: a6457c27 <unknown>
 
 vsoxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ac <unknown>
+# CHECK-UNKNOWN: ac450c27 <unknown>
 
 vsoxseg6ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg6ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ae <unknown>
+# CHECK-UNKNOWN: ae450c27 <unknown>
 
 vsoxseg6ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg6ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ac <unknown>
+# CHECK-UNKNOWN: ac455c27 <unknown>
 
 vsoxseg6ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg6ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ae <unknown>
+# CHECK-UNKNOWN: ae455c27 <unknown>
 
 vsoxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ac <unknown>
+# CHECK-UNKNOWN: ac456c27 <unknown>
 
 vsoxseg6ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg6ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ae <unknown>
+# CHECK-UNKNOWN: ae456c27 <unknown>
 
 vsoxseg6ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg6ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ac <unknown>
+# CHECK-UNKNOWN: ac457c27 <unknown>
 
 vsoxseg6ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg6ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ae <unknown>
+# CHECK-UNKNOWN: ae457c27 <unknown>
 
 vsseg7e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg7e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 c0 <unknown>
+# CHECK-UNKNOWN: c0050c27 <unknown>
 
 vsseg7e8.v v24, (a0)
 # CHECK-INST: vsseg7e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 c2 <unknown>
+# CHECK-UNKNOWN: c2050c27 <unknown>
 
 vsseg7e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg7e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 c0 <unknown>
+# CHECK-UNKNOWN: c0055c27 <unknown>
 
 vsseg7e16.v v24, (a0)
 # CHECK-INST: vsseg7e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 c2 <unknown>
+# CHECK-UNKNOWN: c2055c27 <unknown>
 
 vsseg7e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg7e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 c0 <unknown>
+# CHECK-UNKNOWN: c0056c27 <unknown>
 
 vsseg7e32.v v24, (a0)
 # CHECK-INST: vsseg7e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 c2 <unknown>
+# CHECK-UNKNOWN: c2056c27 <unknown>
 
 vsseg7e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg7e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 c0 <unknown>
+# CHECK-UNKNOWN: c0057c27 <unknown>
 
 vsseg7e64.v v24, (a0)
 # CHECK-INST: vsseg7e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 c2 <unknown>
+# CHECK-UNKNOWN: c2057c27 <unknown>
 
 vssseg7e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b50c27 <unknown>
 
 vssseg7e8.v v24, (a0), a1
 # CHECK-INST: vssseg7e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 ca <unknown>
+# CHECK-UNKNOWN: cab50c27 <unknown>
 
 vssseg7e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b55c27 <unknown>
 
 vssseg7e16.v v24, (a0), a1
 # CHECK-INST: vssseg7e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 ca <unknown>
+# CHECK-UNKNOWN: cab55c27 <unknown>
 
 vssseg7e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b56c27 <unknown>
 
 vssseg7e32.v v24, (a0), a1
 # CHECK-INST: vssseg7e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 ca <unknown>
+# CHECK-UNKNOWN: cab56c27 <unknown>
 
 vssseg7e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b57c27 <unknown>
 
 vssseg7e64.v v24, (a0), a1
 # CHECK-INST: vssseg7e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 ca <unknown>
+# CHECK-UNKNOWN: cab57c27 <unknown>
 
 vsuxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 c4 <unknown>
+# CHECK-UNKNOWN: c4450c27 <unknown>
 
 vsuxseg7ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg7ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 c6 <unknown>
+# CHECK-UNKNOWN: c6450c27 <unknown>
 
 vsuxseg7ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 c4 <unknown>
+# CHECK-UNKNOWN: c4455c27 <unknown>
 
 vsuxseg7ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg7ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 c6 <unknown>
+# CHECK-UNKNOWN: c6455c27 <unknown>
 
 vsuxseg7ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 c4 <unknown>
+# CHECK-UNKNOWN: c4456c27 <unknown>
 
 vsuxseg7ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg7ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 c6 <unknown>
+# CHECK-UNKNOWN: c6456c27 <unknown>
 
 vsuxseg7ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 c4 <unknown>
+# CHECK-UNKNOWN: c4457c27 <unknown>
 
 vsuxseg7ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg7ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 c6 <unknown>
+# CHECK-UNKNOWN: c6457c27 <unknown>
 
 vsoxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 cc <unknown>
+# CHECK-UNKNOWN: cc450c27 <unknown>
 
 vsoxseg7ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg7ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ce <unknown>
+# CHECK-UNKNOWN: ce450c27 <unknown>
 
 vsoxseg7ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg7ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 cc <unknown>
+# CHECK-UNKNOWN: cc455c27 <unknown>
 
 vsoxseg7ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg7ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ce <unknown>
+# CHECK-UNKNOWN: ce455c27 <unknown>
 
 vsoxseg7ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg7ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 cc <unknown>
+# CHECK-UNKNOWN: cc456c27 <unknown>
 
 vsoxseg7ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg7ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ce <unknown>
+# CHECK-UNKNOWN: ce456c27 <unknown>
 
 vsoxseg7ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg7ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 cc <unknown>
+# CHECK-UNKNOWN: cc457c27 <unknown>
 
 vsoxseg7ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg7ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ce <unknown>
+# CHECK-UNKNOWN: ce457c27 <unknown>
 
 vsseg8e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg8e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 e0 <unknown>
+# CHECK-UNKNOWN: e0050c27 <unknown>
 
 vsseg8e8.v v24, (a0)
 # CHECK-INST: vsseg8e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 e2 <unknown>
+# CHECK-UNKNOWN: e2050c27 <unknown>
 
 vsseg8e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg8e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 e0 <unknown>
+# CHECK-UNKNOWN: e0055c27 <unknown>
 
 vsseg8e16.v v24, (a0)
 # CHECK-INST: vsseg8e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 e2 <unknown>
+# CHECK-UNKNOWN: e2055c27 <unknown>
 
 vsseg8e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg8e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 e0 <unknown>
+# CHECK-UNKNOWN: e0056c27 <unknown>
 
 vsseg8e32.v v24, (a0)
 # CHECK-INST: vsseg8e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 e2 <unknown>
+# CHECK-UNKNOWN: e2056c27 <unknown>
 
 vsseg8e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg8e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 e0 <unknown>
+# CHECK-UNKNOWN: e0057c27 <unknown>
 
 vsseg8e64.v v24, (a0)
 # CHECK-INST: vsseg8e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 e2 <unknown>
+# CHECK-UNKNOWN: e2057c27 <unknown>
 
 vssseg8e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b50c27 <unknown>
 
 vssseg8e8.v v24, (a0), a1
 # CHECK-INST: vssseg8e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 ea <unknown>
+# CHECK-UNKNOWN: eab50c27 <unknown>
 
 vssseg8e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b55c27 <unknown>
 
 vssseg8e16.v v24, (a0), a1
 # CHECK-INST: vssseg8e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 ea <unknown>
+# CHECK-UNKNOWN: eab55c27 <unknown>
 
 vssseg8e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b56c27 <unknown>
 
 vssseg8e32.v v24, (a0), a1
 # CHECK-INST: vssseg8e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 ea <unknown>
+# CHECK-UNKNOWN: eab56c27 <unknown>
 
 vssseg8e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b57c27 <unknown>
 
 vssseg8e64.v v24, (a0), a1
 # CHECK-INST: vssseg8e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 ea <unknown>
+# CHECK-UNKNOWN: eab57c27 <unknown>
 
 vsuxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 e4 <unknown>
+# CHECK-UNKNOWN: e4450c27 <unknown>
 
 vsuxseg8ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg8ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 e6 <unknown>
+# CHECK-UNKNOWN: e6450c27 <unknown>
 
 vsuxseg8ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 e4 <unknown>
+# CHECK-UNKNOWN: e4455c27 <unknown>
 
 vsuxseg8ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg8ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 e6 <unknown>
+# CHECK-UNKNOWN: e6455c27 <unknown>
 
 vsuxseg8ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 e4 <unknown>
+# CHECK-UNKNOWN: e4456c27 <unknown>
 
 vsuxseg8ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg8ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 e6 <unknown>
+# CHECK-UNKNOWN: e6456c27 <unknown>
 
 vsuxseg8ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 e4 <unknown>
+# CHECK-UNKNOWN: e4457c27 <unknown>
 
 vsuxseg8ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg8ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 e6 <unknown>
+# CHECK-UNKNOWN: e6457c27 <unknown>
 
 vsoxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ec <unknown>
+# CHECK-UNKNOWN: ec450c27 <unknown>
 
 vsoxseg8ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg8ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ee <unknown>
+# CHECK-UNKNOWN: ee450c27 <unknown>
 
 vsoxseg8ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg8ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ec <unknown>
+# CHECK-UNKNOWN: ec455c27 <unknown>
 
 vsoxseg8ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg8ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ee <unknown>
+# CHECK-UNKNOWN: ee455c27 <unknown>
 
 vsoxseg8ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg8ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ec <unknown>
+# CHECK-UNKNOWN: ec456c27 <unknown>
 
 vsoxseg8ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg8ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ee <unknown>
+# CHECK-UNKNOWN: ee456c27 <unknown>
 
 vsoxseg8ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg8ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ec <unknown>
+# CHECK-UNKNOWN: ec457c27 <unknown>
 
 vsoxseg8ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg8ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ee <unknown>
+# CHECK-UNKNOWN: ee457c27 <unknown>
 
 vlseg2e8.v v8, 0(a0), v0.t
 # CHECK-INST: vlseg2e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 20 <unknown>
+# CHECK-UNKNOWN: 20050407 <unknown>
 
 vlseg2e16ff.v v8, 0(a0)
 # CHECK-INST: vlseg2e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 23 <unknown>
+# CHECK-UNKNOWN: 23055407 <unknown>
 
 vlsseg2e8.v v8, 0(a0), a1
 # CHECK-INST: vlsseg2e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab50407 <unknown>
 
 vluxseg3ei16.v v8, 0(a0), v4
 # CHECK-INST: vluxseg3ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 46 <unknown>
+# CHECK-UNKNOWN: 46455407 <unknown>
 
 vloxseg4ei64.v v8, 0(a0), v4, v0.t
 # CHECK-INST: vloxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 6c <unknown>
+# CHECK-UNKNOWN: 6c457407 <unknown>
 
 vsseg5e32.v v24, 0(a0), v0.t
 # CHECK-INST: vsseg5e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 80 <unknown>
+# CHECK-UNKNOWN: 80056c27 <unknown>
 
 vssseg2e8.v v24, 0(a0), a1, v0.t
 # CHECK-INST: vssseg2e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b50c27 <unknown>
 
 vsoxseg7ei16.v v24, 0(a0), v4
 # CHECK-INST: vsoxseg7ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ce <unknown>
+# CHECK-UNKNOWN: ce455c27 <unknown>
 
 vsuxseg6ei32.v v24, 0(a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4456c27 <unknown>
diff --git a/llvm/test/MC/RISCV/scoped-relaxation.s b/llvm/test/MC/RISCV/scoped-relaxation.s
index 0b797ee5aca5..56394fd80532 100644
--- a/llvm/test/MC/RISCV/scoped-relaxation.s
+++ b/llvm/test/MC/RISCV/scoped-relaxation.s
@@ -9,7 +9,7 @@
 .dword function - .
 
 # CHECK: 0x0 R_RISCV_ADD64 function 0x0
-# CHECK-NEXT: 0x0 R_RISCV_SUB64 <null> 0x0
+# CHECK-NEXT: 0x0 R_RISCV_SUB64 .L0  0x0
 
 # Relaxed reference, this will resolve to a pair of `RISCV_ADD64` and
 # `RISCV_SUB64` relocation.
@@ -19,7 +19,7 @@
 .option pop
 
 # CHECK: 0x8 R_RISCV_ADD64 function 0x0
-# CHECK-NEXT: 0x8 R_RISCV_SUB64 <null> 0x0
+# CHECK-NEXT: 0x8 R_RISCV_SUB64 .L0  0x0
 
 # Unrelaxed reference, this will resolve to a pair of `RISCV_ADD64` and
 # `RISCV_SUB64` relocation due to relaxation being sticky to the file.
@@ -29,6 +29,6 @@
 .option pop
 
 # CHECK: 0x10 R_RISCV_ADD64 function 0x0
-# CHECK-NEXT: 0x10 R_RISCV_SUB64 <null> 0x0
+# CHECK-NEXT: 0x10 R_RISCV_SUB64 .L0  0x0
 
 # CHECK: }
diff --git a/llvm/test/MC/WebAssembly/global-ctor-dtor.ll b/llvm/test/MC/WebAssembly/global-ctor-dtor.ll
index bc1be7931349..f1ec71da1ebb 100644
--- a/llvm/test/MC/WebAssembly/global-ctor-dtor.ll
+++ b/llvm/test/MC/WebAssembly/global-ctor-dtor.ll
@@ -80,29 +80,29 @@ declare void @func3()
 ; CHECK-NEXT:         Offset:          0x1D
 ; CHECK-NEXT:       - Type:            R_WASM_FUNCTION_INDEX_LEB
 ; CHECK-NEXT:         Index:           6
-; CHECK-NEXT:         Offset:          0x2C
+; CHECK-NEXT:         Offset:          0x2B
 ; CHECK-NEXT:       - Type:            R_WASM_TABLE_INDEX_SLEB
 ; CHECK-NEXT:         Index:           5
-; CHECK-NEXT:         Offset:          0x37
+; CHECK-NEXT:         Offset:          0x36
 ; CHECK-NEXT:       - Type:            R_WASM_MEMORY_ADDR_SLEB
 ; CHECK-NEXT:         Index:           3
-; CHECK-NEXT:         Offset:          0x3F
+; CHECK-NEXT:         Offset:          0x3E
 ; CHECK-NEXT:       - Type:            R_WASM_FUNCTION_INDEX_LEB
 ; CHECK-NEXT:         Index:           4
-; CHECK-NEXT:         Offset:          0x45
+; CHECK-NEXT:         Offset:          0x44
 ; CHECK-NEXT:     Functions:
 ; CHECK-NEXT:       - Index:           5
 ; CHECK-NEXT:         Locals:
 ; CHECK-NEXT:         Body:            1080808080000B
 ; CHECK-NEXT:       - Index:           6
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            02404181808080004100418080808000108180808000450D0000000B0B
+; CHECK-NEXT:         Body:            02404181808080004100418080808000108180808000450D00000B0B
 ; CHECK-NEXT:       - Index:           7
 ; CHECK-NEXT:         Locals:
 ; CHECK-NEXT:         Body:            1082808080000B
 ; CHECK-NEXT:       - Index:           8
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            02404182808080004100418080808000108180808000450D0000000B0B
+; CHECK-NEXT:         Body:            02404182808080004100418080808000108180808000450D00000B0B
 ; CHECK-NEXT:   - Type:            DATA
 ; CHECK-NEXT:     Segments:
 ; CHECK-NEXT:       - SectionOffset:   6
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639e07b4..e2fd74306f80 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -183,12 +183,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-EXT: Running pass: {{.*}}::Bye
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f48bad..13a63bbe4d9c 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -182,12 +182,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis on bar
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd2d357..3130da86fa99 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -147,12 +147,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O-NEXT: Running pass: AnnotationRemarksPass on foo
diff --git a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td b/llvm/test/TableGen/GlobalISelEmitter-frameindex.td
new file mode 100644
index 000000000000..232691465bb3
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter-frameindex.td
@@ -0,0 +1,29 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>;
+
+//===- Test a simple pattern with frame index operands. ----------------------===//
+//
+// CHECK:       GIM_Try, /*On fail goto*//*Label [[LABEL_NUM:[0-9]+]]*/ GIMT_Encode4([[LABEL:[0-9]+]]),
+// CHECK-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_FRAME_INDEX),
+// CHECK-NEXT:    // MIs[0] DstI[dst]
+// CHECK-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_p0s32,
+// CHECK-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT:    // MIs[0] fi
+// CHECK-NEXT:    // No operand predicates
+// CHECK-NEXT:    // (frameindex:{ *:[i32] }):$fi  =>  (ADD:{ *:[i32] } (tframeindex:{ *:[i32] }):$fi, 0:{ *:[i32] })
+// CHECK-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ADD),
+// CHECK-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
+// CHECK-NEXT:    GIR_RootToRootCopy, /*OpIdx*/1, // fi
+// CHECK-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/0,
+// CHECK-NEXT:    GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT:    // GIR_Coverage, 0,
+// CHECK-NEXT:    GIR_EraseRootFromParent_Done,
+// CHECK-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
+// CHECK-NEXT:  GIM_Reject,
+
+def : Pat<(p0 frameindex:$fi), (ADD tframeindex:$fi, 0)>;
diff --git a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td b/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td
index 7c9df02ebd87..fc8abc6fbc54 100644
--- a/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td
+++ b/llvm/test/TableGen/GlobalISelEmitterSkippedPatterns.td
@@ -1,4 +1,6 @@
 // RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../include %s -I %p/Common -o /dev/null 2>&1 | FileCheck %s
+// RUN: llvm-tblgen -warn-on-skipped-patterns -gen-global-isel -I %p/../../include %s -I %p/Common -o /dev/null -DIGNORE 2>&1 | FileCheck --allow-empty --check-prefix=IGNORED %s
+
 include "llvm/Target/Target.td"
 include "GlobalISelEmitterCommon.td"
 
@@ -23,6 +25,10 @@ def INSN : I<(outs GPR32:$dst), (ins GPR32:$src1, complex:$src2), []>;
 
 //===- Bail out when we define a variable twice wrt complex suboperands. -===//
 
+#ifdef IGNORE
+let GISelShouldIgnore = 1 in
+#endif
+// IGNORED-NOT: warning: Skipped pattern: Error: {{.*}}
 // CHECK: warning: Skipped pattern: Error: Complex suboperand x referenced by different operands: complex_rr:x:y and complex_rr:x:z.
 def : Pat<(add (complex_rr GPR32:$x, GPR32:$y),
                (complex_rr GPR32:$x, GPR32:$z)),
diff --git a/llvm/test/TableGen/riscv-target-def.td b/llvm/test/TableGen/riscv-target-def.td
index ab589b31192f..7f3d9bdb278c 100644
--- a/llvm/test/TableGen/riscv-target-def.td
+++ b/llvm/test/TableGen/riscv-target-def.td
@@ -2,8 +2,9 @@
 
 include "llvm/Target/Target.td"
 
-class RISCVExtension<string name, int major, int minor, string fieldname,
-                     string desc, list<SubtargetFeature> implies = [],
+class RISCVExtension<string name, int major, int minor, string desc,
+                     list<SubtargetFeature> implies = [],
+                     string fieldname = !subst("Feature", "Has", NAME),
                      string value = "true">
     : SubtargetFeature<name, fieldname, value, desc, implies> {
   int MajorVersion = major;
@@ -11,18 +12,36 @@ class RISCVExtension<string name, int major, int minor, string fieldname,
   bit Experimental = false;
 }
 
+class RISCVExperimentalExtension<string name, int major, int minor, string desc,
+                                 list<RISCVExtension> implies = [],
+                                 string fieldname = !subst("Feature", "Has", NAME),
+                                 string value = "true">
+    : RISCVExtension<"experimental-"#name, major, minor, desc, implies,
+                     fieldname, value> {
+  let Experimental = true;
+}
+
 def FeatureStdExtI
-    : RISCVExtension<"i", 2, 1, "HasStdExtI",
+    : RISCVExtension<"i", 2, 1,
                      "'I' (Base Integer Instruction Set)">;
 
 def FeatureStdExtZicsr
-    : RISCVExtension<"zicsr", 2, 0, "HasStdExtZicsr",
+    : RISCVExtension<"zicsr", 2, 0,
                      "'zicsr' (CSRs)">;
 
 def FeatureStdExtZifencei
-    : RISCVExtension<"zifencei", 2, 0, "HasStdExtZifencei",
+    : RISCVExtension<"zifencei", 2, 0,
                      "'Zifencei' (fence.i)">;
 
+def FeatureStdExtF
+    : RISCVExtension<"f", 2, 2,
+                     "'F' (Single-Precision Floating-Point)",
+                     [FeatureStdExtZicsr]>;
+
+def FeatureStdExtZidummy
+    : RISCVExperimentalExtension<"zidummy", 0, 1,
+                                 "Dummy">;
+
 def Feature32Bit
     : SubtargetFeature<"32bit", "IsRV32", "true", "Implements RV32">;
 def Feature64Bit
@@ -32,6 +51,15 @@ def Feature64Bit
 def FeatureDummy
     : SubtargetFeature<"dummy", "Dummy", "true", "Dummy">;
 
+class RISCVProfile<string name, list<SubtargetFeature> features>
+    : SubtargetFeature<name, "Is" # NAME, "true",
+                       "RISC-V " # name # " profile", features>;
+
+def RVI20U32 : RISCVProfile<"rvi20u32", [Feature32Bit, FeatureStdExtI]>;
+def RVI20U64 : RISCVProfile<"rvi20u64", [Feature64Bit, FeatureStdExtI]>;
+def ProfileDummy : RISCVProfile<"dummy", [Feature64Bit, FeatureStdExtI,
+                                          FeatureStdExtF, FeatureStdExtZidummy]>;
+
 class RISCVProcessorModel<string n,
                           SchedMachineModel m,
                           list<SubtargetFeature> f,
@@ -64,6 +92,7 @@ def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32",
                                        FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZicsr,
+                                       FeatureStdExtZidummy,
                                        FeatureDummy]>;
 def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
                                       NoSchedModel,
@@ -71,26 +100,63 @@ def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
                                        FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZicsr,
+                                       FeatureStdExtZidummy,
                                        FeatureDummy]>;
 def ROCKET : RISCVTuneProcessorModel<"rocket",
                                      NoSchedModel>;
 
-// CHECK: #ifndef PROC
-// CHECK: #define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGNED_ACCESS)
-// CHECK: #endif
+// CHECK:      #ifdef GET_SUPPORTED_EXTENSIONS
+// CHECK-NEXT: #undef GET_SUPPORTED_EXTENSIONS
+
+// CHECK:      static const RISCVSupportedExtension SupportedExtensions[] = {
+// CHECK-NEXT:     {"f", {2, 2}},
+// CHECK-NEXT:     {"i", {2, 1}},
+// CHECK-NEXT:     {"zicsr", {2, 0}},
+// CHECK-NEXT:     {"zifencei", {2, 0}},
+// CHECK-NEXT: };
+
+// CHECK:      static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
+// CHECK-NEXT:     {"zidummy", {0, 1}},
+// CHECK-NEXT: };
+
+// CHECK:      #endif // GET_SUPPORTED_EXTENSIONS
+
+// CHECK:      #ifdef GET_IMPLIED_EXTENSIONS
+// CHECK-NEXT: #undef GET_IMPLIED_EXTENSIONS
+
+// CHECK:      static constexpr ImpliedExtsEntry ImpliedExts[] = {
+// CHECK-NEXT:       { {"f"}, "zicsr"},
+// CHECK-NEXT: };
+
+// CHECK:      #endif // GET_IMPLIED_EXTENSIONS
+
+// CHECK:      #ifdef GET_SUPPORTED_PROFILES
+// CHECK-NEXT: #undef GET_SUPPORTED_PROFILES
+
+// CHECK:      static constexpr RISCVProfile SupportedProfiles[] = {
+// CHECK-NEXT:     {"dummy","rv64i2p1_f2p2_zidummy0p1"},
+// CHECK-NEXT:     {"rvi20u32","rv32i2p1"},
+// CHECK-NEXT:     {"rvi20u64","rv64i2p1"},
+// CHECK-NEXT: };
+
+// CHECK:      #endif // GET_SUPPORTED_PROFILES
+
+// CHECK:      #ifndef PROC
+// CHECK-NEXT: #define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGNED_ACCESS)
+// CHECK-NEXT: #endif
 
-// CHECK: PROC(GENERIC_RV32, {"generic-rv32"}, {"rv32i2p1"}, 0)
-// CHECK: PROC(GENERIC_RV64, {"generic-rv64"}, {"rv64i2p1"}, 0)
-// CHECK: PROC(ROCKET_RV32, {"rocket-rv32"}, {"rv32i2p1_zicsr2p0_zifencei2p0"}, 0)
-// CHECK: PROC(ROCKET_RV64, {"rocket-rv64"}, {"rv64i2p1_zicsr2p0_zifencei2p0"}, 0)
+// CHECK:      PROC(GENERIC_RV32, {"generic-rv32"}, {"rv32i2p1"}, 0)
+// CHECK-NEXT: PROC(GENERIC_RV64, {"generic-rv64"}, {"rv64i2p1"}, 0)
+// CHECK-NEXT: PROC(ROCKET_RV32, {"rocket-rv32"}, {"rv32i2p1_zicsr2p0_zidummy0p1_zifencei2p0"}, 0)
+// CHECK-NEXT: PROC(ROCKET_RV64, {"rocket-rv64"}, {"rv64i2p1_zicsr2p0_zidummy0p1_zifencei2p0"}, 0)
 
 // CHECK: #undef PROC
 
-// CHECK: #ifndef TUNE_PROC
-// CHECK: #define TUNE_PROC(ENUM, NAME)
-// CHECK: #endif
+// CHECK:      #ifndef TUNE_PROC
+// CHECK-NEXT: #define TUNE_PROC(ENUM, NAME)
+// CHECK-NEXT: #endif
 
 // CHECK: TUNE_PROC(GENERIC, "generic")
-// CHECK: TUNE_PROC(ROCKET, "rocket")
+// CHECK-NEXT: TUNE_PROC(ROCKET, "rocket")
 
 // CHECK: #undef TUNE_PROC
diff --git a/llvm/test/TableGen/simplify-patfrag.td b/llvm/test/TableGen/simplify-patfrag.td
index 904c29696a6e..fbb6f97f2863 100644
--- a/llvm/test/TableGen/simplify-patfrag.td
+++ b/llvm/test/TableGen/simplify-patfrag.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen -gen-dag-isel -I %p/../../include %s 2>&1 | FileCheck %s
+// RUN: llvm-tblgen -gen-dag-isel -I %p/../../include -DIGNORE %s 2>&1 | FileCheck %s
 
 include "llvm/Target/Target.td"
 
@@ -29,6 +30,10 @@ def anyconvert : PatFrags<(ops node:$src),
                           [(bitconvert node:$src),
                            (specialconvert node:$src)]>;
 
+#ifdef IGNORE
+// Ensure ShouldIgnore does not disable records in dag isel emitter
+let GISelShouldIgnore = 1 in
+#endif
 // And a rule that matches that PatFrag and turns it into i2f
 def : Pat<(f32 (anyconvert (i32 GPR:$val))), (i2f GPR:$val)>;
 
diff --git a/llvm/test/ThinLTO/X86/distributed_indexes.ll b/llvm/test/ThinLTO/X86/distributed_indexes.ll
index 50724e466e30..4f2662b1b34e 100644
--- a/llvm/test/ThinLTO/X86/distributed_indexes.ll
+++ b/llvm/test/ThinLTO/X86/distributed_indexes.ll
@@ -16,11 +16,11 @@
 ; BACKEND1-NEXT: </MODULE_STRTAB_BLOCK
 ; BACKEND1-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1-NEXT: <VERSION
-; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
-; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
-; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
-; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
-; BACKEND1-DAG: <VALUE_GUID op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
+; BACKEND1-DAG: <VALUE_GUID {{.*}} op0={{.*}}
 ; BACKEND1-NEXT: <COMBINED_PROFILE {{.*}} op1=0
 ; BACKEND1-NEXT: <COMBINED_PROFILE {{.*}} op1=0
 ; BACKEND1-NEXT: <COMBINED_PROFILE {{.*}} op1=1
@@ -34,9 +34,9 @@
 ; BACKEND2-NEXT: </MODULE_STRTAB_BLOCK
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
-; BACKEND2-DAG: <VALUE_GUID op0={{.*}}
-; BACKEND2-DAG: <VALUE_GUID op0={{.*}}
-; BACKEND2-DAG: <VALUE_GUID op0={{.*}}
+; BACKEND2-DAG: <VALUE_GUID {{.*}} op0={{.*}}
+; BACKEND2-DAG: <VALUE_GUID {{.*}} op0={{.*}}
+; BACKEND2-DAG: <VALUE_GUID {{.*}} op0={{.*}}
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: <COMBINED_ALIAS
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll
new file mode 100644
index 000000000000..b1497aefe9b9
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand %s | FileCheck %s
+
+; Make sure atomic loads are not bitcasted and lose metadata
+
+define float @load_atomic_f32_global_system(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define float @load_atomic_f32_global_system(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr addrspace(1) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0:![0-9]+]]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %ld = load atomic float, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret float %ld
+}
+
+define float @load_atomic_f32_global_agent(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define float @load_atomic_f32_global_agent(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %ld = load atomic float, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
+  ret float %ld
+}
+
+define float @load_atomic_f32_local(ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define float @load_atomic_f32_local(
+; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr addrspace(3) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %ld = load atomic float, ptr addrspace(3) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret float %ld
+}
+
+define float @load_atomic_f32_flat_system(ptr %ptr) {
+; CHECK-LABEL: define float @load_atomic_f32_flat_system(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %ld = load atomic float, ptr %ptr seq_cst, align 4, !some.unknown.md !0
+  ret float %ld
+}
+
+define float @load_atomic_f32_flat_agent(ptr %ptr) {
+; CHECK-LABEL: define float @load_atomic_f32_flat_agent(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %ld = load atomic float, ptr %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
+  ret float %ld
+}
+
+define half @load_atomic_f16_global_system(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define half @load_atomic_f16_global_system(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic half, ptr addrspace(1) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret half [[TMP2]]
+;
+  %ld = load atomic half, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret half %ld
+}
+
+define half @load_atomic_f16_global_agent(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define half @load_atomic_f16_global_agent(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic half, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret half [[TMP2]]
+;
+  %ld = load atomic half, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
+  ret half %ld
+}
+
+define half @load_atomic_f16_local(ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define half @load_atomic_f16_local(
+; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic half, ptr addrspace(3) [[PTR]] seq_cst, align 2, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret half [[TMP2]]
+;
+  %ld = load atomic half, ptr addrspace(3) %ptr seq_cst, align 2, !some.unknown.md !0
+  ret half %ld
+}
+
+define bfloat @load_atomic_bf16_global_system(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define bfloat @load_atomic_bf16_global_system(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic bfloat, ptr addrspace(1) [[PTR]] seq_cst, align 2, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret bfloat [[TMP2]]
+;
+  %ld = load atomic bfloat, ptr addrspace(1) %ptr seq_cst, align 2, !some.unknown.md !0
+  ret bfloat %ld
+}
+
+define bfloat @load_atomic_bf16_global_agent(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define bfloat @load_atomic_bf16_global_agent(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic bfloat, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 2, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret bfloat [[TMP2]]
+;
+  %ld = load atomic bfloat, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 2, !some.unknown.md !0
+  ret bfloat %ld
+}
+
+define bfloat @load_atomic_bf16_local(ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define bfloat @load_atomic_bf16_local(
+; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic bfloat, ptr addrspace(3) [[PTR]] seq_cst, align 2, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret bfloat [[TMP2]]
+;
+  %ld = load atomic bfloat, ptr addrspace(3) %ptr seq_cst, align 2, !some.unknown.md !0
+  ret bfloat %ld
+}
+
+define bfloat @load_atomic_bf16_flat(ptr %ptr) {
+; CHECK-LABEL: define bfloat @load_atomic_bf16_flat(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic bfloat, ptr [[PTR]] seq_cst, align 2, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret bfloat [[TMP2]]
+;
+  %ld = load atomic bfloat, ptr %ptr seq_cst, align 2, !some.unknown.md !0
+  ret bfloat %ld
+}
+
+define double @load_atomic_f64_global_system(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define double @load_atomic_f64_global_system(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr addrspace(1) [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %ld = load atomic double, ptr addrspace(1) %ptr seq_cst, align 8, !some.unknown.md !0
+  ret double %ld
+}
+
+define double @load_atomic_f64_global_agent(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define double @load_atomic_f64_global_agent(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 8, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %ld = load atomic double, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 8, !some.unknown.md !0
+  ret double %ld
+}
+
+define double @load_atomic_f64_local(ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define double @load_atomic_f64_local(
+; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr addrspace(3) [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %ld = load atomic double, ptr addrspace(3) %ptr seq_cst, align 8, !some.unknown.md !0
+  ret double %ld
+}
+
+define double @load_atomic_f64_flat_system(ptr %ptr) {
+; CHECK-LABEL: define double @load_atomic_f64_flat_system(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %ld = load atomic double, ptr %ptr seq_cst, align 8, !some.unknown.md !0
+  ret double %ld
+}
+
+define double @load_atomic_f64_flat_agent(ptr %ptr) {
+; CHECK-LABEL: define double @load_atomic_f64_flat_agent(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr [[PTR]] syncscope("agent") seq_cst, align 8, !some.unknown.md [[META0]]
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %ld = load atomic double, ptr %ptr syncscope("agent") seq_cst, align 8, !some.unknown.md !0
+  ret double %ld
+}
+
+!0 = !{}
+
+
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll
new file mode 100644
index 000000000000..db0c3a20e62f
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll
@@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand %s | FileCheck %s
+
+define void @store_atomic_f32_global_system(float %val, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f32_global_system(
+; CHECK-SAME: float [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL]] to i32
+; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr addrspace(1) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic float %val, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f32_global_agent(float %val, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f32_global_agent(
+; CHECK-SAME: float [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL]] to i32
+; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic float %val, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f32_local(float %val, ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f32_local(
+; CHECK-SAME: float [[VAL:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL]] to i32
+; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr addrspace(3) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic float %val, ptr addrspace(3) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f32_flat(float %val, ptr %ptr) {
+; CHECK-LABEL: define void @store_atomic_f32_flat(
+; CHECK-SAME: float [[VAL:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL]] to i32
+; CHECK-NEXT:    store atomic i32 [[TMP1]], ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic float %val, ptr %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f16_global_system(half %val, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f16_global_system(
+; CHECK-SAME: half [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL]] to i16
+; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(1) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic half %val, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f16_global_agent(half %val, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f16_global_agent(
+; CHECK-SAME: half [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL]] to i16
+; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic half %val, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f16_local(half %val, ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f16_local(
+; CHECK-SAME: half [[VAL:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL]] to i16
+; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(3) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic half %val, ptr addrspace(3) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f16_flat(half %val, ptr %ptr) {
+; CHECK-LABEL: define void @store_atomic_f16_flat(
+; CHECK-SAME: half [[VAL:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL]] to i16
+; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic half %val, ptr %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_bf16_global_system(bfloat %val, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @store_atomic_bf16_global_system(
+; CHECK-SAME: bfloat [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VAL]] to i16
+; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(1) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic bfloat %val, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_bf16_global_agent(bfloat %val, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @store_atomic_bf16_global_agent(
+; CHECK-SAME: bfloat [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VAL]] to i16
+; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic bfloat %val, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_bf16_local(bfloat %val, ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define void @store_atomic_bf16_local(
+; CHECK-SAME: bfloat [[VAL:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VAL]] to i16
+; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr addrspace(3) [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic bfloat %val, ptr addrspace(3) %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_bf16_flat(bfloat %val, ptr %ptr) {
+; CHECK-LABEL: define void @store_atomic_bf16_flat(
+; CHECK-SAME: bfloat [[VAL:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VAL]] to i16
+; CHECK-NEXT:    store atomic i16 [[TMP1]], ptr [[PTR]] seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic bfloat %val, ptr %ptr seq_cst, align 4, !some.unknown.md !0
+  ret void
+}
+define void @store_atomic_f64_global_system(double %val, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f64_global_system(
+; CHECK-SAME: double [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL]] to i64
+; CHECK-NEXT:    store atomic i64 [[TMP1]], ptr addrspace(1) [[PTR]] seq_cst, align 8
+; CHECK-NEXT:    ret void
+;
+  store atomic double %val, ptr addrspace(1) %ptr seq_cst, align 8, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f64_global_agent(double %val, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f64_global_agent(
+; CHECK-SAME: double [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL]] to i64
+; CHECK-NEXT:    store atomic i64 [[TMP1]], ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 8
+; CHECK-NEXT:    ret void
+;
+  store atomic double %val, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 8, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f64_local(double %val, ptr addrspace(3) %ptr) {
+; CHECK-LABEL: define void @store_atomic_f64_local(
+; CHECK-SAME: double [[VAL:%.*]], ptr addrspace(3) [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL]] to i64
+; CHECK-NEXT:    store atomic i64 [[TMP1]], ptr addrspace(3) [[PTR]] seq_cst, align 8
+; CHECK-NEXT:    ret void
+;
+  store atomic double %val, ptr addrspace(3) %ptr seq_cst, align 8, !some.unknown.md !0
+  ret void
+}
+
+define void @store_atomic_f64_flat(double %val, ptr %ptr) {
+; CHECK-LABEL: define void @store_atomic_f64_flat(
+; CHECK-SAME: double [[VAL:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL]] to i64
+; CHECK-NEXT:    store atomic i64 [[TMP1]], ptr [[PTR]] seq_cst, align 8
+; CHECK-NEXT:    ret void
+;
+  store atomic double %val, ptr %ptr seq_cst, align 8, !some.unknown.md !0
+  ret void
+}
+
+!0 = !{}
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-debug.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-debug.ll
index 8f10dcb30d7b..68c906d616c9 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split-debug.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-debug.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -passes=callsite-splitting -o - < %s | FileCheck %s
+; RUN: opt -S -passes=callsite-splitting -o - < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DEBUG
 ; RUN: opt -S -strip-debug -passes=callsite-splitting -o - < %s | FileCheck %s
 
 define internal i16 @bar(i16 %p1, i16 %p2) {
@@ -8,6 +8,9 @@ define internal i16 @bar(i16 %p1, i16 %p2) {
 
 define i16 @foo(i16 %in) {
 bb0:
+  %a = alloca i16, align 4, !DIAssignID !12
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !11, metadata !DIExpression(), metadata !12, metadata ptr %a, metadata !DIExpression()), !dbg !8
+  store i16 7, ptr %a, align 4, !DIAssignID !13
   br label %bb1
 
 bb1:
@@ -20,13 +23,21 @@ bb2:
 CallsiteBB:
   %1 = phi i16 [ 0, %bb1 ], [ 1, %bb2 ]
   %c = phi i16 [ 2, %bb1 ], [ 3, %bb2 ]
+  %p = phi ptr [ %a, %bb1 ], [ %a, %bb2 ]
+  call void @llvm.dbg.value(metadata i16 %1, metadata !7, metadata !DIExpression()), !dbg !8
   call void @llvm.dbg.value(metadata i16 %c, metadata !7, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.value(metadata !DIArgList(i16 %1, i16 %c), metadata !7, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.value(metadata !DIArgList(i16 %c, i16 %c), metadata !7, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.assign(metadata i16 %1, metadata !11, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.assign(metadata i16 %c, metadata !11, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.assign(metadata i16 %1, metadata !11, metadata !DIExpression(), metadata !13, metadata ptr %p, metadata !DIExpression()), !dbg !8
   %2 = call i16 @bar(i16 %1, i16 5)
   ret i16 %2
 }
 
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
 
 attributes #0 = { nounwind readnone speculatable }
 
@@ -43,14 +54,37 @@ attributes #0 = { nounwind readnone speculatable }
 !6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, unit: !0)
 !7 = !DILocalVariable(name: "c", scope: !6, line: 5, type: !5)
 !8 = !DILocation(line: 5, column: 7, scope: !6)
+!11 = !DILocalVariable(name: "a", scope: !6, line: 6, type: !5)
+!12 = distinct !DIAssignID()
+!13 = distinct !DIAssignID()
 
 ; The optimization should trigger even in the presence of the dbg.value in
 ; CallSiteBB.
 
 ; CHECK-LABEL: @foo
 ; CHECK-LABEL: bb1.split:
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata i16 0, metadata ![[DBG_1:[0-9]+]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata i16 2, metadata ![[DBG_1]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata !DIArgList(i16 0, i16 2), {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata !DIArgList(i16 2, i16 2), {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 0, metadata ![[DBG_2:[0-9]+]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 2, metadata ![[DBG_2]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 0, metadata ![[DBG_2]], metadata !DIExpression(), metadata ![[ID_1:[0-9]+]], metadata ptr %a, {{.*}}
 ; CHECK: [[TMP1:%[0-9]+]] = call i16 @bar(i16 0, i16 5)
+
 ; CHECK-LABEL: bb2.split:
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata i16 1, metadata ![[DBG_1]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata i16 3, metadata ![[DBG_1]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata !DIArgList(i16 1, i16 3), {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata !DIArgList(i16 3, i16 3), {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 1, metadata ![[DBG_2]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 3, metadata ![[DBG_2]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 1, metadata ![[DBG_2]], metadata !DIExpression(), metadata ![[ID_1:[0-9]+]], metadata ptr %a, {{.*}}
 ; CHECK: [[TMP2:%[0-9]+]] = call i16 @bar(i16 1, i16 5)
+
 ; CHECK-LABEL: CallsiteBB
 ; CHECK: %phi.call = phi i16 [ [[TMP2]], %bb2.split ], [ [[TMP1]], %bb1.split
+
+; CHECK-DEBUG-DAG: ![[DBG_1]] = !DILocalVariable(name: "c"{{.*}})
+; CHECK-DEBUG-DAG: ![[DBG_2]] = !DILocalVariable(name: "a"{{.*}})
+; CHECK-DEBUG-DAG: ![[ID_1]] = distinct !DIAssignID()
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
index ff5cef7e781f..25dfb3c53a07 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
@@ -211,6 +211,29 @@ else:
   ret i32 %l
 }
 
+define i32 @sub10_else_drop_nuw(i32 %a) {
+; CHECK-LABEL: @sub10_else_drop_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L:%.*]] = sub i32 [[A:%.*]], 10
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %c = icmp eq i32 %a, 10
+  br i1 %c, label %then, label %else
+
+then:
+  ret i32 0
+
+else:
+  %l = sub nuw i32 %a, 10
+  ret i32 %l
+}
+
 define i32 @subm10_then(i32 %a) {
 ; CHECK-LABEL: @subm10_then(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll
new file mode 100644
index 000000000000..a6909d149134
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=riscv64 < %s | FileCheck %s
+
+define i8 @hoist_add(i8 %x) {
+; CHECK-LABEL: define i8 @hoist_add(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp eq i8 %x, -1
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %inc = add nuw nsw i8 %x, 1
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
+
+define i8 @hoist_lshr(i8 %x) {
+; CHECK-LABEL: define i8 @hoist_lshr(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = lshr i8 [[X]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp ult i8 %x, 8
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %inc = lshr exact i8 %x, 3
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
+
+define i8 @nomove_add(i8 %x) {
+; CHECK-LABEL: define i8 @nomove_add(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %inc = add nuw nsw i8 %x, 1
+  %cmp = icmp eq i8 %x, -1
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/noop-copy-sink.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/noop-copy-sink.ll
new file mode 100644
index 000000000000..55cde6c1431f
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/noop-copy-sink.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv64 %s \
+; RUN:   | FileCheck --check-prefixes=CHECK %s
+
+define i16 @sink_trunc1(i64 %a) {
+; CHECK-LABEL: @sink_trunc1(
+; CHECK-NEXT:  fnend:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[A:%.*]] to i16
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+  %trunc = trunc i64 %a to i16
+  br label %fnend
+
+fnend:
+  ret i16 %trunc
+}
+
+; The flags on the original trunc should be preserved.
+define i16 @sink_trunc2(i64 %a) {
+; CHECK-LABEL: @sink_trunc2(
+; CHECK-NEXT:  fnend:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[A:%.*]] to i16
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+  %trunc = trunc nuw nsw i64 %a to i16
+  br label %fnend
+
+fnend:
+  ret i16 %trunc
+}
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
index cdd58b2a084f..3e91b79c10f7 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
@@ -1,9 +1,12 @@
 ; Tests that we would convert coro.resume to a musttail call if the target is
-; Wasm64 with tail-call support.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; Wasm64 or Wasm32 with tail-call support.
+; REQUIRES: webassembly-registered-target
 
-target triple = "wasm64-unknown-unknown"
+; RUN: opt -mtriple=wasm64-unknown-unknown < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt -mtriple=wasm64-unknown-unknown < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+
+; RUN: opt -mtriple=wasm32-unknown-unknown < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt -mtriple=wasm32-unknown-unknown < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
 define void @f() #0 {
 entry:
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll
deleted file mode 100644
index da5d868280e9..000000000000
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail11.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; Tests that we would convert coro.resume to a musttail call if the target is
-; Wasm32 with tail-call support.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
-
-define void @f() #0 {
-entry:
-  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
-  %alloc = call ptr @malloc(i64 16) #3
-  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
-
-  %save = call token @llvm.coro.save(ptr null)
-  %addr1 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr1(ptr null)
-
-  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
-  switch i8 %suspend, label %exit [
-    i8 0, label %await.ready
-    i8 1, label %exit
-  ]
-await.ready:
-  %save2 = call token @llvm.coro.save(ptr null)
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr2(ptr null)
-
-  %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
-  switch i8 %suspend2, label %exit [
-    i8 0, label %exit
-    i8 1, label %exit
-  ]
-exit:
-  call i1 @llvm.coro.end(ptr null, i1 false, token none)
-  ret void
-}
-
-; CHECK: musttail call
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
-declare i1 @llvm.coro.alloc(token) #2
-declare i64 @llvm.coro.size.i64() #3
-declare ptr @llvm.coro.begin(token, ptr writeonly) #2
-declare token @llvm.coro.save(ptr) #2
-declare ptr @llvm.coro.frame() #3
-declare i8 @llvm.coro.suspend(token, i1) #2
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
-declare i1 @llvm.coro.end(ptr, i1, token) #2
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
-declare ptr @malloc(i64)
-
-attributes #0 = { presplitcoroutine "target-features"="+tail-call" }
-attributes #1 = { argmemonly nounwind readonly }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
diff --git a/llvm/test/Transforms/FunctionSpecialization/discover-transitive-phis.ll b/llvm/test/Transforms/FunctionSpecialization/discover-transitive-phis.ll
index b4c24715037b..d0095231a30f 100644
--- a/llvm/test/Transforms/FunctionSpecialization/discover-transitive-phis.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/discover-transitive-phis.ll
@@ -1,22 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ;
 ; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=20 -funcspec-for-literal-constant -S < %s | FileCheck %s --check-prefix=FUNCSPEC
 ; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=20 -funcspec-for-literal-constant -funcspec-max-discovery-iterations=16 -S < %s | FileCheck %s --check-prefix=NOFUNCSPEC
 
 define i64 @bar(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) {
-; FUNCSPEC-LABEL: define i64 @bar(
+; FUNCSPEC-LABEL: define range(i64 4, 13) i64 @bar(
 ; FUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
 ; FUNCSPEC-NEXT:  entry:
-; FUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo.specialized.1(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]]
-; FUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo.specialized.2(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG1:![0-9]+]]
+; FUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo.specialized.1(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]])
+; FUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo.specialized.2(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]])
 ; FUNCSPEC-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]]
 ; FUNCSPEC-NEXT:    ret i64 [[ADD]]
 ;
-; NOFUNCSPEC-LABEL: define i64 @bar(
+; NOFUNCSPEC-LABEL: define range(i64 4, 13) i64 @bar(
 ; NOFUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
 ; NOFUNCSPEC-NEXT:  entry:
-; NOFUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]]
-; NOFUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0]]
+; NOFUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]])
+; NOFUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]])
 ; NOFUNCSPEC-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]]
 ; NOFUNCSPEC-NEXT:    ret i64 [[ADD]]
 ;
@@ -28,6 +28,50 @@ entry:
 }
 
 define internal i64 @foo(i64 %n, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) {
+; NOFUNCSPEC-LABEL: define internal range(i64 2, 7) i64 @foo(
+; NOFUNCSPEC-SAME: i64 [[N:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
+; NOFUNCSPEC-NEXT:  entry:
+; NOFUNCSPEC-NEXT:    br i1 [[C1]], label [[L1:%.*]], label [[L9:%.*]]
+; NOFUNCSPEC:       l1:
+; NOFUNCSPEC-NEXT:    [[PHI1:%.*]] = phi i64 [ [[N]], [[ENTRY:%.*]] ], [ [[PHI2:%.*]], [[L2:%.*]] ]
+; NOFUNCSPEC-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[PHI1]], 1
+; NOFUNCSPEC-NEXT:    br i1 [[C2]], label [[L1_5:%.*]], label [[EXIT:%.*]]
+; NOFUNCSPEC:       l1_5:
+; NOFUNCSPEC-NEXT:    br i1 [[C3]], label [[L1_75:%.*]], label [[L6:%.*]]
+; NOFUNCSPEC:       l1_75:
+; NOFUNCSPEC-NEXT:    br i1 [[C4]], label [[L2]], label [[L3:%.*]]
+; NOFUNCSPEC:       l2:
+; NOFUNCSPEC-NEXT:    [[PHI2]] = phi i64 [ [[PHI1]], [[L1_75]] ], [ [[PHI3:%.*]], [[L3]] ]
+; NOFUNCSPEC-NEXT:    br label [[L1]]
+; NOFUNCSPEC:       l3:
+; NOFUNCSPEC-NEXT:    [[PHI3]] = phi i64 [ [[PHI1]], [[L1_75]] ], [ [[PHI4:%.*]], [[L4:%.*]] ]
+; NOFUNCSPEC-NEXT:    br label [[L2]]
+; NOFUNCSPEC:       l4:
+; NOFUNCSPEC-NEXT:    [[PHI4]] = phi i64 [ [[PHI5:%.*]], [[L5:%.*]] ], [ [[PHI6:%.*]], [[L6]] ]
+; NOFUNCSPEC-NEXT:    br i1 [[C5]], label [[L3]], label [[L6]]
+; NOFUNCSPEC:       l5:
+; NOFUNCSPEC-NEXT:    [[PHI5]] = phi i64 [ [[PHI6]], [[L6_5:%.*]] ], [ [[PHI7:%.*]], [[L7:%.*]] ]
+; NOFUNCSPEC-NEXT:    br label [[L4]]
+; NOFUNCSPEC:       l6:
+; NOFUNCSPEC-NEXT:    [[PHI6]] = phi i64 [ [[PHI4]], [[L4]] ], [ [[PHI1]], [[L1_5]] ]
+; NOFUNCSPEC-NEXT:    br i1 [[C6]], label [[L4]], label [[L6_5]]
+; NOFUNCSPEC:       l6_5:
+; NOFUNCSPEC-NEXT:    br i1 [[C7]], label [[L5]], label [[L8:%.*]]
+; NOFUNCSPEC:       l7:
+; NOFUNCSPEC-NEXT:    [[PHI7]] = phi i64 [ [[PHI9:%.*]], [[L9]] ], [ [[PHI8:%.*]], [[L8]] ]
+; NOFUNCSPEC-NEXT:    br i1 [[C8]], label [[L5]], label [[L8]]
+; NOFUNCSPEC:       l8:
+; NOFUNCSPEC-NEXT:    [[PHI8]] = phi i64 [ [[PHI6]], [[L6_5]] ], [ [[PHI7]], [[L7]] ]
+; NOFUNCSPEC-NEXT:    br i1 [[C9]], label [[L7]], label [[L9]]
+; NOFUNCSPEC:       l9:
+; NOFUNCSPEC-NEXT:    [[PHI9]] = phi i64 [ [[N]], [[ENTRY]] ], [ [[PHI8]], [[L8]] ]
+; NOFUNCSPEC-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 [[PHI9]], 1
+; NOFUNCSPEC-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[SUB]], 2
+; NOFUNCSPEC-NEXT:    br i1 [[C10]], label [[L7]], label [[EXIT]]
+; NOFUNCSPEC:       exit:
+; NOFUNCSPEC-NEXT:    [[RES:%.*]] = phi i64 [ 2, [[L1]] ], [ [[MUL]], [[L9]] ]
+; NOFUNCSPEC-NEXT:    ret i64 [[RES]]
+;
 entry:
   br i1 %c1, label %l1, label %l9
 
diff --git a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll
index b9481baae60b..a576d9aa32e1 100644
--- a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll
@@ -49,10 +49,10 @@ entry:
 ; Check if specialisation on the address of a non-const global variable
 ; is not allowed, then it is not performed.
 
-; NO-GLOBALS-LABEL: define internal i32 @g()
+; NO-GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g()
 ; NO-GLOBALS: call i32 @f(ptr @G)
 
-; NO-GLOBALS-LABEL: define i32 @h0(ptr %p)
+; NO-GLOBALS-LABEL: define range(i32 -2147483646, -2147483648) i32 @h0(ptr %p)
 ; NO-GLOBALS:call i32 @g()
 
 ; NO-GLOBALS-LABEL: define i32 @h1()
@@ -64,10 +64,10 @@ entry:
 ; Check if specialisation on the address of a non-const global variable
 ; is allowed, then it is performed where possible.
 
-; GLOBALS-LABEL: define internal i32 @g()
+; GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g()
 ; GLOBALS: call i32 @f.specialized.2()
 
-; GLOBALS-LABEL: define i32 @h0(ptr %p)
+; GLOBALS-LABEL: define range(i32 -2147483646, -2147483648) i32 @h0(ptr %p)
 ; GLOBALS: call i32 @g()
 
 ; GLOBALS-LABEL: define i32 @h1()
diff --git a/llvm/test/Transforms/FunctionSpecialization/literal-const.ll b/llvm/test/Transforms/FunctionSpecialization/literal-const.ll
index f107ffe0ec7e..3eae3dc261fb 100644
--- a/llvm/test/Transforms/FunctionSpecialization/literal-const.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/literal-const.ll
@@ -71,10 +71,10 @@ entry:
 ; CHECK-LIT-LABEL: define i32 @f1
 ; CHECK-LIT: call i32 @neg.specialized.[[#B:]]
 
-; CHECK-LIT-LABEL: define i32 @g0
+; CHECK-LIT-LABEL: define range(i32 -2147483647, -2147483648) i32 @g0
 ; CHECK-LIT: call i32 @add.specialized.[[#C:]]
 
-; CHECK-LIT-LABEL: define i32 @g1
+; CHECK-LIT-LABEL: define range(i32 -2147483647, -2147483648) i32 @g1
 ; CHECK-LIT: call i32 @add.specialized.[[#D:]]
 
 ; CHECK-LIT-LABEL: define float @h0
diff --git a/llvm/test/Transforms/GlobalOpt/atexit-dtor.ll b/llvm/test/Transforms/GlobalOpt/atexit-dtor.ll
new file mode 100644
index 000000000000..bf78fd3d3ef4
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/atexit-dtor.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S -passes=globalopt | FileCheck %s
+
+declare dso_local i32 @atexit(ptr)
+
+define dso_local void @empty_atexit_handler() {
+; CHECK-LABEL: define dso_local void @empty_atexit_handler() local_unnamed_addr {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; Check that `atexit` is removed if the handler is empty.
+; Check that a removed `atexit` call returns `0` which is the value that denotes success.
+define dso_local noundef i32 @register_atexit_handler() {
+; CHECK-LABEL: define dso_local noundef i32 @register_atexit_handler() local_unnamed_addr {
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @atexit(ptr @empty_atexit_handler)
+  ret i32 %1
+}
+
+declare dso_local void @declared_atexit_handler()
+
+; Check that an atexit handler with only a declaration is not removed.
+define dso_local noundef i32 @register_declared_atexit_handler() {
+; CHECK-LABEL: define dso_local noundef i32 @register_declared_atexit_handler() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @atexit(ptr @declared_atexit_handler)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @atexit(ptr @declared_atexit_handler)
+  ret i32 %1
+}
+
+declare dso_local void @external_exit_func()
+
+define dso_local void @nonempty_atexit_handler() {
+; CHECK-LABEL: define dso_local void @nonempty_atexit_handler() {
+; CHECK-NEXT:    call void @external_exit_func()
+; CHECK-NEXT:    ret void
+;
+  call void @external_exit_func()
+  ret void
+}
+
+; Check that an atexit handler that consists of any instructions other than `ret` is considered nonempty and not removed.
+define dso_local noundef i32 @register_nonempty_atexit_handler() {
+; CHECK-LABEL: define dso_local noundef i32 @register_nonempty_atexit_handler() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @atexit(ptr @nonempty_atexit_handler)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @atexit(ptr @nonempty_atexit_handler)
+  ret i32 %1
+}
diff --git a/llvm/test/Transforms/GlobalOpt/basictest.ll b/llvm/test/Transforms/GlobalOpt/basictest.ll
index 6d7fcdd96dfd..72d38a1e8845 100644
--- a/llvm/test/Transforms/GlobalOpt/basictest.ll
+++ b/llvm/test/Transforms/GlobalOpt/basictest.ll
@@ -1,9 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=globalopt -S | FileCheck %s
 
-; CHECK-NOT: global
 @X = internal global i32 4              ; <ptr> [#uses=1]
 
 define i32 @foo() {
-        %V = load i32, ptr @X               ; <i32> [#uses=1]
-        ret i32 %V
+; CHECK-LABEL: define i32 @foo() local_unnamed_addr {
+; CHECK-NEXT:    ret i32 4
+;
+  %V = load i32, ptr @X               ; <i32> [#uses=1]
+  ret i32 %V
+}
+
+@X_tls = internal thread_local global i32 13
+
+define i32 @bar() {
+; CHECK-LABEL: define i32 @bar() local_unnamed_addr {
+; CHECK-NEXT:    ret i32 13
+;
+  %p = call ptr @llvm.threadlocal.address(ptr @X_tls)
+  %v = load i32, ptr %p
+  ret i32 %v
 }
diff --git a/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll b/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
index ca844f63937c..f82942e73d92 100644
--- a/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
+++ b/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
@@ -72,11 +72,12 @@ entry:
 }
 
 @threadlocalptr = global ptr null, align 4
-; CHECK: @threadlocalptr = global ptr null, align 4
+; CHECK: @threadlocalptr = local_unnamed_addr global ptr null, align 4
 @threadlocalvar = external thread_local global i32
 define internal void @test5() {
 entry:
-  store ptr @threadlocalvar, ptr @threadlocalptr, align 4
+  %p = call ptr @llvm.threadlocal.address(ptr @threadlocalvar)
+  store ptr %p, ptr @threadlocalptr, align 4
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll b/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
index 7b845070bbd0..2b7ceb4169f3 100644
--- a/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
+++ b/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
@@ -39,12 +39,14 @@ define i32 @dom_arg(i32 %a) {
 
 define ptr @dom_thread_local_global() {
 ; CHECK-LABEL: @dom_thread_local_global(
-; CHECK-NEXT:    store ptr @tl, ptr @g3, align 8
+; CHECK-NEXT:    [[P:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tl)
+; CHECK-NEXT:    store ptr [[P]], ptr @g3, align 8
 ; CHECK-NEXT:    call void @b()
 ; CHECK-NEXT:    [[R:%.*]] = load ptr, ptr @g3, align 8
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
-  store ptr @tl, ptr @g3
+  %p = call ptr @llvm.threadlocal.address(ptr @tl)
+  store ptr %p, ptr @g3
   call void @b()
   %r = load ptr, ptr @g3
   ret ptr %r
diff --git a/llvm/test/Transforms/GlobalOpt/tls.ll b/llvm/test/Transforms/GlobalOpt/tls.ll
index 6ba003ff30b2..2cc2ea4e366e 100644
--- a/llvm/test/Transforms/GlobalOpt/tls.ll
+++ b/llvm/test/Transforms/GlobalOpt/tls.ll
@@ -15,14 +15,16 @@ declare void @start_thread(ptr)
 define i32 @f() {
 entry:
   ; Set @ip to point to x[1] for thread 1.
-  store ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), ptr @ip, align 8
+  %p = call ptr @llvm.threadlocal.address(ptr @x)
+  %addr = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 1
+  store ptr %addr, ptr @ip, align 8
 
   ; Run g on a new thread.
   tail call void @start_thread(ptr @g) nounwind
   tail call void @wait() nounwind
 
   ; Reset x[1] for thread 1.
-  store i32 0, ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), align 4
+  store i32 0, ptr %addr, align 4
 
   ; Read the value of @ip, which now points at x[1] for thread 2.
   %0 = load ptr, ptr @ip, align 8
@@ -39,10 +41,12 @@ entry:
 define internal void @g() nounwind uwtable {
 entry:
   ; Set @ip to point to x[1] for thread 2.
-  store ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), ptr @ip, align 8
+  %p = call ptr @llvm.threadlocal.address(ptr @x)
+  %addr = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 1
+  store ptr %addr, ptr @ip, align 8
 
   ; Store 50 in x[1] for thread 2.
-  store i32 50, ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), align 4
+  store i32 50, ptr %addr, align 4
 
   tail call void @signal() nounwind
   ret void
diff --git a/llvm/test/Transforms/IRCE/pr89959.ll b/llvm/test/Transforms/IRCE/pr89959.ll
new file mode 100644
index 000000000000..dc7c0dfbc57a
--- /dev/null
+++ b/llvm/test/Transforms/IRCE/pr89959.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=irce -S < %s 2>&1 | FileCheck %s
+
+; Make sure we don't crash.
+define void @pr89959() {
+; CHECK-LABEL: define void @pr89959() {
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[VALUE_PHI:%.*]] = phi ptr [ null, [[TOP:%.*]] ], [ [[TMP0:%.*]], [[L13:%.*]] ]
+; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[VALUE_PHI]], i64 8
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp ule ptr [[VALUE_PHI]], null
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[L13]], label [[L15:%.*]]
+; CHECK:       L13:
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L15:
+; CHECK-NEXT:    ret void
+;
+top:
+  br label %L3
+
+L3:
+  %value_phi = phi ptr [ null, %top ], [ %0, %L13 ]
+  %0 = getelementptr i8, ptr %value_phi, i64 8
+  %.not = icmp ule ptr %value_phi, null
+  br i1 %.not, label %L13, label %L15
+
+L13:
+  br label %L3
+
+L15:
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll b/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
index f02d03688f03..4a9c576f0271 100644
--- a/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
+++ b/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
@@ -38,6 +38,8 @@ store_ptr_in_gvar:                                ; preds = %entry
 
 check_pointers_are_equal:                         ; preds = %store_ptr_in_gvar, %entry
   %phi = phi ptr [ %ptr, %store_ptr_in_gvar ], [ @other_g_var, %entry ]
+; FIXME: While inlining, the following is miscompiled to i1 false,
+; as %ptr in the phi-node is not taken into account.
   %.not1 = icmp eq ptr %phi, %ptr
   br i1 %.not1, label %return, label %abort
 
@@ -62,13 +64,9 @@ define i32 @main() {
 ; CHECK-NEXT:    br label [[CHECK_POINTERS_ARE_EQUAL_I]]
 ; CHECK:       check_pointers_are_equal.i:
 ; CHECK-NEXT:    [[PHI_I:%.*]] = phi ptr [ [[G_VAR]], [[STORE_PTR_IN_GVAR_I]] ], [ @other_g_var, [[TMP0:%.*]] ]
-; CHECK-NEXT:    [[DOTNOT1_I:%.*]] = icmp eq ptr [[PHI_I]], [[G_VAR]]
-; CHECK-NEXT:    br i1 [[DOTNOT1_I]], label [[CALLEE_EXIT:%.*]], label [[ABORT_I:%.*]]
-; CHECK:       abort.i:
 ; CHECK-NEXT:    call void @abort()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       callee.exit:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr [[G_VAR]])
 ; CHECK-NEXT:    ret i32 0
 ;
   call void @callee(ptr noundef byval(%struct.a) align 8 @g_var)
diff --git a/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll b/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
index 8af4d89663a4..d6b771e2629d 100644
--- a/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
+++ b/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
@@ -53,6 +53,7 @@ define void @caller() {
 !18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
 attributes #0 = { alwaysinline }
 ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
+; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i32 2000}
 ; CHECK: ![[COUNT_CALLEE]] = !{!"branch_weights", i32 1200}
 ; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
 ; CHECK: ![[COUNT_CALLER]] = !{!"branch_weights", i32 800}
diff --git a/llvm/test/Transforms/Inline/prof-update-sample.ll b/llvm/test/Transforms/Inline/prof-update-sample.ll
index e09b859b6981..6cdd70e84e0c 100644
--- a/llvm/test/Transforms/Inline/prof-update-sample.ll
+++ b/llvm/test/Transforms/Inline/prof-update-sample.ll
@@ -52,6 +52,7 @@ define void @caller() {
 !17 = !{!"branch_weights", i32 400}
 !18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
 ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
+; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i32 2000}
 ; CHECK: ![[COUNT_CALLEE]] = !{!"branch_weights", i32 1200}
 ; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
 ; CHECK: ![[COUNT_CALLER]] = !{!"branch_weights", i32 800}
diff --git a/llvm/test/Transforms/InstCombine/array.ll b/llvm/test/Transforms/InstCombine/array.ll
index 236821d8ba4c..4f4ae17bebc5 100644
--- a/llvm/test/Transforms/InstCombine/array.ll
+++ b/llvm/test/Transforms/InstCombine/array.ll
@@ -108,3 +108,163 @@ entry:
   store i32 %b, ptr %gep
   ret void
 }
+
+define ptr @gep_inbounds_add_nsw_nonneg(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_nsw_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i64 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i64 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %b.nneg = icmp sgt i64 %b, -1
+  call void @llvm.assume(i1 %b.nneg)
+  %add = add nsw i64 %a, %b
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_add_nsw_not_nonneg1(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_nsw_not_nonneg1(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i64 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i64 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = add nsw i64 %a, %b
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_add_nsw_not_nonneg2(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_nsw_not_nonneg2(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %b.nneg = icmp sgt i64 %b, -1
+  call void @llvm.assume(i1 %b.nneg)
+  %add = add nsw i64 %a, %b
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_not_inbounds_add_nsw_nonneg(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_not_inbounds_add_nsw_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i64 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i64 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %b.nneg = icmp sgt i64 %b, -1
+  call void @llvm.assume(i1 %b.nneg)
+  %add = add nsw i64 %a, %b
+  %gep = getelementptr i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_add_not_nsw_nonneg(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_not_nsw_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i64 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i64 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %b.nneg = icmp sgt i64 %b, -1
+  call void @llvm.assume(i1 %b.nneg)
+  %add = add i64 %a, %b
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_sext_add_nonneg(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_inbounds_sext_add_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i32 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 40
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i32 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = add nsw i32 %a, 10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_sext_add_not_nonneg_1(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_inbounds_sext_add_not_nonneg_1(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i32 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -40
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i32 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = add nsw i32 %a, -10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_sext_add_not_nonneg_2(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_inbounds_sext_add_not_nonneg_2(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 40
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %add = add nsw i32 %a, 10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
+
+define ptr @gep_not_inbounds_sext_add_nonneg(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_not_inbounds_sext_add_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i32 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 40
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i32 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = add nsw i32 %a, 10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll
index 7c9289c44711..3c4088832fea 100644
--- a/llvm/test/Transforms/InstCombine/fneg.ll
+++ b/llvm/test/Transforms/InstCombine/fneg.ll
@@ -980,7 +980,7 @@ define float @fneg_ldexp_contract(float %x, i32 %n) {
 define float @fneg_ldexp_metadata(float %x, i32 %n) {
 ; CHECK-LABEL: @fneg_ldexp_metadata(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg float [[X:%.*]]
-; CHECK-NEXT:    [[NEG:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP1]], i32 [[N:%.*]]), !arst !0
+; CHECK-NEXT:    [[NEG:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP1]], i32 [[N:%.*]]), !arst [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[NEG]]
 ;
   %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %n), !arst !0
@@ -988,4 +988,125 @@ define float @fneg_ldexp_metadata(float %x, i32 %n) {
   ret float %neg
 }
 
+define float @test_fneg_select_constants(i1 %cond) {
+; CHECK-LABEL: @test_fneg_select_constants(
+; CHECK-NEXT:    [[NEG:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float 0.000000e+00
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %sel1 = select i1 %cond, float 0.0, float -0.0
+  %neg = fneg float %sel1
+  ret float %neg
+}
+
+define <2 x float> @test_fneg_vec(<2 x i1> %cond) {
+; CHECK-LABEL: @test_fneg_vec(
+; CHECK-NEXT:    [[NEG:%.*]] = select <2 x i1> [[COND:%.*]], <2 x float> <float -0.000000e+00, float 0.000000e+00>, <2 x float> <float 0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[NEG]]
+;
+  %sel1 = select <2 x i1> %cond, <2 x float> <float 0.0, float -0.0>, <2 x float> <float -0.0, float 0.0>
+  %neg = fneg <2 x float> %sel1
+  ret <2 x float> %neg
+}
+
+define float @test_fneg_select_var_constant(i1 %cond, float %x) {
+; CHECK-LABEL: @test_fneg_select_var_constant(
+; CHECK-NEXT:    [[X_NEG:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = select i1 [[COND:%.*]], float [[X_NEG]], float 0.000000e+00
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %sel1 = select i1 %cond, float %x, float -0.0
+  %neg = fneg float %sel1
+  ret float %neg
+}
+
+; nsz can be preserved.
+
+define float @test_fneg_select_var_constant_fmf1(i1 %cond, float %x) {
+; CHECK-LABEL: @test_fneg_select_var_constant_fmf1(
+; CHECK-NEXT:    [[X_NEG:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = select nnan ninf nsz i1 [[COND:%.*]], float [[X_NEG]], float -1.000000e+00
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %sel1 = select nnan ninf nsz i1 %cond, float %x, float 1.0
+  %neg = fneg float %sel1
+  ret float %neg
+}
+
+define float @test_fneg_select_var_constant_fmf2(i1 %cond, float %x) {
+; CHECK-LABEL: @test_fneg_select_var_constant_fmf2(
+; CHECK-NEXT:    [[X_NEG:%.*]] = fneg nnan ninf nsz float [[X:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = select nnan ninf nsz i1 [[COND:%.*]], float [[X_NEG]], float -1.000000e+00
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %sel1 = select i1 %cond, float %x, float 1.0
+  %neg = fneg nnan ninf nsz float %sel1
+  ret float %neg
+}
+
+define float @test_fneg_select_constant_var(i1 %cond, float %x) {
+; CHECK-LABEL: @test_fneg_select_constant_var(
+; CHECK-NEXT:    [[X_NEG:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[X_NEG]]
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %sel1 = select i1 %cond, float 0.0, float %x
+  %neg = fneg float %sel1
+  ret float %neg
+}
+
+; Make sure nabs is generated.
+
+define float @test_fneg_select_abs(i1 %cond, float %x) {
+; CHECK-LABEL: @test_fneg_select_abs(
+; CHECK-NEXT:    [[ABSX:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[ABSX_NEG:%.*]] = fneg float [[ABSX]]
+; CHECK-NEXT:    [[NEG:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[ABSX_NEG]]
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %absx = call float @llvm.fabs.f32(float %x)
+  %sel1 = select i1 %cond, float 0.0, float %absx
+  %neg = fneg float %sel1
+  ret float %neg
+}
+
+define float @test_fneg_fabs_select(i1 %cond, float %x) {
+; CHECK-LABEL: @test_fneg_fabs_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[DOTNEG:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    [[NEG:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[DOTNEG]]
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %sel1 = select i1 %cond, float 0.0, float %x
+  %abs = call float @llvm.fabs.f32(float %sel1)
+  %neg = fneg float %abs
+  ret float %neg
+}
+
+define float @test_fneg_select_constant_var_multiuse(i1 %cond, float %x) {
+; CHECK-LABEL: @test_fneg_select_constant_var_multiuse(
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[COND:%.*]], float 0.000000e+00, float [[X:%.*]]
+; CHECK-NEXT:    call void @use(float [[SEL1]])
+; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[SEL1]]
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %sel1 = select i1 %cond, float 0.0, float %x
+  call void @use(float %sel1)
+  %neg = fneg float %sel1
+  ret float %neg
+}
+
+; Don't break fmax idioms.
+
+define float @test_fneg_select_maxnum(float %x) {
+; CHECK-LABEL: @test_fneg_select_maxnum(
+; CHECK-NEXT:    [[SEL1:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00)
+; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[SEL1]]
+; CHECK-NEXT:    ret float [[NEG]]
+;
+  %cmp1 = fcmp ogt float %x, 1.0
+  %sel1 = select nnan nsz i1 %cmp1, float %x, float 1.0
+  %neg = fneg float %sel1
+  ret float %neg
+}
+
 !0 = !{}
diff --git a/llvm/test/Transforms/InstCombine/gep-vector.ll b/llvm/test/Transforms/InstCombine/gep-vector.ll
index 3465c620b70f..f058338fbf7c 100644
--- a/llvm/test/Transforms/InstCombine/gep-vector.ll
+++ b/llvm/test/Transforms/InstCombine/gep-vector.ll
@@ -127,7 +127,10 @@ define ptr addrspace(3) @inbounds_bitcast_vec_to_array_addrspace_matching_alloc_
 
 define ptr @test_accumulate_constant_offset_vscale_nonzero(<vscale x 16 x i1> %pg, ptr %base) {
 ; CHECK-LABEL: @test_accumulate_constant_offset_vscale_nonzero(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 1, i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[GEP_OFFS:%.*]] = or disjoint i64 [[TMP2]], 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[GEP_OFFS]]
 ; CHECK-NEXT:    ret ptr [[GEP]]
 ;
   %gep = getelementptr <vscale x 16 x i8>, ptr %base, i64 1, i64 4
diff --git a/llvm/test/Transforms/InstCombine/gepofconstgepi8.ll b/llvm/test/Transforms/InstCombine/gepofconstgepi8.ll
new file mode 100644
index 000000000000..4c8c56a9262e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/gepofconstgepi8.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+declare void @use64(i64)
+declare void @useptr(ptr)
+
+define ptr @test_zero(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_zero(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[A]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add i64 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_nonzero(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_nonzero(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[BASE]], i64 4
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[A]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add i64 %a, 2
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_or_disjoint(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_or_disjoint(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[A]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = or disjoint i64 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_zero_multiuse_index(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_zero_multiuse_index(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INDEX:%.*]] = add i64 [[A]], 1
+; CHECK-NEXT:    call void @use64(i64 [[INDEX]])
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[A]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add i64 %a, 1
+  call void @use64(i64 %index)
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_zero_multiuse_ptr(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_zero_multiuse_ptr(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    call void @useptr(ptr [[P1]])
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[A]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  call void @useptr(ptr %p1)
+  %index = add i64 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_zero_sext_add_nsw(ptr %base, i32 %a) {
+; CHECK-LABEL: define ptr @test_zero_sext_add_nsw(
+; CHECK-SAME: ptr [[BASE:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P1]], i64 [[TMP0]]
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add nsw i32 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i32 %index
+  ret ptr %p2
+}
+
+define ptr @test_zero_trunc_add(ptr %base, i128 %a) {
+; CHECK-LABEL: define ptr @test_zero_trunc_add(
+; CHECK-SAME: ptr [[BASE:%.*]], i128 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i128 [[A]] to i64
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add i128 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i128 %index
+  ret ptr %p2
+}
+
+define ptr @test_non_i8(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_non_i8(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P1]], i64 [[A]]
+; CHECK-NEXT:    ret ptr [[TMP0]]
+;
+entry:
+  %p1 = getelementptr i16, ptr %base, i64 -4
+  %index = add i64 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_non_const(ptr %base, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @test_non_const(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[B]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P1]], i64 [[A]]
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 %b
+  %index = add i64 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_too_many_indices(ptr %base, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @test_too_many_indices(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[B]]
+; CHECK-NEXT:    [[INDEX:%.*]] = add i64 [[A]], 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr [8 x i32], ptr [[P1]], i64 1, i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 %b
+  %index = add i64 %a, 1
+  %p2 = getelementptr [8 x i32], ptr %p1, i64 1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_wrong_op(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_wrong_op(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    [[INDEX:%.*]] = xor i64 [[A]], 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = xor i64 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_sext_add_without_nsw(ptr %base, i32 %a) {
+; CHECK-LABEL: define ptr @test_sext_add_without_nsw(
+; CHECK-SAME: ptr [[BASE:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    [[INDEX:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[P1]], i64 [[TMP0]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add i32 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i32 %index
+  ret ptr %p2
+}
+
+define ptr @test_or_without_disjoint(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_or_without_disjoint(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    [[INDEX:%.*]] = or i64 [[A]], 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = or i64 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_smul_overflow(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_smul_overflow(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -12
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P1]], i64 [[A]]
+; CHECK-NEXT:    ret ptr [[TMP0]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add i64 %a, 9223372036854775806
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_sadd_overflow(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_sadd_overflow(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -9223372036854775808
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P1]], i64 [[A]]
+; CHECK-NEXT:    ret ptr [[TMP0]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 9223372036854775804
+  %index = add i64 %a, 1
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_nonzero_multiuse_index(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_nonzero_multiuse_index(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    [[INDEX:%.*]] = add i64 [[A]], 2
+; CHECK-NEXT:    call void @use64(i64 [[INDEX]])
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add i64 %a, 2
+  call void @use64(i64 %index)
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_nonzero_multiuse_ptr(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_nonzero_multiuse_ptr(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    call void @useptr(ptr [[P1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P1]], i64 [[A]]
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  call void @useptr(ptr %p1)
+  %index = add i64 %a, 2
+  %p2 = getelementptr i32, ptr %p1, i64 %index
+  ret ptr %p2
+}
+
+define ptr @test_scalable(ptr %base, i64 %a) {
+; CHECK-LABEL: define ptr @test_scalable(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[BASE]], i64 -4
+; CHECK-NEXT:    [[INDEX:%.*]] = add i64 [[A]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[P2_IDX:%.*]] = mul i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P1]], i64 [[P2_IDX]]
+; CHECK-NEXT:    ret ptr [[P2]]
+;
+entry:
+  %p1 = getelementptr i8, ptr %base, i64 -4
+  %index = add i64 %a, 1
+  %p2 = getelementptr <vscale x 4 x i32>, ptr %p1, i64 %index
+  ret ptr %p2
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index a0e03a5428e6..29d0c941ac5c 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -472,7 +472,14 @@ define void @test60_extra_use_fold(ptr %foo, i64 %start.idx, i64 %end.offset) {
 
 define i1 @test_scalable_same(ptr %x) {
 ; CHECK-LABEL: @test_scalable_same(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[A_IDX:%.*]] = shl i64 [[TMP1]], 5
+; CHECK-NEXT:    [[A:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 [[A_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[B_IDX:%.*]] = shl i64 [[TMP2]], 5
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[B_IDX]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt ptr [[A]], [[B]]
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a = getelementptr <vscale x 4 x i8>, ptr %x, i64 8
   %b = getelementptr inbounds <vscale x 4 x i8>, ptr %x, i64 8
@@ -507,11 +514,11 @@ define i1 @test_scalable_xc(ptr %x) {
 define i1 @test_scalable_xy(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test_scalable_xy(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
-; CHECK-NEXT:    [[GEP2_IDX:%.*]] = mul nsw i64 [[TMP2]], [[J:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = mul nsw i64 [[TMP2]], [[I:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 4
-; CHECK-NEXT:    [[GEP1_IDX:%.*]] = mul nsw i64 [[TMP4]], [[I:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 2
+; CHECK-NEXT:    [[GEP2_IDX:%.*]] = mul nsw i64 [[TMP4]], [[J:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[GEP2_IDX]], [[GEP1_IDX]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
index 7f616bbb2a83..a61694919ab0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
@@ -268,3 +268,328 @@ define i1 @icmp_trunc_x_zext_y_fail_multiuse(i32 %x, i8 %y) {
   %r = icmp ule i16 %x16, %y16
   ret i1 %r
 }
+
+define i1 @trunc_unsigned_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_both_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_both_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index 227ca4a6d5cf..4fb3c0b1ad49 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -2061,8 +2061,8 @@ define i32 @mul_sext_icmp_with_zero(i32 %x) {
 
 define i32 @test_mul_sext_bool(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool(
-; CHECK-NEXT:    [[Y_NEG:%.*]] = sub i32 0, [[Y:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %sext = sext i1 %x to i32
@@ -2072,8 +2072,8 @@ define i32 @test_mul_sext_bool(i1 %x, i32 %y) {
 
 define i32 @test_mul_sext_bool_nuw(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool_nuw(
-; CHECK-NEXT:    [[Y_NEG:%.*]] = sub i32 0, [[Y:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %sext = sext i1 %x to i32
@@ -2083,8 +2083,8 @@ define i32 @test_mul_sext_bool_nuw(i1 %x, i32 %y) {
 
 define i32 @test_mul_sext_bool_nsw(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool_nsw(
-; CHECK-NEXT:    [[Y_NEG:%.*]] = sub nsw i32 0, [[Y:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %sext = sext i1 %x to i32
@@ -2094,8 +2094,8 @@ define i32 @test_mul_sext_bool_nsw(i1 %x, i32 %y) {
 
 define i32 @test_mul_sext_bool_nuw_nsw(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool_nuw_nsw(
-; CHECK-NEXT:    [[Y_NEG:%.*]] = sub nsw i32 0, [[Y:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %sext = sext i1 %x to i32
@@ -2106,8 +2106,8 @@ define i32 @test_mul_sext_bool_nuw_nsw(i1 %x, i32 %y) {
 define i32 @test_mul_sext_bool_commuted(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool_commuted(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[Y:%.*]], -2
-; CHECK-NEXT:    [[YY_NEG1:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[YY_NEG1]], i32 0
+; CHECK-NEXT:    [[YY_NEG:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[YY_NEG]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %yy = xor i32 %y, 1
@@ -2139,3 +2139,63 @@ define i32 @test_mul_sext_multiuse(i1 %x, i32 %y) {
   %mul = mul i32 %sext, %y
   ret i32 %mul
 }
+
+define i8 @mul_nsw_nonneg(i8 %x, i8 %y) {
+; CHECK-LABEL: @mul_nsw_nonneg(
+; CHECK-NEXT:    [[X_NNEG:%.*]] = icmp sgt i8 [[X:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[X_NNEG]])
+; CHECK-NEXT:    [[Y_NNEG:%.*]] = icmp sgt i8 [[Y:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_NNEG]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %x.nneg = icmp sge i8 %x, 0
+  call void @llvm.assume(i1 %x.nneg)
+  %y.nneg = icmp sge i8 %y, 0
+  call void @llvm.assume(i1 %y.nneg)
+  %mul = mul nsw i8 %x, %y
+  ret i8 %mul
+}
+
+define i8 @mul_nsw_not_nonneg1(i8 %x, i8 %y) {
+; CHECK-LABEL: @mul_nsw_not_nonneg1(
+; CHECK-NEXT:    [[Y_NNEG:%.*]] = icmp sgt i8 [[Y:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_NNEG]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %y.nneg = icmp sge i8 %y, 0
+  call void @llvm.assume(i1 %y.nneg)
+  %mul = mul nsw i8 %x, %y
+  ret i8 %mul
+}
+
+define i8 @mul_nsw_not_nonneg2(i8 %x, i8 %y) {
+; CHECK-LABEL: @mul_nsw_not_nonneg2(
+; CHECK-NEXT:    [[X_NNEG:%.*]] = icmp sgt i8 [[X:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[X_NNEG]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %x.nneg = icmp sge i8 %x, 0
+  call void @llvm.assume(i1 %x.nneg)
+  %mul = mul nsw i8 %x, %y
+  ret i8 %mul
+}
+
+define i8 @mul_not_nsw_nonneg(i8 %x, i8 %y) {
+; CHECK-LABEL: @mul_not_nsw_nonneg(
+; CHECK-NEXT:    [[X_NNEG:%.*]] = icmp sgt i8 [[X:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[X_NNEG]])
+; CHECK-NEXT:    [[Y_NNEG:%.*]] = icmp sgt i8 [[Y:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_NNEG]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %x.nneg = icmp sge i8 %x, 0
+  call void @llvm.assume(i1 %x.nneg)
+  %y.nneg = icmp sge i8 %y, 0
+  call void @llvm.assume(i1 %y.nneg)
+  %mul = mul i8 %x, %y
+  ret i8 %mul
+}
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index 4d38e2cd37c9..5cc444042e17 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -289,7 +289,9 @@ define ptr @geps_combinable_different_elem_type_extra_use2(ptr %a, i64 %idx) {
 
 define ptr @geps_combinable_scalable(ptr %a, i64 %idx) {
 ; CHECK-LABEL: @geps_combinable_scalable(
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds <vscale x 2 x i32>, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds i8, ptr [[A2]], i64 4
 ; CHECK-NEXT:    ret ptr [[A3]]
 ;
@@ -300,7 +302,9 @@ define ptr @geps_combinable_scalable(ptr %a, i64 %idx) {
 
 define ptr @geps_combinable_scalable_vector_array(ptr %a, i64 %idx) {
 ; CHECK-LABEL: @geps_combinable_scalable_vector_array(
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds [4 x <vscale x 2 x i32>], ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 5
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds i8, ptr [[A2]], i64 4
 ; CHECK-NEXT:    ret ptr [[A3]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-array.ll b/llvm/test/Transforms/InstCombine/scalable-vector-array.ll
index d03184766b4a..20e9f2d99dd9 100644
--- a/llvm/test/Transforms/InstCombine/scalable-vector-array.ll
+++ b/llvm/test/Transforms/InstCombine/scalable-vector-array.ll
@@ -4,7 +4,9 @@
 define <vscale x 4 x i32> @load(ptr %x) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @load
 ; CHECK-SAME: (ptr [[X:%.*]]) {
-; CHECK-NEXT:    [[A_ELT1:%.*]] = getelementptr inbounds [2 x <vscale x 4 x i32>], ptr [[X]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[A_ELT1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[A_UNPACK2:%.*]] = load <vscale x 4 x i32>, ptr [[A_ELT1]], align 16
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_UNPACK2]]
 ;
@@ -17,7 +19,9 @@ define void @store(ptr %x, <vscale x 4 x i32> %y, <vscale x 4 x i32> %z) {
 ; CHECK-LABEL: define void @store
 ; CHECK-SAME: (ptr [[X:%.*]], <vscale x 4 x i32> [[Y:%.*]], <vscale x 4 x i32> [[Z:%.*]]) {
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[Y]], ptr [[X]], align 16
-; CHECK-NEXT:    [[X_REPACK1:%.*]] = getelementptr inbounds [2 x <vscale x 4 x i32>], ptr [[X]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[X_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[Z]], ptr [[X_REPACK1]], align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 5692202fcb4c..8da52e074637 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -366,7 +366,7 @@ define i32 @test26(i32 %A) {
   ret i32 %D
 }
 
-define i1 @test27(i32 %x) nounwind {
+define i1 @test27(i32 %x) {
 ; CHECK-LABEL: @test27(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], 8
 ; CHECK-NEXT:    [[Z:%.*]] = icmp ne i32 [[TMP1]], 0
@@ -605,7 +605,7 @@ define <2 x i32> @shl_nuw_nsw_splat_vec(<2 x i8> %x) {
   ret <2 x i32> %t3
 }
 
-define i32 @test38(i32 %x) nounwind readnone {
+define i32 @test38(i32 %x) {
 ; CHECK-LABEL: @test38(
 ; CHECK-NEXT:    [[REM1:%.*]] = and i32 [[X:%.*]], 31
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, [[REM1]]
@@ -616,7 +616,7 @@ define i32 @test38(i32 %x) nounwind readnone {
   ret i32 %shl
 }
 
-define <2 x i32> @test38_uniform(<2 x i32> %x) nounwind readnone {
+define <2 x i32> @test38_uniform(<2 x i32> %x) {
 ; CHECK-LABEL: @test38_uniform(
 ; CHECK-NEXT:    [[REM1:%.*]] = and <2 x i32> [[X:%.*]], <i32 31, i32 31>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <2 x i32> <i32 1, i32 1>, [[REM1]]
@@ -627,7 +627,7 @@ define <2 x i32> @test38_uniform(<2 x i32> %x) nounwind readnone {
   ret <2 x i32> %shl
 }
 
-define <3 x i32> @test38_nonuniform(<3 x i32> %x) nounwind readnone {
+define <3 x i32> @test38_nonuniform(<3 x i32> %x) {
 ; CHECK-LABEL: @test38_nonuniform(
 ; CHECK-NEXT:    [[REM1:%.*]] = and <3 x i32> [[X:%.*]], <i32 31, i32 15, i32 0>
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw <3 x i32> <i32 1, i32 1, i32 1>, [[REM1]]
@@ -638,7 +638,7 @@ define <3 x i32> @test38_nonuniform(<3 x i32> %x) nounwind readnone {
   ret <3 x i32> %shl
 }
 
-define <2 x i32> @test38_poison(<2 x i32> %x) nounwind readnone {
+define <2 x i32> @test38_poison(<2 x i32> %x) {
 ; CHECK-LABEL: @test38_poison(
 ; CHECK-NEXT:    ret <2 x i32> poison
 ;
@@ -658,8 +658,8 @@ define i8 @test39(i32 %a0) {
 ; CHECK-NEXT:    [[I51:%.*]] = xor i8 [[I50]], [[I5]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr exact i8 [[I5]], 3
 ; CHECK-NEXT:    [[I54:%.*]] = and i8 [[TMP0]], 16
-; CHECK-NEXT:    [[I551:%.*]] = or disjoint i8 [[I54]], [[I51]]
-; CHECK-NEXT:    ret i8 [[I551]]
+; CHECK-NEXT:    [[I55:%.*]] = or disjoint i8 [[I54]], [[I51]]
+; CHECK-NEXT:    ret i8 [[I55]]
 ;
 entry:
   %i4 = trunc i32 %a0 to i8
@@ -675,7 +675,7 @@ entry:
   ret i8 %i55
 }
 
-define i32 @test42(i32 %a, i32 %b) nounwind {
+define i32 @test42(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test42(
 ; CHECK-NEXT:    [[DIV:%.*]] = lshr exact i32 4096, [[B:%.*]]
 ; CHECK-NEXT:    [[DIV2:%.*]] = udiv i32 [[A:%.*]], [[DIV]]
@@ -697,7 +697,7 @@ define <2 x i32> @test42vec(<2 x i32> %a, <2 x i32> %b) {
   ret <2 x i32> %div2
 }
 
-define i32 @test43(i32 %a, i32 %b) nounwind {
+define i32 @test43(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test43(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[B:%.*]], 12
 ; CHECK-NEXT:    [[DIV21:%.*]] = lshr i32 [[A:%.*]], [[TMP1]]
@@ -708,7 +708,7 @@ define i32 @test43(i32 %a, i32 %b) nounwind {
   ret i32 %div2
 }
 
-define i32 @test44(i32 %a) nounwind {
+define i32 @test44(i32 %a) {
 ; CHECK-LABEL: @test44(
 ; CHECK-NEXT:    [[Y:%.*]] = shl i32 [[A:%.*]], 5
 ; CHECK-NEXT:    ret i32 [[Y]]
@@ -718,7 +718,20 @@ define i32 @test44(i32 %a) nounwind {
   ret i32 %z
 }
 
-define i32 @test45(i32 %a) nounwind {
+define i32 @test44_multiuse(i32 %a) {
+; CHECK-LABEL: @test44_multiuse(
+; CHECK-NEXT:    [[Y:%.*]] = shl nuw i32 [[A:%.*]], 1
+; CHECK-NEXT:    call void @use_i32(i32 [[Y]])
+; CHECK-NEXT:    [[Z:%.*]] = shl i32 [[A]], 5
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %y = shl nuw i32 %a, 1
+  call void @use_i32(i32 %y)
+  %z = shl i32 %y, 4
+  ret i32 %z
+}
+
+define i32 @test45(i32 %a) {
 ; CHECK-LABEL: @test45(
 ; CHECK-NEXT:    [[Y:%.*]] = lshr i32 [[A:%.*]], 5
 ; CHECK-NEXT:    ret i32 [[Y]]
@@ -728,6 +741,19 @@ define i32 @test45(i32 %a) nounwind {
   ret i32 %z
 }
 
+define i32 @test45_multiuse(i32 %a) {
+; CHECK-LABEL: @test45_multiuse(
+; CHECK-NEXT:    [[Y:%.*]] = lshr exact i32 [[A:%.*]], 1
+; CHECK-NEXT:    call void @use_i32(i32 [[Y]])
+; CHECK-NEXT:    [[Z:%.*]] = lshr i32 [[A]], 5
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %y = lshr exact i32 %a, 1
+  call void @use_i32(i32 %y)
+  %z = lshr i32 %y, 4
+  ret i32 %z
+}
+
 ; (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
 
 define i32 @test46(i32 %a) {
@@ -1751,14 +1777,14 @@ define void @ashr_out_of_range_1(ptr %A) {
 ; CHECK-NEXT:    [[L:%.*]] = load i177, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[L_FROZEN:%.*]] = freeze i177 [[L]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i177 [[L_FROZEN]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i177 [[L_FROZEN]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i177, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 -24
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP1]] to i64
-; CHECK-NEXT:    [[G62:%.*]] = getelementptr i177, ptr [[G11]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i177 [[L_FROZEN]], -1
-; CHECK-NEXT:    [[B28:%.*]] = select i1 [[TMP5]], i177 0, i177 [[L_FROZEN]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i177 [[L_FROZEN]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i177, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr i8, ptr [[TMP4]], i64 -24
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i1 [[TMP1]] to i64
+; CHECK-NEXT:    [[G62:%.*]] = getelementptr i177, ptr [[G11]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i177 [[L_FROZEN]], -1
+; CHECK-NEXT:    [[B28:%.*]] = select i1 [[TMP6]], i177 0, i177 [[L_FROZEN]]
 ; CHECK-NEXT:    store i177 [[B28]], ptr [[G62]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
index a7d01b4f824d..e4fb7764ba9e 100644
--- a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
@@ -364,8 +364,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__select_chain_inf_nan_1(i1 %cond,
 define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    ret float [[FABS]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %select = select i1 %cond, float %x, float 0xFFF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -376,8 +376,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_ninf_rhs(i1 %cond, f
 define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    ret float [[FABS]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -400,8 +400,8 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__fabs_
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    ret float [[FABS]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -435,8 +435,8 @@ define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_na
 define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
-; CHECK-NEXT:    ret float [[FNEG]]
+; CHECK-NEXT:    [[X_NEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    ret float [[X_NEG]]
 ;
   %select = select i1 %cond, float %x, float 0xFFF0000000000000
   %fneg = fneg float %select
@@ -447,8 +447,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__fneg_select_ninf_rhs(i1 %cond, f
 define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
-; CHECK-NEXT:    ret float [[FNEG]]
+; CHECK-NEXT:    [[X_NEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    ret float [[X_NEG]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fneg = fneg float %select
@@ -459,8 +459,8 @@ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___
 define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_nofpclass_nonegatives_noinf___fneg_select_ninf_lhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[X]]
-; CHECK-NEXT:    ret float [[FNEG]]
+; CHECK-NEXT:    [[X_NEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    ret float [[X_NEG]]
 ;
   %select = select i1 %cond, float 0xFFF0000000000000, float %x
   %fneg = fneg float %select
@@ -470,8 +470,8 @@ define nofpclass(inf nnorm nsub nzero) float @ret_nofpclass_nonegatives_noinf___
 define nofpclass(pzero psub pnorm pinf) float @ret_nofpclass_nopositives___fneg_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives___fneg_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[X]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[SELECT]]
+; CHECK-NEXT:    [[X_NEG:%.*]] = fneg float [[X]]
+; CHECK-NEXT:    [[FNEG:%.*]] = select i1 [[COND]], float [[X_NEG]], float 0xFFF0000000000000
 ; CHECK-NEXT:    ret float [[FNEG]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -483,9 +483,9 @@ define nofpclass(pzero psub pnorm pinf) float @ret_nofpclass_nopositives___fneg_
 define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
-; CHECK-NEXT:    ret float [[FNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[DOTNEG:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[DOTNEG]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -497,9 +497,9 @@ define nofpclass(inf) float @ret_nofpclass_inf__fneg_fabs_select_pinf_rhs(i1 %co
 define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_nofpclass_nonegatives__fneg_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[FABS]]
-; CHECK-NEXT:    ret float [[FNEG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[DOTNEG:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[DOTNEG]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs = call float @llvm.fabs.f32(float %select)
@@ -535,8 +535,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_unknown_select_pinf_rhs
 define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs(i1 %cond, float %x) {
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_positive_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    ret float [[COPYSIGN]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %copysign = call float @llvm.copysign.f32(float %select, float 1.0)
@@ -547,8 +547,8 @@ define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rh
 ; CHECK-LABEL: define nofpclass(inf) float @ret_nofpclass_inf__copysign_negative_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
-; CHECK-NEXT:    ret float [[COPYSIGN]]
+; CHECK-NEXT:    [[DOTNEG:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    ret float [[DOTNEG]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %copysign = call float @llvm.copysign.f32(float %select, float -1.0)
@@ -627,8 +627,8 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_nonegatives_non
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs(i1 %cond, float %x, float %sign) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_nopositives__copysign_fabs_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[SIGN:%.*]]) {
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    ret float [[COPYSIGN]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
   %fabs.sign = call float @llvm.fabs.f32(float %sign)
@@ -678,9 +678,9 @@ define nofpclass(ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives__copys
 define nofpclass(pinf pnorm psub pzero) float @ret_nofpclass_no_positives__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_nofpclass_no_positives__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND]], float [[TMP2]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[DOTNEG:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = select i1 [[COND]], float [[DOTNEG]], float 0xFFF0000000000000
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
@@ -705,9 +705,9 @@ define nofpclass(nan ninf nnorm nsub nzero) float @ret_nofpclass_no_negatives_no
 define nofpclass(nan pinf pnorm psub pzero) float @ret_nofpclass_no_positives_nonan__copysign_unknown_select_pinf_rhs(i1 %cond, float %x, float %unknown.sign) {
 ; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_nofpclass_no_positives_nonan__copysign_unknown_select_pinf_rhs
 ; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], float [[UNKNOWN_SIGN:%.*]]) {
-; CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND]], float [[TMP2]], float 0x7FF0000000000000
-; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[DOTNEG:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = select i1 [[COND]], float [[DOTNEG]], float 0xFFF0000000000000
 ; CHECK-NEXT:    ret float [[COPYSIGN]]
 ;
   %select = select i1 %cond, float %x, float 0x7FF0000000000000
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index a84e389f13c3..32ed4a787e92 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -1123,7 +1123,8 @@ define i64 @test58(ptr %foo, i64 %i, i64 %j) {
 
 define i64 @test59(ptr %foo, i64 %i) {
 ; CHECK-LABEL: @test59(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO:%.*]], i64 0, i64 42, i64 [[I:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[FOO:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4200
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4200
 ; CHECK-NEXT:    store ptr [[GEP1]], ptr @dummy_global1, align 8
 ; CHECK-NEXT:    store ptr [[GEP2]], ptr @dummy_global2, align 8
@@ -1142,13 +1143,12 @@ define i64 @test59(ptr %foo, i64 %i) {
 
 define i64 @test60(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO:%.*]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4200
-; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint ptr [[GEP1]] to i64
-; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint ptr [[GEP2]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = mul nsw i64 [[J:%.*]], 100
+; CHECK-NEXT:    [[GEP1_OFFS:%.*]] = add nsw i64 [[GEP1_IDX]], [[I:%.*]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP1_OFFS]]
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = add nsw i64 [[GEP1_OFFS]], -4200
 ; CHECK-NEXT:    store ptr [[GEP1]], ptr @dummy_global1, align 8
-; CHECK-NEXT:    ret i64 [[SUB]]
+; CHECK-NEXT:    ret i64 [[GEPDIFF]]
 ;
 ; gep1 has a non-constant index and more than one uses. Shouldn't duplicate the arithmetic.
   %gep1 = getelementptr inbounds [100 x [100 x i8]], ptr %foo, i64 0, i64 %j, i64 %i
@@ -1162,13 +1162,12 @@ define i64 @test60(ptr %foo, i64 %i, i64 %j) {
 
 define i64 @test61(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test61(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 4200
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint ptr [[GEP1]] to i64
-; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint ptr [[GEP2]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
+; CHECK-NEXT:    [[GEP2_IDX:%.*]] = mul nsw i64 [[J:%.*]], 100
+; CHECK-NEXT:    [[GEP2_OFFS:%.*]] = add nsw i64 [[GEP2_IDX]], [[I:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP2_OFFS]]
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub nsw i64 4200, [[GEP2_OFFS]]
 ; CHECK-NEXT:    store ptr [[GEP2]], ptr @dummy_global2, align 8
-; CHECK-NEXT:    ret i64 [[SUB]]
+; CHECK-NEXT:    ret i64 [[GEPDIFF]]
 ;
 ; gep2 has a non-constant index and more than one uses. Shouldn't duplicate the arithmetic.
   %gep1 = getelementptr inbounds [100 x [100 x i8]], ptr %foo, i64 0, i64 42, i64 0
@@ -1180,6 +1179,24 @@ define i64 @test61(ptr %foo, i64 %i, i64 %j) {
   ret i64 %sub
 }
 
+declare void @use.ptr(ptr)
+
+define i64 @test_sub_ptradd_multiuse(ptr %p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @test_sub_ptradd_multiuse(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 [[IDX1:%.*]]
+; CHECK-NEXT:    call void @use.ptr(ptr [[P1]])
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub nsw i64 [[IDX1]], [[IDX2:%.*]]
+; CHECK-NEXT:    ret i64 [[GEPDIFF]]
+;
+  %p1 = getelementptr inbounds i8, ptr %p, i64 %idx1
+  call void @use.ptr(ptr %p1)
+  %p2 = getelementptr inbounds i8, ptr %p, i64 %idx2
+  %p1.int = ptrtoint ptr %p1 to i64
+  %p2.int = ptrtoint ptr %p2 to i64
+  %sub = sub i64 %p1.int, %p2.int
+  ret i64 %sub
+}
+
 define i32 @test62(i32 %A) {
 ; CHECK-LABEL: @test62(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[A:%.*]], 1
diff --git a/llvm/test/Transforms/InstCombine/trunc.ll b/llvm/test/Transforms/InstCombine/trunc.ll
index e59b2bea6684..a180c3d52f6a 100644
--- a/llvm/test/Transforms/InstCombine/trunc.ll
+++ b/llvm/test/Transforms/InstCombine/trunc.ll
@@ -1054,3 +1054,44 @@ define i8 @drop_both_trunc(i16 %x, i16 %y) {
   %res = trunc nuw nsw i16 %and2 to i8
   ret i8 %res
 }
+
+define i1 @trunc_xor(i8 %x, i8 %y) {
+; CHECK-LABEL: @trunc_xor(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = trunc i8 [[XOR]] to i1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor = xor i8 %x, %y
+  %r = trunc i8 %xor to i1
+  ret i1 %r
+}
+
+define i1 @trunc_nuw_xor(i8 %x, i8 %y) {
+; CHECK-LABEL: @trunc_nuw_xor(
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor = xor i8 %x, %y
+  %r = trunc nuw i8 %xor to i1
+  ret i1 %r
+}
+
+define i1 @trunc_nsw_xor(i8 %x, i8 %y) {
+; CHECK-LABEL: @trunc_nsw_xor(
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor = xor i8 %x, %y
+  %r = trunc nsw i8 %xor to i1
+  ret i1 %r
+}
+
+define <2 x i1> @trunc_nuw_xor_vector(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @trunc_nuw_xor_vector(
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i8> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %xor = xor <2 x i8> %x, %y
+  %r = trunc nuw <2 x i8> %xor to <2 x i1>
+  ret <2 x i1> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/vector-reverse.ll b/llvm/test/Transforms/InstCombine/vector-reverse.ll
index 5e6672658f9a..a1a6ee949a13 100644
--- a/llvm/test/Transforms/InstCombine/vector-reverse.ll
+++ b/llvm/test/Transforms/InstCombine/vector-reverse.ll
@@ -8,11 +8,11 @@
 define <vscale x 4 x i32> @binop_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse(
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
 }
@@ -20,14 +20,14 @@ define <vscale x 4 x i32> @binop_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[ADD1:%.*]] = add <vscale x 4 x i32> [[A]], [[B:%.*]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32>  %a.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
@@ -36,14 +36,14 @@ define <vscale x 4 x i32> @binop_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[ADD1:%.*]] = add <vscale x 4 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32>  %b.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
@@ -52,15 +52,15 @@ define <vscale x 4 x i32> @binop_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @binop_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[A_REV]], [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
@@ -71,10 +71,10 @@ define <vscale x 4 x i32> @binop_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x
 define <vscale x 4 x i32> @binop_reverse_4(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @binop_reverse_4(
 ; CHECK-NEXT:    [[MUL1:%.*]] = mul <vscale x 4 x i32> [[A:%.*]], [[A]]
-; CHECK-NEXT:    [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[MUL1]])
+; CHECK-NEXT:    [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[MUL1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[MUL]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %mul = mul <vscale x 4 x i32> %a.rev, %a.rev
   ret <vscale x 4 x i32> %mul
 }
@@ -82,12 +82,12 @@ define <vscale x 4 x i32> @binop_reverse_4(<vscale x 4 x i32> %a) {
 ; %a.rev used as both operands along with a third use
 define <vscale x 4 x i32> @binop_reverse_5(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @binop_reverse_5(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[MUL:%.*]] = mul <vscale x 4 x i32> [[A_REV]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[MUL]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   %mul = mul <vscale x 4 x i32> %a.rev, %a.rev
   ret <vscale x 4 x i32> %mul
@@ -98,10 +98,10 @@ define <vscale x 4 x i32> @binop_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DIV1:%.*]] = udiv <vscale x 4 x i32> [[A:%.*]], [[B_SPLAT]]
-; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %div = udiv <vscale x 4 x i32> %a.rev, %b.splat
@@ -111,14 +111,14 @@ define <vscale x 4 x i32> @binop_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_splat_RHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @binop_reverse_splat_RHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv <vscale x 4 x i32> [[A_REV]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -131,10 +131,10 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DIV1:%.*]] = udiv <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %div = udiv <vscale x 4 x i32> %b.splat, %a.rev
@@ -144,14 +144,14 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @binop_reverse_splat_LHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv <vscale x 4 x i32> [[B_SPLAT]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -161,11 +161,11 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32
 
 define <vscale x 4 x float> @unop_reverse(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: @unop_reverse(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
 ; CHECK-NEXT:    [[NEG:%.*]] = fneg fast <vscale x 4 x float> [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %neg = fneg fast <vscale x 4 x float> %a.rev
   ret <vscale x 4 x float> %neg
 }
@@ -173,12 +173,12 @@ define <vscale x 4 x float> @unop_reverse(<vscale x 4 x float> %a) {
 ; %a.rev has multiple uses
 define <vscale x 4 x float> @unop_reverse_1(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: @unop_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4f32(<vscale x 4 x float> [[A_REV]])
 ; CHECK-NEXT:    [[NEG:%.*]] = fneg fast <vscale x 4 x float> [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   call void @use_nxv4f32(<vscale x 4 x float> %a.rev)
   %neg = fneg fast <vscale x 4 x float> %a.rev
   ret <vscale x 4 x float> %neg
@@ -187,11 +187,11 @@ define <vscale x 4 x float> @unop_reverse_1(<vscale x 4 x float> %a) {
 define <vscale x 4 x i1> @icmp_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse(
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
 }
@@ -199,14 +199,14 @@ define <vscale x 4 x i1> @icmp_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32>
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
@@ -215,14 +215,14 @@ define <vscale x 4 x i1> @icmp_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %b.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
@@ -231,15 +231,15 @@ define <vscale x 4 x i1> @icmp_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i1> @icmp_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <vscale x 4 x i32> [[A_REV]], [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
@@ -251,10 +251,10 @@ define <vscale x 4 x i1> @icmp_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b)
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %cmp = icmp sgt <vscale x 4 x i32> %a.rev, %b.splat
@@ -264,14 +264,14 @@ define <vscale x 4 x i1> @icmp_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b)
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_splat_RHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @icmp_reverse_splat_RHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <vscale x 4 x i32> [[A_REV]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -284,10 +284,10 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b)
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %cmp = icmp ult <vscale x 4 x i32> %b.splat, %a.rev
@@ -297,14 +297,14 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b)
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @icmp_reverse_splat_LHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <vscale x 4 x i32> [[B_SPLAT]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -315,12 +315,12 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b
 define <vscale x 4 x i32> @select_reverse(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse(
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
 }
@@ -328,15 +328,15 @@ define <vscale x 4 x i32> @select_reverse(<vscale x 4 x i1> %a, <vscale x 4 x i3
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -345,15 +345,15 @@ define <vscale x 4 x i32> @select_reverse_1(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -362,15 +362,15 @@ define <vscale x 4 x i32> @select_reverse_2(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %c.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_3(
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -379,17 +379,17 @@ define <vscale x 4 x i32> @select_reverse_3(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_4(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_4(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -399,17 +399,17 @@ define <vscale x 4 x i32> @select_reverse_4(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_5(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_5(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -419,17 +419,17 @@ define <vscale x 4 x i32> @select_reverse_5(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %b.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_6(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_6(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -439,18 +439,18 @@ define <vscale x 4 x i32> @select_reverse_6(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev, %b.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_7(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_7(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[B_REV]], <vscale x 4 x i32> [[C_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
@@ -463,11 +463,11 @@ define <vscale x 4 x i32> @select_reverse_splat_false(<vscale x 4 x i1> %a, <vsc
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.splat
@@ -477,16 +477,16 @@ define <vscale x 4 x i32> @select_reverse_splat_false(<vscale x 4 x i1> %a, <vsc
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -497,16 +497,16 @@ define <vscale x 4 x i32> @select_reverse_splat_false_1(<vscale x 4 x i1> %a, <v
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
@@ -517,8 +517,8 @@ define <vscale x 4 x i32> @select_reverse_splat_false_2(<vscale x 4 x i1> %a, <v
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
@@ -526,8 +526,8 @@ define <vscale x 4 x i32> @select_reverse_splat_false_3(<vscale x 4 x i1> %a, <v
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[B_REV]], <vscale x 4 x i32> [[C_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -541,11 +541,11 @@ define <vscale x 4 x i32> @select_reverse_splat_true(<vscale x 4 x i1> %a, <vsca
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %c.splat, <vscale x 4 x i32> %b.rev
@@ -555,16 +555,16 @@ define <vscale x 4 x i32> @select_reverse_splat_true(<vscale x 4 x i1> %a, <vsca
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -575,16 +575,16 @@ define <vscale x 4 x i32> @select_reverse_splat_true_1(<vscale x 4 x i1> %a, <vs
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
@@ -595,8 +595,8 @@ define <vscale x 4 x i32> @select_reverse_splat_true_2(<vscale x 4 x i1> %a, <vs
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
@@ -604,8 +604,8 @@ define <vscale x 4 x i32> @select_reverse_splat_true_3(<vscale x 4 x i1> %a, <vs
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -622,10 +622,10 @@ define <vscale x 4 x float> @reverse_binop_reverse(<vscale x 4 x float> %a, <vsc
 ; CHECK-NEXT:    [[ADD1:%.*]] = fadd <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[ADD1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
   %add = fadd <vscale x 4 x float> %a.rev, %b.rev
-  %add.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %add)
+  %add.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %add)
   ret <vscale x 4 x float> %add.rev
 }
 
@@ -636,11 +636,11 @@ define <vscale x 4 x float> @reverse_binop_reverse_splat_RHS(<vscale x 4 x float
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv <vscale x 4 x float> [[A:%.*]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[DIV1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.insert = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %b.splat = shufflevector <vscale x 4 x float> %b.insert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
   %div = fdiv <vscale x 4 x float> %a.rev, %b.splat
-  %div.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
+  %div.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
   ret <vscale x 4 x float> %div.rev
 }
 
@@ -651,11 +651,11 @@ define <vscale x 4 x float> @reverse_binop_reverse_splat_LHS(<vscale x 4 x float
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv <vscale x 4 x float> [[B_SPLAT]], [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[DIV1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.insert = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %b.splat = shufflevector <vscale x 4 x float> %b.insert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
   %div = fdiv <vscale x 4 x float> %b.splat, %a.rev
-  %div.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
+  %div.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
   ret <vscale x 4 x float> %div.rev
 }
 
@@ -664,10 +664,10 @@ define <vscale x 4 x i1> @reverse_fcmp_reverse(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast olt <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
   %cmp = fcmp fast olt <vscale x 4 x float> %a.rev, %b.rev
-  %cmp.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %cmp)
+  %cmp.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %cmp)
   ret <vscale x 4 x i1> %cmp.rev
 }
 
@@ -676,11 +676,11 @@ define <vscale x 4 x float> @reverse_select_reverse(<vscale x 4 x i1> %a, <vscal
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select fast <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SELECT1]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
-  %c.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %c.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %c)
   %select = select fast <vscale x 4 x i1> %a.rev, <vscale x 4 x float> %b.rev, <vscale x 4 x float> %c.rev
-  %select.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %select)
+  %select.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %select)
   ret <vscale x 4 x float> %select.rev
 }
 
@@ -689,9 +689,9 @@ define <vscale x 4 x float> @reverse_unop_reverse(<vscale x 4 x float> %a) {
 ; CHECK-NEXT:    [[NEG1:%.*]] = fneg <vscale x 4 x float> [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %neg = fneg <vscale x 4 x float> %a.rev
-  %neg.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %neg)
+  %neg.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %neg)
   ret <vscale x 4 x float> %neg.rev
 }
 
@@ -700,6 +700,6 @@ declare void @use_nxv4i1(<vscale x 4 x i1>)
 declare void @use_nxv4i32(<vscale x 4 x i32>)
 declare void @use_nxv4f32(<vscale x 4 x float>)
 
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
diff --git a/llvm/test/Transforms/InstCombine/vscale_gep.ll b/llvm/test/Transforms/InstCombine/vscale_gep.ll
index 534888dc1a1c..f424650d21e8 100644
--- a/llvm/test/Transforms/InstCombine/vscale_gep.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_gep.ll
@@ -14,7 +14,9 @@ define <vscale x 2 x ptr> @gep_index_type_is_scalable(ptr %p) {
 ; This test serves to verify code changes for "GEP.getNumIndices() == 1".
 define ptr @gep_num_of_indices_1(ptr %p) {
 ; CHECK-LABEL: @gep_num_of_indices_1(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    ret ptr [[GEP]]
 ;
   %gep = getelementptr <vscale x 4 x i32>, ptr %p, i64 1
@@ -25,7 +27,9 @@ define ptr @gep_num_of_indices_1(ptr %p) {
 define void @gep_bitcast(ptr %p) {
 ; CHECK-LABEL: @gep_bitcast(
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[P:%.*]], align 16
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[GEP2]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -54,7 +58,10 @@ define i32 @gep_alloca_inbounds_vscale_zero() {
 define i32 @gep_alloca_inbounds_vscale_nonzero() {
 ; CHECK-LABEL: @gep_alloca_inbounds_vscale_nonzero(
 ; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>, align 16
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[A]], i64 1, i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP_OFFS:%.*]] = or disjoint i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP_OFFS]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
diff --git a/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll b/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
index a26f0a9d87f8..25e99ff0e715 100644
--- a/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
+++ b/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
@@ -6,8 +6,8 @@ define <vscale x 4 x i32> @shuffle_b2b_reverse(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @shuffle_b2b_reverse(
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
 ;
-  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %rev.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %rev)
+  %rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %rev.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %rev)
   ret <vscale x 4 x i32> %rev.rev
 }
 
@@ -20,8 +20,8 @@ define <vscale x 4 x i32> @splat_reverse(i32 %a) {
 ;
   %splat_insert = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 4 x i32> %splat_insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %splat)
+  %rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %splat)
   ret <vscale x 4 x i32> %rev
 }
 
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 40c1460e3ebc..4eb6491eec5a 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -1105,19 +1105,19 @@ define <2 x i32> @select_ctpop_zero_vec(<2 x i32> %x) {
 define <2 x i32> @select_vector_reverse(<2 x i32> %x) {
 ; CHECK-LABEL: @select_vector_reverse(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> [[X]])
+; CHECK-NEXT:    [[REV:%.*]] = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> [[X]])
 ; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> [[CMP]], <2 x i32> zeroinitializer, <2 x i32> [[REV]]
 ; CHECK-NEXT:    ret <2 x i32> [[SEL]]
 ;
   %cmp = icmp eq <2 x i32> %x, zeroinitializer
-  %rev = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %x)
+  %rev = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %x)
   %sel = select <2 x i1> %cmp, <2 x i32> zeroinitializer, <2 x i32> %rev
   ret <2 x i32> %sel
 }
 
 declare i32 @llvm.ctpop.i32(i32)
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
 
 define <2 x i32> @vec_select_no_equivalence(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @vec_select_no_equivalence(
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 224a0693bf21..54348d1e2a48 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -15,11 +15,11 @@ define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]]
 ;
   %load = load <32 x i8>, ptr %ptr, align 1
-  %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %load)
+  %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %load)
   ret { <16 x i8>, <16 x i8> } %deinterleave
 }
 
@@ -32,11 +32,11 @@ define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
 ;
   %load = load <16 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %load)
+  %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %load)
   ret { <8 x i16>, <8 x i16> } %deinterleave
 }
 
@@ -49,11 +49,11 @@ define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
 ;
   %load = load <8 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %load)
+  %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %load)
   ret { <4 x i32>, <4 x i32> } %deinterleave
 }
 
@@ -66,11 +66,11 @@ define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x i64>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %load)
+  %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> %load)
   ret { <2 x i64>, <2 x i64> } %deinterleave
 }
 
@@ -83,11 +83,11 @@ define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
 ;
   %load = load <8 x float>, ptr %ptr, align 4
-  %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %load)
+  %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> %load)
   ret { <4 x float>, <4 x float> } %deinterleave
 }
 
@@ -100,11 +100,11 @@ define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %load)
+  %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %load)
   ret { <2 x double>, <2 x double> } %deinterleave
 }
 
@@ -117,11 +117,11 @@ define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x ptr>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> %load)
+  %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> %load)
   ret { <2 x ptr>, <2 x ptr> } %deinterleave
 }
 
@@ -133,11 +133,11 @@ define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i8_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <16 x i8> [[L:%.*]], <16 x i8> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> [[L]], <16 x i8> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> [[L]], <16 x i8> [[R]])
 ; SVE-FIXED-NEXT:    store <32 x i8> [[INTERLEAVE]], ptr [[PTR]], align 1
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r)
+  %interleave = tail call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r)
   store <32 x i8> %interleave, ptr %ptr, align 1
   ret void
 }
@@ -150,11 +150,11 @@ define void @interleave_i16_factor2(ptr %ptr, <8 x i16> %l, <8 x i16> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x i16> [[L:%.*]], <8 x i16> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> [[L]], <8 x i16> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> [[L]], <8 x i16> [[R]])
 ; SVE-FIXED-NEXT:    store <16 x i16> [[INTERLEAVE]], ptr [[PTR]], align 2
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r)
+  %interleave = tail call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r)
   store <16 x i16> %interleave, ptr %ptr, align 2
   ret void
 }
@@ -167,11 +167,11 @@ define void @interleave_i32_factor2(ptr %ptr, <4 x i32> %l, <4 x i32> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x i32> [[L:%.*]], <4 x i32> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> [[L]], <4 x i32> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> [[L]], <4 x i32> [[R]])
 ; SVE-FIXED-NEXT:    store <8 x i32> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r)
+  %interleave = tail call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r)
   store <8 x i32> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -184,11 +184,11 @@ define void @interleave_i64_factor2(ptr %ptr, <2 x i64> %l, <2 x i64> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> [[L]], <2 x i64> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> [[L]], <2 x i64> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x i64> [[INTERLEAVE]], ptr [[PTR]], align 8
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r)
+  %interleave = tail call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r)
   store <4 x i64> %interleave, ptr %ptr, align 8
   ret void
 }
@@ -201,11 +201,11 @@ define void @interleave_float_factor2(ptr %ptr, <4 x float> %l, <4 x float> %r)
 ;
 ; SVE-FIXED-LABEL: define void @interleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x float> [[L:%.*]], <4 x float> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> [[L]], <4 x float> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> [[L]], <4 x float> [[R]])
 ; SVE-FIXED-NEXT:    store <8 x float> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r)
+  %interleave = tail call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r)
   store <8 x float> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -218,11 +218,11 @@ define void @interleave_double_factor2(ptr %ptr, <2 x double> %l, <2 x double> %
 ;
 ; SVE-FIXED-LABEL: define void @interleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x double> [[L:%.*]], <2 x double> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> [[L]], <2 x double> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> [[L]], <2 x double> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x double> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r)
+  %interleave = tail call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r)
   store <4 x double> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -235,11 +235,11 @@ define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x ptr> [[L:%.*]], <2 x ptr> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> [[L]], <2 x ptr> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr> [[L]], <2 x ptr> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r)
+  %interleave = tail call <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r)
   store <4 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -266,11 +266,11 @@ define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
 ; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
 ;
   %load = load <32 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> %load)
+  %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> %load)
   ret { <16 x i16>, <16 x i16> } %deinterleave
 }
 
@@ -297,29 +297,29 @@ define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_wide_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x ptr> [[L:%.*]], <8 x ptr> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> [[L]], <8 x ptr> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr> [[L]], <8 x ptr> [[R]])
 ; SVE-FIXED-NEXT:    store <16 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r)
+  %interleave = tail call <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r)
   store <16 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
-declare { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
-declare { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr>)
-declare { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16>)
+declare { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64>)
+declare { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double>)
+declare { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr>)
+declare { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16>)
 
-declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
-declare <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>)
-declare <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>)
+declare <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>)
+declare <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>)
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 6353bf10d57c..2a05718cc416 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -11,7 +11,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2(ptr
 ; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]]
 ;
   %load = load <vscale x 32 x i8>, ptr %ptr, align 1
-  %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
+  %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave
 }
 
@@ -22,7 +22,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]]
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
+  %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave
 }
 
@@ -33,7 +33,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(
 ; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]]
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave
 }
 
@@ -44,7 +44,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]]
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
+  %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave
 }
 
@@ -55,7 +55,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_fact
 ; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]]
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
+  %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
   ret { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave
 }
 
@@ -66,7 +66,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_f
 ; CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]]
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
+  %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
   ret { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave
 }
 
@@ -77,7 +77,7 @@ define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]]
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.experimental.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
+  %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
   ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave
 }
 
@@ -87,7 +87,7 @@ define void @interleave_nxi8_factor2(ptr %ptr, <vscale x 16 x i8> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[L]], <vscale x 16 x i8> [[R]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
+  %interleave = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
   store <vscale x 32 x i8> %interleave, ptr %ptr, align 1
   ret void
 }
@@ -98,7 +98,7 @@ define void @interleave_nxi16_factor2(ptr %ptr, <vscale x 8 x i16> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[L]], <vscale x 8 x i16> [[R]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %l, <vscale x 8 x i16> %r)
+  %interleave = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %l, <vscale x 8 x i16> %r)
   store <vscale x 16 x i16> %interleave, ptr %ptr, align 2
   ret void
 }
@@ -109,7 +109,7 @@ define void @interleave_nxi32_factor2(ptr %ptr, <vscale x 4 x i32> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[L]], <vscale x 4 x i32> [[R]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %l, <vscale x 4 x i32> %r)
+  %interleave = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %l, <vscale x 4 x i32> %r)
   store <vscale x 8 x i32> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -120,7 +120,7 @@ define void @interleave_nxi64_factor2(ptr %ptr, <vscale x 2 x i64> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[L]], <vscale x 2 x i64> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %l, <vscale x 2 x i64> %r)
+  %interleave = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %l, <vscale x 2 x i64> %r)
   store <vscale x 4 x i64> %interleave, ptr %ptr, align 8
   ret void
 }
@@ -131,7 +131,7 @@ define void @interleave_nxfloat_factor2(ptr %ptr, <vscale x 4 x float> %l, <vsca
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[L]], <vscale x 4 x float> [[R]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %l, <vscale x 4 x float> %r)
+  %interleave = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %l, <vscale x 4 x float> %r)
   store <vscale x 8 x float> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -142,7 +142,7 @@ define void @interleave_nxdouble_factor2(ptr %ptr, <vscale x 2 x double> %l, <vs
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[L]], <vscale x 2 x double> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %l, <vscale x 2 x double> %r)
+  %interleave = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %l, <vscale x 2 x double> %r)
   store <vscale x 4 x double> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -153,7 +153,7 @@ define void @interleave_nxptr_factor2(ptr %ptr, <vscale x 2 x ptr> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2p0(<vscale x 2 x ptr> [[L]], <vscale x 2 x ptr> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x ptr> @llvm.experimental.vector.interleave2.nxv4p0(<vscale x 2 x ptr> %l, <vscale x 2 x ptr> %r)
+  %interleave = tail call <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr> %l, <vscale x 2 x ptr> %r)
   store <vscale x 4 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -192,7 +192,7 @@ define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_fac
 ; CHECK-NEXT:    ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]]
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
+  %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
   ret { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave
 }
 
@@ -216,7 +216,7 @@ define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdou
 ; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
+  %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
   ret { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave
 }
 
@@ -233,32 +233,32 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %l, <vscale x 4 x double> %r)
+  %interleave = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %l, <vscale x 4 x double> %r)
   store <vscale x 8 x double> %interleave, ptr %ptr, align 4
   ret void
 }
 
-declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.experimental.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr>)
 
 ; Larger deinterleaves to test 'legalization'
-declare { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
 
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 4 x ptr> @llvm.experimental.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <vscale x 2 x ptr>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <vscale x 2 x ptr>)
 
 ; Larger interleaves to test 'legalization'
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
 
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index 45e2c36836ff..73f26814f3a4 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -511,7 +511,7 @@ define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_fact
 ; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8
-  %ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
+  %ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
   ret { <vscale x 4 x double>, <vscale x 4 x double> } %ldN
 }
 
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll b/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll
new file mode 100644
index 000000000000..66cd4d454443
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s
+
+define i64 @d(i1 %tobool.not, i32 %add, i64 %conv23) !dbg !14{
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  ; There should be only one "llvm.dbg.vale" after loop unrolling
+  ; CHECK: call void @llvm.dbg.value
+  ; CHECK-NOT: call void @llvm.dbg.value
+
+  %k.045 = phi i64 [ 0, %entry ], [ %k.046, %for.body ]
+  tail call void @llvm.dbg.value(metadata i32 0, metadata !13, metadata !DIExpression()), !dbg !17
+  %k.046 = add nuw nsw i64 %k.045, 1
+  %exitcond = icmp ne i64 %k.046, 5
+  br i1 %exitcond, label %for.body, label %for.end22
+
+for.end22:                                        ; preds = %for.body
+  ret i64 %k.046
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, producer: "clang version 19.0.0git (https://github.com/llvm/llvm-project.git ec062f5b33ed22c61742e3c1486f6cba915801e0)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "unroll-remove-redundant-dbg.c", directory: "", checksumkind: CSK_MD5, checksum: "aa30a1d8c04deb9b0f3885c258d2b674")
+!2 = !{!3, !8, !10}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = distinct !DIGlobalVariable(name: "a", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint32_t", file: !6, line: 198, baseType: !7)
+!6 = !DIFile(filename: "/usr/include/stdint.h", directory: "", checksumkind: CSK_MD5, checksum: "da031bcff2d0c1d65aa92e7e68a44ef3")
+!7 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression())
+!9 = distinct !DIGlobalVariable(name: "c", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+!11 = distinct !DIGlobalVariable(name: "b", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !DILocalVariable(name: "f", scope: !14, file: !1, line: 4, type: !5)
+!14 = distinct !DISubprogram(name: "d", scope: !1, file: !1, line: 3, type: !15, scopeLine: 3, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+!15 = !DISubroutineType(types: !16)
+!16 = !{}
+!17 = !DILocation(line: 0, scope: !14)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
new file mode 100644
index 000000000000..14b5ee244080
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -0,0 +1,889 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefix=DEFAULT %s
+; RUN: opt -p loop-vectorize  -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefix=PRED %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define void @invar_cond_gep_store(ptr %dst, i32 %0) {
+; DEFAULT-LABEL: define void @invar_cond_gep_store(
+; DEFAULT-SAME: ptr [[DST:%.*]], i32 [[TMP0:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP5]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP6]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; DEFAULT:       pred.store.if1:
+; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1
+; DEFAULT-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 1
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP9]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; DEFAULT:       pred.store.continue2:
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; DEFAULT:       pred.store.if3:
+; DEFAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 2
+; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 1
+; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP13]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP14]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; DEFAULT:       pred.store.continue4:
+; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; DEFAULT:       pred.store.if5:
+; DEFAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 3
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 1
+; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP17]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP18]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; DEFAULT:       pred.store.continue6:
+; DEFAULT-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; DEFAULT:       pred.store.if7:
+; DEFAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 4
+; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 1
+; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP21]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP22]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; DEFAULT:       pred.store.continue8:
+; DEFAULT-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; DEFAULT:       pred.store.if9:
+; DEFAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 5
+; DEFAULT-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], 1
+; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP25]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP26]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; DEFAULT:       pred.store.continue10:
+; DEFAULT-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; DEFAULT:       pred.store.if11:
+; DEFAULT-NEXT:    [[TMP28:%.*]] = add i64 [[OFFSET_IDX]], 6
+; DEFAULT-NEXT:    [[TMP29:%.*]] = add i64 [[TMP28]], 1
+; DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP29]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP30]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; DEFAULT:       pred.store.continue12:
+; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
+; DEFAULT:       pred.store.if13:
+; DEFAULT-NEXT:    [[TMP32:%.*]] = add i64 [[OFFSET_IDX]], 7
+; DEFAULT-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], 1
+; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP33]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP34]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; DEFAULT:       pred.store.continue14:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; DEFAULT-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 97, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[LOOP_HEADER:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[CMP9:%.*]] = icmp eq i32 [[TMP0]], 0
+; DEFAULT-NEXT:    br i1 [[CMP9]], label [[THEN:%.*]], label [[LOOP_LATCH]]
+; DEFAULT:       then:
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_NEXT]]
+; DEFAULT-NEXT:    store i32 1, ptr [[GEP]], align 4
+; DEFAULT-NEXT:    br label [[LOOP_LATCH]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @invar_cond_gep_store(
+; PRED-SAME: ptr [[DST:%.*]], i32 [[TMP0:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; PRED-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; PRED-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; PRED-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; PRED-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; PRED-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP4]]
+; PRED-NEXT:    store i32 1, ptr [[TMP5]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; PRED-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; PRED:       pred.store.if1:
+; PRED-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 1
+; PRED-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 1
+; PRED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP8]]
+; PRED-NEXT:    store i32 1, ptr [[TMP9]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; PRED:       pred.store.continue2:
+; PRED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; PRED:       pred.store.if3:
+; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 2
+; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 1
+; PRED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP12]]
+; PRED-NEXT:    store i32 1, ptr [[TMP13]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; PRED:       pred.store.continue4:
+; PRED-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; PRED-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; PRED-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 1
+; PRED-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP16]]
+; PRED-NEXT:    store i32 1, ptr [[TMP17]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; PRED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 101, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; PRED-NEXT:    br label [[LOOP_HEADER:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[CMP9:%.*]] = icmp eq i32 [[TMP0]], 0
+; PRED-NEXT:    br i1 [[CMP9]], label [[THEN:%.*]], label [[LOOP_LATCH]]
+; PRED:       then:
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_NEXT]]
+; PRED-NEXT:    store i32 1, ptr [[GEP]], align 4
+; PRED-NEXT:    br label [[LOOP_LATCH]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i64 %iv, 1
+  %cmp9 = icmp eq i32 %0, 0
+  br i1 %cmp9, label %then, label %loop.latch
+
+then:
+  %gep = getelementptr i32, ptr %dst, i64 %iv.next
+  store i32 1, ptr %gep, align 4
+  br label %loop.latch
+
+loop.latch:
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+declare double @llvm.fabs.f64(double) #0
+
+define void @loop_dependent_cond(ptr %src, ptr noalias %dst, i64 %N) {
+; DEFAULT-LABEL: define void @loop_dependent_cond(
+; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP3]], i32 0
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD]])
+; DEFAULT-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD1]])
+; DEFAULT-NEXT:    [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
+; DEFAULT-NEXT:    [[TMP10:%.*]] = fcmp ogt <2 x double> [[TMP8]], <double 1.000000e+00, double 1.000000e+00>
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; DEFAULT:       pred.store.if2:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; DEFAULT:       pred.store.continue3:
+; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; DEFAULT:       pred.store.if4:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; DEFAULT:       pred.store.continue5:
+; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
+; DEFAULT:       pred.store.if6:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; DEFAULT:       pred.store.continue7:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END123:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY112:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[IV175:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT176:%.*]], [[FOR_INC121:%.*]] ]
+; DEFAULT-NEXT:    [[ARRAYIDX114:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IV175]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX114]], align 8
+; DEFAULT-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fabs.f64(double [[TMP16]])
+; DEFAULT-NEXT:    [[CMP115:%.*]] = fcmp ogt double [[TMP17]], 1.000000e+00
+; DEFAULT-NEXT:    br i1 [[CMP115]], label [[IF_THEN117:%.*]], label [[FOR_INC121]]
+; DEFAULT:       then:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[FOR_INC121]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[IV_NEXT176]] = add i64 [[IV175]], 1
+; DEFAULT-NEXT:    [[EXITCOND180_NOT:%.*]] = icmp eq i64 [[IV175]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EXITCOND180_NOT]], label [[FOR_END123]], label [[FOR_BODY112]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @loop_dependent_cond(
+; PRED-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br label [[FOR_BODY112:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[IV175:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT176:%.*]], [[FOR_INC121:%.*]] ]
+; PRED-NEXT:    [[ARRAYIDX114:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IV175]]
+; PRED-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX114]], align 8
+; PRED-NEXT:    [[TMP1:%.*]] = tail call double @llvm.fabs.f64(double [[TMP0]])
+; PRED-NEXT:    [[CMP115:%.*]] = fcmp ogt double [[TMP1]], 1.000000e+00
+; PRED-NEXT:    br i1 [[CMP115]], label [[IF_THEN117:%.*]], label [[FOR_INC121]]
+; PRED:       then:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[FOR_INC121]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[IV_NEXT176]] = add i64 [[IV175]], 1
+; PRED-NEXT:    [[EXITCOND180_NOT:%.*]] = icmp eq i64 [[IV175]], [[N]]
+; PRED-NEXT:    br i1 [[EXITCOND180_NOT]], label [[FOR_END123:%.*]], label [[FOR_BODY112]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr double, ptr %src, i64 %iv
+  %l = load double, ptr %gep, align 8
+  %abs = tail call double @llvm.fabs.f64(double %l)
+  %cmp = fcmp ogt double %abs, 1.000000e+00
+  br i1 %cmp, label %then, label %loop.latch
+
+then:
+  store i32 0, ptr %dst, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @invar_cond_chain_1(ptr %I, ptr noalias %src, i1 %c) {
+; DEFAULT-LABEL: define void @invar_cond_chain_1(
+; DEFAULT-SAME: ptr [[I:%.*]], ptr noalias [[SRC:%.*]], i1 [[C:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; DEFAULT-NEXT:    [[I1:%.*]] = ptrtoint ptr [[I]] to i64
+; DEFAULT-NEXT:    [[TMP29:%.*]] = sub i64 [[I1]], [[SRC2]]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = lshr i64 [[TMP29]], 2
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 16
+; DEFAULT-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; DEFAULT-NEXT:    [[TMP7:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; DEFAULT-NEXT:    [[TMP8:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; DEFAULT-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP7]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP8]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; DEFAULT-NEXT:    store i32 [[TMP12]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP9]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; DEFAULT:       pred.store.if5:
+; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; DEFAULT-NEXT:    store i32 [[TMP14]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; DEFAULT:       pred.store.continue6:
+; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; DEFAULT:       pred.store.if7:
+; DEFAULT-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; DEFAULT-NEXT:    store i32 [[TMP16]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; DEFAULT:       pred.store.continue8:
+; DEFAULT-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; DEFAULT:       pred.store.if9:
+; DEFAULT-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; DEFAULT-NEXT:    store i32 [[TMP18]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; DEFAULT:       pred.store.continue10:
+; DEFAULT-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; DEFAULT:       pred.store.if11:
+; DEFAULT-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i32 0
+; DEFAULT-NEXT:    store i32 [[TMP20]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; DEFAULT:       pred.store.continue12:
+; DEFAULT-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; DEFAULT:       pred.store.if13:
+; DEFAULT-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i32 1
+; DEFAULT-NEXT:    store i32 [[TMP22]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; DEFAULT:       pred.store.continue14:
+; DEFAULT-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; DEFAULT:       pred.store.if15:
+; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i32 2
+; DEFAULT-NEXT:    store i32 [[TMP24]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; DEFAULT:       pred.store.continue16:
+; DEFAULT-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17]]
+; DEFAULT:       pred.store.if17:
+; DEFAULT-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i32 3
+; DEFAULT-NEXT:    store i32 [[TMP26]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; DEFAULT:       pred.store.continue18:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP312_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY313:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[__BEGIN3_011973:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR329:%.*]], [[IF_END327:%.*]] ]
+; DEFAULT-NEXT:    [[TMP28:%.*]] = load i32, ptr [[__BEGIN3_011973]], align 4
+; DEFAULT-NEXT:    br i1 true, label [[IF_ELSE321:%.*]], label [[IF_THEN316:%.*]]
+; DEFAULT:       if:
+; DEFAULT-NEXT:    br label [[IF_END327_SINK_SPLIT:%.*]]
+; DEFAULT:       else.1:
+; DEFAULT-NEXT:    br i1 [[C]], label [[IF_THEN323:%.*]], label [[IF_END327]]
+; DEFAULT:       else.2:
+; DEFAULT-NEXT:    br label [[IF_END327_SINK_SPLIT]]
+; DEFAULT:       split:
+; DEFAULT-NEXT:    store i32 [[TMP28]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[IF_END327]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[INCDEC_PTR329]] = getelementptr inbounds i8, ptr [[__BEGIN3_011973]], i64 4
+; DEFAULT-NEXT:    [[CMP311_NOT:%.*]] = icmp eq ptr [[__BEGIN3_011973]], [[I]]
+; DEFAULT-NEXT:    br i1 [[CMP311_NOT]], label [[FOR_COND_CLEANUP312_LOOPEXIT]], label [[FOR_BODY313]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @invar_cond_chain_1(
+; PRED-SAME: ptr [[I:%.*]], ptr noalias [[SRC:%.*]], i1 [[C:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br label [[FOR_BODY313:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[__BEGIN3_011973:%.*]] = phi ptr [ [[SRC]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR329:%.*]], [[IF_END327:%.*]] ]
+; PRED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__BEGIN3_011973]], align 4
+; PRED-NEXT:    br i1 true, label [[IF_ELSE321:%.*]], label [[IF_THEN316:%.*]]
+; PRED:       if:
+; PRED-NEXT:    br label [[IF_END327_SINK_SPLIT:%.*]]
+; PRED:       else.1:
+; PRED-NEXT:    br i1 [[C]], label [[IF_THEN323:%.*]], label [[IF_END327]]
+; PRED:       else.2:
+; PRED-NEXT:    br label [[IF_END327_SINK_SPLIT]]
+; PRED:       split:
+; PRED-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+; PRED-NEXT:    br label [[IF_END327]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[INCDEC_PTR329]] = getelementptr inbounds i8, ptr [[__BEGIN3_011973]], i64 4
+; PRED-NEXT:    [[CMP311_NOT:%.*]] = icmp eq ptr [[__BEGIN3_011973]], [[I]]
+; PRED-NEXT:    br i1 [[CMP311_NOT]], label [[FOR_COND_CLEANUP312_LOOPEXIT:%.*]], label [[FOR_BODY313]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %src, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i32, ptr %ptr.iv, align 4
+  br i1 true, label %else.1, label %if
+
+if:
+  br label %split
+
+else.1:
+  br i1 %c, label %else.2, label %loop.latch
+
+else.2:
+  br label %split
+
+split:
+  store i32 %l, ptr %I, align 4
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 4
+  %ec = icmp eq ptr %ptr.iv, %I
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @invar_cond_chain_2(ptr %I, ptr noalias %src, ptr noalias %dst, i32 %a) {
+; DEFAULT-LABEL: define void @invar_cond_chain_2(
+; DEFAULT-SAME: ptr [[I:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[A:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; DEFAULT-NEXT:    [[I1:%.*]] = ptrtoint ptr [[I]] to i64
+; DEFAULT-NEXT:    [[TMP0:%.*]] = sub i64 [[I1]], [[SRC2]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; DEFAULT-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; DEFAULT-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; DEFAULT:       pred.store.if3:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; DEFAULT:       pred.store.continue4:
+; DEFAULT-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; DEFAULT:       pred.store.if5:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; DEFAULT:       pred.store.continue6:
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; DEFAULT:       pred.store.if7:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; DEFAULT:       pred.store.continue8:
+; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; DEFAULT:       pred.store.if9:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; DEFAULT:       pred.store.continue10:
+; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; DEFAULT:       pred.store.if11:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; DEFAULT:       pred.store.continue12:
+; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; DEFAULT:       pred.store.if13:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; DEFAULT:       pred.store.continue14:
+; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
+; DEFAULT:       pred.store.if15:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; DEFAULT:       pred.store.continue16:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP312_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY313:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[__BEGIN3_01197:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR329:%.*]], [[IF_END327:%.*]] ]
+; DEFAULT-NEXT:    [[CMP315_NOT:%.*]] = icmp sgt i32 [[A]], 0
+; DEFAULT-NEXT:    br i1 [[CMP315_NOT]], label [[IF_END327]], label [[IF_THEN316:%.*]]
+; DEFAULT:       if:
+; DEFAULT-NEXT:    br label [[IF_END327_SINK_SPLIT:%.*]]
+; DEFAULT:       else:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[IF_END327]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[INCDEC_PTR329]] = getelementptr inbounds i8, ptr [[__BEGIN3_01197]], i64 4
+; DEFAULT-NEXT:    [[CMP311_NOT:%.*]] = icmp eq ptr [[__BEGIN3_01197]], [[I]]
+; DEFAULT-NEXT:    br i1 [[CMP311_NOT]], label [[FOR_COND_CLEANUP312_LOOPEXIT]], label [[FOR_BODY313]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @invar_cond_chain_2(
+; PRED-SAME: ptr [[I:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[A:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[I1:%.*]] = ptrtoint ptr [[I]] to i64
+; PRED-NEXT:    [[TMP0:%.*]] = sub i64 [[I1]], [[SRC2]]
+; PRED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; PRED-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; PRED-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT3]]
+; PRED-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT5]], zeroinitializer
+; PRED-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; PRED-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer
+; PRED-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; PRED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; PRED-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; PRED:       pred.store.if7:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; PRED:       pred.store.continue8:
+; PRED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; PRED:       pred.store.if9:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; PRED:       pred.store.continue10:
+; PRED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; PRED-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; PRED:       pred.store.if11:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; PRED:       pred.store.continue12:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP312_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; PRED-NEXT:    br label [[FOR_BODY313:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[__BEGIN3_01197:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR329:%.*]], [[IF_END327:%.*]] ]
+; PRED-NEXT:    [[CMP315_NOT:%.*]] = icmp sgt i32 [[A]], 0
+; PRED-NEXT:    br i1 [[CMP315_NOT]], label [[IF_END327]], label [[IF_THEN316:%.*]]
+; PRED:       if:
+; PRED-NEXT:    br label [[IF_END327_SINK_SPLIT:%.*]]
+; PRED:       else:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[IF_END327]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[INCDEC_PTR329]] = getelementptr inbounds i8, ptr [[__BEGIN3_01197]], i64 4
+; PRED-NEXT:    [[CMP311_NOT:%.*]] = icmp eq ptr [[__BEGIN3_01197]], [[I]]
+; PRED-NEXT:    br i1 [[CMP311_NOT]], label [[FOR_COND_CLEANUP312_LOOPEXIT]], label [[FOR_BODY313]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %src, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %cmp315.not = icmp sgt i32 %a, 0
+  br i1 %cmp315.not, label %loop.latch, label %if
+
+if:
+  br label %else
+
+else:
+  store i32 0, ptr %dst, align 4
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 4
+  %cmp311.not = icmp eq ptr %ptr.iv, %I
+  br i1 %cmp311.not, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @latch_branch_cost(ptr %dst) {
+; DEFAULT-LABEL: define void @latch_branch_cost(
+; DEFAULT-SAME: ptr [[DST:%.*]]) {
+; DEFAULT-NEXT:  iter.check:
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.main.loop.iter.check:
+; DEFAULT-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP2]], i32 16
+; DEFAULT-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; DEFAULT-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; DEFAULT-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; DEFAULT:       vec.epilog.ph:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; DEFAULT:       vec.epilog.vector.body:
+; DEFAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX1]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; DEFAULT-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP9]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
+; DEFAULT-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; DEFAULT:       vec.epilog.middle.block:
+; DEFAULT-NEXT:    br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
+; DEFAULT:       vec.epilog.scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; DEFAULT-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
+; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @latch_branch_cost(
+; PRED-SAME: ptr [[DST:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; PRED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99>
+; PRED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; PRED-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
+; PRED-NEXT:    store i8 0, ptr [[TMP3]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; PRED-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; PRED:       pred.store.if1:
+; PRED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
+; PRED-NEXT:    store i8 0, ptr [[TMP6]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; PRED:       pred.store.continue2:
+; PRED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; PRED-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; PRED:       pred.store.if3:
+; PRED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; PRED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; PRED-NEXT:    store i8 0, ptr [[TMP9]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; PRED:       pred.store.continue4:
+; PRED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; PRED-NEXT:    store i8 0, ptr [[TMP12]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; PRED-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; PRED:       pred.store.if7:
+; PRED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; PRED-NEXT:    store i8 0, ptr [[TMP15]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; PRED:       pred.store.continue8:
+; PRED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; PRED-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; PRED:       pred.store.if9:
+; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; PRED-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; PRED:       pred.store.continue10:
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; PRED-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; PRED:       pred.store.if11:
+; PRED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]]
+; PRED-NEXT:    store i8 0, ptr [[TMP21]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; PRED:       pred.store.continue12:
+; PRED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; PRED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.if13:
+; PRED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
+; PRED-NEXT:    store i8 0, ptr [[TMP24]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue14:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; PRED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
+; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; PRED-NEXT:    br label [[FOR_BODY:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 0, ptr [[GEP]], align 1
+; PRED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
+; PRED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
+;.
+; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; PRED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
index 2be525a2abc0..2cc0aa2ffca5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
@@ -22,8 +22,8 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) #0 {
 ; CHECK-VF4UF2-LABEL: @PR33613
 ; CHECK-VF4UF2: vector.body
 ; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi <vscale x 4 x double> [ {{.*}}, %vector.ph ], [ {{.*}}, %vector.body ]
-; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %[[VEC_RECUR]], <vscale x 4 x double> {{.*}}, i32 -1)
-; CHECK-VF4UF2-NEXT: %[[SPLICE2:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %{{.*}}, <vscale x 4 x double> %{{.*}}, i32 -1)
+; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %[[VEC_RECUR]], <vscale x 4 x double> {{.*}}, i32 -1)
+; CHECK-VF4UF2-NEXT: %[[SPLICE2:.*]] = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %{{.*}}, <vscale x 4 x double> %{{.*}}, i32 -1)
 ; CHECK-VF4UF2-NOT: insertelement <vscale x 4 x double>
 ; CHECK-VF4UF2: middle.block
 entry:
@@ -71,7 +71,7 @@ define void @PR34711(ptr %a, ptr %b, ptr %c, i64 %n) #0 {
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> {{.*}}, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
-; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
+; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
 ; CHECK-VF4UF1-NEXT: %[[SXT1:.*]] = sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: %[[SXT2:.*]] = sext <vscale x 4 x i16> %[[MGATHER]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: mul nsw <vscale x 4 x i32> %[[SXT2]], %[[SXT1]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
new file mode 100644
index 000000000000..c85ae6dba73e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -0,0 +1,969 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=DEFAULT %s
+; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefixes=PRED %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
+; DEFAULT-LABEL: define void @iv_casts(
+; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  iter.check:
+; DEFAULT-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; DEFAULT-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; DEFAULT:       vector.memcheck:
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DEFAULT-NEXT:    [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; DEFAULT-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; DEFAULT-NEXT:    br i1 [[DIFF_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; DEFAULT:       vector.main.loop.iter.check:
+; DEFAULT-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], [[TMP8]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
+; DEFAULT-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP19]]
+; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; DEFAULT-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
+; DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP24]]
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP22]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP25]], align 1
+; DEFAULT-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
+; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
+; DEFAULT-NEXT:    [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+; DEFAULT-NEXT:    [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+; DEFAULT-NEXT:    [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP37:%.*]] = trunc <vscale x 8 x i16> [[TMP35]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; DEFAULT-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
+; DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[TMP38]], i32 0
+; DEFAULT-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 8
+; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
+; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP40]], align 1
+; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 4
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP46]]
+; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; DEFAULT:       vec.epilog.ph:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP47]], 4
+; DEFAULT-NEXT:    [[N_MOD_VF5:%.*]] = urem i64 [[TMP0]], [[TMP48]]
+; DEFAULT-NEXT:    [[N_VEC6:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF5]]
+; DEFAULT-NEXT:    [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP49]], 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT8]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP51:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT9]] to <vscale x 4 x i16>
+; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; DEFAULT:       vec.epilog.vector.body:
+; DEFAULT-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX10]], 0
+; DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP52]]
+; DEFAULT-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[TMP53]], i32 0
+; DEFAULT-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP54]], align 1
+; DEFAULT-NEXT:    [[TMP55:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i16>
+; DEFAULT-NEXT:    [[TMP56:%.*]] = mul <vscale x 4 x i16> [[TMP55]], [[TMP51]]
+; DEFAULT-NEXT:    [[TMP57:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i16>
+; DEFAULT-NEXT:    [[TMP58:%.*]] = or <vscale x 4 x i16> [[TMP56]], [[TMP57]]
+; DEFAULT-NEXT:    [[TMP59:%.*]] = lshr <vscale x 4 x i16> [[TMP58]], trunc (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i16>)
+; DEFAULT-NEXT:    [[TMP60:%.*]] = trunc <vscale x 4 x i16> [[TMP59]] to <vscale x 4 x i8>
+; DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP52]]
+; DEFAULT-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[TMP61]], i32 0
+; DEFAULT-NEXT:    store <vscale x 4 x i8> [[TMP60]], ptr [[TMP62]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], [[TMP50]]
+; DEFAULT-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC6]]
+; DEFAULT-NEXT:    br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       vec.epilog.middle.block:
+; DEFAULT-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC6]]
+; DEFAULT-NEXT:    br i1 [[CMP_N7]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT:       vec.epilog.scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; DEFAULT-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; DEFAULT-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; DEFAULT-NEXT:    [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[CONV25_US:%.*]] = zext i8 [[L]] to i32
+; DEFAULT-NEXT:    [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
+; DEFAULT-NEXT:    [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
+; DEFAULT-NEXT:    [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
+; DEFAULT-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT:    store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @iv_casts(
+; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; PRED:       vector.memcheck:
+; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; PRED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
+; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]]
+; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP16:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
+; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; PRED-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
+; PRED-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
+; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
+; PRED-NEXT:    [[TMP23:%.*]] = or <vscale x 8 x i16> [[TMP21]], [[TMP22]]
+; PRED-NEXT:    [[TMP24:%.*]] = lshr <vscale x 8 x i16> [[TMP23]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+; PRED-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
+; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
+; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
+; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
+; PRED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; PRED-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; PRED-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; PRED-NEXT:    [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[CONV25_US:%.*]] = zext i8 [[L]] to i32
+; PRED-NEXT:    [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
+; PRED-NEXT:    [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
+; PRED-NEXT:    [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
+; PRED-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src  = getelementptr i8, ptr %src, i64 %iv
+  %l = load i8, ptr %gep.src, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, %x
+  %iv.next = add i64 %iv, 1
+  %l.ext.2 = zext i8 %l to i32
+  %or = or i32 %mul, %l.ext.2
+  %lshr  = lshr i32 %or, 1
+  %trunc = trunc i32 %lshr to i8
+  %gep.dst = getelementptr i8, ptr %dst, i64 %iv
+  store i8 %trunc, ptr %gep.dst, align 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
+; DEFAULT-LABEL: define void @iv_trunc(
+; DEFAULT-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[MUL_X:%.*]] = add i32 [[X]], 1
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0
+; DEFAULT-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
+; DEFAULT-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
+; DEFAULT-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
+; DEFAULT-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
+; DEFAULT-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0
+; DEFAULT-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 0
+; DEFAULT-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], 1
+; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i32 [[MUL_X]], [[TMP14]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i32 [[MUL_X]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP16]] to i64
+; DEFAULT-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP18]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP20]], align 4
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP21]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
+; DEFAULT-NEXT:    [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]]
+; DEFAULT-NEXT:    [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_MUL]]
+; DEFAULT-NEXT:    store i32 1, ptr [[GEP]], align 4
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @iv_trunc(
+; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[MUL_X:%.*]] = add i32 [[X]], 1
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
+; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0
+; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]]
+; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
+; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
+; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
+; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
+; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0
+; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
+; PRED-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
+; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
+; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; PRED-NEXT:    [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT:    [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64>
+; PRED-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0
+; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
+; PRED-NEXT:    store i32 1, ptr [[TMP20]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; PRED:       pred.store.if1:
+; PRED-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1
+; PRED-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
+; PRED-NEXT:    store i32 1, ptr [[TMP23]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; PRED:       pred.store.continue2:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
+; PRED-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true>
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0
+; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[FOR_BODY:%.*]]
+; PRED:       for.body:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; PRED-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
+; PRED-NEXT:    [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]]
+; PRED-NEXT:    [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_MUL]]
+; PRED-NEXT:    store i32 1, ptr [[GEP]], align 4
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  %mul.x = add i32 %x, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %trunc.iv = trunc i64 %iv to i32
+  %add.i = mul i32 %mul.x, %trunc.iv
+  %iv.mul = zext i32 %add.i to i64
+  %gep = getelementptr i32, ptr %dst, i64 %iv.mul
+  store i32 1, ptr %gep, align 4
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
+; DEFAULT-LABEL: define void @trunc_ivs_and_store(
+; DEFAULT-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i32 [[X]], [[X]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0
+; DEFAULT-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc i64 [[N]] to i32
+; DEFAULT-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]])
+; DEFAULT-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
+; DEFAULT-NEXT:    [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
+; DEFAULT-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0
+; DEFAULT-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
+; DEFAULT-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 1
+; DEFAULT-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 0
+; DEFAULT-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], 1
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[MUL]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = mul i32 [[MUL]], [[TMP18]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
+; DEFAULT-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
+; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP21]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
+; DEFAULT-NEXT:    store i32 [[TMP14]], ptr [[TMP23]], align 4
+; DEFAULT-NEXT:    store i32 [[TMP15]], ptr [[TMP24]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
+; DEFAULT-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]]
+; DEFAULT-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; DEFAULT-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]]
+; DEFAULT-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
+; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; DEFAULT-NEXT:    [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EXITCOND_3_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @trunc_ivs_and_store(
+; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[TMP1:%.*]] = mul i32 [[X]], [[X]]
+; PRED-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
+; PRED-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0
+; PRED-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]]
+; PRED-NEXT:    [[TMP5:%.*]] = trunc i64 [[N]] to i32
+; PRED-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]])
+; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
+; PRED-NEXT:    [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0
+; PRED-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
+; PRED-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295
+; PRED-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0
+; PRED-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
+; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
+; PRED-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
+; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; PRED-NEXT:    [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT:    [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64>
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
+; PRED-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0
+; PRED-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; PRED:       pred.store.if3:
+; PRED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; PRED-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]]
+; PRED-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1
+; PRED-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; PRED:       pred.store.continue4:
+; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; PRED-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; PRED-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]]
+; PRED-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2
+; PRED-NEXT:    store i32 [[TMP30]], ptr [[TMP29]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; PRED-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; PRED:       pred.store.if7:
+; PRED-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; PRED-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]]
+; PRED-NEXT:    [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3
+; PRED-NEXT:    store i32 [[TMP34]], ptr [[TMP33]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; PRED:       pred.store.continue8:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
+; PRED-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PRED-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
+; PRED-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
+; PRED-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]]
+; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; PRED-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]]
+; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
+; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; PRED-NEXT:    [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; PRED-NEXT:    br i1 [[EXITCOND_3_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  %mul = mul i32 %x, %x
+  br label %loop
+
+loop:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.1.trunc = trunc i64 %iv.1 to i32
+  %iv.1.mul = mul i32 %mul, %iv.1.trunc
+  %iv.2.next = add i32 %iv.2, 1
+  %mul.ext = zext i32 %iv.1.mul to i64
+  %gep = getelementptr i32, ptr %dst, i64 %mul.ext
+  store i32 %iv.2, ptr %gep, align 4
+  %iv.1.next = add i64 %iv.1, 1
+  %exitcond.3.not = icmp eq i64 %iv.1, %N
+  br i1 %exitcond.3.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
+; DEFAULT-LABEL: define void @ivs_trunc_and_ext(
+; DEFAULT-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0
+; DEFAULT-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
+; DEFAULT-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
+; DEFAULT-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
+; DEFAULT-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
+; DEFAULT-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0
+; DEFAULT-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 1
+; DEFAULT-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 0
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i32 [[TMP15]], 1
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i32 [[ADD]], [[TMP16]]
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[ADD]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP18]] to i64
+; DEFAULT-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
+; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP21]]
+; DEFAULT-NEXT:    store i32 [[TMP13]], ptr [[TMP22]], align 4
+; DEFAULT-NEXT:    store i32 [[TMP14]], ptr [[TMP23]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
+; DEFAULT-NEXT:    [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]]
+; DEFAULT-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; DEFAULT-NEXT:    [[EXT:%.*]] = zext i32 [[IV_MUL]] to i64
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[EXT]]
+; DEFAULT-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
+; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @ivs_trunc_and_ext(
+; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
+; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0
+; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]]
+; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
+; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
+; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
+; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
+; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0
+; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
+; PRED-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
+; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
+; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; PRED-NEXT:    [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT:    [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64>
+; PRED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
+; PRED-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0
+; PRED-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; PRED:       pred.store.if2:
+; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]]
+; PRED-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1
+; PRED-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; PRED:       pred.store.continue3:
+; PRED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; PRED-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; PRED:       pred.store.if4:
+; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
+; PRED-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
+; PRED-NEXT:    [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2
+; PRED-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; PRED:       pred.store.continue5:
+; PRED-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; PRED-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
+; PRED:       pred.store.if6:
+; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; PRED-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
+; PRED-NEXT:    [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3
+; PRED-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; PRED:       pred.store.continue7:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
+; PRED-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
+; PRED-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
+; PRED-NEXT:    [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]]
+; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; PRED-NEXT:    [[EXT:%.*]] = zext i32 [[IV_MUL]] to i64
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[EXT]]
+; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
+; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  %add = add i32 %x, 1
+  br label %loop
+
+loop:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.trunc = trunc i64 %iv.1 to i32
+  %iv.mul = mul i32 %add, %iv.trunc
+  %iv.2.next = add i32 %iv.2, 1
+  %ext = zext i32 %iv.mul to i64
+  %gep = getelementptr i32, ptr %dst, i64 %ext
+  store i32 %iv.2, ptr %gep, align 4
+  %iv.1.next = add i64 %iv.1, 1
+  %ec = icmp eq i64 %iv.1, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
+; DEFAULT-LABEL: define void @exit_cond_zext_iv(
+; DEFAULT-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP7]], i32 2
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP8]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP9]], align 8
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP10]], align 8
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_CONV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[GEP]], align 8
+; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; DEFAULT-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
+; DEFAULT-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP12:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @exit_cond_zext_iv(
+; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
+; PRED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; PRED-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; PRED-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
+; PRED-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; PRED-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; PRED-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
+; PRED-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; PRED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[IV_CONV:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[GEP]], align 8
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; PRED-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 1
+; PRED-NEXT:    [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[TMP13]], align 8
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; PRED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[LOOP1:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP1]] ]
+; PRED-NEXT:    [[IV_CONV1:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP1]] ]
+; PRED-NEXT:    [[GEP1:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV1]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[GEP1]], align 8
+; PRED-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; PRED-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
+; PRED-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
+; PRED-NEXT:    br i1 [[C]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.conv = phi i64 [ 0, %entry ], [ %iv.ext, %loop ]
+  %gep = getelementptr {[100 x i32], i32, i32}, ptr %dst, i64 %iv.conv, i32 2
+  store i32 0, ptr %gep, align 8
+  %iv.1.next = add i32 %iv.1, 1
+  %iv.ext = zext i32 %iv.1.next to i64
+  %c = icmp ult i64 %iv.ext, %N
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
+
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
+; DEFAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]]}
+; DEFAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
+; DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]}
+;.
+; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; PRED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; PRED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; PRED: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 931ab4f77618..4d9c850abdf3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -9,37 +9,37 @@ define i32 @multi_exit_iv_uniform(i32 %a, i64 %N, ptr %dst) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 2147483648)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 8, i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i32> [[BROADCAST_SPLAT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i32> [[BROADCAST_SPLAT]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP5]], i32 2
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10]] = add <2 x i32> [[VEC_PHI]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP11]] = add <2 x i32> [[VEC_PHI1]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[VEC_PHI]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[VEC_PHI1]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -285,6 +285,264 @@ loop:
 exit:
   ret void
 }
+
+define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
+; DEFAULT-LABEL: define i64 @test_ptr_ivs_and_widened_ivs(
+; DEFAULT-SAME: ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    [[IND_END3:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16
+; DEFAULT-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]]
+; DEFAULT-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; DEFAULT-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; DEFAULT-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[WIDE_LOAD7]], <i32 1, i32 1, i32 1, i32 1>
+; DEFAULT-NEXT:    [[TMP10:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64>
+; DEFAULT-NEXT:    [[TMP11:%.*]] = zext <4 x i32> [[TMP9]] to <4 x i64>
+; DEFAULT-NEXT:    [[TMP12:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
+; DEFAULT-NEXT:    [[TMP13:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64>
+; DEFAULT-NEXT:    [[TMP14:%.*]] = shl <4 x i64> [[TMP10]], [[TMP12]]
+; DEFAULT-NEXT:    [[TMP15]] = shl <4 x i64> [[TMP11]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP14]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; DEFAULT-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> [[TMP15]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
+; DEFAULT-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; DEFAULT-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP15]], i32 3
+; DEFAULT-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[L:%.*]] = load i32, ptr [[PTR_IV]], align 4
+; DEFAULT-NEXT:    [[NOT:%.*]] = xor i32 [[L]], 1
+; DEFAULT-NEXT:    [[NOT_EXT:%.*]] = zext i32 [[NOT]] to i64
+; DEFAULT-NEXT:    [[IV_EXT:%.*]] = zext i32 [[IV_1]] to i64
+; DEFAULT-NEXT:    [[SHL]] = shl i64 [[NOT_EXT]], [[IV_EXT]]
+; DEFAULT-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 4
+; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; DEFAULT-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_2_NEXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    [[P_LCSSA:%.*]] = phi i64 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    ret i64 [[P_LCSSA]]
+;
+; PRED-LABEL: define i64 @test_ptr_ivs_and_widened_ivs(
+; PRED-SAME: ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; PRED-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; PRED-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; PRED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; PRED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; PRED-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[IND_END3:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PRED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; PRED-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16
+; PRED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]]
+; PRED-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
+; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; PRED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
+; PRED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; PRED-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; PRED-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; PRED-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[WIDE_LOAD7]], <i32 1, i32 1, i32 1, i32 1>
+; PRED-NEXT:    [[TMP10:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64>
+; PRED-NEXT:    [[TMP11:%.*]] = zext <4 x i32> [[TMP9]] to <4 x i64>
+; PRED-NEXT:    [[TMP12:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
+; PRED-NEXT:    [[TMP13:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64>
+; PRED-NEXT:    [[TMP14:%.*]] = shl <4 x i64> [[TMP10]], [[TMP12]]
+; PRED-NEXT:    [[TMP15]] = shl <4 x i64> [[TMP11]], [[TMP13]]
+; PRED-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP14]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; PRED-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> [[TMP15]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
+; PRED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP15]], i32 3
+; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
+; PRED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[L:%.*]] = load i32, ptr [[PTR_IV]], align 4
+; PRED-NEXT:    [[NOT:%.*]] = xor i32 [[L]], 1
+; PRED-NEXT:    [[NOT_EXT:%.*]] = zext i32 [[NOT]] to i64
+; PRED-NEXT:    [[IV_EXT:%.*]] = zext i32 [[IV_1]] to i64
+; PRED-NEXT:    [[SHL]] = shl i64 [[NOT_EXT]], [[IV_EXT]]
+; PRED-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 4
+; PRED-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_2_NEXT]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    [[P_LCSSA:%.*]] = phi i64 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    ret i64 [[P_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %p = phi i64 [ 0, %entry ], [ %shl, %loop ]
+  %ptr.iv = phi ptr [ %src, %entry ], [ %ptr.iv.next, %loop ]
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %l = load i32, ptr %ptr.iv, align 4
+  %not = xor i32 %l, 1
+  %not.ext = zext i32 %not to i64
+  %iv.ext = zext i32 %iv.1 to i64
+  %shl = shl i64 %not.ext , %iv.ext
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 4
+  %iv.1.next = add i32 %iv.1, 1
+  %iv.2.next = add i32 %iv.2, 1
+  %ec = icmp eq i32 %iv.2.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %p
+}
+
+define void @zext_iv_increment(ptr %dst, i64 %N) {
+; DEFAULT-LABEL: define void @zext_iv_increment(
+; DEFAULT-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr { i32, i32, i32 }, ptr [[DST]], i64 [[TMP7]], i32 2
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr { i32, i32, i32 }, ptr [[DST]], i64 [[TMP8]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP9]], align 8
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP10]], align 8
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP173_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY174:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[I167_0800:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC179:%.*]], [[FOR_BODY174]] ]
+; DEFAULT-NEXT:    [[CONV169801:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[CONV169:%.*]], [[FOR_BODY174]] ]
+; DEFAULT-NEXT:    [[PATCH_INDEX:%.*]] = getelementptr { i32, i32, i32 }, ptr [[DST]], i64 [[CONV169801]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[PATCH_INDEX]], align 8
+; DEFAULT-NEXT:    [[INC179]] = add i32 [[I167_0800]], 1
+; DEFAULT-NEXT:    [[CONV169]] = zext i32 [[INC179]] to i64
+; DEFAULT-NEXT:    [[CMP172:%.*]] = icmp ult i64 [[CONV169]], [[N]]
+; DEFAULT-NEXT:    br i1 [[CMP172]], label [[FOR_BODY174]], label [[FOR_COND_CLEANUP173_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @zext_iv_increment(
+; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br label [[FOR_BODY174:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[I167_0800:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC179:%.*]], [[FOR_BODY174]] ]
+; PRED-NEXT:    [[CONV169801:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[CONV169:%.*]], [[FOR_BODY174]] ]
+; PRED-NEXT:    [[PATCH_INDEX:%.*]] = getelementptr { i32, i32, i32 }, ptr [[DST]], i64 [[CONV169801]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[PATCH_INDEX]], align 8
+; PRED-NEXT:    [[INC179]] = add i32 [[I167_0800]], 1
+; PRED-NEXT:    [[CONV169]] = zext i32 [[INC179]] to i64
+; PRED-NEXT:    [[CMP172:%.*]] = icmp ult i64 [[CONV169]], [[N]]
+; PRED-NEXT:    br i1 [[CMP172]], label [[FOR_BODY174]], label [[FOR_COND_CLEANUP173_LOOPEXIT:%.*]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.wide = phi i64 [ 0, %entry ], [ %iv.next.ext, %loop ]
+  %patch_index = getelementptr { i32, i32, i32 }, ptr %dst, i64 %iv.wide, i32 2
+  store i32 0, ptr %patch_index, align 8
+  %iv.next = add i32 %iv, 1
+  %iv.next.ext = zext i32 %iv.next to i64
+  %ec = icmp ult i64 %iv.next.ext, %N
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
new file mode 100644
index 000000000000..c24c1a38177d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=DEFAULT %s
+; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefixes=PRED %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) #0 {
+; DEFAULT-LABEL: define i32 @chained_recurrences(
+; DEFAULT-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[Y]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; DEFAULT-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
+; DEFAULT-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP9]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vscale.i32()
+; DEFAULT-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], 4
+; DEFAULT-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
+; DEFAULT-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP12]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT6]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT8]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT5:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[Y]], 1
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i64 [[Y]], 1
+; DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP13]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP14]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP17]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP18]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT5]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 -1)
+; DEFAULT-NEXT:    [[TMP20]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32> [[BROADCAST_SPLAT5]], i32 -1)
+; DEFAULT-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
+; DEFAULT-NEXT:    [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], i32 -1)
+; DEFAULT-NEXT:    [[TMP23:%.*]] = or <vscale x 4 x i32> [[TMP21]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = or <vscale x 4 x i32> [[TMP22]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP25:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP26:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP27:%.*]] = shl <vscale x 4 x i32> [[TMP23]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP28:%.*]] = shl <vscale x 4 x i32> [[TMP24]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP29:%.*]] = or <vscale x 4 x i32> [[TMP27]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP30:%.*]] = or <vscale x 4 x i32> [[TMP28]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP31:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP32:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 4 x i32> [[TMP25]], [[TMP31]]
+; DEFAULT-NEXT:    [[TMP34:%.*]] = or <vscale x 4 x i32> [[TMP26]], [[TMP32]]
+; DEFAULT-NEXT:    [[TMP35:%.*]] = or <vscale x 4 x i32> [[TMP33]], [[TMP29]]
+; DEFAULT-NEXT:    [[TMP36:%.*]] = or <vscale x 4 x i32> [[TMP34]], [[TMP30]]
+; DEFAULT-NEXT:    [[TMP37:%.*]] = or <vscale x 4 x i32> [[TMP35]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP38:%.*]] = or <vscale x 4 x i32> [[TMP36]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP39:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT9]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP40:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT9]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP41:%.*]] = and <vscale x 4 x i32> [[TMP39]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP42:%.*]] = and <vscale x 4 x i32> [[TMP40]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP43:%.*]] = xor <vscale x 4 x i32> [[TMP41]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP44:%.*]] = xor <vscale x 4 x i32> [[TMP42]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP45:%.*]] = zext <vscale x 4 x i32> [[TMP43]] to <vscale x 4 x i64>
+; DEFAULT-NEXT:    [[TMP46:%.*]] = zext <vscale x 4 x i32> [[TMP44]] to <vscale x 4 x i64>
+; DEFAULT-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP45]]
+; DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP46]]
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP47]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP48]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[TMP49:%.*]] = lshr <vscale x 4 x i32> [[TMP37]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP50:%.*]] = lshr <vscale x 4 x i32> [[TMP38]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP51:%.*]] = zext <vscale x 4 x i32> [[TMP49]] to <vscale x 4 x i64>
+; DEFAULT-NEXT:    [[TMP52:%.*]] = zext <vscale x 4 x i32> [[TMP50]] to <vscale x 4 x i64>
+; DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP51]]
+; DEFAULT-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP52]]
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP53]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP54]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[TMP55:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
+; DEFAULT-NEXT:    [[TMP56:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER10]], [[VEC_PHI3]]
+; DEFAULT-NEXT:    [[TMP57]] = or <vscale x 4 x i32> [[TMP55]], [[WIDE_MASKED_GATHER11]]
+; DEFAULT-NEXT:    [[TMP58]] = or <vscale x 4 x i32> [[TMP56]], [[WIDE_MASKED_GATHER12]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; DEFAULT-NEXT:    [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP59]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[BIN_RDX:%.*]] = or <vscale x 4 x i32> [[TMP58]], [[TMP57]]
+; DEFAULT-NEXT:    [[TMP60:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    [[TMP61:%.*]] = call i32 @llvm.vscale.i32()
+; DEFAULT-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], 4
+; DEFAULT-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP62]], 1
+; DEFAULT-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT5]], i32 [[TMP63]]
+; DEFAULT-NEXT:    [[TMP64:%.*]] = call i32 @llvm.vscale.i32()
+; DEFAULT-NEXT:    [[TMP65:%.*]] = mul i32 [[TMP64]], 4
+; DEFAULT-NEXT:    [[TMP66:%.*]] = sub i32 [[TMP65]], 1
+; DEFAULT-NEXT:    [[VECTOR_RECUR_EXTRACT13:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP66]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[SCALAR_RECUR_INIT14:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT13]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP60]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP68:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[SCALAR_RECUR15:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT14]], [[SCALAR_PH]] ], [ [[SCALAR_RECUR]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[SUM_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_2:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[TMP67:%.*]] = add i64 [[Y]], 1
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP67]]
+; DEFAULT-NEXT:    [[TMP68]] = load i32, ptr [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[OR3:%.*]] = or i32 [[SCALAR_RECUR15]], [[X]]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 1
+; DEFAULT-NEXT:    [[TMP69:%.*]] = shl i32 [[OR3]], 1
+; DEFAULT-NEXT:    [[TMP70:%.*]] = or i32 [[TMP69]], 2
+; DEFAULT-NEXT:    [[SHL19:%.*]] = shl i32 [[X]], 1
+; DEFAULT-NEXT:    [[TMP71:%.*]] = or i32 [[SHR]], [[SHL19]]
+; DEFAULT-NEXT:    [[TMP72:%.*]] = or i32 [[TMP71]], [[TMP70]]
+; DEFAULT-NEXT:    [[TMP73:%.*]] = or i32 [[TMP72]], [[X]]
+; DEFAULT-NEXT:    [[OR20:%.*]] = or i32 [[Z]], [[X]]
+; DEFAULT-NEXT:    [[NOT:%.*]] = and i32 [[OR20]], 1
+; DEFAULT-NEXT:    [[AND:%.*]] = xor i32 [[NOT]], 1
+; DEFAULT-NEXT:    [[IDX_EXT_1:%.*]] = zext i32 [[AND]] to i64
+; DEFAULT-NEXT:    [[GEP_2:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_1]]
+; DEFAULT-NEXT:    [[TMP74:%.*]] = load i32, ptr [[GEP_2]], align 4
+; DEFAULT-NEXT:    [[SHR24:%.*]] = lshr i32 [[TMP73]], 1
+; DEFAULT-NEXT:    [[IDX_EXT_2:%.*]] = zext i32 [[SHR24]] to i64
+; DEFAULT-NEXT:    [[GEP_3:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_2]]
+; DEFAULT-NEXT:    [[TMP75:%.*]] = load i32, ptr [[GEP_3]], align 4
+; DEFAULT-NEXT:    [[RED_1:%.*]] = or i32 [[TMP74]], [[SUM_RED]]
+; DEFAULT-NEXT:    [[RED_2]] = or i32 [[RED_1]], [[TMP75]]
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[Y]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    [[RED_2_LCSSA:%.*]] = phi i32 [ [[RED_2]], [[LOOP]] ], [ [[TMP60]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    ret i32 [[RED_2_LCSSA]]
+;
+; PRED-LABEL: define i32 @chained_recurrences(
+; PRED-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[Y]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; PRED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; PRED-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; PRED-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
+; PRED-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
+; PRED-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; PRED-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP14]], 1
+; PRED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP15]]
+; PRED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; PRED-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; PRED-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
+; PRED-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP18]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP19:%.*]] = add i64 [[Y]], 1
+; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP19]]
+; PRED-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP21]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 -1)
+; PRED-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
+; PRED-NEXT:    [[TMP24:%.*]] = or <vscale x 4 x i32> [[TMP23]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP25:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP26:%.*]] = shl <vscale x 4 x i32> [[TMP24]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP27:%.*]] = or <vscale x 4 x i32> [[TMP26]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP28:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP29:%.*]] = or <vscale x 4 x i32> [[TMP25]], [[TMP28]]
+; PRED-NEXT:    [[TMP30:%.*]] = or <vscale x 4 x i32> [[TMP29]], [[TMP27]]
+; PRED-NEXT:    [[TMP31:%.*]] = or <vscale x 4 x i32> [[TMP30]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP32:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT6]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP33:%.*]] = and <vscale x 4 x i32> [[TMP32]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP34:%.*]] = xor <vscale x 4 x i32> [[TMP33]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP35:%.*]] = zext <vscale x 4 x i32> [[TMP34]] to <vscale x 4 x i64>
+; PRED-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP35]]
+; PRED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP36]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; PRED-NEXT:    [[TMP37:%.*]] = lshr <vscale x 4 x i32> [[TMP31]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP38:%.*]] = zext <vscale x 4 x i32> [[TMP37]] to <vscale x 4 x i64>
+; PRED-NEXT:    [[TMP39:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP38]]
+; PRED-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP39]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; PRED-NEXT:    [[TMP40:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
+; PRED-NEXT:    [[TMP41:%.*]] = or <vscale x 4 x i32> [[TMP40]], [[WIDE_MASKED_GATHER7]]
+; PRED-NEXT:    [[TMP42]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP41]], <vscale x 4 x i32> [[VEC_PHI]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
+; PRED-NEXT:    [[TMP43:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP44:%.*]] = extractelement <vscale x 4 x i1> [[TMP43]], i32 0
+; PRED-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[TMP42]])
+; PRED-NEXT:    [[TMP46:%.*]] = call i32 @llvm.vscale.i32()
+; PRED-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], 4
+; PRED-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP47]], 1
+; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 [[TMP48]]
+; PRED-NEXT:    [[TMP49:%.*]] = call i32 @llvm.vscale.i32()
+; PRED-NEXT:    [[TMP50:%.*]] = mul i32 [[TMP49]], 4
+; PRED-NEXT:    [[TMP51:%.*]] = sub i32 [[TMP50]], 1
+; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT8:%.*]] = extractelement <vscale x 4 x i32> [[TMP22]], i32 [[TMP51]]
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT8]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; PRED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP53:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[SCALAR_RECUR10:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], [[SCALAR_PH]] ], [ [[SCALAR_RECUR]], [[LOOP]] ]
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[SUM_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_2:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[TMP52:%.*]] = add i64 [[Y]], 1
+; PRED-NEXT:    [[GEP_1:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP52]]
+; PRED-NEXT:    [[TMP53]] = load i32, ptr [[GEP_1]], align 4
+; PRED-NEXT:    [[OR3:%.*]] = or i32 [[SCALAR_RECUR10]], [[X]]
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 1
+; PRED-NEXT:    [[TMP54:%.*]] = shl i32 [[OR3]], 1
+; PRED-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 2
+; PRED-NEXT:    [[SHL19:%.*]] = shl i32 [[X]], 1
+; PRED-NEXT:    [[TMP56:%.*]] = or i32 [[SHR]], [[SHL19]]
+; PRED-NEXT:    [[TMP57:%.*]] = or i32 [[TMP56]], [[TMP55]]
+; PRED-NEXT:    [[TMP58:%.*]] = or i32 [[TMP57]], [[X]]
+; PRED-NEXT:    [[OR20:%.*]] = or i32 [[Z]], [[X]]
+; PRED-NEXT:    [[NOT:%.*]] = and i32 [[OR20]], 1
+; PRED-NEXT:    [[AND:%.*]] = xor i32 [[NOT]], 1
+; PRED-NEXT:    [[IDX_EXT_1:%.*]] = zext i32 [[AND]] to i64
+; PRED-NEXT:    [[GEP_2:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_1]]
+; PRED-NEXT:    [[TMP59:%.*]] = load i32, ptr [[GEP_2]], align 4
+; PRED-NEXT:    [[SHR24:%.*]] = lshr i32 [[TMP58]], 1
+; PRED-NEXT:    [[IDX_EXT_2:%.*]] = zext i32 [[SHR24]] to i64
+; PRED-NEXT:    [[GEP_3:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_2]]
+; PRED-NEXT:    [[TMP60:%.*]] = load i32, ptr [[GEP_3]], align 4
+; PRED-NEXT:    [[RED_1:%.*]] = or i32 [[TMP59]], [[SUM_RED]]
+; PRED-NEXT:    [[RED_2]] = or i32 [[RED_1]], [[TMP60]]
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[Y]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    [[RED_2_LCSSA:%.*]] = phi i32 [ [[RED_2]], [[LOOP]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    ret i32 [[RED_2_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %2 = phi i32 [ 0, %entry ], [ %5, %loop ]
+  %3 = phi i32 [ 0, %entry ], [ %2, %loop ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %sum.red = phi i32 [ 0, %entry ], [ %red.2, %loop ]
+  %4 = add i64 %y, 1
+  %gep.1 = getelementptr i32, ptr %src.1, i64 %4
+  %5 = load i32, ptr %gep.1, align 4
+  %or3 = or i32 %3, %x
+  %iv.next = add i64 %iv, 1
+  %shr = lshr i32 %x, 1
+  %6 = shl i32 %or3, 1
+  %7 = or i32 %6, 2
+  %shl19 = shl i32 %x, 1
+  %8 = or i32 %shr, %shl19
+  %9 = or i32 %8, %7
+  %10 = or i32 %9, %x
+  %or20 = or i32 %z, %x
+  %not = and i32 %or20, 1
+  %and = xor i32 %not, 1
+  %idx.ext.1 = zext i32 %and to i64
+  %gep.2 = getelementptr i32, ptr %src.2, i64 %idx.ext.1
+  %11 = load i32, ptr %gep.2, align 4
+  %shr24 = lshr i32 %10, 1
+  %idx.ext.2 = zext i32 %shr24 to i64
+  %gep.3 = getelementptr i32, ptr %src.2, i64 %idx.ext.2
+  %12 = load i32, ptr %gep.3, align 4
+  %red.1 = or i32 %11, %sum.red
+  %red.2 = or i32 %red.1, %12
+  %ec = icmp eq i64 %iv, %y
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.2
+}
+
+define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
+; DEFAULT-LABEL: define i16 @reduce_udiv(
+; DEFAULT-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 0
+; DEFAULT-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 1
+; DEFAULT-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP7]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP12]]
+; DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP13]], i32 0
+; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[TMP13]], i64 [[TMP17]]
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP15]], align 2
+; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP18]], align 2
+; DEFAULT-NEXT:    [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]]
+; DEFAULT-NEXT:    [[TMP21]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]]
+; DEFAULT-NEXT:    [[TMP22]] = or <vscale x 4 x i16> [[TMP20]], [[VEC_PHI1]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[BIN_RDX:%.*]] = or <vscale x 4 x i16> [[TMP22]], [[TMP21]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[BIN_RDX]])
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
+; DEFAULT-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
+; DEFAULT-NEXT:    [[DIV:%.*]] = udiv i16 [[L]], [[X]]
+; DEFAULT-NEXT:    [[RED_NEXT]] = or i16 [[DIV]], [[RED]]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    ret i16 [[RED_NEXT_LCSSA]]
+;
+; PRED-LABEL: define i16 @reduce_udiv(
+; PRED-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; PRED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; PRED-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; PRED-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
+; PRED-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
+; PRED-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]]
+; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
+; PRED-NEXT:    [[TMP16:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[BROADCAST_SPLAT]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP17:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[TMP16]]
+; PRED-NEXT:    [[TMP18:%.*]] = or <vscale x 8 x i16> [[TMP17]], [[VEC_PHI]]
+; PRED-NEXT:    [[TMP19]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP18]], <vscale x 8 x i16> [[VEC_PHI]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
+; PRED-NEXT:    [[TMP20:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i1> [[TMP20]], i32 0
+; PRED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP19]])
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; PRED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
+; PRED-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
+; PRED-NEXT:    [[DIV:%.*]] = udiv i16 [[L]], [[X]]
+; PRED-NEXT:    [[RED_NEXT]] = or i16 [[DIV]], [[RED]]
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    ret i16 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i16 [ 0, %entry ], [ %red.next, %loop ]
+  %gep = getelementptr i16, ptr %src, i64 %iv
+  %l = load i16, ptr %gep, align 2
+  %div = udiv i16 %l, %x
+  %red.next = or i16 %div, %red
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i16 %red.next
+}
+
+attributes #0 = { "target-features"="+sve" }
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
+; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index ad6e8534f318..ddc004657ed5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -588,7 +588,7 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
+; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
 ; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-UNORDERED-NEXT:    [[TMP14]] = fadd <vscale x 4 x float> [[TMP12]], [[VEC_PHI1]]
@@ -658,7 +658,7 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
 ; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-ORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
+; CHECK-ORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
 ; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-ORDERED-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
@@ -733,9 +733,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
+; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
new file mode 100644
index 000000000000..1e13d70083ff
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=DEFAULT %s
+; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefixes=PRED %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define void @cost_store_i8(ptr %dst) #0 {
+; DEFAULT-LABEL: define void @cost_store_i8(
+; DEFAULT-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  iter.check:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 101, [[TMP1]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; DEFAULT:       vector.main.loop.iter.check:
+; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP3]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 101, [[TMP5]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 101, [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 101, [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 101, [[N_VEC]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP13]]
+; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; DEFAULT:       vec.epilog.ph:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 8
+; DEFAULT-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP15]]
+; DEFAULT-NEXT:    [[N_VEC3:%.*]] = sub i64 101, [[N_MOD_VF2]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 8
+; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; DEFAULT:       vec.epilog.vector.body:
+; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX5]], 0
+; DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0
+; DEFAULT-NEXT:    store <vscale x 8 x i8> zeroinitializer, ptr [[TMP20]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; DEFAULT-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       vec.epilog.middle.block:
+; DEFAULT-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 101, [[N_VEC3]]
+; DEFAULT-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT:       vec.epilog.scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT:    store i8 0, ptr [[GEP]], align 1
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @cost_store_i8(
+; PRED-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; PRED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; PRED-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 101, [[TMP4]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; PRED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; PRED-NEXT:    [[TMP9:%.*]] = sub i64 101, [[TMP8]]
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 101, [[TMP8]]
+; PRED-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 101)
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; PRED-NEXT:    [[TMP15:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[TMP15]], i32 0
+; PRED-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 0, ptr [[GEP]], align 1
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
+; DEFAULT-LABEL: define void @trunc_store(
+; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT:  iter.check:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; DEFAULT:       vector.memcheck:
+; DEFAULT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
+; DEFAULT-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[DST]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; DEFAULT-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; DEFAULT:       vector.main.loop.iter.check:
+; DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
+; DEFAULT-NEXT:    [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP6:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT4]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP6]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16
+; DEFAULT-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
+; DEFAULT-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META8]], !noalias [[META5]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; DEFAULT-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; DEFAULT-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP16]]
+; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; DEFAULT:       vec.epilog.ph:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP18]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; DEFAULT:       vec.epilog.vector.body:
+; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX5]], 0
+; DEFAULT-NEXT:    [[TMP22:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP22]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP23:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT]] to <vscale x 2 x i8>
+; DEFAULT-NEXT:    [[TMP24:%.*]] = trunc <vscale x 2 x i16> [[BROADCAST_SPLAT7]] to <vscale x 2 x i8>
+; DEFAULT-NEXT:    [[TMP25:%.*]] = and <vscale x 2 x i8> [[TMP23]], [[TMP24]]
+; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP21]]
+; DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
+; DEFAULT-NEXT:    store <vscale x 2 x i8> [[TMP25]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]]
+; DEFAULT-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], [[TMP20]]
+; DEFAULT-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; DEFAULT:       vec.epilog.middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT:       vec.epilog.scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[X_EXT:%.*]] = zext i16 [[X]] to i64
+; DEFAULT-NEXT:    [[L:%.*]] = load i64, ptr [[SRC]], align 8
+; DEFAULT-NEXT:    [[AND:%.*]] = and i64 [[L]], [[X_EXT]]
+; DEFAULT-NEXT:    [[TRUNC:%.*]] = trunc i64 [[AND]] to i8
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT:    store i8 [[TRUNC]], ptr [[GEP]], align 1
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @trunc_store(
+; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; PRED:       vector.memcheck:
+; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
+; PRED-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
+; PRED-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[DST]]
+; PRED-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; PRED-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT1]], <16 x i16> poison, <16 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT]] to <16 x i8>
+; PRED-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT2]] to <16 x i8>
+; PRED-NEXT:    [[TMP4:%.*]] = and <16 x i8> [[TMP2]], [[TMP3]]
+; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; PRED-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; PRED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; PRED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[X_EXT:%.*]] = zext i16 [[X]] to i64
+; PRED-NEXT:    [[L:%.*]] = load i64, ptr [[SRC]], align 8
+; PRED-NEXT:    [[AND:%.*]] = and i64 [[L]], [[X_EXT]]
+; PRED-NEXT:    [[TRUNC:%.*]] = trunc i64 [[AND]] to i8
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 [[TRUNC]], ptr [[GEP]], align 1
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %x.ext = zext i16 %x to i64
+  %l = load i64, ptr %src, align 8
+  %and = and i64 %l, %x.ext
+  %trunc = trunc i64 %and to i8
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 %trunc, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
+attributes #1 = { vscale_range(1,16) "target-features"="+sve" }
+
+
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; DEFAULT: [[META5]] = !{[[META6:![0-9]+]]}
+; DEFAULT: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
+; DEFAULT: [[META7]] = distinct !{[[META7]], !"LVerDomain"}
+; DEFAULT: [[META8]] = !{[[META9:![0-9]+]]}
+; DEFAULT: [[META9]] = distinct !{[[META9]], [[META7]]}
+; DEFAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; DEFAULT: [[META11]] = !{[[META12:![0-9]+]]}
+; DEFAULT: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]]}
+; DEFAULT: [[META13]] = distinct !{[[META13]], !"LVerDomain"}
+; DEFAULT: [[META14]] = !{[[META15:![0-9]+]]}
+; DEFAULT: [[META15]] = distinct !{[[META15]], [[META13]]}
+; DEFAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
+;.
+; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; PRED: [[META4]] = !{[[META5:![0-9]+]]}
+; PRED: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]}
+; PRED: [[META6]] = distinct !{[[META6]], !"LVerDomain"}
+; PRED: [[META7]] = !{[[META8:![0-9]+]]}
+; PRED: [[META8]] = distinct !{[[META8]], [[META6]]}
+; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index c07b3c8d4922..1853e551806b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -38,7 +38,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
@@ -46,7 +46,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -134,7 +134,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -216,7 +216,7 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
@@ -401,11 +401,11 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
@@ -414,9 +414,9 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
-; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
+; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -483,7 +483,7 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[TMP5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
@@ -569,7 +569,7 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP15:%.*]] = and i64 [[INDEX]], 9223372036854775804
@@ -717,18 +717,18 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -811,7 +811,7 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 4 x float>
@@ -910,7 +910,7 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -994,12 +994,12 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[TMP16]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1092,7 +1092,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1181,11 +1181,11 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1291,7 +1291,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -1497,7 +1497,7 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP19]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison), !alias.scope [[META34]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 4 x i16> [[TMP23]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <vscale x 4 x i32> [[TMP24]], [[TMP21]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3ba91360850e..726d98f4d37d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -52,9 +52,9 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1
@@ -63,8 +63,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -134,9 +134,9 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl i32 [[INDEX]], 1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1
@@ -145,8 +145,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
index 1dfa7f8fe18b..cf4d65318b7e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -178,7 +178,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NOTF-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NOTF:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-NOTF:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-NOTF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NOTF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-NOTF:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-NOTF:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -191,7 +191,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NORED:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF-NORED:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NORED:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF-NORED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NORED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NORED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NORED:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -204,7 +204,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NOREC-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREC:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NOREC:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-TF-NOREC:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREC:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NOREC:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NOREC:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -217,7 +217,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NOREV:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREV:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NOREV:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF-NOREV:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREV:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NOREV:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NOREV:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -230,7 +230,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -243,7 +243,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-ONLYRED-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-ONLYRED:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-ONLYRED:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-TF-ONLYRED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-ONLYRED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-ONLYRED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -256,7 +256,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NEOVERSE-V1:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-NEOVERSE-V1:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-NEOVERSE-V1:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -350,30 +350,30 @@ define void @reverse(ptr noalias %dst, ptr noalias %src) #0 {
 ; CHECK-NOTF:       vector.body:
 ; CHECK-NOTF-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NOTF:         %[[LOAD:.*]] = load <vscale x 2 x double>, ptr
-; CHECK-NOTF:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+; CHECK-NOTF:         %{{.*}} = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
 
 ; CHECK-TF-NOREV-LABEL: @reverse(
 ; CHECK-TF-NOREV:       vector.body:
 ; CHECK-TF-NOREV-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREV:         %[[LOAD:.*]] = load <vscale x 2 x double>, ptr
-; CHECK-TF-NOREV:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+; CHECK-TF-NOREV:         %{{.*}} = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
 
 ; CHECK-TF-LABEL: @reverse(
 ; CHECK-TF:       vector.body:
 ; CHECK-TF:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 ; CHECK-TF-NORED-LABEL: @reverse(
 ; CHECK-TF-NORED:       vector.body:
 ; CHECK-TF-NORED:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF-NORED:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NORED:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF-NORED:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 ; CHECK-TF-NOREC-LABEL: @reverse(
 ; CHECK-TF-NOREC:       vector.body:
 ; CHECK-TF-NOREC:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF-NOREC:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index 70833e44b075..9485d827ced4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -1,5 +1,5 @@
 ; This is the loop in c++ being vectorize in this file with
-; experimental.vector.reverse
+; vector.reverse
 
 ;#pragma clang loop vectorize_width(4, scalable)
 ;  for (long int i = N - 1; i >= 0; i--)
@@ -18,12 +18,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
 ; CHECK-LABEL: vector.body:
-; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
 ; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
-; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
+; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
 ; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
-; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
-; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
+; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
 ; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
index e35a4db36905..e3bba1338e1d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; This is the loop in c++ being vectorize in this file with
-;experimental.vector.reverse
+;vector.reverse
 ;  #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 126ceac7325a..61105e51cb94 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -40,10 +40,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <vscale x 8 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
index 5c1966fa7a2d..0f524561eadc 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -107,9 +107,10 @@ while.body:
   %1 = load i8, ptr %b.addr.07, align 1
   %add = add i8 %1, %0
   %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
-  store i8 %add, ptr %c.addr.08, align 1
   %cmp = icmp sgt i32 %N.addr.09, 1
   %select = select i1 %cmp, i8 %0, i8 %1
+  %add2 = add i8 %add, %select
+  store i8 %add2, ptr %c.addr.08, align 1
   br i1 %cmp, label %while.body, label %while.end.loopexit
 
 while.end.loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 72d9691b2bb8..c3374fceb1fb 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; This is the loop in c++ being vectorize in this file with
-;experimental.vector.reverse
+;vector.reverse
 ;  #pragma clang loop vectorize_width(4, scalable)
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index f2222e0a1f93..0dee4a9b8585 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -46,9 +46,9 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
 ; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
@@ -56,8 +56,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
+; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
 ; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
index 0b9b592627c6..c4f9c404a926 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
@@ -1328,13 +1328,11 @@ define void @unknown_inner_stride(ptr nocapture noundef %dst, ptr nocapture noun
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 [[TMP14]], [[TMP0]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[TMP15]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4, !alias.scope [[META60:![0-9]+]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw i64 [[TMP14]], [[TMP1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[TMP19]], [[TMP12]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[TMP14]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META63:![0-9]+]], !noalias [[META60]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 3be31c011eaa..d64755999635 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -21,7 +21,7 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %[[NEXT_IDX:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1: middle.block:
 ; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
@@ -70,7 +70,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1: vector.body:
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[REVERSE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1: %[[REVERSE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1: middle.block:
 ; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
@@ -119,7 +119,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF1: vector.body:
 ; CHECK-VF4UF1: %vector.recur = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[L1:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[L1]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %vector.recur, <vscale x 4 x i16> %[[L1]], i32 -1)
+; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %vector.recur, <vscale x 4 x i16> %[[L1]], i32 -1)
 ; Check also that the casts were not moved needlessly.
 ; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[L1]] to <vscale x 4 x double>
 ; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x double>
@@ -169,8 +169,8 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF2-LABEL: @constant_folded_previous_value
 ; CHECK-VF4UF2: vector.body
 ; CHECK-VF4UF2: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i64> [ %vector.recur.init, %vector.ph ], [ shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %vector.body ]
-; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %vector.recur, <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
-; CHECK-VF4UF2: %[[SPLICE2:.*]] = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
+; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %vector.recur, <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
+; CHECK-VF4UF2: %[[SPLICE2:.*]] = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
 ; CHECK-VF4UF2: br i1 {{.*}}, label %middle.block, label %vector.body
 entry:
   br label %scalar.body
@@ -242,7 +242,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[LOAD]] to <vscale x 4 x i32>
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
new file mode 100644
index 000000000000..d09066fa2d70
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) {
+; CHECK-LABEL: define void @test_versioned_with_sext_use(
+; CHECK-SAME: i32 [[OFFSET:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = sext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header.loopexit:
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label [[INNER_LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 0, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP]]
+; CHECK:       inner.loop:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_2]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
+; CHECK-NEXT:    br i1 [[EC]], label [[OUTER_HEADER_LOOPEXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %offset.ext = sext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  %c = call i1 @cond()
+  br i1 %c, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i32 [ 0, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %gep = getelementptr i32, ptr %dst, i64 %iv.2
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
+define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) {
+; CHECK-LABEL: define void @test_versioned_with_zext_use(
+; CHECK-SAME: i32 [[OFFSET:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header.loopexit:
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label [[INNER_LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 0, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP]]
+; CHECK:       inner.loop:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_2]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
+; CHECK-NEXT:    br i1 [[EC]], label [[OUTER_HEADER_LOOPEXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %offset.ext = zext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  %c = call i1 @cond()
+  br i1 %c, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i32 [ 0, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %gep = getelementptr i32, ptr %dst, i64 %iv.2
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
+define void @versioned_sext_use_in_gep(i32 %scale, ptr %dst, i64 %scale.2) {
+; CHECK-LABEL: define void @versioned_sext_use_in_gep(
+; CHECK-SAME: i32 [[SCALE:%.*]], ptr [[DST:%.*]], i64 [[SCALE_2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SCALE_EXT:%.*]] = sext i32 [[SCALE]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[SCALE]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]]
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_MUL:%.*]] = mul i64 [[IV]], [[SCALE_EXT]]
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_MUL]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[SCALE_MUL:%.*]] = mul i64 [[SCALE_EXT]], [[SCALE_2]]
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_MUL]]
+; CHECK-NEXT:    store ptr [[GEP_2]], ptr [[GEP_1]], align 8
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 256
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %scale.ext = sext i32 %scale to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul  = mul i64 %iv, %scale.ext
+  %gep.1 = getelementptr i8, ptr %dst, i64 %iv.mul
+  %iv.next = add i64 %iv, 1
+  %scale.mul = mul i64 %scale.ext, %scale.2
+  %gep.2 = getelementptr i8, ptr %dst, i64 %scale.mul
+  store ptr %gep.2, ptr %gep.1, align 8
+  %ec = icmp eq i64 %iv.next, 256
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1, ptr %dst.2) {
+; CHECK-LABEL: define void @test_versioned_with_different_uses(
+; CHECK-SAME: i32 [[OFFSET:%.*]], ptr noalias [[DST_1:%.*]], ptr [[DST_2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header.loopexit:
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label [[INNER_LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 0, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], [[TMP2]]
+; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX2]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX2]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[OFFSET_IDX2]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX2]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP7]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP9]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP10]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP]]
+; CHECK:       inner.loop:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_MUL:%.*]] = mul i32 [[IV_3]], [[OFFSET]]
+; CHECK-NEXT:    [[GEP_MUL:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[IV_MUL]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_MUL]], align 8
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[IV_2]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
+; CHECK-NEXT:    br i1 [[EC]], label [[OUTER_HEADER_LOOPEXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %offset.ext = zext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  %c = call i1 @cond()
+  br i1 %c, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i32 [ 0, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %iv.mul  = mul i32 %iv.3, %offset
+  %gep.mul = getelementptr i8, ptr %dst.1, i32 %iv.mul
+  store i32 0, ptr %gep.mul, align 8
+  %gep = getelementptr i32, ptr %dst.2, i64 %iv.2
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
+define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr %dst.2) {
+; CHECK-LABEL: define void @test_versioned_with_non_ex_use(
+; CHECK-SAME: i32 [[OFFSET:%.*]], ptr noalias [[DST_1:%.*]], ptr [[DST_2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[OFFSET]], 3
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 -3, [[OFFSET]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[ADD]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[TMP0]], i32 [[ADD]]
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP2]], i32 200)
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[MUL_RESULT]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP1]], i1 [[TMP5]], i1 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[IDENT_CHECK]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP17]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP12]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP16]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP20]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP21]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_3_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_MUL:%.*]] = mul i32 [[IV_3]], [[ADD]]
+; CHECK-NEXT:    [[GEP_MUL:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[IV_MUL]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_MUL]], align 8
+; CHECK-NEXT:    [[IV_2_MUL:%.*]] = mul i64 [[IV_2]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[IV_2_MUL]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], 1
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %offset.ext = zext i32 %offset to i64
+  %add = add i32 %offset, 3
+  br label %loop
+
+loop:
+  %iv.2 = phi i64 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.3 = phi i32 [ 0, %entry ], [ %iv.3.next, %loop ]
+  %iv.mul  = mul i32 %iv.3, %add
+  %gep.mul = getelementptr i8, ptr %dst.1, i32 %iv.mul
+  store i32 0, ptr %gep.mul, align 8
+  %iv.2.mul  = mul i64 %iv.2, %offset.ext
+  %gep = getelementptr i32, ptr %dst.2, i64 %iv.2.mul
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i64 %iv.2, 1
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll b/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll
index 4c88f4acc12f..2795333effd7 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll
@@ -109,8 +109,8 @@ attributes #6 = { noreturn nounwind }
 !11 = !{}
 !12 = !{!"branch_weights", i32 1048575, i32 1}
 ; CHECK: Function Attrs: minsize mustprogress nofree norecurse nosync nounwind optsize willreturn memory(none)
-; CHECK-LABEL: define dso_local noundef i32 @_Z9nothrow_ei
-; CHECK-SAME: (i32 noundef [[NUM:%.*]]) #[[ATTR0:[0-9]+]] !type !4 !type !5 !type !6 {
+; CHECK-LABEL: define dso_local noundef range(i32 0, 2) i32 @_Z9nothrow_ei
+; CHECK-SAME: (i32 noundef [[NUM:%.*]]) #[[ATTR0:[0-9]+]] !type [[META4:![0-9]+]] !type [[META5:![0-9]+]] !type [[META6:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i32 [[NUM]], 0
 ; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[TOBOOL_NOT]] to i32
@@ -118,8 +118,8 @@ attributes #6 = { noreturn nounwind }
 ;
 ;
 ; CHECK: Function Attrs: minsize mustprogress nofree norecurse nosync nounwind optsize willreturn memory(write, argmem: none, inaccessiblemem: none)
-; CHECK-LABEL: define dso_local noundef i32 @_Z10call_catchi
-; CHECK-SAME: (i32 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !type !4 !type !5 !type !6 {
+; CHECK-LABEL: define dso_local noundef range(i32 0, 2) i32 @_Z10call_catchi
+; CHECK-SAME: (i32 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !type [[META4]] !type [[META5]] !type [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store ptr @_Z9nothrow_ei.cfi_jt, ptr @catch_ptr, align 8, !tbaa [[TBAA7:![0-9]+]]
 ; CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp ne i32 [[NUM]], 0
@@ -131,17 +131,17 @@ attributes #6 = { noreturn nounwind }
 ; CHECK-LABEL: define weak_odr hidden void @__cfi_check_fail
 ; CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq ptr [[TMP0]], null, !nosanitize !11
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[TRAP:%.*]], label [[CONT:%.*]], !nosanitize !11
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq ptr [[TMP0]], null, !nosanitize [[META11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[TRAP:%.*]], label [[CONT:%.*]], !nosanitize [[META11]]
 ; CHECK:       trap:
-; CHECK-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR5:[0-9]+]], !nosanitize !11
-; CHECK-NEXT:    unreachable, !nosanitize !11
+; CHECK-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR6:[0-9]+]], !nosanitize [[META11]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META11]]
 ; CHECK:       cont:
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 4, !nosanitize !11
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 4, !nosanitize [[META11]]
 ; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[TMP2]], 5
 ; CHECK-NEXT:    br i1 [[SWITCH]], label [[TRAP]], label [[CONT6:%.*]]
 ; CHECK:       cont6:
-; CHECK-NEXT:    ret void, !nosanitize !11
+; CHECK-NEXT:    ret void, !nosanitize [[META11]]
 ;
 ;
 ; CHECK-LABEL: define weak void @__cfi_check
@@ -153,8 +153,8 @@ attributes #6 = { noreturn nounwind }
 ;
 ; CHECK: Function Attrs: naked nocf_check noinline nounwind
 ; CHECK-LABEL: define internal void @_Z9nothrow_ei.cfi_jt
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] align 8 {
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] align 8 {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr nonnull @_Z9nothrow_ei) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    tail call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr nonnull @_Z9nothrow_ei) #[[ATTR7:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvm/test/Transforms/OpenMP/add_attributes.ll b/llvm/test/Transforms/OpenMP/add_attributes.ll
index 47ff5cad4e7e..ebcca3067f04 100644
--- a/llvm/test/Transforms/OpenMP/add_attributes.ll
+++ b/llvm/test/Transforms/OpenMP/add_attributes.ll
@@ -641,8 +641,6 @@ declare i32 @__tgt_target_teams_mapper(ptr, i64, ptr, i32, ptr, ptr, ptr, ptr, p
 
 declare i32 @__tgt_target_teams_nowait_mapper(ptr, i64, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, i32, ptr, i32, ptr)
 
-declare void @__tgt_register_requires(i64)
-
 declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr)
 
 declare void @__tgt_target_data_begin_nowait_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr)
@@ -1249,9 +1247,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; CHECK-NEXT: declare i32 @__tgt_target_teams_nowait_mapper(ptr, i64, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, i32, ptr, i32, ptr)
 
 ; CHECK: ; Function Attrs: nounwind
-; CHECK-NEXT: declare void @__tgt_register_requires(i64)
-
-; CHECK: ; Function Attrs: nounwind
 ; CHECK-NEXT: declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr)
 
 ; CHECK: ; Function Attrs: nounwind
@@ -1894,9 +1889,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; OPTIMISTIC-NEXT: declare i32 @__tgt_target_teams_nowait_mapper(ptr, i64, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, i32, ptr, i32, ptr)
 
 ; OPTIMISTIC: ; Function Attrs: nounwind
-; OPTIMISTIC-NEXT: declare void @__tgt_register_requires(i64)
-
-; OPTIMISTIC: ; Function Attrs: nounwind
 ; OPTIMISTIC-NEXT: declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr)
 
 ; OPTIMISTIC: ; Function Attrs: nounwind
@@ -2552,9 +2544,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; EXT-NEXT: declare signext i32 @__tgt_target_teams_nowait_mapper(ptr, i64, ptr, i32 signext, ptr, ptr, ptr, ptr, ptr, ptr, i32 signext, i32 signext, i32 signext, ptr, i32 signext, ptr)
 
 ; EXT: ; Function Attrs: nounwind
-; EXT-NEXT: declare void @__tgt_register_requires(i64)
-
-; EXT: ; Function Attrs: nounwind
 ; EXT-NEXT: declare void @__tgt_target_data_begin_mapper(ptr, i64, i32 signext, ptr, ptr, ptr, ptr, ptr, ptr)
 
 ; EXT: ; Function Attrs: nounwind
diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
new file mode 100644
index 000000000000..2ad95ab51cc6
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt -passes=pgo-instr-gen -profile-context-root=an_entrypoint \
+; RUN:   -S < %s | FileCheck --check-prefix=INSTRUMENT %s
+
+declare void @bar()
+
+;.
+; INSTRUMENT: @__profn_foo = private constant [3 x i8] c"foo"
+;.
+define void @foo(i32 %a, ptr %fct) {
+; INSTRUMENT-LABEL: define void @foo(
+; INSTRUMENT-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) {
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
+; INSTRUMENT-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
+; INSTRUMENT-NEXT:    br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; INSTRUMENT:       yes:
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1)
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]])
+; INSTRUMENT-NEXT:    call void [[FCT]](i32 [[A]])
+; INSTRUMENT-NEXT:    br label [[EXIT:%.*]]
+; INSTRUMENT:       no:
+; INSTRUMENT-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1, ptr @bar)
+; INSTRUMENT-NEXT:    call void @bar()
+; INSTRUMENT-NEXT:    br label [[EXIT]]
+; INSTRUMENT:       exit:
+; INSTRUMENT-NEXT:    ret void
+;
+  %t = icmp eq i32 %a, 0
+  br i1 %t, label %yes, label %no
+yes:
+  call void %fct(i32 %a)
+  br label %exit
+no:
+  call void @bar()
+  br label %exit
+exit:
+  ret void
+}
+;.
+; INSTRUMENT: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
index 9206893cb234..c133852f6693 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
@@ -7,7 +7,7 @@ target triple = "aarch64"
 ; Check that the function gets vectorized.
 
 define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
-; CHECK-LABEL: define i32 @quant_4x4
+; CHECK-LABEL: define range(i32 0, 2) i32 @quant_4x4
 ; CHECK-SAME: (ptr nocapture noundef [[DCT:%.*]], ptr nocapture noundef readonly [[MF:%.*]], ptr nocapture noundef readonly [[BIAS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DCT]], i64 32
diff --git a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
index 67d721b23d6f..35d5ceeb9195 100644
--- a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
+++ b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -O1 -S < %s  | FileCheck %s
 
 define i32 @testa(i32 %mul) {
-; CHECK-LABEL: define i32 @testa(
+; CHECK-LABEL: define range(i32 -65536, 65536) i32 @testa(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
 ; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
@@ -16,7 +16,7 @@ define i32 @testa(i32 %mul) {
 }
 
 define i32 @testb(i32 %mul) {
-; CHECK-LABEL: define i32 @testb(
+; CHECK-LABEL: define range(i32 -16777216, 16777216) i32 @testb(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[SHR102:%.*]] = ashr i32 [[MUL]], 7
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[SHR102]], i32 -128)
diff --git a/llvm/test/Transforms/PhaseOrdering/min_max_loop.ll b/llvm/test/Transforms/PhaseOrdering/min_max_loop.ll
index fb338a6507eb..63cfef6f3d09 100644
--- a/llvm/test/Transforms/PhaseOrdering/min_max_loop.ll
+++ b/llvm/test/Transforms/PhaseOrdering/min_max_loop.ll
@@ -19,7 +19,7 @@
 ;; }
 
 define i16 @vecreduce_smin_v2i16(i32 %n, ptr %v) {
-; CHECK-LABEL: define i16 @vecreduce_smin_v2i16(
+; CHECK-LABEL: define range(i16 -32768, 1) i16 @vecreduce_smin_v2i16(
 ; CHECK:    @llvm.smin.v2i16
 
 entry:
@@ -65,7 +65,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 define i16 @vecreduce_smax_v2i16(i32 %n, ptr %v) {
-; CHECK-LABEL: define i16 @vecreduce_smax_v2i16(
+; CHECK-LABEL: define range(i16 0, -32768) i16 @vecreduce_smax_v2i16(
 ; CHECK:  @llvm.smax.v2i16
 
 entry:
diff --git a/llvm/test/Transforms/SCCP/and-add-shl.ll b/llvm/test/Transforms/SCCP/and-add-shl.ll
index 7c037ffa6bf6..7af563f13a18 100644
--- a/llvm/test/Transforms/SCCP/and-add-shl.ll
+++ b/llvm/test/Transforms/SCCP/and-add-shl.ll
@@ -59,7 +59,7 @@ define i8 @and_not_shl_1(i8 %x) {
 
 ; Negative test: https://alive2.llvm.org/ce/z/Zv4Pyu
 define i8 @and_add_shl_overlap(i8 %x) {
-; CHECK-LABEL: define i8 @and_add_shl_overlap
+; CHECK-LABEL: define range(i8 0, 33) i8 @and_add_shl_overlap
 ; CHECK-SAME: (i8 [[X:%.*]]) {
 ; CHECK-NEXT:    [[OP1_P2:%.*]] = icmp ule i8 [[X]], 6
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OP1_P2]])
@@ -77,7 +77,7 @@ define i8 @and_add_shl_overlap(i8 %x) {
 }
 
 define i8 @and_not_shl_overlap(i8 %x) {
-; CHECK-LABEL: define i8 @and_not_shl_overlap
+; CHECK-LABEL: define range(i8 0, 5) i8 @and_not_shl_overlap
 ; CHECK-SAME: (i8 [[X:%.*]]) {
 ; CHECK-NEXT:    [[OP1_P2:%.*]] = icmp ule i8 [[X]], 3
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OP1_P2]])
diff --git a/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll b/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
index 64c1b9020a05..c24c554102dd 100644
--- a/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
+++ b/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
@@ -1,20 +1,21 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=ipsccp -S %s | FileCheck %s
 
 ; Test 1.
 ; Both arguments and return value of @callee can be tracked. The inferred range
 ; can be added to call sites.
 define internal i32 @callee(i32 %x) {
-; CHECK-LABEL: @callee(
-; CHECK-NEXT:    ret i32 [[X:%.*]]
+; CHECK-LABEL: define internal range(i32 0, 21) i32 @callee(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   ret i32 %x
 }
 
 define i32 @caller1() {
-; CHECK-LABEL: @caller1(
-; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee(i32 10), !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee(i32 20), !range [[RNG0]]
+; CHECK-LABEL: define range(i32 0, 41) i32 @caller1() {
+; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee(i32 10)
+; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee(i32 20)
 ; CHECK-NEXT:    [[A:%.*]] = add nuw nsw i32 [[C1]], [[C2]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ;
@@ -25,9 +26,10 @@ define i32 @caller1() {
 }
 
 define i32 @caller2(i32 %x) {
-; CHECK-LABEL: @caller2(
-; CHECK-NEXT:    [[X_15:%.*]] = and i32 [[X:%.*]], 15
-; CHECK-NEXT:    [[C:%.*]] = call i32 @callee(i32 [[X_15]]), !range [[RNG0]]
+; CHECK-LABEL: define range(i32 0, 21) i32 @caller2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_15:%.*]] = and i32 [[X]], 15
+; CHECK-NEXT:    [[C:%.*]] = call i32 @callee(i32 [[X_15]])
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %x.15 = and i32 %x, 15
@@ -43,14 +45,15 @@ define i32 @caller2(i32 %x) {
 declare void @use_cb1(ptr)
 
 define internal i32 @callee2(i32 %x) {
-; CHECK-LABEL: @callee2(
-; CHECK-NEXT:    ret i32 [[X:%.*]]
+; CHECK-LABEL: define internal i32 @callee2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   ret i32 %x
 }
 
 define void @caller_cb1() {
-; CHECK-LABEL: @caller_cb1(
+; CHECK-LABEL: define void @caller_cb1() {
 ; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee2(i32 9)
 ; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee2(i32 10)
 ; CHECK-NEXT:    call void @use_cb1(ptr @callee2)
@@ -70,8 +73,9 @@ define void @caller_cb1() {
 declare void @use_cb2(ptr)
 
 define internal i32 @callee3(i32 %x) {
-; CHECK-LABEL: @callee3(
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X:%.*]], 10
+; CHECK-LABEL: define internal range(i32 500, 601) i32 @callee3(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X]], 10
 ; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 500, i32 600
 ; CHECK-NEXT:    ret i32 [[S]]
 ;
@@ -81,9 +85,9 @@ define internal i32 @callee3(i32 %x) {
 }
 
 define void @caller_cb2() {
-; CHECK-LABEL: @caller_cb2(
-; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee3(i32 9), !range [[RNG1:![0-9]+]]
-; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee3(i32 10), !range [[RNG1]]
+; CHECK-LABEL: define void @caller_cb2() {
+; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee3(i32 9)
+; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee3(i32 10)
 ; CHECK-NEXT:    call void @use_cb2(ptr @callee3)
 ; CHECK-NEXT:    ret void
 ;
@@ -100,9 +104,10 @@ define void @caller_cb2() {
 declare void @use_cb3(ptr)
 
 define internal i32 @callee4(i32 %x, i32 %y) {
-; CHECK-LABEL: @callee4(
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X:%.*]], 10
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 500, i32 [[Y:%.*]]
+; CHECK-LABEL: define internal i32 @callee4(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X]], 10
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 500, i32 [[Y]]
 ; CHECK-NEXT:    ret i32 [[S]]
 ;
   %c = icmp eq i32 %x, 10
@@ -111,11 +116,9 @@ define internal i32 @callee4(i32 %x, i32 %y) {
 }
 
 define void @caller_cb3() {
-; CHECK-LABEL: @caller_cb3(
+; CHECK-LABEL: define void @caller_cb3() {
 ; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee4(i32 11, i32 30)
-; CHECK-NOT:   !range
 ; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee4(i32 12, i32 40)
-; CHECK-NOT:   !range
 ; CHECK-NEXT:    call void @use_cb3(ptr @callee4)
 ; CHECK-NEXT:    ret void
 ;
@@ -129,15 +132,16 @@ define void @caller_cb3() {
 ; Range for the return value of callee5 includes undef. No range metadata
 ; should be added at call sites.
 define internal i32 @callee5(i32 %x, i32 %y) {
-; CHECK-LABEL: @callee5(
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[X:%.*]], 15
+; CHECK-LABEL: define internal i32 @callee5(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[X]], 15
 ; CHECK-NEXT:    br i1 [[C]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[Y:%.*]], [[BB1]] ], [ undef, [[BB2]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[Y]], [[BB1]] ], [ undef, [[BB2]] ]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %c = icmp slt i32 %x, 15
@@ -155,11 +159,9 @@ exit:
 }
 
 define i32 @caller5() {
-; CHECK-LABEL: @caller5(
+; CHECK-LABEL: define range(i32 200, 401) i32 @caller5() {
 ; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee5(i32 10, i32 100)
-; CHECK-NOT:   !range
 ; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee5(i32 20, i32 200)
-; CHECK-NOT:   !range
 ; CHECK-NEXT:    [[A:%.*]] = add i32 [[C1]], [[C2]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ;
@@ -170,8 +172,9 @@ define i32 @caller5() {
 }
 
 define internal <2 x i64> @ctlz(<2 x i64> %arg) {
-; CHECK-LABEL: @ctlz(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[ARG:%.*]], i1 false)
+; CHECK-LABEL: define internal range(i64 0, 65) <2 x i64> @ctlz(
+; CHECK-SAME: <2 x i64> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[ARG]], i1 false)
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %arg, i1 false)
@@ -179,8 +182,9 @@ define internal <2 x i64> @ctlz(<2 x i64> %arg) {
 }
 
 define <2 x i64> @ctlz_caller(<2 x i64> %arg) {
-; CHECK-LABEL: @ctlz_caller(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @ctlz(<2 x i64> [[ARG:%.*]]), !range [[RNG2:![0-9]+]]
+; CHECK-LABEL: define range(i64 0, 65) <2 x i64> @ctlz_caller(
+; CHECK-SAME: <2 x i64> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @ctlz(<2 x i64> [[ARG]])
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %res = call <2 x i64> @ctlz(<2 x i64> %arg)
@@ -189,6 +193,3 @@ define <2 x i64> @ctlz_caller(<2 x i64> %arg) {
 
 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
 
-; CHECK: [[RNG0]] = !{i32 0, i32 21}
-; CHECK: [[RNG1]] = !{i32 500, i32 601}
-; CHECK: [[RNG2]] = !{i64 0, i64 65}
diff --git a/llvm/test/Transforms/SCCP/ip-ranges-casts.ll b/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
index 80d90922c2fb..05fa04a9fbe0 100644
--- a/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
+++ b/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
@@ -1,10 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
 
 ; x = [100, 301)
 define internal i1 @f.trunc(i32 %x) {
-; CHECK-LABEL: @f.trunc(
-; CHECK-NEXT:    [[T_1:%.*]] = trunc nuw nsw i32 [[X:%.*]] to i16
+; CHECK-LABEL: define internal i1 @f.trunc(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[T_1:%.*]] = trunc nuw nsw i32 [[X]] to i16
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i16 [[T_1]], 299
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp slt i16 [[T_1]], 101
 ; CHECK-NEXT:    [[RES_1:%.*]] = add nuw nsw i1 false, [[C_2]]
@@ -43,7 +44,7 @@ define internal i1 @f.trunc(i32 %x) {
 }
 
 define i1 @caller1() {
-; CHECK-LABEL: @caller1(
+; CHECK-LABEL: define i1 @caller1() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.trunc(i32 100)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.trunc(i32 300)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -58,14 +59,15 @@ define i1 @caller1() {
 
 ; x = [100, 301)
 define internal i1 @f.zext(i32 %x, i32 %y) {
-; CHECK-LABEL: @f.zext(
-; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X:%.*]] to i64
+; CHECK-LABEL: define internal i1 @f.zext(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X]] to i64
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i64 [[T_1]], 299
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp slt i64 [[T_1]], 101
 ; CHECK-NEXT:    [[RES_1:%.*]] = add nuw nsw i1 false, [[C_2]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add nuw nsw i1 [[RES_1]], false
 ; CHECK-NEXT:    [[RES_3:%.*]] = add i1 [[RES_2]], [[C_4]]
-; CHECK-NEXT:    [[T_2:%.*]] = zext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[T_2:%.*]] = zext i32 [[Y]] to i64
 ; CHECK-NEXT:    [[C_5:%.*]] = icmp sgt i64 [[T_2]], 300
 ; CHECK-NEXT:    [[C_6:%.*]] = icmp sgt i64 [[T_2]], 299
 ; CHECK-NEXT:    [[C_8:%.*]] = icmp slt i64 [[T_2]], 1
@@ -97,7 +99,7 @@ define internal i1 @f.zext(i32 %x, i32 %y) {
 }
 
 define i1 @caller.zext() {
-; CHECK-LABEL: @caller.zext(
+; CHECK-LABEL: define i1 @caller.zext() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.zext(i32 100, i32 -120)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.zext(i32 300, i32 900)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -111,14 +113,15 @@ define i1 @caller.zext() {
 
 ; x = [100, 301)
 define internal i1 @f.sext(i32 %x, i32 %y) {
-; CHECK-LABEL: @f.sext(
-; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X:%.*]] to i64
+; CHECK-LABEL: define internal i1 @f.sext(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X]] to i64
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i64 [[T_1]], 299
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp slt i64 [[T_1]], 101
 ; CHECK-NEXT:    [[RES_1:%.*]] = add nuw nsw i1 false, [[C_2]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add nuw nsw i1 [[RES_1]], false
 ; CHECK-NEXT:    [[RES_3:%.*]] = add i1 [[RES_2]], [[C_4]]
-; CHECK-NEXT:    [[T_2:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[T_2:%.*]] = sext i32 [[Y]] to i64
 ; CHECK-NEXT:    [[C_6:%.*]] = icmp sgt i64 [[T_2]], 899
 ; CHECK-NEXT:    [[C_8:%.*]] = icmp slt i64 [[T_2]], -119
 ; CHECK-NEXT:    [[RES_4:%.*]] = add nuw nsw i1 [[RES_3]], false
@@ -148,7 +151,7 @@ define internal i1 @f.sext(i32 %x, i32 %y) {
 }
 
 define i1 @caller.sext() {
-; CHECK-LABEL: @caller.sext(
+; CHECK-LABEL: define i1 @caller.sext() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.sext(i32 100, i32 -120)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.sext(i32 300, i32 900)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -162,8 +165,9 @@ define i1 @caller.sext() {
 
 ; There's nothing we can do besides going to the full range or overdefined.
 define internal i1 @f.fptosi(i32 %x) {
-; CHECK-LABEL: @f.fptosi(
-; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = sitofp i32 [[X:%.*]] to double
+; CHECK-LABEL: define internal i1 @f.fptosi(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = sitofp i32 [[X]] to double
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double 0.000000e+00, [[TO_DOUBLE]]
 ; CHECK-NEXT:    [[TO_I32:%.*]] = fptosi double [[ADD]] to i32
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[TO_I32]], 300
@@ -189,7 +193,7 @@ define internal i1 @f.fptosi(i32 %x) {
 }
 
 define i1 @caller.fptosi() {
-; CHECK-LABEL: @caller.fptosi(
+; CHECK-LABEL: define i1 @caller.fptosi() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.fptosi(i32 100)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.fptosi(i32 300)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -203,8 +207,9 @@ define i1 @caller.fptosi() {
 
 ; There's nothing we can do besides going to the full range or overdefined.
 define internal i1 @f.fpext(i16 %x) {
-; CHECK-LABEL: @f.fpext(
-; CHECK-NEXT:    [[TO_FLOAT:%.*]] = sitofp i16 [[X:%.*]] to float
+; CHECK-LABEL: define internal i1 @f.fpext(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TO_FLOAT:%.*]] = sitofp i16 [[X]] to float
 ; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = fpext float [[TO_FLOAT]] to double
 ; CHECK-NEXT:    [[TO_I64:%.*]] = fptoui float [[TO_FLOAT]] to i64
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i64 [[TO_I64]], 300
@@ -231,7 +236,7 @@ define internal i1 @f.fpext(i16 %x) {
 
 ; There's nothing we can do besides going to the full range or overdefined.
 define i1 @caller.fpext() {
-; CHECK-LABEL: @caller.fpext(
+; CHECK-LABEL: define i1 @caller.fpext() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.fpext(i16 100)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.fpext(i16 300)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -245,8 +250,9 @@ define i1 @caller.fpext() {
 
 ; There's nothing we can do besides going to the full range or overdefined.
 define internal i1 @f.inttoptr.ptrtoint(i64 %x) {
-; CHECK-LABEL: @f.inttoptr.ptrtoint(
-; CHECK-NEXT:    [[TO_PTR:%.*]] = inttoptr i64 [[X:%.*]] to ptr
+; CHECK-LABEL: define internal i1 @f.inttoptr.ptrtoint(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[TO_PTR:%.*]] = inttoptr i64 [[X]] to ptr
 ; CHECK-NEXT:    [[TO_I64:%.*]] = ptrtoint ptr [[TO_PTR]] to i64
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i64 [[TO_I64]], 300
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i64 [[TO_I64]], 299
@@ -270,7 +276,7 @@ define internal i1 @f.inttoptr.ptrtoint(i64 %x) {
 }
 
 define i1 @caller.inttoptr.ptrtoint() {
-; CHECK-LABEL: @caller.inttoptr.ptrtoint(
+; CHECK-LABEL: define i1 @caller.inttoptr.ptrtoint() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.inttoptr.ptrtoint(i64 100)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.inttoptr.ptrtoint(i64 300)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -284,8 +290,9 @@ define i1 @caller.inttoptr.ptrtoint() {
 
 ; Make sure we do not create constant ranges for int to fp casts.
 define i1 @int_range_to_double_cast(i32 %a) {
-; CHECK-LABEL: @int_range_to_double_cast(
-; CHECK-NEXT:    [[R:%.*]] = and i32 [[A:%.*]], 255
+; CHECK-LABEL: define i1 @int_range_to_double_cast(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = and i32 [[A]], 255
 ; CHECK-NEXT:    [[T4:%.*]] = sitofp i32 [[R]] to double
 ; CHECK-NEXT:    [[T10:%.*]] = fadd double 0.000000e+00, [[T4]]
 ; CHECK-NEXT:    [[T11:%.*]] = fcmp olt double [[T4]], [[T10]]
@@ -300,7 +307,7 @@ define i1 @int_range_to_double_cast(i32 %a) {
 
 ; Make sure we do not use ranges to propagate info from vectors.
 define i16 @vector_binop_and_cast() {
-; CHECK-LABEL: @vector_binop_and_cast(
+; CHECK-LABEL: define i16 @vector_binop_and_cast() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x i16> <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 undef, i32 0
 ; CHECK-NEXT:    [[REM:%.*]] = srem <8 x i16> <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>, [[VECINIT7]]
@@ -317,8 +324,9 @@ entry:
 }
 
 define internal i64 @f.sext_to_zext(i32 %t) {
-; CHECK-LABEL: @f.sext_to_zext(
-; CHECK-NEXT:    [[A:%.*]] = zext nneg i32 [[T:%.*]] to i64
+; CHECK-LABEL: define internal range(i64 0, 2) i64 @f.sext_to_zext(
+; CHECK-SAME: i32 [[T:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = zext nneg i32 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[A]]
 ;
   %a = sext i32 %t to i64
@@ -326,10 +334,11 @@ define internal i64 @f.sext_to_zext(i32 %t) {
 }
 
 define i64 @caller.sext_to_zext(i32 %i) {
-; CHECK-LABEL: @caller.sext_to_zext(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[I:%.*]], 9
+; CHECK-LABEL: define range(i64 0, 2) i64 @caller.sext_to_zext(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[I]], 9
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    [[T:%.*]] = call i64 @f.sext_to_zext(i32 [[CONV]]), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[T:%.*]] = call i64 @f.sext_to_zext(i32 [[CONV]])
 ; CHECK-NEXT:    ret i64 [[T]]
 ;
   %cmp = icmp sle i32 %i, 9
diff --git a/llvm/test/Transforms/SCCP/ipsccp-basic.ll b/llvm/test/Transforms/SCCP/ipsccp-basic.ll
index 71c042b9b294..6a7ab8ac2864 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-basic.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-basic.ll
@@ -71,7 +71,7 @@ define void @test3a() {
 }
 
 define i32 @test3b() {
-; CHECK-LABEL: define i32 @test3b() {
+; CHECK-LABEL: define range(i32 0, 18) i32 @test3b() {
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr @G, align 4
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[V]], 17
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -105,7 +105,7 @@ define internal {i64,i64} @test4a() {
 }
 
 define i64 @test4b() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: define i64 @test4b() personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: define range(i64 0, 6) i64 @test4b() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[A:%.*]] = invoke { i64, i64 } @test4a()
 ; CHECK-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
 ; CHECK:       A:
@@ -149,7 +149,7 @@ define internal {i64,i64} @test5a() {
 }
 
 define i64 @test5b() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: define i64 @test5b() personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: define range(i64 0, 6) i64 @test5b() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[A:%.*]] = invoke { i64, i64 } @test5a()
 ; CHECK-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
 ; CHECK:       A:
diff --git a/llvm/test/Transforms/SCCP/switch.ll b/llvm/test/Transforms/SCCP/switch.ll
index 306f0eebf2b4..5208213de210 100644
--- a/llvm/test/Transforms/SCCP/switch.ll
+++ b/llvm/test/Transforms/SCCP/switch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=ipsccp < %s | FileCheck %s
 
 ; Make sure we always consider the default edge executable for a switch
@@ -7,7 +7,7 @@ declare void @foo()
 declare i32 @g(i32)
 
 define void @test1() {
-; CHECK-LABEL: @test1(
+; CHECK-LABEL: define void @test1() {
 ; CHECK-NEXT:    switch i32 undef, label [[D:%.*]] [
 ; CHECK-NEXT:    ]
 ; CHECK:       d:
@@ -21,15 +21,16 @@ d:
 }
 
 define i32 @test_duplicate_successors_phi(i1 %c, i32 %x) {
-; CHECK-LABEL: @test_duplicate_successors_phi(
+; CHECK-LABEL: define i32 @test_duplicate_successors_phi(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[SWITCH:%.*]], label [[END:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[SWITCH:%.*]], label [[END:%.*]]
 ; CHECK:       switch:
 ; CHECK-NEXT:    br label [[SWITCH_DEFAULT:%.*]]
 ; CHECK:       switch.default:
 ; CHECK-NEXT:    ret i32 -1
 ; CHECK:       end:
-; CHECK-NEXT:    ret i32 [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[X]]
 ;
 entry:
   br i1 %c, label %switch, label %end
@@ -49,13 +50,14 @@ end:
 }
 
 define i32 @test_duplicate_successors_phi_2(i1 %c, i32 %x) {
-; CHECK-LABEL: @test_duplicate_successors_phi_2(
+; CHECK-LABEL: define i32 @test_duplicate_successors_phi_2(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[SWITCH:%.*]], label [[END:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[SWITCH:%.*]], label [[END:%.*]]
 ; CHECK:       switch:
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[X:%.*]], [[ENTRY:%.*]] ], [ 1, [[SWITCH]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ 1, [[SWITCH]] ]
 ; CHECK-NEXT:    ret i32 [[PHI]]
 ;
 entry:
@@ -76,22 +78,23 @@ end:
 }
 
 define i32 @test_duplicate_successors_phi_3(i1 %c1, ptr %p, i32 %y) {
-; CHECK-LABEL: @test_duplicate_successors_phi_3(
+; CHECK-LABEL: define i32 @test_duplicate_successors_phi_3(
+; CHECK-SAME: i1 [[C1:%.*]], ptr [[P:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C1:%.*]], label [[SWITCH:%.*]], label [[SWITCH_1:%.*]]
+; CHECK-NEXT:    br i1 [[C1]], label [[SWITCH:%.*]], label [[SWITCH_1:%.*]]
 ; CHECK:       switch:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0:![0-9]+]]
 ; CHECK-NEXT:    switch i32 [[X]], label [[SWITCH_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SWITCH_DEFAULT]]
-; CHECK-NEXT:    i32 1, label [[SWITCH_0:%.*]]
-; CHECK-NEXT:    i32 2, label [[SWITCH_0]]
+; CHECK-NEXT:      i32 0, label [[SWITCH_DEFAULT]]
+; CHECK-NEXT:      i32 1, label [[SWITCH_0:%.*]]
+; CHECK-NEXT:      i32 2, label [[SWITCH_0]]
 ; CHECK-NEXT:    ]
 ; CHECK:       switch.default:
 ; CHECK-NEXT:    ret i32 -1
 ; CHECK:       switch.0:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       switch.1:
-; CHECK-NEXT:    ret i32 [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[Y]]
 ;
 entry:
   br i1 %c1, label %switch, label %switch.1
@@ -118,12 +121,13 @@ switch.1:
 }
 
 define i32 @test_local_range(ptr %p) {
-; CHECK-LABEL: @test_local_range(
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0]]
+; CHECK-LABEL: define range(i32 0, 3) i32 @test_local_range(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
 ; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SWITCH_0:%.*]]
-; CHECK-NEXT:    i32 1, label [[SWITCH_1:%.*]]
-; CHECK-NEXT:    i32 2, label [[SWITCH_2:%.*]]
+; CHECK-NEXT:      i32 0, label [[SWITCH_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[SWITCH_1:%.*]]
+; CHECK-NEXT:      i32 2, label [[SWITCH_2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default.unreachable:
 ; CHECK-NEXT:    unreachable
@@ -160,13 +164,14 @@ switch.3:
 
 ; TODO: Determine that case i3 is dead, even though the edge is shared?
 define i32 @test_duplicate_successors(ptr %p) {
-; CHECK-LABEL: @test_duplicate_successors(
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0]]
+; CHECK-LABEL: define range(i32 0, 2) i32 @test_duplicate_successors(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
 ; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SWITCH_0:%.*]]
-; CHECK-NEXT:    i32 1, label [[SWITCH_0]]
-; CHECK-NEXT:    i32 2, label [[SWITCH_1:%.*]]
-; CHECK-NEXT:    i32 3, label [[SWITCH_1]]
+; CHECK-NEXT:      i32 0, label [[SWITCH_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[SWITCH_0]]
+; CHECK-NEXT:      i32 2, label [[SWITCH_1:%.*]]
+; CHECK-NEXT:      i32 3, label [[SWITCH_1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default.unreachable:
 ; CHECK-NEXT:    unreachable
@@ -201,11 +206,12 @@ switch.2:
 ; Case i32 2 is dead as well, but this cannot be determined based on
 ; range information.
 define internal i32 @test_ip_range(i32 %x) {
-; CHECK-LABEL: @test_ip_range(
-; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:    i32 3, label [[SWITCH_3:%.*]]
-; CHECK-NEXT:    i32 1, label [[SWITCH_1:%.*]]
-; CHECK-NEXT:    i32 2, label [[SWITCH_2:%.*]]
+; CHECK-LABEL: define internal range(i32 1, 4) i32 @test_ip_range(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 3, label [[SWITCH_3:%.*]]
+; CHECK-NEXT:      i32 1, label [[SWITCH_1:%.*]]
+; CHECK-NEXT:      i32 2, label [[SWITCH_2:%.*]]
 ; CHECK-NEXT:    ], !prof [[PROF1:![0-9]+]]
 ; CHECK:       default.unreachable:
 ; CHECK-NEXT:    unreachable
@@ -240,9 +246,9 @@ switch.3:
 }
 
 define void @call_test_ip_range() {
-; CHECK-LABEL: @call_test_ip_range(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @test_ip_range(i32 1), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @test_ip_range(i32 3), !range [[RNG2]]
+; CHECK-LABEL: define void @call_test_ip_range() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @test_ip_range(i32 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @test_ip_range(i32 3)
 ; CHECK-NEXT:    ret void
 ;
   call i32 @test_ip_range(i32 1)
@@ -251,11 +257,12 @@ define void @call_test_ip_range() {
 }
 
 define i32 @test_switch_range_may_include_undef(i1 %c.1, i1 %c.2, i32 %x) {
-; CHECK-LABEL: @test_switch_range_may_include_undef(
+; CHECK-LABEL: define range(i32 -1, 21) i32 @test_switch_range_may_include_undef(
+; CHECK-SAME: i1 [[C_1:%.*]], i1 [[C_2:%.*]], i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[THEN_1:%.*]], label [[ELSE_1:%.*]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_1:%.*]], label [[ELSE_1:%.*]]
 ; CHECK:       then.1:
-; CHECK-NEXT:    br i1 [[C_2:%.*]], label [[SWITCH:%.*]], label [[ELSE_2:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[SWITCH:%.*]], label [[ELSE_2:%.*]]
 ; CHECK:       else.1:
 ; CHECK-NEXT:    br label [[SWITCH]]
 ; CHECK:       else.2:
@@ -263,8 +270,8 @@ define i32 @test_switch_range_may_include_undef(i1 %c.1, i1 %c.2, i32 %x) {
 ; CHECK:       switch:
 ; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, [[THEN_1]] ], [ 2, [[ELSE_1]] ], [ undef, [[ELSE_2]] ]
 ; CHECK-NEXT:    switch i32 [[P]], label [[SWITCH_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[END_1:%.*]]
-; CHECK-NEXT:    i32 3, label [[END_2:%.*]]
+; CHECK-NEXT:      i32 0, label [[END_1:%.*]]
+; CHECK-NEXT:      i32 3, label [[END_2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       switch.default:
 ; CHECK-NEXT:    ret i32 -1
@@ -303,9 +310,10 @@ end.2:
 }
 
 define i32 @test_default_unreachable_by_dom_cond(i32 %x) {
-; CHECK-LABEL: @test_default_unreachable_by_dom_cond(
+; CHECK-LABEL: define i32 @test_default_unreachable_by_dom_cond(
+; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR_COND:%.*]] = icmp ult i32 [[X:%.*]], 4
+; CHECK-NEXT:    [[OR_COND:%.*]] = icmp ult i32 [[X]], 4
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[IF_THEN:%.*]], label [[RETURN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT_UNREACHABLE:%.*]] [
@@ -371,4 +379,7 @@ return:
 
 declare void @llvm.assume(i1)
 
-; CHECK: !1 = !{!"branch_weights", i32 1, i32 5, i32 3, i32 4}
+;.
+; CHECK: [[RNG0]] = !{i32 0, i32 3}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 5, i32 3, i32 4}
+;.
diff --git a/llvm/test/Transforms/SCCP/trunc-nuw-nsw-flags.ll b/llvm/test/Transforms/SCCP/trunc-nuw-nsw-flags.ll
index fc3e56011d46..d3bac0d68a97 100644
--- a/llvm/test/Transforms/SCCP/trunc-nuw-nsw-flags.ll
+++ b/llvm/test/Transforms/SCCP/trunc-nuw-nsw-flags.ll
@@ -16,7 +16,7 @@ entry:
 }
 
 define i8 @range_from_or_nsw(i16 %a) {
-; CHECK-LABEL: define i8 @range_from_or_nsw(
+; CHECK-LABEL: define range(i8 -128, 0) i8 @range_from_or_nsw(
 ; CHECK-SAME: i16 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND1:%.*]] = or i16 [[A]], -128
@@ -30,7 +30,7 @@ entry:
 }
 
 define i16 @range_from_and_nuw_nsw(i32 %a) {
-; CHECK-LABEL: define i16 @range_from_and_nuw_nsw(
+; CHECK-LABEL: define range(i16 0, -32768) i16 @range_from_and_nuw_nsw(
 ; CHECK-SAME: i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[A]], 32767
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index 690772472975..3771ec4bda88 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -5,12 +5,7 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i32> [[TMP0]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
-; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
index d51ef0bce3a4..76bb882171b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
@@ -5,7 +5,8 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i32> zeroinitializer to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 0 to i1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP6]], i32 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
index 6404cf4a2cd1..2ab6e919c23b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
@@ -5,7 +5,12 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 0 to i1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i1> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 47d918eabdfe..9bbd314a27cb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -537,24 +537,18 @@ entry:
 }
 
 define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) {
-; NON-POW2-LABEL: define void @vec3_extract(
-; NON-POW2-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    store <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], ptr [[CALL3_I536]], align 2
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: define void @vec3_extract(
-; POW2-ONLY-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
-; POW2-ONLY-NEXT:    [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
-; POW2-ONLY-NEXT:    store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
-; POW2-ONLY-NEXT:    [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1
-; POW2-ONLY-NEXT:    [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1
-; POW2-ONLY-NEXT:    store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2
-; POW2-ONLY-NEXT:    [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0
-; POW2-ONLY-NEXT:    store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: define void @vec3_extract(
+; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
+; CHECK-NEXT:    [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
+; CHECK-NEXT:    store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
+; CHECK-NEXT:    [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1
+; CHECK-NEXT:    [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1
+; CHECK-NEXT:    store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2
+; CHECK-NEXT:    [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0
+; CHECK-NEXT:    store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2
+; CHECK-NEXT:    ret void
 ;
 entry:
   %pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll
new file mode 100644
index 000000000000..94a55c435c8c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v < %s | FileCheck %s
+
+define void @test(ptr noalias %p, ptr %p1) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[P1]], align 2
+; CHECK-NEXT:    ret void
+;
+  %l1 = load i16, ptr %p, align 2
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 2
+  %l2 = load i16, ptr %gep1, align 2
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 16
+  %l3 = load i16, ptr %gep2, align 2
+  %gep3 = getelementptr inbounds i8, ptr %p, i64 18
+  %l4 = load i16, ptr %gep3, align 2
+  store i16 %l1, ptr %p1, align 2
+  %geps1 = getelementptr inbounds i8, ptr %p1, i64 2
+  store i16 %l2, ptr %geps1, align 2
+  %geps2 = getelementptr inbounds i8, ptr %p1, i64 4
+  store i16 %l3, ptr %geps2, align 2
+  %geps3 = getelementptr inbounds i8, ptr %p1, i64 6
+  store i16 %l4, ptr %geps3, align 2
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index d87bdfe26899..aa9a070a7945 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -37,10 +37,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP101]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
@@ -64,15 +64,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
-; CHECK-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP40]], [[TMP39]]
-; CHECK-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT:    [[CONV:%.*]] = add i32 [[TMP42]], [[TMP41]]
-; CHECK-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT:    [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; CHECK-NEXT:    [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; CHECK-NEXT:    [[ADD44_3:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT:    [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]]
+; CHECK-NEXT:    [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
+; CHECK-NEXT:    [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
+; CHECK-NEXT:    [[ADD46_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_3]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
@@ -104,10 +104,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
-; CHECK-NEXT:    [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
+; CHECK-NEXT:    [[TMP190:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
 ; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
 ; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
@@ -115,19 +115,19 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
+; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[ADD46_2]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
-; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15
-; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
+; CHECK-NEXT:    [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT:    [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
+; CHECK-NEXT:    [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
 ; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; CHECK-NEXT:    [[SHR_I49_5:%.*]] = lshr i32 [[CONV1]], 15
-; CHECK-NEXT:    [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
-; CHECK-NEXT:    [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
+; CHECK-NEXT:    [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT:    [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537
+; CHECK-NEXT:    [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535
 ; CHECK-NEXT:    [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
@@ -151,21 +151,21 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]]
 ; CHECK-NEXT:    [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
 ; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
-; CHECK-NEXT:    [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP103]]
+; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
+; CHECK-NEXT:    [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]]
 ; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP165:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
+; CHECK-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
 ; CHECK-NEXT:    [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]]
-; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
-; CHECK-NEXT:    [[TMP143:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
-; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP143]], [[TMP238]]
-; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
-; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP143]], 15
-; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
-; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]]
+; CHECK-NEXT:    [[TMP142:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
 ; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15
 ; CHECK-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
 ; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
+; CHECK-NEXT:    [[SHR_I59_4:%.*]] = lshr i32 [[TMP142]], 15
+; CHECK-NEXT:    [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
+; CHECK-NEXT:    [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
 ; CHECK-NEXT:    [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <2 x i8> poison, i8 [[TMP12]], i32 0
@@ -185,7 +185,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3>
 ; CHECK-NEXT:    [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP153:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT:    [[TMP144:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7>
 ; CHECK-NEXT:    [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
@@ -195,15 +195,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP153]]
+; CHECK-NEXT:    [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP144]]
 ; CHECK-NEXT:    [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
 ; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
 ; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]]
-; CHECK-NEXT:    [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT:    [[TMP257:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
-; CHECK-NEXT:    [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]]
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP257]], i32 1
+; CHECK-NEXT:    [[TMP155:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
+; CHECK-NEXT:    [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP155]]
+; CHECK-NEXT:    [[TMP189:%.*]] = sub <2 x i32> [[TMP155]], [[TMP139]]
+; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
 ; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
 ; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
@@ -220,37 +220,37 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
 ; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]]
-; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP143]]
+; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP108]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
 ; CHECK-NEXT:    [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1
-; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1
-; CHECK-NEXT:    [[TMP163:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
-; CHECK-NEXT:    [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP155]]
-; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
+; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB59_2]], i32 1
+; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB51_2]], i32 1
+; CHECK-NEXT:    [[TMP153:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
+; CHECK-NEXT:    [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP189]], <2 x i32> [[TMP190]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP189]], <2 x i32> [[TMP190]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP184]]
+; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP153]], i32 1
 ; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP158]], [[TMP157]]
-; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP153]], i32 0
 ; CHECK-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP161]], [[TMP160]]
-; CHECK-NEXT:    [[TMP164:%.*]] = sub <2 x i32> [[TMP163]], [[TMP156]]
-; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
-; CHECK-NEXT:    [[TMP174:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
-; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[TMP174]], [[TMP173]]
-; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[TMP173]], [[TMP174]]
-; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; CHECK-NEXT:    [[TMP163:%.*]] = sub <2 x i32> [[TMP153]], [[TMP156]]
+; CHECK-NEXT:    [[TMP164:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
+; CHECK-NEXT:    [[TMP165:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[TMP165]], [[TMP164]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[TMP164]], [[TMP165]]
+; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_1]]
 ; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]]
-; CHECK-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP189]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP167:%.*]] = lshr <2 x i32> [[TMP166]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP168:%.*]] = and <2 x i32> [[TMP167]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP169:%.*]] = mul <2 x i32> [[TMP168]], <i32 65535, i32 65535>
@@ -263,44 +263,44 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP283:%.*]] = shufflevector <2 x i32> [[TMP282]], <2 x i32> [[TMP211]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP283]]
 ; CHECK-NEXT:    [[TMP178:%.*]] = xor <2 x i32> [[TMP177]], [[TMP166]]
-; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
-; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP108]]
+; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]]
+; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP142]]
 ; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
 ; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP178]], i32 0
 ; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP179]]
 ; CHECK-NEXT:    [[TMP180:%.*]] = extractelement <2 x i32> [[TMP178]], i32 1
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP180]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP165]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_2]], i32 0
-; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> [[TMP165]], i32 [[CONV]], i32 0
-; CHECK-NEXT:    [[TMP184:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]]
-; CHECK-NEXT:    [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_3]], i32 0
+; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> [[TMP106]], i32 [[ADD46_2]], i32 0
+; CHECK-NEXT:    [[TMP195:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]]
+; CHECK-NEXT:    [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP187:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]]
-; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0
-; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
-; CHECK-NEXT:    [[TMP190:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP189]], [[TMP188]]
-; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP184]], i32 1
+; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <2 x i32> [[TMP195]], i32 0
+; CHECK-NEXT:    [[TMP196:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
+; CHECK-NEXT:    [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP195]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP196]], [[TMP188]]
+; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP195]], i32 1
 ; CHECK-NEXT:    [[TMP192:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1
-; CHECK-NEXT:    [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP195]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP192]], [[TMP191]]
-; CHECK-NEXT:    [[TMP194:%.*]] = sub <2 x i32> [[TMP184]], [[TMP187]]
+; CHECK-NEXT:    [[TMP194:%.*]] = sub <2 x i32> [[TMP195]], [[TMP187]]
 ; CHECK-NEXT:    [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
 ; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP244]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
 ; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP246:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]]
-; CHECK-NEXT:    [[TMP247:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]]
-; CHECK-NEXT:    [[TMP248:%.*]] = shufflevector <2 x i32> [[TMP246]], <2 x i32> [[TMP247]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP216:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]]
+; CHECK-NEXT:    [[TMP210:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]]
+; CHECK-NEXT:    [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP216]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP215:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
 ; CHECK-NEXT:    [[TMP203:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1
 ; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP215]], [[TMP203]]
 ; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP215]]
 ; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_2]]
 ; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT:    [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP248]]
+; CHECK-NEXT:    [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP221]]
 ; CHECK-NEXT:    [[TMP267:%.*]] = xor <2 x i32> [[TMP266]], [[TMP110]]
 ; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP238]], 15
 ; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
@@ -313,48 +313,48 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP207:%.*]] = extractelement <2 x i32> [[TMP267]], i32 1
 ; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP207]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP221:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0
-; CHECK-NEXT:    [[TMP222:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0
-; CHECK-NEXT:    [[TMP210:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
-; CHECK-NEXT:    [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP225]], [[TMP212]]
-; CHECK-NEXT:    [[TMP214:%.*]] = extractelement <2 x i32> [[TMP210]], i32 0
-; CHECK-NEXT:    [[TMP227:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
-; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP227]], [[TMP214]]
-; CHECK-NEXT:    [[TMP217:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1
-; CHECK-NEXT:    [[TMP218:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
-; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[TMP218]], [[TMP217]]
-; CHECK-NEXT:    [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP226]]
-; CHECK-NEXT:    [[TMP274:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
-; CHECK-NEXT:    [[TMP275:%.*]] = shufflevector <2 x i32> [[TMP274]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP222:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB51_2]], i32 0
+; CHECK-NEXT:    [[TMP225:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB59_2]], i32 0
+; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP222]], [[TMP225]]
+; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP190]], <2 x i32> [[TMP189]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP190]], <2 x i32> [[TMP189]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP227]], [[TMP212]]
+; CHECK-NEXT:    [[TMP214:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
+; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
+; CHECK-NEXT:    [[TMP239:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[TMP237]], [[TMP214]]
+; CHECK-NEXT:    [[TMP217:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT:    [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP226]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP218]], [[TMP217]]
+; CHECK-NEXT:    [[TMP240:%.*]] = sub <2 x i32> [[TMP226]], [[TMP213]]
 ; CHECK-NEXT:    [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
 ; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP276:%.*]] = add <2 x i32> [[TMP275]], [[TMP224]]
-; CHECK-NEXT:    [[TMP277:%.*]] = sub <2 x i32> [[TMP275]], [[TMP224]]
-; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP276]], <2 x i32> [[TMP277]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP228:%.*]] = extractelement <2 x i32> [[TMP220]], i32 0
-; CHECK-NEXT:    [[TMP229:%.*]] = extractelement <2 x i32> [[TMP220]], i32 1
+; CHECK-NEXT:    [[TMP241:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_5]], i32 0
+; CHECK-NEXT:    [[TMP242:%.*]] = shufflevector <2 x i32> [[TMP241]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP261:%.*]] = add <2 x i32> [[TMP224]], [[TMP242]]
+; CHECK-NEXT:    [[TMP262:%.*]] = sub <2 x i32> [[TMP224]], [[TMP242]]
+; CHECK-NEXT:    [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP228:%.*]] = extractelement <2 x i32> [[TMP240]], i32 0
+; CHECK-NEXT:    [[TMP229:%.*]] = extractelement <2 x i32> [[TMP240]], i32 1
 ; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[TMP228]], [[TMP229]]
 ; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[TMP229]], [[TMP228]]
-; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_3]]
+; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_6]], [[ADD105_3]]
 ; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
 ; CHECK-NEXT:    [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP231:%.*]] = and <2 x i32> [[TMP230]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP286:%.*]] = add <2 x i32> [[TMP232]], [[TMP278]]
-; CHECK-NEXT:    [[TMP287:%.*]] = xor <2 x i32> [[TMP286]], [[TMP102]]
+; CHECK-NEXT:    [[TMP233:%.*]] = add <2 x i32> [[TMP232]], [[TMP220]]
+; CHECK-NEXT:    [[TMP234:%.*]] = xor <2 x i32> [[TMP233]], [[TMP102]]
 ; CHECK-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
 ; CHECK-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
 ; CHECK-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP235:%.*]] = extractelement <2 x i32> [[TMP287]], i32 0
+; CHECK-NEXT:    [[TMP235:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
 ; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]]
-; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP287]], i32 1
+; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
 ; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]]
 ; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
new file mode 100644
index 000000000000..d6dc3bcc3354
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+@c = global [12 x i64] zeroinitializer
+
+; FIXME: after minbitwidth analysis and i32 conv.., 65535 is transformed to
+; and <4 x i16> , -1, which must be dropped.
+; FIXME: need to adjust the cost of the final transformation, since the user is
+; just a trunc to i16 (it must be free).
+define i16 @test() {
+; CHECK-LABEL: define i16 @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[TMP5]] to i16
+; CHECK-NEXT:    ret i16 [[T]]
+;
+entry:
+  %0 = load i64, ptr @c, align 8
+  %conv = trunc i64 %0 to i32
+  %conv3 = and i32 %conv, 65535
+  %conv4 = xor i32 %conv3, 65535
+  %1 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 3), align 8
+  %conv.1 = trunc i64 %1 to i32
+  %conv3.1 = and i32 %conv.1, 65535
+  %conv4.1 = xor i32 %conv3.1, 65535
+  %.conv4.1 = tail call i32 @llvm.umax.i32(i32 %conv4, i32 %conv4.1)
+  %2 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 6), align 8
+  %conv.2 = trunc i64 %2 to i32
+  %conv3.2 = and i32 %conv.2, 65535
+  %conv4.2 = xor i32 %conv3.2, 65535
+  %.conv4.2 = tail call i32 @llvm.umax.i32(i32 %.conv4.1, i32 %conv4.2)
+  %3 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 9), align 8
+  %conv.3 = trunc i64 %3 to i32
+  %conv3.3 = and i32 %conv.3, 65535
+  %conv4.3 = xor i32 %conv3.3, 65535
+  %.conv4.3 = tail call i32 @llvm.umax.i32(i32 %.conv4.2, i32 %conv4.3)
+  %t = trunc i32 %.conv4.3 to i16
+  ret i16 %t
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
new file mode 100644
index 000000000000..56e8829b0ec6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=riscv64-unknown-linux -mattr=+v | FileCheck %s
+
+define void @store_reverse(ptr %p3) {
+; CHECK-LABEL: @store_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P3:%.*]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> [[TMP2]], ptr align 8 [[ARRAYIDX14]], i64 -8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, ptr %p3, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %p3, i64 8
+  %1 = load i64, ptr %arrayidx1, align 8
+  %shl = shl i64 %0, %1
+  %arrayidx2 = getelementptr inbounds i64, ptr %p3, i64 7
+  store i64 %shl, ptr %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds i64, ptr %p3, i64 1
+  %2 = load i64, ptr %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds i64, ptr %p3, i64 9
+  %3 = load i64, ptr %arrayidx4, align 8
+  %shl5 = shl i64 %2, %3
+  %arrayidx6 = getelementptr inbounds i64, ptr %p3, i64 6
+  store i64 %shl5, ptr %arrayidx6, align 8
+  %arrayidx7 = getelementptr inbounds i64, ptr %p3, i64 2
+  %4 = load i64, ptr %arrayidx7, align 8
+  %arrayidx8 = getelementptr inbounds i64, ptr %p3, i64 10
+  %5 = load i64, ptr %arrayidx8, align 8
+  %shl9 = shl i64 %4, %5
+  %arrayidx10 = getelementptr inbounds i64, ptr %p3, i64 5
+  store i64 %shl9, ptr %arrayidx10, align 8
+  %arrayidx11 = getelementptr inbounds i64, ptr %p3, i64 3
+  %6 = load i64, ptr %arrayidx11, align 8
+  %arrayidx12 = getelementptr inbounds i64, ptr %p3, i64 11
+  %7 = load i64, ptr %arrayidx12, align 8
+  %shl13 = shl i64 %6, %7
+  %arrayidx14 = getelementptr inbounds i64, ptr %p3, i64 4
+  store i64 %shl13, ptr %arrayidx14, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
index 04d275742832..0c0c723e6699 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
@@ -9,8 +9,7 @@ define i32 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP5]], i32 1)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unsigned-icmp-signed-op.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unsigned-icmp-signed-op.ll
new file mode 100644
index 000000000000..5ec6b4f1040d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unsigned-icmp-signed-op.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+
+define i32 @test(ptr %f, i16 %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[F:%.*]], i16 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[F]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> <i16 0, i16 poison, i16 0, i16 0>, i16 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> <i16 0, i16 poison, i16 0, i16 0>, i16 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[ZEXT_4:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    ret i32 [[ZEXT_4]]
+;
+entry:
+  %1 = load i16, ptr %f, align 2
+
+  %zext.0 = zext i16 %1 to i32
+  %sext.0 = sext i16 %0 to i32
+
+  %zext.1 = zext i16 0 to i32
+  %sext.1 = sext i16 0 to i32
+  %zext.2 = zext i16 0 to i32
+  %sext.2 = sext i16 0 to i32
+  %zext.3 = zext i16 0 to i32
+  %sext.3 = sext i16 0 to i32
+
+  %cmp.0 = icmp ule i32 %zext.0, %sext.0
+  %cmp.1 = icmp ule i32 %zext.1, %sext.1
+  %cmp.2 = icmp ule i32 %zext.2, %sext.2
+  %cmp.3 = icmp ule i32 %zext.3, %sext.3
+
+  %and.0 = and i1 %cmp.0, %cmp.1
+  %and.1 = and i1 %and.0, %cmp.2
+  %and.2 = and i1 %and.1, %cmp.3
+
+  %zext.4 = zext i1 %and.2 to i32
+
+  ret i32 %zext.4
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
index 7b4e2b0ce911..1bb87bf6205f 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
@@ -7,9 +7,9 @@ define void @test(ptr %a, i8 %0, i16 %b.promoted.i) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[B_PROMOTED_I]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i128> poison, i128 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i128> [[TMP5]], <4 x i128> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i128> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP2]] to i16
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP4]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[TMP8]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP9]] to i64
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index 94534274cab2..5f8941e9f889 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -332,18 +332,14 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re
 
 define void @vec_shuff_reorder() #0 {
 ; CHECK-LABEL: @vec_shuff_reorder(
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @fb, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @fa, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr @fa, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr @fb, align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
index 610cc5bdeb31..536526a5cfe0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll
@@ -6,11 +6,11 @@ define i1 @test() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 0, 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i64> [[TMP1]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i64> [[TMP2]], i64 0, i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 3, i32 poison, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> <i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <8 x i64> [[TMP3]], <8 x i32> <i32 11, i32 11, i32 11, i32 1, i32 9, i32 9, i32 1, i32 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 0, i64 0, i64 0>, i64 0, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> <i64 undef, i64 undef, i64 0, i64 0>, i64 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP11]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 1, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <8 x i64> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
index cac0491d0b64..7ae6793fba4c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll
@@ -8,12 +8,10 @@ define i1 @test(float %0, double %1) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> <double poison, double 0.000000e+00, double poison, double 0.000000e+00>, <4 x i32> <i32 poison, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double poison, double 0.000000e+00, double poison, double poison>, <4 x i32> <i32 poison, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 1, i32 2, i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
index dd7ba71ed673..f1580599ba12 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -12,8 +12,8 @@ define i64 @foo(i32 %tmp7) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 2, i32 3, i32 5, i32 poison, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP77:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP77]], <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index f665dac3282b..24b95c4e6ff2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -7,17 +7,14 @@ define void @foo(double %i) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00>, double [[I]], i32 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x i32> <i32 8, i32 poison, i32 2, i32 poison, i32 poison, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 5, i32 poison, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP5]], i32 6
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <8 x double> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 1, i32 poison, i32 0, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> <double poison, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 9, i32 poison, i32 3, i32 12, i32 5, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <8 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, [[TMP8]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = fcmp ult <8 x double> [[TMP14]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
new file mode 100644
index 000000000000..d80d7b5ecd4e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v4 < %s | FileCheck %s
+
+%struct.rect = type { float, float, float, float }
+
+define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr [[I7:%.*]], i32 [[TMP0:%.*]], i1 [[TOBOOL62_NOT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RC21:%.*]] = alloca [0 x [0 x %struct.rect]], i32 0, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[RC21]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[X1:%.*]] = getelementptr i8, ptr [[RC21]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[X1]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[I7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]]
+; CHECK-NEXT:    store <4 x float> [[TMP16]], ptr [[RC21]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       entry.if.end72_crit_edge:
+; CHECK-NEXT:    br label [[IF_END72:%.*]]
+; CHECK:       if.then63:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x float> [ poison, [[IF_THEN63:%.*]] ], [ [[TMP16]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = fptosi <4 x float> [[TMP18]] to <4 x i32>
+; CHECK-NEXT:    br label [[IF_END72]]
+; CHECK:       if.end72:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ poison, [[ENTRY_IF_END72_CRIT_EDGE:%.*]] ], [ [[TMP19]], [[IF_END]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    br i1 [[TOBOOL62_NOT]], label [[IF_END75:%.*]], label [[IF_THEN74:%.*]]
+; CHECK:       if.then74:
+; CHECK-NEXT:    br label [[IF_END75]]
+; CHECK:       if.end75:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP20]], [[IF_THEN74]] ], [ [[TMP21]], [[IF_END72]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i32> [[TMP22]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[RC21]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %rc21 = alloca [0 x [0 x %struct.rect]], i32 0, align 4
+  %1 = load float, ptr %rc21, align 4
+  %cmp = fcmp olt float %1, 0.000000e+00
+  %conv = sitofp i32 %0 to float
+  %cmp2 = fcmp olt float %conv, 0.000000e+00
+  %cond = select i1 %cmp2, float %conv, float 0.000000e+00
+  %cond9 = select i1 %cmp, float 0.000000e+00, float %cond
+  store float %cond9, ptr %rc21, align 4
+  %x1 = getelementptr i8, ptr %rc21, i64 4
+  %2 = load float, ptr %x1, align 4
+  %cmp11 = fcmp olt float %2, 0.000000e+00
+  %conv16 = sitofp i32 %0 to float
+  %cmp17 = fcmp olt float %conv16, 0.000000e+00
+  %cond24 = select i1 %cmp17, float %conv16, float 0.000000e+00
+  %cond26 = select i1 %cmp11, float 0.000000e+00, float %cond24
+  store float %cond26, ptr %x1, align 4
+  %y0 = getelementptr i8, ptr %rc21, i64 8
+  %3 = load float, ptr %y0, align 4
+  %cmp28 = fcmp olt float %3, 0.000000e+00
+  %cmp34 = fcmp olt float %conv, 0.000000e+00
+  %cond41 = select i1 %cmp34, float %conv, float 0.000000e+00
+  %cond43 = select i1 %cmp28, float 0.000000e+00, float %cond41
+  store float %cond43, ptr %y0, align 4
+  %y11 = getelementptr i8, ptr %rc21, i64 12
+  %4 = load float, ptr %i7, align 4
+  %cmp45 = fcmp olt float %4, 0.000000e+00
+  %cmp51 = fcmp olt float %conv16, 0.000000e+00
+  %cond58 = select i1 %cmp51, float %conv16, float 0.000000e+00
+  %cond60 = select i1 %cmp45, float 0.000000e+00, float %cond58
+  store float %cond60, ptr %y11, align 4
+  br label %if.end
+
+entry.if.end72_crit_edge:
+  br label %if.end72
+
+if.then63:
+  br label %if.end
+
+if.end:
+  %5 = phi float [ 0.000000e+00, %if.then63 ], [ %cond60, %entry ]
+  %6 = phi float [ 0.000000e+00, %if.then63 ], [ %cond26, %entry ]
+  %7 = phi float [ 0.000000e+00, %if.then63 ], [ %cond43, %entry ]
+  %8 = phi float [ 0.000000e+00, %if.then63 ], [ %cond9, %entry ]
+  %9 = call float @llvm.round.f32(float %8)
+  %conv65 = fptosi float %9 to i32
+  %10 = call float @llvm.round.f32(float %7)
+  %conv67 = fptosi float %10 to i32
+  %11 = call float @llvm.round.f32(float %6)
+  %conv69 = fptosi float %11 to i32
+  %12 = call float @llvm.round.f32(float %5)
+  %conv71 = fptosi float %12 to i32
+  br label %if.end72
+
+if.end72:
+  %.pre100 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv71, %if.end ]
+  %.pre99 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv67, %if.end ]
+  %.pre98 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv69, %if.end ]
+  %.pre97 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv65, %if.end ]
+  br i1 %tobool62.not, label %if.end75, label %if.then74
+
+if.then74:
+  br label %if.end75
+
+if.end75:
+  %13 = phi i32 [ %.pre99, %if.then74 ], [ %.pre100, %if.end72 ]
+  %14 = phi i32 [ %.pre100, %if.then74 ], [ %.pre99, %if.end72 ]
+  %15 = phi i32 [ %.pre97, %if.then74 ], [ %.pre98, %if.end72 ]
+  %16 = phi i32 [ %.pre98, %if.then74 ], [ %.pre97, %if.end72 ]
+  %sub = or i32 %16, 1
+  %mul = mul i32 %sub, %0
+  %conv77 = sitofp i32 %mul to float
+  store float %conv77, ptr %rc21, align 4
+  %x178 = getelementptr i8, ptr %rc21, i64 4
+  %sub79 = or i32 %15, 1
+  %mul80 = mul i32 %sub79, %0
+  %conv81 = sitofp i32 %mul80 to float
+  store float %conv81, ptr %x178, align 4
+  %y082 = getelementptr i8, ptr %rc21, i64 8
+  %sub83 = or i32 %14, 1
+  %mul84 = mul i32 %sub83, %0
+  %conv85 = sitofp i32 %mul84 to float
+  store float %conv85, ptr %y082, align 4
+  %y186 = getelementptr i8, ptr %rc21, i64 12
+  %sub87 = or i32 %13, 1
+  %mul88 = mul i32 %sub87, %0
+  %conv89 = sitofp i32 %mul88 to float
+  store float %conv89, ptr %y186, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll
new file mode 100644
index 000000000000..37d166953c33
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x ptr> zeroinitializer, i32 0
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 872
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GETELEMENTPTR6]], align 8
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 32, i64 872>), [[BB:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %getelementptr = getelementptr i8, ptr null, i64 32
+  %0 = extractelement <2 x ptr> zeroinitializer, i32 0
+  %getelementptr6 = getelementptr i8, ptr %0, i64 872
+  store double 0.000000e+00, ptr %getelementptr6, align 8
+  br label %bb9
+
+bb9:
+  %phi10 = phi ptr [ %getelementptr, %bb ]
+  %phi11 = phi ptr [ %getelementptr6, %bb ]
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 4cc3c1241b56..de06daac7a75 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1295,7 +1295,7 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) {
 
 define void @PR49730() {
 ; CHECK-LABEL: @PR49730(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 2, i32 undef, i32 1>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 2, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 1>)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
 ; CHECK-NEXT:    [[T12:%.*]] = sub nsw i32 undef, undef
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
index 668d3c3c8c82..0ab56279fe47 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
@@ -16,8 +16,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 593aad82ad5d..8562e53b1538 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -185,9 +185,13 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
+; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
+; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -233,13 +237,9 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
-; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
-; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll b/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll
index 681d131c5072..488ca0b23cd9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll
@@ -10,7 +10,7 @@ define void @foo() {
 ; CHECK-NEXT:    br label [[BCI_252:%.*]]
 ; CHECK:       bci_252:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[BCI_0:%.*]] ], [ [[TMP16:%.*]], [[BCI_252_1:%.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i32> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i32> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <2 x i32> [[TMP6]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 75505f632a43..29021150ccd2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s
 
 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i32(
@@ -98,58 +98,19 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 }
 
 define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
-; SSE-LABEL: @store_i64(
-; SSE-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; SSE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; SSE-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; SSE-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; SSE-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; SSE-NEXT:    store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; SSE-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; SSE-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; SSE-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; SSE-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; SSE-NEXT:    store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
-; SSE-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; SSE-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; SSE-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; SSE-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; SSE-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; SSE-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; SSE-NEXT:    store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; SSE-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; SSE-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; SSE-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; SSE-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; SSE-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; SSE-NEXT:    store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @store_i64(
-; AVX-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
-; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
-; AVX-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
-; AVX-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-; AVX-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
-; AVX-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @store_i64(
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
+; CHECK-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    ret void
 ;
   %4 = zext i32 %1 to i64
   %5 = load i64, ptr %0, align 8, !tbaa !7
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
index 19a3a7d53df0..9df7aa1c727c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
@@ -5,8 +5,8 @@ define void @test(i32 %0, ptr %p) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 1, i32 0, i32 poison>, i32 [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[PH:%.*]]
 ; CHECK:       ph:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 69ecf1852aed..8f1d7a11e150 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,11 +7,9 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
-; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
-; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -21,10 +19,11 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
+; CHECK-NEXT:    [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
-; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -34,20 +33,19 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
-; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT:    [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
-; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
index 8b131ccd01c0..16a9bf53b54a 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
@@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
 define void @test2() {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 3, i32 undef, i32 0>)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 3, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 0>)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
index 867a49dbaed2..7258ffca1278 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
@@ -1,5 +1,9 @@
-; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml -sample-profile-use-profi=0 -S | FileCheck %s
-; RUN: FileCheck %s -check-prefix=YAML < %t.opt.yaml
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml -sample-profile-use-profi=0 -S -o %t
+; RUN: FileCheck %s --input-file %t
+; RUN: FileCheck %s -check-prefix=YAML --input-file %t.opt.yaml
+; RUN: opt < %t -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -sample-profile-remove-probe -S | FileCheck %s -check-prefix=REMOVE-PROBE
+
+; REMOVE-PROBE-NOT: call void @llvm.pseudoprobe
 
 define dso_local i32 @foo(i32 %x, ptr %f) #0 !dbg !4 {
 entry:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
index 0d3aa8b24310..e70bea2d2f7a 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
@@ -106,3 +106,96 @@ for.inc:                                          ; preds = %for.cond5
   store i8 0, ptr @b, align 1
   br label %for.cond5
 }
+
+define void @e(ptr %p) {
+; CHECK-LABEL: @e(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END_SPLIT:%.*]], label [[FOR_END_SPLIT_US:%.*]]
+; CHECK:       for.end.split.us:
+; CHECK-NEXT:    br label [[G_US:%.*]]
+; CHECK:       g.us:
+; CHECK-NEXT:    br label [[G_SPLIT_US6:%.*]]
+; CHECK:       for.cond1.us1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TOBOOL4_NOT_US:%.*]] = trunc i16 [[TMP2]] to i1
+; CHECK-NEXT:    br i1 [[TOBOOL4_NOT_US]], label [[FOR_COND5_PREHEADER_US4:%.*]], label [[G_LOOPEXIT_US:%.*]]
+; CHECK:       for.cond5.us2:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT_US5:%.*]], label [[FOR_INC_US3:%.*]]
+; CHECK:       for.inc.us3:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5_US2:%.*]]
+; CHECK:       for.cond5.preheader.us4:
+; CHECK-NEXT:    br label [[FOR_COND5_US2]]
+; CHECK:       for.cond1.loopexit.us5:
+; CHECK-NEXT:    br label [[FOR_COND1_US1:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       g.loopexit.us:
+; CHECK-NEXT:    br label [[G_US]]
+; CHECK:       g.split.us6:
+; CHECK-NEXT:    br label [[FOR_COND1_US1]]
+; CHECK:       for.end.split:
+; CHECK-NEXT:    br label [[G:%.*]]
+; CHECK:       g.loopexit:
+; CHECK-NEXT:    br label [[G]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       g:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i16 [[TMP3]] to i1
+; CHECK-NEXT:    br i1 [[TMP4]], label [[G_SPLIT_US:%.*]], label [[G_SPLIT:%.*]]
+; CHECK:       g.split.us:
+; CHECK-NEXT:    br label [[FOR_COND1_US:%.*]]
+; CHECK:       for.cond1.us:
+; CHECK-NEXT:    br label [[FOR_COND5_PREHEADER_US:%.*]]
+; CHECK:       for.cond5.us:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT_US:%.*]], label [[FOR_INC_US:%.*]]
+; CHECK:       for.inc.us:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5_US:%.*]]
+; CHECK:       for.cond5.preheader.us:
+; CHECK-NEXT:    br label [[FOR_COND5_US]]
+; CHECK:       for.cond1.loopexit.us:
+; CHECK-NEXT:    br label [[FOR_COND1_US]]
+; CHECK:       g.split:
+; CHECK-NEXT:    br label [[FOR_COND1:%.*]]
+; CHECK:       for.cond1.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP3]]
+; CHECK:       for.cond1:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TOBOOL4_NOT:%.*]] = trunc i16 [[TMP5]] to i1
+; CHECK-NEXT:    br i1 [[TOBOOL4_NOT]], label [[FOR_COND5_PREHEADER:%.*]], label [[G_LOOPEXIT:%.*]]
+; CHECK:       for.cond5.preheader:
+; CHECK-NEXT:    br label [[FOR_COND5:%.*]]
+; CHECK:       for.cond5:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5]]
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  br i1 false, label %for.end, label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %g
+
+g:                                                ; preds = %for.cond1, %for.end
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond5, %g
+  %0 = load i16, ptr %p, align 2
+  %tobool4.not = trunc i16 %0 to i1
+  br i1 %tobool4.not, label %for.cond5, label %g
+
+for.cond5:                                        ; preds = %for.inc, %for.cond1
+  br i1 false, label %for.cond1, label %for.inc
+
+for.inc:                                          ; preds = %for.cond5
+  store i8 0, ptr @b, align 1
+  br label %for.cond5
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
index f97e5c3eec9d..1d8942079ffd 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
@@ -1326,6 +1326,136 @@ exit:
   ret i32 10
 }
 
+define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) {
+; CHECK-LABEL: @partial_unswitch_true_successor_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
+; CHECK:       loop.header.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[LOOP_LATCH_US:%.*]] ]
+; CHECK-NEXT:    br label [[NOCLOBBER_US:%.*]]
+; CHECK:       noclobber.us:
+; CHECK-NEXT:    br label [[LOOP_LATCH_US]]
+; CHECK:       loop.latch.us:
+; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.us:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[SC:%.*]] = trunc i32 [[LV]] to i1
+; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]]
+; CHECK:       noclobber:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       clobber:
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       exit.split:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %lv = load i32, ptr %ptr
+  %sc = trunc i32 %lv to i1
+  br i1 %sc, label %noclobber, label %clobber
+
+noclobber:
+  br label %loop.latch
+
+clobber:
+  call void @clobber()
+  br label %loop.latch
+
+loop.latch:
+  %c = icmp ult i32 %iv, %N
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret i32 10
+}
+
+define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) {
+; CHECK-LABEL: @partial_unswitch_false_successor_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_US:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
+; CHECK:       loop.header.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[LOOP_LATCH_US:%.*]] ]
+; CHECK-NEXT:    br label [[NOCLOBBER_US:%.*]]
+; CHECK:       noclobber.us:
+; CHECK-NEXT:    br label [[LOOP_LATCH_US]]
+; CHECK:       loop.latch.us:
+; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.us:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[SC:%.*]] = trunc i32 [[LV]] to i1
+; CHECK-NEXT:    br i1 [[SC]], label [[CLOBBER:%.*]], label [[NOCLOBBER:%.*]]
+; CHECK:       clobber:
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       noclobber:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit.split:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %lv = load i32, ptr %ptr
+  %sc = trunc i32 %lv to i1
+  br i1 %sc, label %clobber, label %noclobber
+
+clobber:
+  call void @clobber()
+  br label %loop.latch
+
+noclobber:
+  br label %loop.latch
+
+loop.latch:
+  %c = icmp ult i32 %iv, %N
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret i32 10
+}
+
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]}
 ; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"}
 ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]}
diff --git a/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll b/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll
index 757340527ec0..ef2d3219cca9 100644
--- a/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll
+++ b/llvm/test/Transforms/SimplifyCFG/UnreachableEliminate.ll
@@ -627,7 +627,233 @@ else:
   ret void
 }
 
+define i32 @test_assume_false(i32 %cond) {
+; CHECK-LABEL: @test_assume_false(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[COND:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 0, label [[EXIT:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default:
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 2, [[CASE1]] ], [ 3, [[CASE2]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %cond, label %default [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case0:
+  br label %exit
+
+case1:
+  br label %exit
+
+case2:
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %bool = phi i1 [ false, %default ], [ true, %case0 ], [ true, %case1 ], [ true, %case2 ]
+  %res = phi i32 [ 0, %default ], [ 1, %case0 ], [ 2, %case1 ], [ 3, %case2 ]
+  call void @llvm.assume(i1 %bool)
+  ret i32 %res
+}
+
+define i32 @test_assume_undef(i32 %cond) {
+; CHECK-LABEL: @test_assume_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[COND:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 0, label [[EXIT:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default:
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 2, [[CASE1]] ], [ 3, [[CASE2]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %cond, label %default [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case0:
+  br label %exit
+
+case1:
+  br label %exit
+
+case2:
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %bool = phi i1 [ undef, %default ], [ true, %case0 ], [ true, %case1 ], [ true, %case2 ]
+  %res = phi i32 [ 0, %default ], [ 1, %case0 ], [ 2, %case1 ], [ 3, %case2 ]
+  call void @llvm.assume(i1 %bool)
+  ret i32 %res
+}
+
+define i32 @test_assume_var(i32 %cond, i1 %var) {
+; CHECK-LABEL: @test_assume_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[COND:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 0, label [[EXIT:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[BOOL:%.*]] = phi i1 [ [[VAR:%.*]], [[DEFAULT]] ], [ true, [[CASE1]] ], [ true, [[CASE2]] ], [ true, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 0, [[DEFAULT]] ], [ 2, [[CASE1]] ], [ 3, [[CASE2]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[BOOL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %cond, label %default [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case0:
+  br label %exit
 
+case1:
+  br label %exit
+
+case2:
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %bool = phi i1 [ %var, %default ], [ true, %case0 ], [ true, %case1 ], [ true, %case2 ]
+  %res = phi i32 [ 0, %default ], [ 1, %case0 ], [ 2, %case1 ], [ 3, %case2 ]
+  call void @llvm.assume(i1 %bool)
+  ret i32 %res
+}
+
+define i32 @test_assume_bundle_nonnull(i32 %cond, ptr nonnull %p) {
+; CHECK-LABEL: @test_assume_bundle_nonnull(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[COND:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 0, label [[EXIT:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ null, [[DEFAULT]] ], [ [[P:%.*]], [[CASE1]] ], [ [[P]], [[CASE2]] ], [ [[P]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 0, [[DEFAULT]] ], [ 2, [[CASE1]] ], [ 3, [[CASE2]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "nonnull"(ptr [[PTR]]) ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %cond, label %default [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case0:
+  br label %exit
+
+case1:
+  br label %exit
+
+case2:
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %ptr = phi ptr [ null, %default ], [ %p, %case0 ], [ %p, %case1 ], [ %p, %case2 ]
+  %res = phi i32 [ 0, %default ], [ 1, %case0 ], [ 2, %case1 ], [ 3, %case2 ]
+  call void @llvm.assume(i1 true) [ "nonnull"(ptr %ptr) ]
+  ret i32 %res
+}
+
+define i32 @test_assume_bundle_align(i32 %cond, ptr nonnull %p) {
+; CHECK-LABEL: @test_assume_bundle_align(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[COND:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 0, label [[EXIT:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ null, [[DEFAULT]] ], [ [[P:%.*]], [[CASE1]] ], [ [[P]], [[CASE2]] ], [ [[P]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 0, [[DEFAULT]] ], [ 2, [[CASE1]] ], [ 3, [[CASE2]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[PTR]], i32 8) ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  switch i32 %cond, label %default [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  ]
+
+case0:
+  br label %exit
+
+case1:
+  br label %exit
+
+case2:
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %ptr = phi ptr [ null, %default ], [ %p, %case0 ], [ %p, %case1 ], [ %p, %case2 ]
+  %res = phi i32 [ 0, %default ], [ 1, %case0 ], [ 2, %case1 ], [ 3, %case2 ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %ptr, i32 8) ]
+  ret i32 %res
+}
 
 attributes #0 = { null_pointer_is_valid }
 ;.
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
index e00d1daf71de..5af73e789f11 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
@@ -9,7 +9,6 @@ init:
 
 ; CHECK:  %vala = load i64, ptr %ptr
 ; CHECK-NEXT:  call void @llvm.dbg.value(metadata i64 %vala, metadata [[MD:![0-9]*]]
-; CHECK-NEXT:  call void @llvm.dbg.value(metadata i64 %vala, metadata [[MD]]
 ; CHECK-NEXT:  %valbmasked = and i64 %vala, 1
 
 a:                                              ; preds = %init
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
index af7da45ec089..c5d723c4e3dd 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
@@ -47,7 +47,6 @@ define i1 @hoist_with_debug2(i32 %x) !dbg !22 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ugt i32 [[X:%.*]], 2
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[X]], metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[X]], metadata [[META21]], metadata !DIExpression()), !dbg [[DBG23]]
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL_NOT]], i1 false, i1 true
 ; CHECK-NEXT:    ret i1 [[DOT]]
 ;
diff --git a/llvm/test/Verifier/invalid-splice.ll b/llvm/test/Verifier/invalid-splice.ll
index d5096bdf17ca..2239386df562 100644
--- a/llvm/test/Verifier/invalid-splice.ll
+++ b/llvm/test/Verifier/invalid-splice.ll
@@ -2,36 +2,36 @@
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx_neg3(<2 x double> %a, <2 x double> %b) #0 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -3)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -3)
   ret <2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <vscale x 2 x double> @splice_nxv2f64_idx_neg3_vscale_min1(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
   ret <vscale x 2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <vscale x 2 x double> @splice_nxv2f64_idx_neg5_vscale_min2(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #1 {
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -5)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -5)
   ret <vscale x 2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx2(<2 x double> %a, <2 x double> %b) #0 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 2)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 2)
   ret <2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx3(<2 x double> %a, <2 x double> %b) #1 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 4)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 4)
   ret <2 x double> %res
 }
 
 attributes #0 = { vscale_range(1,16) }
 attributes #1 = { vscale_range(2,16) }
 
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index b6f255d472d1..60a68b0edaf9 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -61,6 +61,8 @@ config.reverse_iteration = @LLVM_ENABLE_REVERSE_ITERATION@
 config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@
 config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
 config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
+config.have_vc_rev = @LLVM_APPEND_VC_REV@
+config.force_vc_rev = "@LLVM_FORCE_VC_REVISION@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/llvm/test/tools/gold/X86/thinlto.ll b/llvm/test/tools/gold/X86/thinlto.ll
index 9d56afe9d58d..fb2183450e4c 100644
--- a/llvm/test/tools/gold/X86/thinlto.ll
+++ b/llvm/test/tools/gold/X86/thinlto.ll
@@ -103,8 +103,8 @@
 ; BACKEND1-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1-NEXT: <VERSION
 ; BACKEND1-NEXT: <FLAGS
-; BACKEND1-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
-; BACKEND1-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; BACKEND1-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; BACKEND1-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
 ; BACKEND1-NEXT: <COMBINED
 ; BACKEND1-NEXT: <COMBINED
 ; BACKEND1-NEXT: </GLOBALVAL_SUMMARY_BLOCK
@@ -117,7 +117,7 @@
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
 ; BACKEND2-NEXT: <FLAGS
-; BACKEND2-NEXT: <VALUE_GUID op0=1 op1=-5300342847281564238
+; BACKEND2-NEXT: <VALUE_GUID {{.*}} op0=1 op1=3060885059 op2=1207956914
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
@@ -136,8 +136,8 @@
 ; COMBINED-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; COMBINED-NEXT: <VERSION
 ; COMBINED-NEXT: <FLAGS
-; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
-; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: </GLOBALVAL_SUMMARY_BLOCK
diff --git a/llvm/test/tools/llc/new-pm/pipeline.ll b/llvm/test/tools/llc/new-pm/pipeline.ll
index 1ace5963e4ef..d1a50642ea31 100644
--- a/llvm/test/tools/llc/new-pm/pipeline.ll
+++ b/llvm/test/tools/llc/new-pm/pipeline.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -filetype=null %s | FileCheck %s
 
 ; CHECK: require<profile-summary>,require<collector-metadata>
-; CHECK: MachineSanitizerBinaryMetadata,FreeMachineFunctionPass
+; CHECK: MachineSanitizerBinaryMetadata
 
diff --git a/llvm/test/tools/llc/new-pm/pipeline.mir b/llvm/test/tools/llc/new-pm/pipeline.mir
index fcc7d4f8f02e..6baa710060f0 100644
--- a/llvm/test/tools/llc/new-pm/pipeline.mir
+++ b/llvm/test/tools/llc/new-pm/pipeline.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes=no-op-machine-function --print-pipeline-passes -filetype=null < %s | FileCheck %s --match-full-lines
 
-# CHECK: machine-function(no-op-machine-function),PrintMIRPreparePass,machine-function(print,FreeMachineFunctionPass)
+# CHECK: function(machine-function(no-op-machine-function)),PrintMIRPreparePass,function(machine-function(print))
 
 ---
 name: f
diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll
index 8c795a7a70f8..ba225d227d4c 100644
--- a/llvm/test/tools/llc/new-pm/start-stop.ll
+++ b/llvm/test/tools/llc/new-pm/start-stop.ll
@@ -2,4 +2,4 @@
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ
 
 ; NULL: function(mergeicmps,expand-memcmp,gc-lowering)
-; OBJ: function(mergeicmps,expand-memcmp,gc-lowering),PrintMIRPreparePass,machine-function(print)
+; OBJ: function(mergeicmps,expand-memcmp,gc-lowering),PrintMIRPreparePass,function(machine-function(print),invalidate<machine-function-info>)
diff --git a/llvm/test/tools/llvm-driver/passthrough-lld.test b/llvm/test/tools/llvm-driver/passthrough-lld.test
index acd5f3878481..b31fa4e483b9 100644
--- a/llvm/test/tools/llvm-driver/passthrough-lld.test
+++ b/llvm/test/tools/llvm-driver/passthrough-lld.test
@@ -1,6 +1,8 @@
 # REQUIRES: llvm-driver, lld
 
 # RUN: %llvm ld.lld --help | FileCheck %s
+# RUN: %llvm ld --help | FileCheck %s
 # RUN: %llvm lld -flavor ld.lld --help | FileCheck %s
+# RUN: %llvm ld -flavor ld.lld --help | FileCheck %s
 
 # CHECK: supported targets: elf
diff --git a/llvm/test/tools/llvm-lto/thinlto.ll b/llvm/test/tools/llvm-lto/thinlto.ll
index 23843e07d6a5..8eb7e7286e6c 100644
--- a/llvm/test/tools/llvm-lto/thinlto.ll
+++ b/llvm/test/tools/llvm-lto/thinlto.ll
@@ -12,8 +12,8 @@
 ; COMBINED-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; COMBINED-NEXT: <VERSION
 ; COMBINED-NEXT: <FLAGS
-; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
-; COMBINED-NEXT: <VALUE_GUID op0={{1|2}} op1={{-3706093650706652785|-5300342847281564238}}
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
+; COMBINED-NEXT: <VALUE_GUID {{.*}} op0={{1|2}} {{op1=3060885059 op2=1207956914|op1=3432075125 op2=3712786831}}
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: <COMBINED
 ; COMBINED-NEXT: </GLOBALVAL_SUMMARY_BLOCK
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s b/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
new file mode 100644
index 000000000000..0d67f53e12f1
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
@@ -0,0 +1,14 @@
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-ERROR %s
+
+# Test defends that if all instructions are skipped leaving an empty input, an error is printed.
+
+bzhi %eax, %ebx, %ecx
+
+# CHECK-ALL-NOT: error
+
+# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions to ignore.
+
+# CHECK-SKIP: warning: found an unsupported instruction in the input assembly sequence, skipping with -skip-unsupported-instructions, note accuracy will be impacted:
+# CHECK-SKIP: note: instruction:      bzhil   %eax, %ebx, %ecx
+# CHECK-SKIP: error: no assembly instructions found.
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s b/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
index bb88e951c129..3690a1101be9 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
@@ -1,6 +1,55 @@
-# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions -timeline %s 2>&1 | FileCheck --check-prefix=CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+
+# Test checks that unsupported instructions exit with an error, unless -skip-unsupported-instructions is passed, in which case the remaining instructions should be analysed.
+
+# CHECK-SKIP: warning: found an unsupported instruction in the input assembly sequence, skipping with -skip-unsupported-instructions, note accuracy will be impacted:
+# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions to ignore.
 
 bzhi %eax, %ebx, %ecx
 
-# CHECK: error: found an unsupported instruction in the input assembly sequence.
-# CHECK-NEXT: note: instruction: 	bzhil	%eax, %ebx, %ecx
+# Supported instruction that may be analysed.
+add %eax, %eax
+
+# CHECK-SKIP: Iterations:        100
+# CHECK-SKIP: Instructions:      100
+# CHECK-SKIP: Total Cycles:      103
+# CHECK-SKIP: Total uOps:        100
+
+# CHECK-SKIP: Dispatch Width:    2
+# CHECK-SKIP: uOps Per Cycle:    0.97
+# CHECK-SKIP: IPC:               0.97
+# CHECK-SKIP: Block RThroughput: 0.5
+
+# CHECK-SKIP: Instruction Info:
+# CHECK-SKIP: [1]: #uOps
+# CHECK-SKIP: [2]: Latency
+# CHECK-SKIP: [3]: RThroughput
+# CHECK-SKIP: [4]: MayLoad
+# CHECK-SKIP: [5]: MayStore
+# CHECK-SKIP: [6]: HasSideEffects (U)
+
+# CHECK-SKIP: [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-SKIP:  1      1     0.50                        addl  %eax, %eax
+
+# CHECK-SKIP: Timeline view:
+
+# CHECK-SKIP: [0,0]     DeER .    . .   addl  %eax, %eax
+# CHECK-SKIP: [1,0]     D=eER.    . .   addl  %eax, %eax
+# CHECK-SKIP: [2,0]     .D=eER    . .   addl  %eax, %eax
+# CHECK-SKIP: [3,0]     .D==eER   . .   addl  %eax, %eax
+# CHECK-SKIP: [4,0]     . D==eER  . .   addl  %eax, %eax
+# CHECK-SKIP: [5,0]     . D===eER . .   addl  %eax, %eax
+# CHECK-SKIP: [6,0]     .  D===eER. .   addl  %eax, %eax
+# CHECK-SKIP: [7,0]     .  D====eER .   addl  %eax, %eax
+# CHECK-SKIP: [8,0]     .   D====eER.   addl  %eax, %eax
+# CHECK-SKIP: [9,0]     .   D=====eER   addl  %eax, %eax
+
+# CHECK-SKIP: Average Wait times (based on the timeline view):
+# CHECK-SKIP: [0]: Executions
+# CHECK-SKIP: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-SKIP: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-SKIP: [3]: Average time elapsed from WB until retire stage
+
+# CHECK-SKIP:       [0]    [1]    [2]    [3]
+# CHECK-SKIP: 0.     10    3.5    0.1    0.0       addl       %eax, %eax
diff --git a/llvm/test/tools/llvm-objcopy/MachO/install-name-tool.test b/llvm/test/tools/llvm-objcopy/MachO/install-name-tool.test
new file mode 100644
index 000000000000..b56543bd8cfc
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/install-name-tool.test
@@ -0,0 +1,19 @@
+## This test checks general llvm-install-name-tool behavior.
+
+# RUN: yaml2obj %s -o %t
+
+## Passing something that doesn't exist
+# RUN: not llvm-install-name-tool -add_rpath foo non-existent-binary 2>&1 | FileCheck %s --check-prefix=DOES_NOT_EXIST
+
+# DOES_NOT_EXIST: {{.*}}non-existent-binary
+
+## Passing a non-Mach-O binary
+# RUN: not llvm-install-name-tool -add_rpath foo %t 2>&1 | FileCheck %s --check-prefix=NON_MACH_O -DFILE=%t
+
+# NON_MACH_O: error: input file: [[FILE]] is not a Mach-O file
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-v0.test b/llvm/test/tools/llvm-profdata/memprof-merge-v0.test
index 03ccbdd42efd..28f65e0781bc 100644
--- a/llvm/test/tools/llvm-profdata/memprof-merge-v0.test
+++ b/llvm/test/tools/llvm-profdata/memprof-merge-v0.test
@@ -16,6 +16,9 @@ RUN: llvm-profdata show %t.prof.v1 | FileCheck %s
 RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=2 --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v2
 RUN: llvm-profdata show %t.prof.v2 | FileCheck %s
 
+RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=2 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v2
+RUN: llvm-profdata show %t.prof.v2 | FileCheck %s
+
 For now we only check the validity of the instrumented profile since we don't
 have a way to display the contents of the memprof indexed format yet.
 
diff --git a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
new file mode 100644
index 000000000000..bb79dca399c2
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
@@ -0,0 +1,16 @@
+101 DIALOG 0, 0, 362, 246
+STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l |
+    0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l |
+    0x00080000l | 0x00040000l
+CAPTION "MakeNSISW"
+MENU 104
+FONT 8, "MS Shell Dlg"
+BEGIN
+    CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l |
+                    0x0100l | 0x0800l | 0x00008000 |
+                    0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190
+    CONTROL "",-1,"Static",0x00000010l,7,220,346,1
+    LTEXT "",200,7,230,200,12,0x08000000l
+    DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l
+    PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l
+END
diff --git a/llvm/test/tools/llvm-rc/dialog-with-menu.test b/llvm/test/tools/llvm-rc/dialog-with-menu.test
new file mode 100644
index 000000000000..2529e9c1722b
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/dialog-with-menu.test
@@ -0,0 +1,32 @@
+; RUN: llvm-rc -no-preprocess /FO %t -- %p/Inputs/dialog-with-menu.rc
+; RUN: llvm-readobj %t | FileCheck %s
+
+CHECK: Resource type (int): DIALOG (ID 5)
+CHECK-NEXT: Resource name (int): 101
+CHECK-NEXT: Data version: 0
+CHECK-NEXT: Memory flags: 0x1030
+CHECK-NEXT: Language ID: 1033
+CHECK-NEXT: Version (major): 0
+CHECK-NEXT: Version (minor): 0
+CHECK-NEXT: Characteristics: 0
+CHECK-NEXT: Data size: 278
+CHECK-NEXT: Data: (
+CHECK-NEXT:   0000: 4C08CF92 00000000 05000000 00006A01  |L.............j.|
+CHECK-NEXT:   0010: F600FFFF 68000000 4D006100 6B006500  |....h...M.a.k.e.|
+CHECK-NEXT:   0020: 4E005300 49005300 57000000 08004D00  |N.S.I.S.W.....M.|
+CHECK-NEXT:   0030: 53002000 53006800 65006C00 6C002000  |S. .S.h.e.l.l. .|
+CHECK-NEXT:   0040: 44006C00 67000000 4489A150 00000000  |D.l.g...D..P....|
+CHECK-NEXT:   0050: 07001600 5C01BE00 CA005200 69006300  |....\.....R.i.c.|
+CHECK-NEXT:   0060: 68004500 64006900 74003200 30004100  |h.E.d.i.t.2.0.A.|
+CHECK-NEXT:   0070: 00000000 00000000 10000050 00000000  |...........P....|
+CHECK-NEXT:   0080: 0700DC00 5A010100 FFFF5300 74006100  |....Z.....S.t.a.|
+CHECK-NEXT:   0090: 74006900 63000000 00000000 00000258  |t.i.c..........X|
+CHECK-NEXT:   00A0: 00000000 0700E600 C8000C00 C800FFFF  |................|
+CHECK-NEXT:   00B0: 82000000 00000000 01000158 00000000  |...........X....|
+CHECK-NEXT:   00C0: E600E200 3C000F00 CB00FFFF 80005400  |....<.........T.|
+CHECK-NEXT:   00D0: 65007300 74002000 26004900 6E007300  |e.s.t. .&.I.n.s.|
+CHECK-NEXT:   00E0: 74006100 6C006C00 65007200 00000000  |t.a.l.l.e.r.....|
+CHECK-NEXT:   00F0: 00000150 00000000 2801E200 31000F00  |...P....(...1...|
+CHECK-NEXT:   0100: 0200FFFF 80002600 43006C00 6F007300  |......&.C.l.o.s.|
+CHECK-NEXT:   0110: 65000000 0000                        |e.....|
+CHECK-NEXT: )
diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
index 46f6a2d0a554..95c23007b1a0 100644
--- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
+++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
@@ -34,21 +34,21 @@
 #
 # CHECK: << Total TLI yes SDK no:  8
 # CHECK: >> Total TLI no  SDK yes: 0
-# CHECK: == Total TLI yes SDK yes: 238
+# CHECK: == Total TLI yes SDK yes: 239
 #
 # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*)
 # WRONG_DETAIL: >> TLI no  SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int)
 # WRONG_DETAIL-COUNT-8: << TLI yes SDK no : {{.*}}__hot_cold_t
 # WRONG_SUMMARY: << Total TLI yes SDK no:  9{{$}}
 # WRONG_SUMMARY: >> Total TLI no  SDK yes: 1{{$}}
-# WRONG_SUMMARY: == Total TLI yes SDK yes: 237
+# WRONG_SUMMARY: == Total TLI yes SDK yes: 238
 #
 ## The -COUNT suffix doesn't care if there are too many matches, so check
 ## the exact count first; the two directives should add up to that.
 ## Yes, this means additions to TLI will fail this test, but the argument
 ## to -COUNT can't be an expression.
-# AVAIL: TLI knows 479 symbols, 246 available
-# AVAIL-COUNT-246: {{^}} available
+# AVAIL: TLI knows 480 symbols, 247 available
+# AVAIL-COUNT-247: {{^}} available
 # AVAIL-NOT:       {{^}} available
 # UNAVAIL-COUNT-233: not available
 # UNAVAIL-NOT:       not available
@@ -263,6 +263,10 @@ DynamicSymbols:
     Type:            STT_FUNC
     Section:         .text
     Binding:         STB_GLOBAL
+  - Name:            atexit
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
   - Name:            atof
     Type:            STT_FUNC
     Section:         .text
diff --git a/llvm/test/tools/lto/discard-value-names.ll b/llvm/test/tools/lto/discard-value-names.ll
index 723b0701ae22..04d25eaf6067 100644
--- a/llvm/test/tools/lto/discard-value-names.ll
+++ b/llvm/test/tools/lto/discard-value-names.ll
@@ -7,11 +7,10 @@
 
 ; The test requires asserts, as it depends on the default value for
 ; -lto-discard-value-names at the moment.
-; FIXME: -lto-discard-value-names is ignored at the moment.
 
 ; REQUIRES: asserts
 
-; DISCARD: %cmp.i = icmp
+; DISCARD: %{{[0-9]+}} = icmp
 ; DISCARD: %add = add i32
 
 ; KEEP: %cmp.i = icmp
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index b8a33f74bd57..5503f7343cb6 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -434,8 +434,10 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
       // FIXME: When binutils 2.31 (containing gold 1.16) is the minimum
       // required version, this should be changed to:
       // get_wrap_symbols = tv->tv_u.tv_get_wrap_symbols;
-      get_wrap_symbols =
-          (ld_plugin_get_wrap_symbols)tv->tv_u.tv_message;
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-function-type"
+      get_wrap_symbols = (ld_plugin_get_wrap_symbols)tv->tv_u.tv_message;
+#pragma GCC diagnostic pop
       break;
     default:
       break;
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index 6ae1b8db5e11..6d9956ea07d3 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/CodeGen/FreeMachineFunction.h"
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -120,11 +119,11 @@ int llvm::compileModuleWithNewPM(
   SI.registerCallbacks(PIC);
   registerCodeGenCallback(PIC, LLVMTM);
 
+  MachineFunctionAnalysisManager MFAM;
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
-  MachineFunctionAnalysisManager MFAM;
   PassBuilder PB(Target.get(), PipelineTuningOptions(), std::nullopt, &PIC);
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
@@ -137,6 +136,7 @@ int llvm::compileModuleWithNewPM(
   MAM.registerPass([&] { return MachineModuleAnalysis(MMI); });
 
   ModulePassManager MPM;
+  FunctionPassManager FPM;
 
   if (!PassPipeline.empty()) {
     // Construct a custom pass pipeline that starts after instruction
@@ -152,10 +152,10 @@ int llvm::compileModuleWithNewPM(
     MPM.addPass(PrintMIRPreparePass(*OS));
     MachineFunctionPassManager MFPM;
     MFPM.addPass(PrintMIRPass(*OS));
-    MFPM.addPass(FreeMachineFunctionPass());
-    MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+    FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
-    if (MIR->parseMachineFunctions(*M, MMI))
+    if (MIR->parseMachineFunctions(*M, MAM))
       return 1;
   } else {
     ExitOnErr(LLVMTM.buildCodeGenPipeline(
diff --git a/llvm/tools/llvm-cxxmap/llvm-cxxmap.cpp b/llvm/tools/llvm-cxxmap/llvm-cxxmap.cpp
index 6a5646965df2..c5ccd64f1165 100644
--- a/llvm/tools/llvm-cxxmap/llvm-cxxmap.cpp
+++ b/llvm/tools/llvm-cxxmap/llvm-cxxmap.cpp
@@ -144,15 +144,18 @@ int main(int argc, const char *argv[]) {
   cl::HideUnrelatedOptions({&CXXMapCategory, &getColorCategory()});
   cl::ParseCommandLineOptions(argc, argv, "LLVM C++ mangled name remapper\n");
 
-  auto OldSymbolBufOrError = MemoryBuffer::getFileOrSTDIN(OldSymbolFile);
+  auto OldSymbolBufOrError =
+      MemoryBuffer::getFileOrSTDIN(OldSymbolFile, /*IsText=*/true);
   if (!OldSymbolBufOrError)
     exitWithErrorCode(OldSymbolBufOrError.getError(), OldSymbolFile);
 
-  auto NewSymbolBufOrError = MemoryBuffer::getFileOrSTDIN(NewSymbolFile);
+  auto NewSymbolBufOrError =
+      MemoryBuffer::getFileOrSTDIN(NewSymbolFile, /*IsText=*/true);
   if (!NewSymbolBufOrError)
     exitWithErrorCode(NewSymbolBufOrError.getError(), NewSymbolFile);
 
-  auto RemappingBufOrError = MemoryBuffer::getFileOrSTDIN(RemappingFile);
+  auto RemappingBufOrError =
+      MemoryBuffer::getFileOrSTDIN(RemappingFile, /*IsText=*/true);
   if (!RemappingBufOrError)
     exitWithErrorCode(RemappingBufOrError.getError(), RemappingFile);
 
diff --git a/llvm/tools/llvm-driver/CMakeLists.txt b/llvm/tools/llvm-driver/CMakeLists.txt
index 83e084069b96..82d85c723010 100644
--- a/llvm/tools/llvm-driver/CMakeLists.txt
+++ b/llvm/tools/llvm-driver/CMakeLists.txt
@@ -13,6 +13,11 @@ foreach(tool ${LLVM_DRIVER_TOOLS})
     string(REPLACE "llvm-" "" alias ${alias})
     set(def_decl "${def_decl}LLVM_DRIVER_TOOL(\"${alias}\", ${tool_entry})\n")
   endforeach()
+  get_property(hidden_tool_aliases GLOBAL PROPERTY LLVM_DRIVER_HIDDEN_TOOL_ALIASES_${tool})
+  foreach(alias ${hidden_tool_aliases})
+    string(REPLACE "llvm-" "" alias ${alias})
+    set(def_decl "${def_decl}LLVM_DRIVER_TOOL(\"${alias}\", ${tool_entry})\n")
+  endforeach()
 endforeach()
 
 file(WRITE
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 498308e2edbe..ed53f8fabb17 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -18,6 +18,7 @@
 #include "PerfHelper.h"
 #include "SubprocessMemory.h"
 #include "Target.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -283,6 +284,7 @@ private:
                    SmallVectorImpl<int64_t> &CounterValues,
                    ArrayRef<const char *> ValidationCounters,
                    SmallVectorImpl<int64_t> &ValidationCounterValues) const {
+    auto WriteFDClose = make_scope_exit([WriteFD]() { close(WriteFD); });
     const ExegesisTarget &ET = State.getExegesisTarget();
     auto CounterOrError =
         ET.createCounter(CounterName, State, ValidationCounters, ChildPID);
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 1d44e09ad61e..89d7b197079e 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -8,6 +8,7 @@
 
 #include "SubprocessMemory.h"
 #include "Error.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cerrno>
@@ -56,6 +57,8 @@ Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
     return make_error<Failure>(
         "Failed to create shared memory object for auxiliary memory: " +
         Twine(strerror(errno)));
+  auto AuxiliaryMemoryFDClose =
+      make_scope_exit([AuxiliaryMemoryFD]() { close(AuxiliaryMemoryFD); });
   if (ftruncate(AuxiliaryMemoryFD, AuxiliaryMemorySize) != 0) {
     return make_error<Failure>("Truncating the auxiliary memory failed: " +
                                Twine(strerror(errno)));
@@ -78,6 +81,8 @@ Error SubprocessMemory::addMemoryDefinition(
       return make_error<Failure>(
           "Failed to create shared memory object for memory definition: " +
           Twine(strerror(errno)));
+    auto SharedMemoryFDClose =
+        make_scope_exit([SharedMemoryFD]() { close(SharedMemoryFD); });
     if (ftruncate(SharedMemoryFD, MemVal.SizeBytes) != 0) {
       return make_error<Failure>("Truncating a memory definiton failed: " +
                                  Twine(strerror(errno)));
@@ -138,7 +143,7 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
 }
 
 SubprocessMemory::~SubprocessMemory() {
-  for (std::string SharedMemoryName : SharedMemoryNames) {
+  for (const std::string &SharedMemoryName : SharedMemoryNames) {
     if (shm_unlink(SharedMemoryName.c_str()) != 0) {
       errs() << "Failed to unlink shared memory section: " << strerror(errno)
              << "\n";
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index faed9ff9939b..5dd961a603c9 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -251,10 +251,9 @@ static int run(int argc, char **argv) {
   // resolutions and apply them in the order observed.
   std::map<std::pair<std::string, std::string>, std::list<SymbolResolution>>
       CommandLineResolutions;
-  for (std::string R : SymbolResolutions) {
-    StringRef Rest = R;
-    StringRef FileName, SymbolName;
-    std::tie(FileName, Rest) = Rest.split(',');
+  for (StringRef R : SymbolResolutions) {
+    StringRef Rest, FileName, SymbolName;
+    std::tie(FileName, Rest) = R.split(',');
     if (Rest.empty()) {
       llvm::errs() << "invalid resolution: " << R << '\n';
       return 1;
diff --git a/llvm/tools/llvm-mca/CodeRegion.h b/llvm/tools/llvm-mca/CodeRegion.h
index ce107fd8f3b6..5a2e8baa1f3e 100644
--- a/llvm/tools/llvm-mca/CodeRegion.h
+++ b/llvm/tools/llvm-mca/CodeRegion.h
@@ -59,6 +59,7 @@
 #define LLVM_TOOLS_LLVM_MCA_CODEREGION_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -97,6 +98,20 @@ public:
     Instructions.emplace_back(Instruction);
   }
 
+  // Remove the given instructions from the set, for unsupported instructions
+  // being skipped. Returns an ArrayRef for the updated vector of Instructions.
+  [[nodiscard]] llvm::ArrayRef<llvm::MCInst>
+  dropInstructions(const llvm::SmallPtrSetImpl<const llvm::MCInst *> &Insts) {
+    if (Insts.empty())
+      return Instructions;
+    Instructions.erase(std::remove_if(Instructions.begin(), Instructions.end(),
+                                      [&Insts](const llvm::MCInst &Inst) {
+                                        return Insts.contains(&Inst);
+                                      }),
+                       Instructions.end());
+    return Instructions;
+  }
+
   llvm::SMLoc startLoc() const { return RangeStart; }
   llvm::SMLoc endLoc() const { return RangeEnd; }
 
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index eb71cffba6dd..e037c06b12a3 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -237,6 +237,11 @@ static cl::opt<bool> DisableInstrumentManager(
              "ignores instruments.)."),
     cl::cat(ViewOptions), cl::init(false));
 
+static cl::opt<bool> SkipUnsupportedInstructions(
+    "skip-unsupported-instructions",
+    cl::desc("Make unsupported instruction errors into warnings."),
+    cl::cat(ViewOptions), cl::init(false));
+
 namespace {
 
 const Target *getTarget(const char *ProgName) {
@@ -558,6 +563,7 @@ int main(int argc, char **argv) {
   assert(MAB && "Unable to create asm backend!");
 
   json::Object JSONOutput;
+  int NonEmptyRegions = 0;
   for (const std::unique_ptr<mca::AnalysisRegion> &Region : Regions) {
     // Skip empty code regions.
     if (Region->empty())
@@ -571,14 +577,13 @@ int main(int argc, char **argv) {
 
     IPP->resetState();
 
-    DenseMap<const MCInst *, SmallVector<mca::Instrument *>>
-        InstToInstruments;
+    DenseMap<const MCInst *, SmallVector<mca::Instrument *>> InstToInstruments;
     SmallVector<std::unique_ptr<mca::Instruction>> LoweredSequence;
+    SmallPtrSet<const MCInst *, 16> DroppedInsts;
     for (const MCInst &MCI : Insts) {
       SMLoc Loc = MCI.getLoc();
       const SmallVector<mca::Instrument *> Instruments =
           InstrumentRegions.getActiveInstruments(Loc);
-      InstToInstruments.insert({&MCI, Instruments});
 
       Expected<std::unique_ptr<mca::Instruction>> Inst =
           IB.createInstruction(MCI, Instruments);
@@ -588,7 +593,15 @@ int main(int argc, char **argv) {
                 [&IP, &STI](const mca::InstructionError<MCInst> &IE) {
                   std::string InstructionStr;
                   raw_string_ostream SS(InstructionStr);
-                  WithColor::error() << IE.Message << '\n';
+                  if (SkipUnsupportedInstructions)
+                    WithColor::warning()
+                        << IE.Message
+                        << ", skipping with -skip-unsupported-instructions, "
+                           "note accuracy will be impacted:\n";
+                  else
+                    WithColor::error()
+                        << IE.Message
+                        << ", use -skip-unsupported-instructions to ignore.\n";
                   IP->printInst(&IE.Inst, 0, "", *STI, SS);
                   SS.flush();
                   WithColor::note()
@@ -597,14 +610,25 @@ int main(int argc, char **argv) {
           // Default case.
           WithColor::error() << toString(std::move(NewE));
         }
+        if (SkipUnsupportedInstructions) {
+          DroppedInsts.insert(&MCI);
+          continue;
+        }
         return 1;
       }
 
       IPP->postProcessInstruction(Inst.get(), MCI);
-
+      InstToInstruments.insert({&MCI, Instruments});
       LoweredSequence.emplace_back(std::move(Inst.get()));
     }
 
+    Insts = Region->dropInstructions(DroppedInsts);
+
+    // Skip empty regions.
+    if (Insts.empty())
+      continue;
+    NonEmptyRegions++;
+
     mca::CircularSourceMgr S(LoweredSequence,
                              PrintInstructionTables ? 1 : Iterations);
 
@@ -759,6 +783,11 @@ int main(int argc, char **argv) {
     ++RegionIdx;
   }
 
+  if (NonEmptyRegions == 0) {
+    WithColor::error() << "no assembly instructions found.\n";
+    return 1;
+  }
+
   if (PrintJson)
     TOF->os() << formatv("{0:2}", json::Value(std::move(JSONOutput))) << "\n";
 
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 70e85460d3df..a1897334cff2 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/ObjCopy/ConfigManager.h"
 #include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/Object/Binary.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/CRC.h"
@@ -26,6 +27,7 @@
 
 using namespace llvm;
 using namespace llvm::objcopy;
+using namespace llvm::object;
 using namespace llvm::opt;
 
 namespace {
@@ -1242,6 +1244,16 @@ objcopy::parseInstallNameToolOptions(ArrayRef<const char *> ArgsArr) {
   Config.InputFilename = Positional[0];
   Config.OutputFilename = Positional[0];
 
+  Expected<OwningBinary<Binary>> BinaryOrErr =
+      createBinary(Config.InputFilename);
+  if (!BinaryOrErr)
+    return createFileError(Config.InputFilename, BinaryOrErr.takeError());
+  auto *Binary = (*BinaryOrErr).getBinary();
+  if (!Binary->isMachO() && !Binary->isMachOUniversalBinary())
+    return createStringError(errc::invalid_argument,
+                             "input file: %s is not a Mach-O file",
+                             Config.InputFilename.str().c_str());
+
   DC.CopyConfigs.push_back(std::move(ConfigMgr));
   return std::move(DC);
 }
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 339822e4adcd..675364a1c1bc 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -947,6 +947,55 @@ public:
 };
 AArch64PrettyPrinter AArch64PrettyPrinterInst;
 
+class RISCVPrettyPrinter : public PrettyPrinter {
+public:
+  void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
+                 object::SectionedAddress Address, formatted_raw_ostream &OS,
+                 StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
+                 StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
+                 LiveVariablePrinter &LVP) override {
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address, ObjectFilename, LVP);
+    LVP.printBetweenInsts(OS, false);
+
+    size_t Start = OS.tell();
+    if (LeadingAddr)
+      OS << format("%8" PRIx64 ":", Address.Address);
+    if (ShowRawInsn) {
+      size_t Pos = 0, End = Bytes.size();
+      if (End % 4 == 0) {
+        // 32-bit and 64-bit instructions.
+        for (; Pos + 4 <= End; Pos += 4)
+          OS << ' '
+             << format_hex_no_prefix(
+                    llvm::support::endian::read<uint32_t>(
+                        Bytes.data() + Pos, llvm::endianness::little),
+                    8);
+      } else if (End % 2 == 0) {
+        // 16-bit and 48-bits instructions.
+        for (; Pos + 2 <= End; Pos += 2)
+          OS << ' '
+             << format_hex_no_prefix(
+                    llvm::support::endian::read<uint16_t>(
+                        Bytes.data() + Pos, llvm::endianness::little),
+                    4);
+      }
+      if (Pos < End) {
+        OS << ' ';
+        dumpBytes(Bytes.slice(Pos), OS);
+      }
+    }
+
+    AlignToInstStartColumn(Start, STI, OS);
+
+    if (MI) {
+      IP.printInst(MI, Address.Address, "", STI, OS);
+    } else
+      OS << "\t<unknown>";
+  }
+};
+RISCVPrettyPrinter RISCVPrettyPrinterInst;
+
 PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
   switch(Triple.getArch()) {
   default:
@@ -967,6 +1016,9 @@ PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
   case Triple::aarch64_be:
   case Triple::aarch64_32:
     return AArch64PrettyPrinterInst;
+  case Triple::riscv32:
+  case Triple::riscv64:
+    return RISCVPrettyPrinterInst;
   }
 }
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 78daf9f7dc10..ec046ebfab13 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -308,6 +308,10 @@ cl::opt<memprof::IndexedVersion> MemProfVersionRequested(
                clEnumValN(memprof::Version1, "1", "version 1"),
                clEnumValN(memprof::Version2, "2", "version 2")));
 
+cl::opt<bool> MemProfFullSchema(
+    "memprof-full-schema", cl::Hidden, cl::sub(MergeSubcommand),
+    cl::desc("Use the full schema for serialization"), cl::init(false));
+
 // Options specific to overlap subcommand.
 cl::opt<std::string> BaseFilename(cl::Positional, cl::Required,
                                   cl::desc("<base profile file>"),
@@ -600,7 +604,7 @@ struct WriterContext {
                 SmallSet<instrprof_error, 4> &WriterErrorCodes,
                 uint64_t ReservoirSize = 0, uint64_t MaxTraceLength = 0)
       : Writer(IsSparse, ReservoirSize, MaxTraceLength, DoWritePrevVersion,
-               MemProfVersionRequested),
+               MemProfVersionRequested, MemProfFullSchema),
         ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {}
 };
 
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index d507525970ec..85b59532bb83 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -550,6 +550,11 @@ Error ResourceFileWriter::visitVersionStmt(const VersionStmt *Stmt) {
   return Error::success();
 }
 
+Error ResourceFileWriter::visitMenuStmt(const MenuStmt *Stmt) {
+  ObjectData.Menu = Stmt->Value;
+  return Error::success();
+}
+
 Error ResourceFileWriter::writeResource(
     const RCResource *Res,
     Error (ResourceFileWriter::*BodyWriter)(const RCResource *)) {
@@ -1132,9 +1137,8 @@ Error ResourceFileWriter::writeDialogBody(const RCResource *Base) {
            ulittle16_t(Res->Height)};
   writeObject(Middle);
 
-  // MENU field. As of now, we don't keep them in the state and can peacefully
-  // think there is no menu attached to the dialog.
-  writeInt<uint16_t>(0);
+  // MENU field.
+  RETURN_IF_ERROR(writeIntOrString(ObjectData.Menu));
 
   // Window CLASS field.
   RETURN_IF_ERROR(writeIntOrString(ObjectData.Class));
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.h b/llvm/tools/llvm-rc/ResourceFileWriter.h
index 9413a0eecdac..82d3e3b9e9e8 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.h
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.h
@@ -16,6 +16,7 @@
 #include "ResourceScriptStmt.h"
 #include "ResourceVisitor.h"
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
 
 namespace llvm {
@@ -68,6 +69,7 @@ public:
   Error visitLanguageStmt(const LanguageResource *) override;
   Error visitStyleStmt(const StyleStmt *) override;
   Error visitVersionStmt(const VersionStmt *) override;
+  Error visitMenuStmt(const MenuStmt *) override;
 
   // Stringtables are output at the end of .res file. We need a separate
   // function to do it.
@@ -92,10 +94,11 @@ public:
     };
     std::optional<FontInfo> Font;
     IntOrString Class;
+    IntOrString Menu;
 
     ObjectInfo()
         : LanguageInfo(0), Characteristics(0), VersionInfo(0),
-          Class(StringRef()) {}
+          Class(StringRef()), Menu(StringRef()) {}
   } ObjectData;
 
   struct StringTableInfo {
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
index 4f02fa502d24..69798152c1f2 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
@@ -430,6 +430,8 @@ RCParser::parseSingleOptionalStatement(OptStmtType StmtsType) {
       return parseFontStmt(StmtsType);
     if (TypeToken->equals_insensitive("STYLE"))
       return parseStyleStmt();
+    if (TypeToken->equals_insensitive("MENU"))
+      return parseMenuStmt();
   }
 
   return getExpectedError("optional statement type, BEGIN or '{'",
@@ -965,6 +967,11 @@ RCParser::ParseOptionType RCParser::parseExStyleStmt() {
   return std::make_unique<ExStyleStmt>(*Arg);
 }
 
+RCParser::ParseOptionType RCParser::parseMenuStmt() {
+  ASSIGN_OR_RETURN(Arg, readIntOrString());
+  return std::make_unique<MenuStmt>(*Arg);
+}
+
 Error RCParser::getExpectedError(const Twine &Message, bool IsAlreadyRead) {
   return make_error<ParserError>(
       Message, IsAlreadyRead ? std::prev(CurLoc) : CurLoc, End);
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.h b/llvm/tools/llvm-rc/ResourceScriptParser.h
index 603afd8d73fb..aa7f847187c4 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.h
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.h
@@ -176,6 +176,7 @@ private:
   ParseOptionType parseExStyleStmt();
   ParseOptionType parseFontStmt(OptStmtType DialogType);
   ParseOptionType parseStyleStmt();
+  ParseOptionType parseMenuStmt();
 
   // Raises an error. If IsAlreadyRead = false (default), this complains about
   // the token that couldn't be parsed. If the flag is on, this complains about
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
index 62df7999252f..a7f3df0863e7 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
@@ -309,5 +309,9 @@ raw_ostream &ExStyleStmt::log(raw_ostream &OS) const {
   return OS << "ExStyle: " << Value << "\n";
 }
 
+raw_ostream &MenuStmt::log(raw_ostream &OS) const {
+  return OS << "Menu: " << Value << "\n";
+}
+
 } // namespace rc
 } // namespace llvm
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index 70e7cec9cb84..05865e582859 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -993,6 +993,19 @@ public:
   Error visit(Visitor *V) const override { return V->visitExStyleStmt(this); }
 };
 
+// MENU optional statement.
+//
+// Ref: https://learn.microsoft.com/en-us/windows/win32/menurc/menu-statement
+class MenuStmt : public OptionalStmt {
+public:
+  IntOrString Value;
+
+  MenuStmt(IntOrString NameOrId) : Value(NameOrId) {}
+  raw_ostream &log(raw_ostream &) const override;
+  Twine getResourceTypeName() const override { return "MENU"; }
+  Error visit(Visitor *V) const override { return V->visitMenuStmt(this); }
+};
+
 // CLASS optional statement.
 //
 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380883(v=vs.85).aspx
diff --git a/llvm/tools/llvm-rc/ResourceVisitor.h b/llvm/tools/llvm-rc/ResourceVisitor.h
index a950cd7555ec..a121a0a507c2 100644
--- a/llvm/tools/llvm-rc/ResourceVisitor.h
+++ b/llvm/tools/llvm-rc/ResourceVisitor.h
@@ -28,6 +28,7 @@ class FontStmt;
 class LanguageResource;
 class StyleStmt;
 class VersionStmt;
+class MenuStmt;
 
 class Visitor {
 public:
@@ -52,6 +53,7 @@ public:
   virtual Error visitLanguageStmt(const LanguageResource *) = 0;
   virtual Error visitStyleStmt(const StyleStmt *) = 0;
   virtual Error visitVersionStmt(const VersionStmt *) = 0;
+  virtual Error visitMenuStmt(const MenuStmt *) = 0;
 
   virtual ~Visitor() {}
 };
diff --git a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
index 80064ed98485..1f183975d948 100644
--- a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
+++ b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
@@ -133,9 +133,7 @@ getInterfaceFile(const StringRef Filename, bool ResetBanner = true) {
   std::unique_ptr<InterfaceFile> IF;
   switch (identify_magic(Buffer->getBuffer())) {
   case file_magic::macho_dynamically_linked_shared_lib:
-    LLVM_FALLTHROUGH;
   case file_magic::macho_dynamically_linked_shared_lib_stub:
-    LLVM_FALLTHROUGH;
   case file_magic::macho_universal_binary:
     IF = ExitOnErr(DylibReader::get(Buffer->getMemBufferRef()));
     break;
diff --git a/llvm/tools/yaml2obj/yaml2obj.cpp b/llvm/tools/yaml2obj/yaml2obj.cpp
index b7f5356e22a9..4a060e1aad42 100644
--- a/llvm/tools/yaml2obj/yaml2obj.cpp
+++ b/llvm/tools/yaml2obj/yaml2obj.cpp
@@ -130,7 +130,7 @@ int main(int argc, char **argv) {
   }
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-      MemoryBuffer::getFileOrSTDIN(Input);
+      MemoryBuffer::getFileOrSTDIN(Input, /*IsText=*/true);
   if (!Buf)
     return 1;
 
diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp
index 8df71e8ad033..fa537e816fc8 100644
--- a/llvm/unittests/ADT/StringRefTest.cpp
+++ b/llvm/unittests/ADT/StringRefTest.cpp
@@ -368,6 +368,8 @@ TEST(StringRefTest, StartsWith) {
   EXPECT_TRUE(Str.starts_with("he"));
   EXPECT_FALSE(Str.starts_with("helloworld"));
   EXPECT_FALSE(Str.starts_with("hi"));
+  EXPECT_TRUE(Str.starts_with('h'));
+  EXPECT_FALSE(Str.starts_with('i'));
 }
 
 TEST(StringRefTest, StartsWithInsensitive) {
@@ -421,6 +423,8 @@ TEST(StringRefTest, EndsWith) {
   EXPECT_FALSE(Str.ends_with("helloworld"));
   EXPECT_FALSE(Str.ends_with("worldhello"));
   EXPECT_FALSE(Str.ends_with("so"));
+  EXPECT_TRUE(Str.ends_with('o'));
+  EXPECT_FALSE(Str.ends_with('p'));
 }
 
 TEST(StringRefTest, EndsWithInsensitive) {
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index 8e3fe3b44a84..1fe94e2aae05 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -495,6 +495,8 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
       "declare i32 @__cxa_guard_acquire(%struct*)\n"
       "declare void @__cxa_guard_release(%struct*)\n"
 
+      "declare i32 @atexit(void ()*)\n"
+
       "declare i32 @__nvvm_reflect(i8*)\n"
 
       "declare i8* @__memcpy_chk(i8*, i8*, i64, i64)\n"
diff --git a/llvm/unittests/BinaryFormat/CMakeLists.txt b/llvm/unittests/BinaryFormat/CMakeLists.txt
index f0c42a0dd02b..40d3bc4dca0b 100644
--- a/llvm/unittests/BinaryFormat/CMakeLists.txt
+++ b/llvm/unittests/BinaryFormat/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(BinaryFormatTests
   DwarfTest.cpp
+  ELFTest.cpp
   MachOTest.cpp
   MsgPackDocumentTest.cpp
   MsgPackReaderTest.cpp
diff --git a/llvm/unittests/BinaryFormat/DwarfTest.cpp b/llvm/unittests/BinaryFormat/DwarfTest.cpp
index 2fff8657939b..684e59fa2785 100644
--- a/llvm/unittests/BinaryFormat/DwarfTest.cpp
+++ b/llvm/unittests/BinaryFormat/DwarfTest.cpp
@@ -204,4 +204,19 @@ TEST(DwarfTest, format_provider) {
   EXPECT_EQ("DW_OP_lit0", formatv("{0}", DW_OP_lit0).str());
   EXPECT_EQ("DW_OP_unknown_ff", formatv("{0}", DW_OP_hi_user).str());
 }
+
+TEST(DwarfTest, lname) {
+  auto roundtrip = [](llvm::dwarf::SourceLanguage sl) {
+    auto name_version = toDW_LNAME(sl);
+    // Ignore ones without a defined mapping.
+    if (sl == DW_LANG_Mips_Assembler || sl == DW_LANG_GOOGLE_RenderScript ||
+        !name_version.has_value())
+      return sl;
+    return dwarf::toDW_LANG(name_version->first, name_version->second)
+        .value_or(sl);
+  };
+#define HANDLE_DW_LANG(ID, NAME, LOWER_BOUND, VERSION, VENDOR)                 \
+  EXPECT_EQ(roundtrip(DW_LANG_##NAME), DW_LANG_##NAME);
+#include "llvm/BinaryFormat/Dwarf.def"
+}
 } // end namespace
diff --git a/llvm/unittests/BinaryFormat/ELFTest.cpp b/llvm/unittests/BinaryFormat/ELFTest.cpp
new file mode 100644
index 000000000000..5dbf6ff8d5c9
--- /dev/null
+++ b/llvm/unittests/BinaryFormat/ELFTest.cpp
@@ -0,0 +1,32 @@
+//===- ELFTest.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::ELF;
+
+namespace {
+TEST(ELFTest, OSABI) {
+  EXPECT_EQ(ELFOSABI_GNU, convertNameToOSABI("gnu"));
+  EXPECT_EQ(ELFOSABI_FREEBSD, convertNameToOSABI("freebsd"));
+  EXPECT_EQ(ELFOSABI_STANDALONE, convertNameToOSABI("standalone"));
+  EXPECT_EQ(ELFOSABI_NONE, convertNameToOSABI("none"));
+  // Test unrecognized strings.
+  EXPECT_EQ(ELFOSABI_NONE, convertNameToOSABI(""));
+  EXPECT_EQ(ELFOSABI_NONE, convertNameToOSABI("linux"));
+
+  EXPECT_EQ("gnu", convertOSABIToName(ELFOSABI_GNU));
+  EXPECT_EQ("freebsd", convertOSABIToName(ELFOSABI_FREEBSD));
+  EXPECT_EQ("standalone", convertOSABIToName(ELFOSABI_STANDALONE));
+  EXPECT_EQ("none", convertOSABIToName(ELFOSABI_NONE));
+  // Test unrecognized values.
+  EXPECT_EQ("none", convertOSABIToName(0xfe));
+}
+} // namespace
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index ef80eed8d180..34a36ba68d7c 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -745,6 +745,120 @@ TEST_F(AArch64GISelMITest, TestNumSignBitsConstant) {
   EXPECT_EQ(3u, Info.computeNumSignBits(CopyRegNeg32));
 }
 
+TEST_F(AArch64GISelMITest, TestNumSignBitsXOR) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %xor1:_(s8) = G_XOR %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %xor1\n"
+
+                        "  %xor2:_(s8) = G_XOR %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %xor2\n"
+
+                        "  %xor3:_(s8) = G_XOR %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %xor3\n"
+
+                        "  %xor4:_(s8) = G_XOR %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %xor4\n"
+
+                        "  %xor5:_(s8) = G_XOR %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %xor5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(7u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy5));
+}
+
+TEST_F(AArch64GISelMITest, TestNumSignBitsOR) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %or1:_(s8) = G_OR %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %or1\n"
+
+                        "  %or2:_(s8) = G_OR %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %or2\n"
+
+                        "  %or3:_(s8) = G_OR %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %or3\n"
+
+                        "  %or4:_(s8) = G_OR %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %or4\n"
+
+                        "  %or5:_(s8) = G_OR %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %or5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy5));
+}
+
+TEST_F(AArch64GISelMITest, TestNumSignBitsAND) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %and1:_(s8) = G_AND %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %and1\n"
+
+                        "  %and2:_(s8) = G_AND %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %and2\n"
+
+                        "  %and3:_(s8) = G_AND %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %and3\n"
+
+                        "  %and4:_(s8) = G_AND %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %and4\n"
+
+                        "  %and5:_(s8) = G_AND %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %and5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(7u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy5));
+}
+
 TEST_F(AArch64GISelMITest, TestNumSignBitsSext) {
   StringRef MIRString = "  %3:_(p0) = G_IMPLICIT_DEF\n"
                         "  %4:_(s8) = G_LOAD %3 :: (load (s8))\n"
diff --git a/llvm/unittests/CodeGen/PassManagerTest.cpp b/llvm/unittests/CodeGen/PassManagerTest.cpp
index 71f8832d4365..d3a410f5450c 100644
--- a/llvm/unittests/CodeGen/PassManagerTest.cpp
+++ b/llvm/unittests/CodeGen/PassManagerTest.cpp
@@ -184,8 +184,8 @@ TEST_F(PassManagerTest, Basic) {
 
   MachineModuleInfo MMI(LLVMTM);
 
-  LoopAnalysisManager LAM;
   MachineFunctionAnalysisManager MFAM;
+  LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
@@ -205,13 +205,17 @@ TEST_F(PassManagerTest, Basic) {
   std::vector<int> Counts;
 
   ModulePassManager MPM;
+  FunctionPassManager FPM;
   MachineFunctionPassManager MFPM;
   MPM.addPass(TestMachineModulePass(Count, Counts));
-  MPM.addPass(createModuleToMachineFunctionPassAdaptor(
+  FPM.addPass(createFunctionToMachineFunctionPassAdaptor(
       TestMachineFunctionPass(Count, Counts)));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   MPM.addPass(TestMachineModulePass(Count, Counts));
   MFPM.addPass(TestMachineFunctionPass(Count, Counts));
-  MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+  FPM = FunctionPassManager();
+  FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
   testing::internal::CaptureStderr();
   MPM.run(*M, MAM);
@@ -248,8 +252,10 @@ TEST_F(PassManagerTest, DiagnosticHandler) {
   ModulePassManager MPM;
   FunctionPassManager FPM;
   MachineFunctionPassManager MFPM;
+  MPM.addPass(RequireAnalysisPass<MachineModuleAnalysis, Module>());
   MFPM.addPass(ReportWarningPass());
-  MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+  FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   testing::internal::CaptureStderr();
   MPM.run(*M, MAM);
   std::string Output = testing::internal::GetCapturedStderr();
diff --git a/llvm/unittests/Frontend/OpenMPCompositionTest.cpp b/llvm/unittests/Frontend/OpenMPCompositionTest.cpp
index 920b445427e7..0aed247ff167 100644
--- a/llvm/unittests/Frontend/OpenMPCompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPCompositionTest.cpp
@@ -23,6 +23,15 @@ TEST(Composition, GetLeafConstructs) {
   ASSERT_EQ(L3, (ArrayRef<Directive>{OMPD_parallel, OMPD_for, OMPD_simd}));
 }
 
+TEST(Composition, GetLeafConstructsOrSelf) {
+  ArrayRef<Directive> L1 = getLeafConstructsOrSelf(OMPD_loop);
+  ASSERT_EQ(L1, (ArrayRef<Directive>{OMPD_loop}));
+  ArrayRef<Directive> L2 = getLeafConstructsOrSelf(OMPD_parallel_for);
+  ASSERT_EQ(L2, (ArrayRef<Directive>{OMPD_parallel, OMPD_for}));
+  ArrayRef<Directive> L3 = getLeafConstructsOrSelf(OMPD_parallel_for_simd);
+  ASSERT_EQ(L3, (ArrayRef<Directive>{OMPD_parallel, OMPD_for, OMPD_simd}));
+}
+
 TEST(Composition, GetCompoundConstruct) {
   Directive C1 =
       getCompoundConstruct({OMPD_target, OMPD_teams, OMPD_distribute});
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index 905928819dda..f873bbd4293a 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -109,6 +109,62 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   UseNewDbgInfoFormat = false;
 }
 
+TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
+  LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
+  std::unique_ptr<Module> M = parseIR(C, R"---(
+    define dso_local void @func() #0 !dbg !10 {
+      %1 = alloca i32, align 4
+      tail call void @llvm.dbg.declare(metadata ptr %1, metadata !14, metadata !DIExpression()), !dbg !16
+      store i32 2, ptr %1, align 4, !dbg !16
+      ret void, !dbg !17
+    }
+
+    declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+
+    attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+    !llvm.dbg.cu = !{!0}
+    !llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+    !llvm.ident = !{!9}
+
+    !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "dummy", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+    !1 = !DIFile(filename: "dummy", directory: "dummy")
+    !2 = !{i32 7, !"Dwarf Version", i32 5}
+    !3 = !{i32 2, !"Debug Info Version", i32 3}
+    !4 = !{i32 1, !"wchar_size", i32 4}
+    !5 = !{i32 8, !"PIC Level", i32 2}
+    !6 = !{i32 7, !"PIE Level", i32 2}
+    !7 = !{i32 7, !"uwtable", i32 2}
+    !8 = !{i32 7, !"frame-pointer", i32 2}
+    !9 = !{!"dummy"}
+    !10 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !13)
+    !11 = !DISubroutineType(types: !12)
+    !12 = !{null}
+    !13 = !{}
+    !14 = !DILocalVariable(name: "a", scope: !10, file: !1, line: 2, type: !15)
+    !15 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+    !16 = !DILocation(line: 2, column: 6, scope: !10)
+    !17 = !DILocation(line: 3, column: 2, scope: !10)
+  )---");
+  ASSERT_TRUE(M);
+
+  M->convertToNewDbgValues();
+
+  Function *F = M->getFunction("func");
+
+  BasicBlock &BB = F->getEntryBlock();
+  auto I = std::prev(BB.end(), 2);
+  BB.splitBasicBlockBefore(I, "before");
+
+  BasicBlock &BBBefore = F->getEntryBlock();
+  auto I2 = std::prev(BBBefore.end(), 2);
+  ASSERT_TRUE(I2->hasDbgRecords());
+
+  UseNewDbgInfoFormat = false;
+}
+
 TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp
index 3fa4b2cf73b6..dddd2f73d444 100644
--- a/llvm/unittests/IR/IntrinsicsTest.cpp
+++ b/llvm/unittests/IR/IntrinsicsTest.cpp
@@ -81,6 +81,7 @@ TEST_F(IntrinsicsTest, InstrProfInheritance) {
   __ISA(InstrProfCoverInst, InstrProfCntrInstBase);
   __ISA(InstrProfIncrementInst, InstrProfCntrInstBase);
   __ISA(InstrProfIncrementInstStep, InstrProfIncrementInst);
+  __ISA(InstrProfCallsite, InstrProfCntrInstBase);
   __ISA(InstrProfTimestampInst, InstrProfCntrInstBase);
   __ISA(InstrProfValueProfileInst, InstrProfCntrInstBase);
   __ISA(InstrProfMCDCBitmapInstBase, InstrProfInstBase);
@@ -94,6 +95,7 @@ TEST_F(IntrinsicsTest, InstrProfInheritance) {
           {Intrinsic::instrprof_cover, isInstrProfCoverInst},
           {Intrinsic::instrprof_increment, isInstrProfIncrementInst},
           {Intrinsic::instrprof_increment_step, isInstrProfIncrementInstStep},
+          {Intrinsic::instrprof_callsite, isInstrProfCallsite},
           {Intrinsic::instrprof_mcdc_condbitmap_update,
            isInstrProfMCDCCondBitmapUpdate},
           {Intrinsic::instrprof_mcdc_parameters,
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index fd010ef2208c..626ab2e9a9c5 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -179,6 +179,8 @@ protected:
         << "(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i1>, i32) ";
     Str << " declare <8 x i16> @llvm.vp.fshr.v8i16"
         << "(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i1>, i32) ";
+    Str << " declare i32 @llvm.vp.cttz.elts.i32.v8i16"
+        << "(<8 x i16>, i1 immarg, <8 x i1>, i32) ";
 
     return parseAssemblyString(Str.str(), Err, C);
   }
diff --git a/llvm/unittests/MIR/CMakeLists.txt b/llvm/unittests/MIR/CMakeLists.txt
index 0ad52134a34d..d6aff46eff07 100644
--- a/llvm/unittests/MIR/CMakeLists.txt
+++ b/llvm/unittests/MIR/CMakeLists.txt
@@ -15,7 +15,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(MIRTests
   MachineMetadata.cpp
-  PassBuilderCallbacksTest.cpp
   )
 
 target_link_libraries(MIRTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
deleted file mode 100644
index 6fd4e54a929f..000000000000
--- a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
+++ /dev/null
@@ -1,575 +0,0 @@
-//===- unittests/MIR/PassBuilderCallbacksTest.cpp - PB Callback Tests -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/CodeGen/FreeMachineFunction.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Testing/Support/Error.h"
-#include <functional>
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include <llvm/ADT/Any.h>
-#include <llvm/AsmParser/Parser.h>
-#include <llvm/CodeGen/MIRParser/MIRParser.h>
-#include <llvm/CodeGen/MachineFunction.h>
-#include <llvm/CodeGen/MachineModuleInfo.h>
-#include <llvm/CodeGen/MachinePassManager.h>
-#include <llvm/IR/LLVMContext.h>
-#include <llvm/IR/PassInstrumentation.h>
-#include <llvm/IR/PassManager.h>
-#include <llvm/Passes/PassBuilder.h>
-#include <llvm/Support/Regex.h>
-#include <llvm/Support/SourceMgr.h>
-#include <llvm/Support/TargetSelect.h>
-
-using namespace llvm;
-
-namespace {
-using testing::_;
-using testing::AnyNumber;
-using testing::DoAll;
-using testing::Not;
-using testing::Return;
-using testing::WithArgs;
-
-StringRef MIRString = R"MIR(
---- |
-  define void @test() {
-    ret void
-  }
-...
----
-name:            test
-body:             |
-  bb.0 (%ir-block.0):
-    RET64
-...
-)MIR";
-
-/// Helper for HasName matcher that returns getName both for IRUnit and
-/// for IRUnit pointer wrapper into llvm::Any (wrapped by PassInstrumentation).
-template <typename IRUnitT> std::string getName(const IRUnitT &IR) {
-  return std::string(IR.getName());
-}
-
-template <> std::string getName(const StringRef &name) {
-  return std::string(name);
-}
-
-template <> std::string getName(const Any &WrappedIR) {
-  if (const auto *const *M = llvm::any_cast<const Module *>(&WrappedIR))
-    return (*M)->getName().str();
-  if (const auto *const *F = llvm::any_cast<const Function *>(&WrappedIR))
-    return (*F)->getName().str();
-  if (const auto *const *MF =
-          llvm::any_cast<const MachineFunction *>(&WrappedIR))
-    return (*MF)->getName().str();
-  return "<UNKNOWN>";
-}
-/// Define a custom matcher for objects which support a 'getName' method.
-///
-/// LLVM often has IR objects or analysis objects which expose a name
-/// and in tests it is convenient to match these by name for readability.
-/// Usually, this name is either a StringRef or a plain std::string. This
-/// matcher supports any type exposing a getName() method of this form whose
-/// return value is compatible with an std::ostream. For StringRef, this uses
-/// the shift operator defined above.
-///
-/// It should be used as:
-///
-///   HasName("my_function")
-///
-/// No namespace or other qualification is required.
-MATCHER_P(HasName, Name, "") {
-  *result_listener << "has name '" << getName(arg) << "'";
-  return Name == getName(arg);
-}
-
-MATCHER_P(HasNameRegex, Name, "") {
-  *result_listener << "has name '" << getName(arg) << "'";
-  llvm::Regex r(Name);
-  return r.match(getName(arg));
-}
-
-struct MockPassInstrumentationCallbacks {
-  MockPassInstrumentationCallbacks() {
-    ON_CALL(*this, runBeforePass(_, _)).WillByDefault(Return(true));
-  }
-  MOCK_METHOD2(runBeforePass, bool(StringRef PassID, llvm::Any));
-  MOCK_METHOD2(runBeforeSkippedPass, void(StringRef PassID, llvm::Any));
-  MOCK_METHOD2(runBeforeNonSkippedPass, void(StringRef PassID, llvm::Any));
-  MOCK_METHOD3(runAfterPass,
-               void(StringRef PassID, llvm::Any, const PreservedAnalyses &PA));
-  MOCK_METHOD2(runAfterPassInvalidated,
-               void(StringRef PassID, const PreservedAnalyses &PA));
-  MOCK_METHOD2(runBeforeAnalysis, void(StringRef PassID, llvm::Any));
-  MOCK_METHOD2(runAfterAnalysis, void(StringRef PassID, llvm::Any));
-
-  void registerPassInstrumentation(PassInstrumentationCallbacks &Callbacks) {
-    Callbacks.registerShouldRunOptionalPassCallback(
-        [this](StringRef P, llvm::Any IR) {
-          return this->runBeforePass(P, IR);
-        });
-    Callbacks.registerBeforeSkippedPassCallback(
-        [this](StringRef P, llvm::Any IR) {
-          this->runBeforeSkippedPass(P, IR);
-        });
-    Callbacks.registerBeforeNonSkippedPassCallback(
-        [this](StringRef P, llvm::Any IR) {
-          this->runBeforeNonSkippedPass(P, IR);
-        });
-    Callbacks.registerAfterPassCallback(
-        [this](StringRef P, llvm::Any IR, const PreservedAnalyses &PA) {
-          this->runAfterPass(P, IR, PA);
-        });
-    Callbacks.registerAfterPassInvalidatedCallback(
-        [this](StringRef P, const PreservedAnalyses &PA) {
-          this->runAfterPassInvalidated(P, PA);
-        });
-    Callbacks.registerBeforeAnalysisCallback([this](StringRef P, llvm::Any IR) {
-      return this->runBeforeAnalysis(P, IR);
-    });
-    Callbacks.registerAfterAnalysisCallback(
-        [this](StringRef P, llvm::Any IR) { this->runAfterAnalysis(P, IR); });
-  }
-
-  void ignoreNonMockPassInstrumentation(StringRef IRName) {
-    // Generic EXPECT_CALLs are needed to match instrumentation on unimportant
-    // parts of a pipeline that we do not care about (e.g. various passes added
-    // by default by PassBuilder - Verifier pass etc).
-    // Make sure to avoid ignoring Mock passes/analysis, we definitely want
-    // to check these explicitly.
-    EXPECT_CALL(*this,
-                runBeforePass(Not(HasNameRegex("Mock")), HasName(IRName)))
-        .Times(AnyNumber())
-        .WillRepeatedly(Return(false));
-    EXPECT_CALL(
-        *this, runBeforeSkippedPass(Not(HasNameRegex("Mock")), HasName(IRName)))
-        .Times(AnyNumber());
-    EXPECT_CALL(*this, runBeforeNonSkippedPass(Not(HasNameRegex("Mock")),
-                                               HasName(IRName)))
-        .Times(AnyNumber());
-    EXPECT_CALL(*this,
-                runAfterPass(Not(HasNameRegex("Mock")), HasName(IRName), _))
-        .Times(AnyNumber());
-    EXPECT_CALL(*this,
-                runBeforeAnalysis(Not(HasNameRegex("Mock")), HasName(IRName)))
-        .Times(AnyNumber());
-    EXPECT_CALL(*this,
-                runAfterAnalysis(Not(HasNameRegex("Mock")), HasName(IRName)))
-        .Times(AnyNumber());
-  }
-};
-
-template <typename DerivedT> class MockAnalysisHandleBase {
-public:
-  class Analysis : public AnalysisInfoMixin<Analysis> {
-    friend AnalysisInfoMixin<Analysis>;
-    friend MockAnalysisHandleBase;
-    static AnalysisKey Key;
-
-    DerivedT *Handle;
-
-    Analysis(DerivedT &Handle) : Handle(&Handle) {
-      static_assert(std::is_base_of<MockAnalysisHandleBase, DerivedT>::value,
-                    "Must pass the derived type to this template!");
-    }
-
-  public:
-    class Result {
-      friend MockAnalysisHandleBase;
-
-      DerivedT *Handle;
-
-      Result(DerivedT &Handle) : Handle(&Handle) {}
-
-    public:
-      // Forward invalidation events to the mock handle.
-      bool invalidate(MachineFunction &IR, const PreservedAnalyses &PA,
-                      MachineFunctionAnalysisManager::Invalidator &Inv) {
-        return Handle->invalidate(IR, PA, Inv);
-      }
-    };
-
-    Result run(MachineFunction &IR, MachineFunctionAnalysisManager &AM) {
-      return Handle->run(IR, AM);
-    }
-  };
-
-  Analysis getAnalysis() { return Analysis(static_cast<DerivedT &>(*this)); }
-  typename Analysis::Result getResult() {
-    return typename Analysis::Result(static_cast<DerivedT &>(*this));
-  }
-  static StringRef getName() { return llvm::getTypeName<DerivedT>(); }
-
-protected:
-  // FIXME: MSVC seems unable to handle a lambda argument to Invoke from within
-  // the template, so we use a boring static function.
-  static bool
-  invalidateCallback(MachineFunction &IR, const PreservedAnalyses &PA,
-                     MachineFunctionAnalysisManager::Invalidator &Inv) {
-    auto PAC = PA.template getChecker<Analysis>();
-    return !PAC.preserved() &&
-           !PAC.template preservedSet<AllAnalysesOn<MachineFunction>>();
-  }
-
-  /// Derived classes should call this in their constructor to set up default
-  /// mock actions. (We can't do this in our constructor because this has to
-  /// run after the DerivedT is constructed.)
-  void setDefaults() {
-    ON_CALL(static_cast<DerivedT &>(*this), run(_, _))
-        .WillByDefault(Return(this->getResult()));
-    ON_CALL(static_cast<DerivedT &>(*this), invalidate(_, _, _))
-        .WillByDefault(&invalidateCallback);
-  }
-};
-
-template <typename DerivedT> class MockPassHandleBase {
-public:
-  class Pass : public PassInfoMixin<Pass> {
-    friend MockPassHandleBase;
-
-    DerivedT *Handle;
-
-    Pass(DerivedT &Handle) : Handle(&Handle) {
-      static_assert(std::is_base_of<MockPassHandleBase, DerivedT>::value,
-                    "Must pass the derived type to this template!");
-    }
-
-  public:
-    PreservedAnalyses run(MachineFunction &IR,
-                          MachineFunctionAnalysisManager &AM) {
-      return Handle->run(IR, AM);
-    }
-  };
-
-  static StringRef getName() { return llvm::getTypeName<DerivedT>(); }
-
-  Pass getPass() { return Pass(static_cast<DerivedT &>(*this)); }
-
-protected:
-  /// Derived classes should call this in their constructor to set up default
-  /// mock actions. (We can't do this in our constructor because this has to
-  /// run after the DerivedT is constructed.)
-  void setDefaults() {
-    ON_CALL(static_cast<DerivedT &>(*this), run(_, _))
-        .WillByDefault(Return(PreservedAnalyses::all()));
-  }
-};
-
-struct MockAnalysisHandle : public MockAnalysisHandleBase<MockAnalysisHandle> {
-  MOCK_METHOD2(run, Analysis::Result(MachineFunction &,
-                                     MachineFunctionAnalysisManager &));
-
-  MOCK_METHOD3(invalidate, bool(MachineFunction &, const PreservedAnalyses &,
-                                MachineFunctionAnalysisManager::Invalidator &));
-
-  MockAnalysisHandle() { setDefaults(); }
-};
-
-template <typename DerivedT>
-AnalysisKey MockAnalysisHandleBase<DerivedT>::Analysis::Key;
-
-class MockPassHandle : public MockPassHandleBase<MockPassHandle> {
-public:
-  MOCK_METHOD2(run, PreservedAnalyses(MachineFunction &,
-                                      MachineFunctionAnalysisManager &));
-
-  MockPassHandle() { setDefaults(); }
-};
-
-class MachineFunctionCallbacksTest : public testing::Test {
-protected:
-  static void SetUpTestCase() {
-    InitializeAllTargetInfos();
-    InitializeAllTargets();
-    InitializeAllTargetMCs();
-  }
-
-  LLVMContext Context;
-
-  std::unique_ptr<LLVMTargetMachine> TM;
-  std::unique_ptr<MachineModuleInfo> MMI;
-
-  std::unique_ptr<Module> M;
-
-  PassInstrumentationCallbacks PIC;
-  std::unique_ptr<PassBuilder> PB;
-  ModulePassManager MPM;
-  MachineFunctionAnalysisManager MFAM;
-  LoopAnalysisManager LAM;
-  FunctionAnalysisManager FAM;
-  CGSCCAnalysisManager CGAM;
-  ModuleAnalysisManager MAM;
-
-  MockPassInstrumentationCallbacks CallbacksHandle;
-  MockPassHandle PassHandle;
-  MockAnalysisHandle AnalysisHandle;
-
-  static std::unique_ptr<Module> parseMIR(StringRef MIRCode,
-                                          LLVMContext &Context,
-                                          TargetMachine &TM,
-                                          MachineModuleInfo &MMI) {
-    SMDiagnostic Diagnostic;
-    std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
-    std::unique_ptr<MIRParser> MIR =
-        createMIRParser(std::move(MBuffer), Context);
-    assert(MIR);
-
-    std::unique_ptr<Module> Mod = MIR->parseIRModule();
-    assert(Mod);
-
-    // Module identifier is used in tests below.
-    Mod->setModuleIdentifier("module");
-    Mod->setDataLayout(TM.createDataLayout());
-
-    [[maybe_unused]] bool Ret = MIR->parseMachineFunctions(*Mod, MMI);
-    assert(!Ret);
-
-    return Mod;
-  }
-
-  static PreservedAnalyses
-  getAnalysisResult(MachineFunction &U, MachineFunctionAnalysisManager &MFAM) {
-    MFAM.getResult<MockAnalysisHandle::Analysis>(U);
-    return PreservedAnalyses::all();
-  }
-
-  void SetUp() override {
-    std::string Error;
-    auto TripleName = "x86_64-pc-linux-gnu";
-    auto *T = TargetRegistry::lookupTarget(TripleName, Error);
-    if (!T)
-      GTEST_SKIP();
-    TM = std::unique_ptr<LLVMTargetMachine>(
-        static_cast<LLVMTargetMachine *>(T->createTargetMachine(
-            TripleName, "", "", TargetOptions(), std::nullopt)));
-    if (!TM)
-      GTEST_SKIP();
-
-    MMI = std::make_unique<MachineModuleInfo>(TM.get());
-    M = parseMIR(MIRString, Context, *TM, *MMI);
-    PB = std::make_unique<PassBuilder>(TM.get(), PipelineTuningOptions(),
-                                       std::nullopt, &PIC);
-
-    /// Register a callback for analysis registration.
-    ///
-    /// The callback is a function taking a reference to an AnalyisManager
-    /// object. When called, the callee gets to register its own analyses with
-    /// this PassBuilder instance.
-    PB->registerAnalysisRegistrationCallback(
-        [this](MachineFunctionAnalysisManager &AM) {
-          // Register our mock analysis
-          AM.registerPass([this] { return AnalysisHandle.getAnalysis(); });
-        });
-
-    /// Register a callback for pipeline parsing.
-    ///
-    /// During parsing of a textual pipeline, the PassBuilder will call these
-    /// callbacks for each encountered pass name that it does not know. This
-    /// includes both simple pass names as well as names of sub-pipelines. In
-    /// the latter case, the InnerPipeline is not empty.
-    PB->registerPipelineParsingCallback(
-        [this](StringRef Name, MachineFunctionPassManager &PM,
-               ArrayRef<PassBuilder::PipelineElement> InnerPipeline) {
-          if (parseAnalysisUtilityPasses<MockAnalysisHandle::Analysis>(
-                  "test-analysis", Name, PM))
-            return true;
-
-          /// Parse the name of our pass mock handle
-          if (Name == "test-transform") {
-            PM.addPass(PassHandle.getPass());
-            return true;
-          }
-          return false;
-        });
-
-    /// Register builtin analyses and cross-register the analysis proxies
-    PB->registerModuleAnalyses(MAM);
-    PB->registerCGSCCAnalyses(CGAM);
-    PB->registerFunctionAnalyses(FAM);
-    PB->registerLoopAnalyses(LAM);
-    PB->registerMachineFunctionAnalyses(MFAM);
-    PB->crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM);
-    MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); });
-  }
-};
-
-TEST_F(MachineFunctionCallbacksTest, Passes) {
-  EXPECT_CALL(AnalysisHandle, run(HasName("test"), _));
-  EXPECT_CALL(PassHandle, run(HasName("test"), _)).WillOnce(&getAnalysisResult);
-
-  StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
-      << "Pipeline was: " << PipelineText;
-  MPM.run(*M, MAM);
-}
-
-TEST_F(MachineFunctionCallbacksTest, InstrumentedPasses) {
-  CallbacksHandle.registerPassInstrumentation(PIC);
-  // Non-mock instrumentation not specifically mentioned below can be ignored.
-  CallbacksHandle.ignoreNonMockPassInstrumentation("test");
-  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
-
-  // PassInstrumentation calls should happen in-sequence, in the same order
-  // as passes/analyses are scheduled.
-  ::testing::Sequence PISequence;
-  EXPECT_CALL(CallbacksHandle,
-              runBeforePass(HasNameRegex("MockPassHandle"), HasName("test")))
-      .InSequence(PISequence)
-      .WillOnce(Return(true));
-  EXPECT_CALL(
-      CallbacksHandle,
-      runBeforeNonSkippedPass(HasNameRegex("MockPassHandle"), HasName("test")))
-      .InSequence(PISequence);
-  EXPECT_CALL(
-      CallbacksHandle,
-      runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("test")))
-      .InSequence(PISequence);
-  EXPECT_CALL(
-      CallbacksHandle,
-      runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("test")))
-      .InSequence(PISequence);
-  EXPECT_CALL(CallbacksHandle,
-              runAfterPass(HasNameRegex("MockPassHandle"), HasName("test"), _))
-      .InSequence(PISequence);
-  EXPECT_CALL(
-      CallbacksHandle,
-      runBeforeSkippedPass(HasNameRegex("MockPassHandle"), HasName("test")))
-      .Times(0);
-
-  EXPECT_CALL(AnalysisHandle, run(HasName("test"), _));
-  EXPECT_CALL(PassHandle, run(HasName("test"), _)).WillOnce(&getAnalysisResult);
-
-  StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
-      << "Pipeline was: " << PipelineText;
-  MPM.run(*M, MAM);
-}
-
-TEST_F(MachineFunctionCallbacksTest, InstrumentedSkippedPasses) {
-  CallbacksHandle.registerPassInstrumentation(PIC);
-  // Non-mock instrumentation run here can safely be ignored.
-  CallbacksHandle.ignoreNonMockPassInstrumentation("test");
-  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
-
-  // Skip the pass by returning false.
-  EXPECT_CALL(CallbacksHandle,
-              runBeforePass(HasNameRegex("MockPassHandle"), HasName("test")))
-      .WillOnce(Return(false));
-
-  EXPECT_CALL(
-      CallbacksHandle,
-      runBeforeSkippedPass(HasNameRegex("MockPassHandle"), HasName("test")))
-      .Times(1);
-
-  EXPECT_CALL(AnalysisHandle, run(HasName("test"), _)).Times(0);
-  EXPECT_CALL(PassHandle, run(HasName("test"), _)).Times(0);
-
-  // As the pass is skipped there is no afterPass, beforeAnalysis/afterAnalysis
-  // as well.
-  EXPECT_CALL(CallbacksHandle,
-              runBeforeNonSkippedPass(HasNameRegex("MockPassHandle"), _))
-      .Times(0);
-  EXPECT_CALL(CallbacksHandle,
-              runAfterPass(HasNameRegex("MockPassHandle"), _, _))
-      .Times(0);
-  EXPECT_CALL(CallbacksHandle,
-              runAfterPassInvalidated(HasNameRegex("MockPassHandle"), _))
-      .Times(0);
-  EXPECT_CALL(CallbacksHandle,
-              runAfterPass(HasNameRegex("MockPassHandle"), _, _))
-      .Times(0);
-  EXPECT_CALL(CallbacksHandle,
-              runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), _))
-      .Times(0);
-  EXPECT_CALL(CallbacksHandle,
-              runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), _))
-      .Times(0);
-
-  StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
-      << "Pipeline was: " << PipelineText;
-  MPM.run(*M, MAM);
-}
-
-// Check that the Module -> MachineFunction adaptor properly calls
-// runAfterPassInvalidated.
-TEST_F(MachineFunctionCallbacksTest, InstrumentedFreeMFPass) {
-  CallbacksHandle.registerPassInstrumentation(PIC);
-  // Non-mock instrumentation run here can safely be ignored.
-  CallbacksHandle.ignoreNonMockPassInstrumentation("test");
-  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
-
-  ::testing::Sequence PISequence;
-  EXPECT_CALL(
-      CallbacksHandle,
-      runBeforePass(HasNameRegex("FreeMachineFunctionPass"), HasName("test")))
-      .InSequence(PISequence)
-      .WillOnce(Return(true));
-  EXPECT_CALL(CallbacksHandle,
-              runBeforeNonSkippedPass(HasNameRegex("FreeMachineFunctionPass"),
-                                      HasName("test")))
-      .InSequence(PISequence);
-  EXPECT_CALL(CallbacksHandle, runAfterPassInvalidated(
-                                   HasNameRegex("FreeMachineFunctionPass"), _))
-      .InSequence(PISequence);
-
-  // runAfterPass should not be called since the MachineFunction is no longer
-  // valid after FreeMachineFunctionPass.
-  EXPECT_CALL(CallbacksHandle,
-              runAfterPass(HasNameRegex("FreeMachineFunctionPass"), _, _))
-      .Times(0);
-
-  MPM.addPass(
-      createModuleToMachineFunctionPassAdaptor(FreeMachineFunctionPass()));
-  MPM.run(*M, MAM);
-}
-
-// Check that the Module -> MachineFunction adaptor and MachineFunction pass
-// manager properly call runAfterPassInvalidated.
-TEST_F(MachineFunctionCallbacksTest, InstrumentedFreeMFPass2) {
-  CallbacksHandle.registerPassInstrumentation(PIC);
-  // Non-mock instrumentation run here can safely be ignored.
-  CallbacksHandle.ignoreNonMockPassInstrumentation("test");
-  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
-
-  ::testing::Sequence PISequence;
-  EXPECT_CALL(
-      CallbacksHandle,
-      runBeforePass(HasNameRegex("FreeMachineFunctionPass"), HasName("test")))
-      .InSequence(PISequence)
-      .WillOnce(Return(true));
-  EXPECT_CALL(CallbacksHandle,
-              runBeforeNonSkippedPass(HasNameRegex("FreeMachineFunctionPass"),
-                                      HasName("test")))
-      .InSequence(PISequence);
-  EXPECT_CALL(CallbacksHandle, runAfterPassInvalidated(
-                                   HasNameRegex("FreeMachineFunctionPass"), _))
-      .InSequence(PISequence);
-  EXPECT_CALL(CallbacksHandle,
-              runAfterPassInvalidated(HasNameRegex("PassManager"), _))
-      .InSequence(PISequence);
-
-  // runAfterPass should not be called since the MachineFunction is no longer
-  // valid after FreeMachineFunctionPass.
-  EXPECT_CALL(CallbacksHandle,
-              runAfterPass(HasNameRegex("FreeMachineFunctionPass"), _, _))
-      .Times(0);
-  EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("PassManager"), _, _))
-      .Times(0);
-
-  MachineFunctionPassManager MFPM;
-  MFPM.addPass(FreeMachineFunctionPass());
-  MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
-  MPM.run(*M, MAM);
-}
-
-} // end anonymous namespace
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index c4d2b4ae8b9a..c13dc0e3fab8 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -1504,3 +1504,46 @@ Sections:
                "SHT_RELA section with index 1: failed to get a "
                "relocated section: invalid section index: 255");
 }
+
+TEST(ELFObjectFileTest, ELFSymbolRefLess) {
+  SmallString<0> Storage;
+  Expected<ELFObjectFile<ELF64LE>> ElfOrErr = toBinary<ELF64LE>(Storage, R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+)");
+
+  ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+  const ELFObjectFile<ELF64LE> &Obj = *ElfOrErr;
+
+  const uint32_t ValLow = 0x00000001;
+  const uint32_t ValHigh = 0x00000100;
+
+  auto MakeSymbol = [&Obj](size_t SymtabIndex, size_t SymbolIndex) {
+    DataRefImpl Data;
+    Data.d.a = SymtabIndex;
+    Data.d.b = SymbolIndex;
+    SymbolRef Sym(Data, &Obj);
+    return ELFSymbolRef(Sym);
+  };
+
+  ELFSymbolRef ELFSymLowLow = MakeSymbol(ValLow, ValLow);
+  ELFSymbolRef ELFSymLowHigh = MakeSymbol(ValLow, ValHigh);
+  ELFSymbolRef ELFSymHighLow = MakeSymbol(ValHigh, ValLow);
+  ELFSymbolRef ELFSymHighHigh = MakeSymbol(ValHigh, ValHigh);
+
+  EXPECT_TRUE(ELFSymLowLow < ELFSymLowHigh);
+  EXPECT_FALSE(ELFSymLowHigh < ELFSymLowLow);
+  EXPECT_FALSE(ELFSymLowLow < ELFSymLowLow);
+
+  EXPECT_TRUE(ELFSymLowLow < ELFSymHighHigh);
+  EXPECT_TRUE(ELFSymLowHigh < ELFSymHighLow);
+  EXPECT_TRUE(ELFSymLowLow < ELFSymHighLow);
+
+  EXPECT_FALSE(ELFSymHighLow < ELFSymLowHigh);
+  EXPECT_FALSE(ELFSymHighHigh < ELFSymLowLow);
+  EXPECT_FALSE(ELFSymHighLow < ELFSymLowLow);
+}
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 73ba0a23ea5f..402de64fe99b 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -370,12 +370,31 @@ static CallStackIdMapTy getCallStackMapping() {
   return Mapping;
 }
 
+// Populate all of the fields of MIB.
+MemInfoBlock makeFullMIB() {
+  MemInfoBlock MIB;
+#define MIBEntryDef(NameTag, Name, Type) MIB.NameTag;
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+  return MIB;
+}
+
+// Populate those fields returned by getHotColdSchema.
+MemInfoBlock makePartialMIB() {
+  MemInfoBlock MIB;
+  MIB.AllocCount = 1;
+  MIB.TotalSize = 5;
+  MIB.TotalLifetime = 10;
+  MIB.TotalLifetimeAccessDensity = 23;
+  return MIB;
+}
+
 IndexedMemProfRecord makeRecord(
     std::initializer_list<std::initializer_list<::llvm::memprof::FrameId>>
         AllocFrames,
     std::initializer_list<std::initializer_list<::llvm::memprof::FrameId>>
         CallSiteFrames,
-    const MemInfoBlock &Block = MemInfoBlock()) {
+    const MemInfoBlock &Block = makeFullMIB()) {
   llvm::memprof::IndexedMemProfRecord MR;
   for (const auto &Frames : AllocFrames)
     MR.AllocSites.emplace_back(Frames, llvm::memprof::hashCallStack(Frames),
@@ -388,13 +407,13 @@ IndexedMemProfRecord makeRecord(
 IndexedMemProfRecord
 makeRecordV2(std::initializer_list<::llvm::memprof::CallStackId> AllocFrames,
              std::initializer_list<::llvm::memprof::CallStackId> CallSiteFrames,
-             const MemInfoBlock &Block = MemInfoBlock()) {
+             const MemInfoBlock &Block, const memprof::MemProfSchema &Schema) {
   llvm::memprof::IndexedMemProfRecord MR;
   for (const auto &CSId : AllocFrames)
     // We don't populate IndexedAllocationInfo::CallStack because we use it only
     // in Version0 and Version1.
     MR.AllocSites.emplace_back(::llvm::SmallVector<memprof::FrameId>(), CSId,
-                               Block);
+                               Block, Schema);
   for (const auto &CSId : CallSiteFrames)
     MR.CallSiteIds.push_back(CSId);
   return MR;
@@ -476,15 +495,18 @@ TEST_F(InstrProfTest, test_memprof_v0) {
   EXPECT_THAT(WantRecord, EqualsRecord(Record));
 }
 
-TEST_F(InstrProfTest, test_memprof_v2) {
+TEST_F(InstrProfTest, test_memprof_v2_full_schema) {
+  const MemInfoBlock MIB = makeFullMIB();
+
   Writer.setMemProfVersionRequested(memprof::Version2);
+  Writer.setMemProfFullSchema(true);
 
   ASSERT_THAT_ERROR(Writer.mergeProfileKind(InstrProfKind::MemProf),
                     Succeeded());
 
   const IndexedMemProfRecord IndexedMR = makeRecordV2(
       /*AllocFrames=*/{0x111, 0x222},
-      /*CallSiteFrames=*/{0x333});
+      /*CallSiteFrames=*/{0x333}, MIB, memprof::getFullSchema());
   const FrameIdMapTy IdToFrameMap = getFrameMapping();
   const auto CSIdToCallStackMap = getCallStackMapping();
   for (const auto &I : IdToFrameMap) {
@@ -502,38 +524,58 @@ TEST_F(InstrProfTest, test_memprof_v2) {
   ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded());
   const memprof::MemProfRecord &Record = RecordOr.get();
 
-  std::optional<memprof::FrameId> LastUnmappedFrameId;
-  auto IdToFrameCallback = [&](const memprof::FrameId Id) {
-    auto Iter = IdToFrameMap.find(Id);
-    if (Iter == IdToFrameMap.end()) {
-      LastUnmappedFrameId = Id;
-      return memprof::Frame(0, 0, 0, false);
-    }
-    return Iter->second;
-  };
+  memprof::FrameIdConverter<decltype(IdToFrameMap)> FrameIdConv(IdToFrameMap);
+  memprof::CallStackIdConverter<decltype(CSIdToCallStackMap)> CSIdConv(
+      CSIdToCallStackMap, FrameIdConv);
 
-  std::optional<::llvm::memprof::CallStackId> LastUnmappedCSId;
-  auto CSIdToCallStackCallback = [&](::llvm::memprof::CallStackId CSId) {
-    llvm::SmallVector<memprof::Frame> Frames;
-    auto CSIter = CSIdToCallStackMap.find(CSId);
-    if (CSIter == CSIdToCallStackMap.end()) {
-      LastUnmappedCSId = CSId;
-    } else {
-      const ::llvm::SmallVector<::llvm::memprof::FrameId> &CS =
-          CSIter->getSecond();
-      Frames.reserve(CS.size());
-      for (::llvm::memprof::FrameId Id : CS)
-        Frames.push_back(IdToFrameCallback(Id));
-    }
-    return Frames;
-  };
+  const ::llvm::memprof::MemProfRecord WantRecord =
+      IndexedMR.toMemProfRecord(CSIdConv);
+  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt)
+      << "could not map frame id: " << *FrameIdConv.LastUnmappedId;
+  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt)
+      << "could not map call stack id: " << *CSIdConv.LastUnmappedId;
+  EXPECT_THAT(WantRecord, EqualsRecord(Record));
+}
+
+TEST_F(InstrProfTest, test_memprof_v2_partial_schema) {
+  const MemInfoBlock MIB = makePartialMIB();
+
+  Writer.setMemProfVersionRequested(memprof::Version2);
+  Writer.setMemProfFullSchema(false);
+
+  ASSERT_THAT_ERROR(Writer.mergeProfileKind(InstrProfKind::MemProf),
+                    Succeeded());
+
+  const IndexedMemProfRecord IndexedMR = makeRecordV2(
+      /*AllocFrames=*/{0x111, 0x222},
+      /*CallSiteFrames=*/{0x333}, MIB, memprof::getHotColdSchema());
+  const FrameIdMapTy IdToFrameMap = getFrameMapping();
+  const auto CSIdToCallStackMap = getCallStackMapping();
+  for (const auto &I : IdToFrameMap) {
+    Writer.addMemProfFrame(I.first, I.getSecond(), Err);
+  }
+  for (const auto &I : CSIdToCallStackMap) {
+    Writer.addMemProfCallStack(I.first, I.getSecond(), Err);
+  }
+  Writer.addMemProfRecord(/*Id=*/0x9999, IndexedMR);
+
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  auto RecordOr = Reader->getMemProfRecord(0x9999);
+  ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded());
+  const memprof::MemProfRecord &Record = RecordOr.get();
+
+  memprof::FrameIdConverter<decltype(IdToFrameMap)> FrameIdConv(IdToFrameMap);
+  memprof::CallStackIdConverter<decltype(CSIdToCallStackMap)> CSIdConv(
+      CSIdToCallStackMap, FrameIdConv);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
-      IndexedMR.toMemProfRecord(CSIdToCallStackCallback);
-  ASSERT_EQ(LastUnmappedFrameId, std::nullopt)
-      << "could not map frame id: " << *LastUnmappedFrameId;
-  ASSERT_EQ(LastUnmappedCSId, std::nullopt)
-      << "could not map call stack id: " << *LastUnmappedCSId;
+      IndexedMR.toMemProfRecord(CSIdConv);
+  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt)
+      << "could not map frame id: " << *FrameIdConv.LastUnmappedId;
+  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt)
+      << "could not map call stack id: " << *CSIdConv.LastUnmappedId;
   EXPECT_THAT(WantRecord, EqualsRecord(Record));
 }
 
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 7e00a80cacf9..40335d191ba7 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -1,6 +1,7 @@
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/IR/Value.h"
@@ -122,14 +123,6 @@ MATCHER_P4(FrameContains, FunctionName, LineOffset, Column, Inline, "") {
   return false;
 }
 
-MemProfSchema getFullSchema() {
-  MemProfSchema Schema;
-#define MIBEntryDef(NameTag, Name, Type) Schema.push_back(Meta::Name);
-#include "llvm/ProfileData/MIBEntryDef.inc"
-#undef MIBEntryDef
-  return Schema;
-}
-
 TEST(MemProf, FillsValue) {
   std::unique_ptr<MockSymbolizer> Symbolizer(new MockSymbolizer());
 
@@ -187,7 +180,7 @@ TEST(MemProf, FillsValue) {
 
   // Check the memprof record for foo.
   const llvm::GlobalValue::GUID FooId = IndexedMemProfRecord::getGUID("foo");
-  ASSERT_EQ(Records.count(FooId), 1U);
+  ASSERT_TRUE(Records.contains(FooId));
   const MemProfRecord &Foo = Records[FooId];
   ASSERT_THAT(Foo.AllocSites, SizeIs(1));
   EXPECT_EQ(Foo.AllocSites[0].Info.getAllocCount(), 1U);
@@ -203,7 +196,7 @@ TEST(MemProf, FillsValue) {
 
   // Check the memprof record for bar.
   const llvm::GlobalValue::GUID BarId = IndexedMemProfRecord::getGUID("bar");
-  ASSERT_EQ(Records.count(BarId), 1U);
+  ASSERT_TRUE(Records.contains(BarId));
   const MemProfRecord &Bar = Records[BarId];
   ASSERT_THAT(Bar.AllocSites, SizeIs(1));
   EXPECT_EQ(Bar.AllocSites[0].Info.getAllocCount(), 1U);
@@ -223,7 +216,7 @@ TEST(MemProf, FillsValue) {
 
   // Check the memprof record for xyz.
   const llvm::GlobalValue::GUID XyzId = IndexedMemProfRecord::getGUID("xyz");
-  ASSERT_EQ(Records.count(XyzId), 1U);
+  ASSERT_TRUE(Records.contains(XyzId));
   const MemProfRecord &Xyz = Records[XyzId];
   ASSERT_THAT(Xyz.CallSites, SizeIs(1));
   ASSERT_THAT(Xyz.CallSites[0], SizeIs(2));
@@ -234,7 +227,7 @@ TEST(MemProf, FillsValue) {
 
   // Check the memprof record for abc.
   const llvm::GlobalValue::GUID AbcId = IndexedMemProfRecord::getGUID("abc");
-  ASSERT_EQ(Records.count(AbcId), 1U);
+  ASSERT_TRUE(Records.contains(AbcId));
   const MemProfRecord &Abc = Records[AbcId];
   EXPECT_TRUE(Abc.AllocSites.empty());
   ASSERT_THAT(Abc.CallSites, SizeIs(1));
@@ -248,8 +241,8 @@ TEST(MemProf, PortableWrapper) {
                     /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
                     /*dealloc_cpu=*/4);
 
-  const auto Schema = getFullSchema();
-  PortableMemInfoBlock WriteBlock(Info);
+  const auto Schema = llvm::memprof::getFullSchema();
+  PortableMemInfoBlock WriteBlock(Info, Schema);
 
   std::string Buffer;
   llvm::raw_string_ostream OS(Buffer);
@@ -271,7 +264,7 @@ TEST(MemProf, PortableWrapper) {
 // Version0 and Version1 serialize IndexedMemProfRecord in the same format, so
 // we share one test.
 TEST(MemProf, RecordSerializationRoundTripVersion0And1) {
-  const MemProfSchema Schema = getFullSchema();
+  const auto Schema = llvm::memprof::getFullSchema();
 
   MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
                     /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
@@ -305,7 +298,7 @@ TEST(MemProf, RecordSerializationRoundTripVersion0And1) {
 }
 
 TEST(MemProf, RecordSerializationRoundTripVerion2) {
-  const MemProfSchema Schema = getFullSchema();
+  const auto Schema = llvm::memprof::getFullSchema();
 
   MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
                     /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
@@ -334,6 +327,65 @@ TEST(MemProf, RecordSerializationRoundTripVerion2) {
   EXPECT_EQ(Record, GotRecord);
 }
 
+TEST(MemProf, RecordSerializationRoundTripVersion2HotColdSchema) {
+  const auto Schema = llvm::memprof::getHotColdSchema();
+
+  MemInfoBlock Info;
+  Info.AllocCount = 11;
+  Info.TotalSize = 22;
+  Info.TotalLifetime = 33;
+  Info.TotalLifetimeAccessDensity = 44;
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallStackIds = {0x123, 0x456};
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallSiteIds = {0x333, 0x444};
+
+  IndexedMemProfRecord Record;
+  for (const auto &CSId : CallStackIds) {
+    // Use the same info block for both allocation sites.
+    Record.AllocSites.emplace_back(llvm::SmallVector<FrameId>(), CSId, Info,
+                                   Schema);
+  }
+  Record.CallSiteIds.assign(CallSiteIds);
+
+  std::bitset<llvm::to_underlying(Meta::Size)> SchemaBitSet;
+  for (auto Id : Schema)
+    SchemaBitSet.set(llvm::to_underlying(Id));
+
+  // Verify that SchemaBitSet has the fields we expect and nothing else, which
+  // we check with count().
+  EXPECT_EQ(SchemaBitSet.count(), 4U);
+  EXPECT_TRUE(SchemaBitSet[llvm::to_underlying(Meta::AllocCount)]);
+  EXPECT_TRUE(SchemaBitSet[llvm::to_underlying(Meta::TotalSize)]);
+  EXPECT_TRUE(SchemaBitSet[llvm::to_underlying(Meta::TotalLifetime)]);
+  EXPECT_TRUE(
+      SchemaBitSet[llvm::to_underlying(Meta::TotalLifetimeAccessDensity)]);
+
+  // Verify that Schema has propagated all the way to the Info field in each
+  // IndexedAllocationInfo.
+  ASSERT_THAT(Record.AllocSites, ::SizeIs(2));
+  EXPECT_EQ(Record.AllocSites[0].Info.getSchema(), SchemaBitSet);
+  EXPECT_EQ(Record.AllocSites[1].Info.getSchema(), SchemaBitSet);
+
+  std::string Buffer;
+  llvm::raw_string_ostream OS(Buffer);
+  Record.serialize(Schema, OS, llvm::memprof::Version2);
+  OS.flush();
+
+  const IndexedMemProfRecord GotRecord = IndexedMemProfRecord::deserialize(
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()),
+      llvm::memprof::Version2);
+
+  // Verify that Schema comes back correctly after deserialization. Technically,
+  // the comparison between Record and GotRecord below includes the comparison
+  // of their Schemas, but we'll verify the Schemas on our own.
+  ASSERT_THAT(GotRecord.AllocSites, ::SizeIs(2));
+  EXPECT_EQ(GotRecord.AllocSites[0].Info.getSchema(), SchemaBitSet);
+  EXPECT_EQ(GotRecord.AllocSites[1].Info.getSchema(), SchemaBitSet);
+
+  EXPECT_EQ(Record, GotRecord);
+}
+
 TEST(MemProf, SymbolizationFilter) {
   std::unique_ptr<MockSymbolizer> Symbolizer(new MockSymbolizer());
 
@@ -510,37 +562,15 @@ TEST(MemProf, IndexedMemProfRecordToMemProfRecord) {
   IndexedRecord.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS3));
   IndexedRecord.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS4));
 
-  bool CSIdMissing = false;
-  bool FrameIdMissing = false;
-
-  auto Callback = [&](CallStackId CSId) -> llvm::SmallVector<Frame> {
-    llvm::SmallVector<Frame> CallStack;
-    llvm::SmallVector<FrameId> FrameIds;
-
-    auto Iter = CallStackIdMap.find(CSId);
-    if (Iter == CallStackIdMap.end())
-      CSIdMissing = true;
-    else
-      FrameIds = Iter->second;
-
-    for (FrameId Id : FrameIds) {
-      Frame F(0, 0, 0, false);
-      auto Iter = FrameIdMap.find(Id);
-      if (Iter == FrameIdMap.end())
-        FrameIdMissing = true;
-      else
-        F = Iter->second;
-      CallStack.push_back(F);
-    }
-
-    return CallStack;
-  };
+  llvm::memprof::FrameIdConverter<decltype(FrameIdMap)> FrameIdConv(FrameIdMap);
+  llvm::memprof::CallStackIdConverter<decltype(CallStackIdMap)> CSIdConv(
+      CallStackIdMap, FrameIdConv);
 
-  MemProfRecord Record = IndexedRecord.toMemProfRecord(Callback);
+  MemProfRecord Record = IndexedRecord.toMemProfRecord(CSIdConv);
 
   // Make sure that all lookups are successful.
-  ASSERT_FALSE(CSIdMissing);
-  ASSERT_FALSE(FrameIdMissing);
+  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt);
+  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt);
 
   // Verify the contents of Record.
   ASSERT_THAT(Record.AllocSites, SizeIs(2));
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 401981f3841e..6ac0d1b412f0 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -2906,6 +2906,87 @@ TEST(YAMLIO, Numeric) {
 }
 
 //===----------------------------------------------------------------------===//
+//  Test writing and reading escaped keys
+//===----------------------------------------------------------------------===//
+
+// Struct with dynamic string key
+struct QuotedKeyStruct {
+  int unquoted_bool;
+  int unquoted_null;
+  int unquoted_numeric;
+  int unquoted_str;
+  int colon;
+  int just_space;
+  int unprintable;
+};
+
+namespace llvm {
+namespace yaml {
+template <> struct MappingTraits<QuotedKeyStruct> {
+  static void mapping(IO &io, QuotedKeyStruct &map) {
+    io.mapRequired("true", map.unquoted_bool);
+    io.mapRequired("null", map.unquoted_null);
+    io.mapRequired("42", map.unquoted_numeric);
+    io.mapRequired("unquoted", map.unquoted_str);
+    io.mapRequired(":", map.colon);
+    io.mapRequired(" ", map.just_space);
+    char unprintableKey[] = {/* \f, form-feed */ 0xC, 0};
+    io.mapRequired(unprintableKey, map.unprintable);
+  }
+};
+} // namespace yaml
+} // namespace llvm
+
+TEST(YAMLIO, TestQuotedKeyRead) {
+  QuotedKeyStruct map = {};
+  Input yin("---\ntrue:  1\nnull:  2\n42:  3\nunquoted:  4\n':':  5\n' ':  "
+            "6\n\"\\f\":  7\n...\n");
+  yin >> map;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(map.unquoted_bool, 1);
+  EXPECT_EQ(map.unquoted_null, 2);
+  EXPECT_EQ(map.unquoted_numeric, 3);
+  EXPECT_EQ(map.unquoted_str, 4);
+  EXPECT_EQ(map.colon, 5);
+  EXPECT_EQ(map.just_space, 6);
+  EXPECT_EQ(map.unprintable, 7);
+}
+
+TEST(YAMLIO, TestQuotedKeyWriteRead) {
+  std::string intermediate;
+  {
+    QuotedKeyStruct map = {1, 2, 3, 4, 5, 6, 7};
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << map;
+  }
+
+  EXPECT_NE(std::string::npos, intermediate.find("true:"));
+  EXPECT_NE(std::string::npos, intermediate.find("null:"));
+  EXPECT_NE(std::string::npos, intermediate.find("42:"));
+  EXPECT_NE(std::string::npos, intermediate.find("unquoted:"));
+  EXPECT_NE(std::string::npos, intermediate.find("':':"));
+  EXPECT_NE(std::string::npos, intermediate.find("' '"));
+  EXPECT_NE(std::string::npos, intermediate.find("\"\\f\":"));
+
+  {
+    Input yin(intermediate);
+    QuotedKeyStruct map;
+    yin >> map;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(map.unquoted_bool, 1);
+    EXPECT_EQ(map.unquoted_null, 2);
+    EXPECT_EQ(map.unquoted_numeric, 3);
+    EXPECT_EQ(map.unquoted_str, 4);
+    EXPECT_EQ(map.colon, 5);
+    EXPECT_EQ(map.just_space, 6);
+    EXPECT_EQ(map.unprintable, 7);
+  }
+}
+
+//===----------------------------------------------------------------------===//
 //  Test PolymorphicTraits and TaggedScalarTraits
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 81b7e2e527d9..3aa0178100ab 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -118,18 +118,24 @@ TEST(ParseArchString, RejectsUpperCase) {
 TEST(ParseArchString, RejectsInvalidBaseISA) {
   for (StringRef Input : {"rv32", "rv64", "rv65i"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
-              "string must begin with rv32{i,e,g} or rv64{i,e,g}");
+              "string must begin with rv32{i,e,g}, rv64{i,e,g}, or a supported "
+              "profile name");
   }
-  for (StringRef Input : {"rv32j", "rv64k", "rv32_i"}) {
+
+  for (StringRef Input : {"rv32j", "rv32_i"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
-              "first letter should be 'e', 'i' or 'g'");
+              "first letter after 'rv32' should be 'e', 'i' or 'g'");
   }
+
+  EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64k", true).takeError()),
+            "first letter after 'rv64' should be 'e', 'i' or 'g'");
 }
 
 TEST(ParseArchString, RejectsUnsupportedBaseISA) {
   for (StringRef Input : {"rv128i", "rv128g"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
-              "string must begin with rv32{i,e,g} or rv64{i,e,g}");
+              "string must begin with rv32{i,e,g}, rv64{i,e,g}, or a supported "
+              "profile name");
   }
 }
 
@@ -137,7 +143,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV32I = RISCVISAInfo::parseArchString("rv32i", true);
   ASSERT_THAT_EXPECTED(MaybeRV32I, Succeeded());
   RISCVISAInfo &InfoRV32I = **MaybeRV32I;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32I = InfoRV32I.getExtensions();
+  const auto &ExtsRV32I = InfoRV32I.getExtensions();
   EXPECT_EQ(ExtsRV32I.size(), 1UL);
   EXPECT_TRUE(ExtsRV32I.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_EQ(InfoRV32I.getXLen(), 32U);
@@ -146,7 +152,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV32E = RISCVISAInfo::parseArchString("rv32e", true);
   ASSERT_THAT_EXPECTED(MaybeRV32E, Succeeded());
   RISCVISAInfo &InfoRV32E = **MaybeRV32E;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32E = InfoRV32E.getExtensions();
+  const auto &ExtsRV32E = InfoRV32E.getExtensions();
   EXPECT_EQ(ExtsRV32E.size(), 1UL);
   EXPECT_TRUE(ExtsRV32E.at("e") == (RISCVISAUtils::ExtensionVersion{2, 0}));
   EXPECT_EQ(InfoRV32E.getXLen(), 32U);
@@ -155,7 +161,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV32G = RISCVISAInfo::parseArchString("rv32g", true);
   ASSERT_THAT_EXPECTED(MaybeRV32G, Succeeded());
   RISCVISAInfo &InfoRV32G = **MaybeRV32G;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32G = InfoRV32G.getExtensions();
+  const auto &ExtsRV32G = InfoRV32G.getExtensions();
   EXPECT_EQ(ExtsRV32G.size(), 7UL);
   EXPECT_TRUE(ExtsRV32G.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_TRUE(ExtsRV32G.at("m") == (RISCVISAUtils::ExtensionVersion{2, 0}));
@@ -171,7 +177,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV64I = RISCVISAInfo::parseArchString("rv64i", true);
   ASSERT_THAT_EXPECTED(MaybeRV64I, Succeeded());
   RISCVISAInfo &InfoRV64I = **MaybeRV64I;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64I = InfoRV64I.getExtensions();
+  const auto &ExtsRV64I = InfoRV64I.getExtensions();
   EXPECT_EQ(ExtsRV64I.size(), 1UL);
   EXPECT_TRUE(ExtsRV64I.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_EQ(InfoRV64I.getXLen(), 64U);
@@ -180,7 +186,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV64E = RISCVISAInfo::parseArchString("rv64e", true);
   ASSERT_THAT_EXPECTED(MaybeRV64E, Succeeded());
   RISCVISAInfo &InfoRV64E = **MaybeRV64E;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64E = InfoRV64E.getExtensions();
+  const auto &ExtsRV64E = InfoRV64E.getExtensions();
   EXPECT_EQ(ExtsRV64E.size(), 1UL);
   EXPECT_TRUE(ExtsRV64E.at("e") == (RISCVISAUtils::ExtensionVersion{2, 0}));
   EXPECT_EQ(InfoRV64E.getXLen(), 64U);
@@ -189,7 +195,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV64G = RISCVISAInfo::parseArchString("rv64g", true);
   ASSERT_THAT_EXPECTED(MaybeRV64G, Succeeded());
   RISCVISAInfo &InfoRV64G = **MaybeRV64G;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64G = InfoRV64G.getExtensions();
+  const auto &ExtsRV64G = InfoRV64G.getExtensions();
   EXPECT_EQ(ExtsRV64G.size(), 7UL);
   EXPECT_TRUE(ExtsRV64G.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_TRUE(ExtsRV64G.at("m") == (RISCVISAUtils::ExtensionVersion{2, 0}));
@@ -241,7 +247,7 @@ TEST(ParseArchString, IgnoresUnrecognizedExtensionNamesWithIgnoreUnknown) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
     RISCVISAInfo &Info = **MaybeISAInfo;
-    RISCVISAInfo::OrderedExtensionMap Exts = Info.getExtensions();
+    const auto &Exts = Info.getExtensions();
     EXPECT_EQ(Exts.size(), 1UL);
     EXPECT_TRUE(Exts.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   }
@@ -251,7 +257,7 @@ TEST(ParseArchString, IgnoresUnrecognizedExtensionNamesWithIgnoreUnknown) {
   auto MaybeISAInfo =
       RISCVISAInfo::parseArchString("rv32i_zbc1p0_xmadeup", true, false, true);
   ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+  const auto &Exts = (*MaybeISAInfo)->getExtensions();
   EXPECT_TRUE(Exts.at("zbc") == (RISCVISAUtils::ExtensionVersion{1, 0}));
 }
 
@@ -259,13 +265,13 @@ TEST(ParseArchString, AcceptsVersionInLongOrShortForm) {
   for (StringRef Input : {"rv64i2p1"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_TRUE(Exts.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   }
   for (StringRef Input : {"rv32i_zfinx1", "rv32i_zfinx1p0"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_TRUE(Exts.at("zfinx") == (RISCVISAUtils::ExtensionVersion{1, 0}));
   }
 }
@@ -293,14 +299,14 @@ TEST(ParseArchString,
   for (StringRef Input : {"rv32i0p1", "rv32i99p99", "rv64i0p1", "rv64i99p99"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 1UL);
     EXPECT_TRUE(Exts.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   }
   for (StringRef Input : {"rv32e0p1", "rv32e99p99", "rv64e0p1", "rv64e99p99"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 1UL);
     EXPECT_TRUE(Exts.at("e") == (RISCVISAUtils::ExtensionVersion{2, 0}));
   }
@@ -311,7 +317,7 @@ TEST(ParseArchString,
   for (StringRef Input : {"rv32im1p1", "rv64i_svnapot10p9", "rv32i_zicsr0p5"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 1UL);
     EXPECT_TRUE(Exts.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   }
@@ -321,7 +327,7 @@ TEST(ParseArchString, AcceptsUnderscoreSplittingExtensions) {
   for (StringRef Input : {"rv32imafdczifencei", "rv32i_m_a_f_d_c_zifencei"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 8UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("m"), 1U);
@@ -339,7 +345,7 @@ TEST(ParseArchString, AcceptsRelaxSingleLetterExtensions) {
        {"rv32imfad", "rv32im_fa_d", "rv32im2p0fad", "rv32i2p1m2p0fad"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 6UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("m"), 1U);
@@ -356,7 +362,7 @@ TEST(ParseArchString, AcceptsRelaxMixedLetterExtensions) {
         "rv32i_zihintntl_mafd_svinval"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 8UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("m"), 1U);
@@ -373,7 +379,7 @@ TEST(ParseArchString, AcceptsAmbiguousFromRelaxExtensions) {
   for (StringRef Input : {"rv32i_zba_m", "rv32izba_m", "rv32izba1p0_m2p0"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 3UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("zba"), 1U);
@@ -383,7 +389,7 @@ TEST(ParseArchString, AcceptsAmbiguousFromRelaxExtensions) {
        {"rv32ia_zba_m", "rv32iazba_m", "rv32ia2p1zba1p0_m2p0"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 4UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("zba"), 1U);
@@ -395,7 +401,7 @@ TEST(ParseArchString, AcceptsAmbiguousFromRelaxExtensions) {
 TEST(ParseArchString, RejectsRelaxExtensionsNotStartWithEorIorG) {
   EXPECT_EQ(
       toString(RISCVISAInfo::parseArchString("rv32zba_im", true).takeError()),
-      "first letter should be 'e', 'i' or 'g'");
+      "first letter after 'rv32' should be 'e', 'i' or 'g'");
 }
 
 TEST(ParseArchString,
@@ -457,12 +463,12 @@ TEST(ParseArchString,
   // hopefully serve as a reminder to update.
   auto MaybeISAInfo = RISCVISAInfo::parseArchString("rv64iztso", true, false);
   ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+  const auto &Exts = (*MaybeISAInfo)->getExtensions();
   EXPECT_EQ(Exts.size(), 2UL);
   EXPECT_EQ(Exts.count("ztso"), 1U);
   auto MaybeISAInfo2 = RISCVISAInfo::parseArchString("rv64iztso0p1", true);
   ASSERT_THAT_EXPECTED(MaybeISAInfo2, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap Exts2 = (*MaybeISAInfo2)->getExtensions();
+  const auto &Exts2 = (*MaybeISAInfo2)->getExtensions();
   EXPECT_EQ(Exts2.size(), 2UL);
   EXPECT_EQ(Exts2.count("ztso"), 1U);
 }
@@ -479,7 +485,7 @@ TEST(ParseArchString,
   auto MaybeISAInfo =
       RISCVISAInfo::parseArchString("rv64iztso9p9", true, false);
   ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+  const auto &Exts = (*MaybeISAInfo)->getExtensions();
   EXPECT_EQ(Exts.size(), 2UL);
   EXPECT_TRUE(Exts.at("ztso") == (RISCVISAUtils::ExtensionVersion{9, 9}));
 }
@@ -502,8 +508,7 @@ TEST(ParseArchString, AddsImpliedExtensions) {
   // Does not attempt to exhaustively test all implications.
   auto MaybeRV64ID = RISCVISAInfo::parseArchString("rv64id", true);
   ASSERT_THAT_EXPECTED(MaybeRV64ID, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64ID =
-      (*MaybeRV64ID)->getExtensions();
+  const auto &ExtsRV64ID = (*MaybeRV64ID)->getExtensions();
   EXPECT_EQ(ExtsRV64ID.size(), 4UL);
   EXPECT_EQ(ExtsRV64ID.count("i"), 1U);
   EXPECT_EQ(ExtsRV64ID.count("f"), 1U);
@@ -512,8 +517,7 @@ TEST(ParseArchString, AddsImpliedExtensions) {
 
   auto MaybeRV32IZKN = RISCVISAInfo::parseArchString("rv64izkn", true);
   ASSERT_THAT_EXPECTED(MaybeRV32IZKN, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32IZKN =
-      (*MaybeRV32IZKN)->getExtensions();
+  const auto &ExtsRV32IZKN = (*MaybeRV32IZKN)->getExtensions();
   EXPECT_EQ(ExtsRV32IZKN.size(), 8UL);
   EXPECT_EQ(ExtsRV32IZKN.count("i"), 1U);
   EXPECT_EQ(ExtsRV32IZKN.count("zbkb"), 1U);
@@ -603,7 +607,7 @@ TEST(ToFeatures, AddAllExtensionsAddsNegativeExtensions) {
 }
 
 TEST(OrderedExtensionMap, ExtensionsAreCorrectlyOrdered) {
-  RISCVISAInfo::OrderedExtensionMap Exts;
+  RISCVISAUtils::OrderedExtensionMap Exts;
   for (auto ExtName : {"y", "l", "m", "c", "i", "xfoo", "xbar", "sfoo", "sbar",
                        "zmfoo", "zzfoo", "zfinx", "zicsr"})
     Exts[ExtName] = {1, 0};
@@ -621,8 +625,7 @@ TEST(OrderedExtensionMap, ExtensionsAreCorrectlyOrdered) {
 TEST(ParseArchString, ZceImplication) {
   auto MaybeRV32IZce = RISCVISAInfo::parseArchString("rv32izce", true);
   ASSERT_THAT_EXPECTED(MaybeRV32IZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32IZce =
-      (*MaybeRV32IZce)->getExtensions();
+  const auto &ExtsRV32IZce = (*MaybeRV32IZce)->getExtensions();
   EXPECT_EQ(ExtsRV32IZce.size(), 7UL);
   EXPECT_EQ(ExtsRV32IZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV32IZce.count("zicsr"), 1U);
@@ -634,8 +637,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV32IFZce = RISCVISAInfo::parseArchString("rv32ifzce", true);
   ASSERT_THAT_EXPECTED(MaybeRV32IFZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32IFZce =
-      (*MaybeRV32IFZce)->getExtensions();
+  const auto &ExtsRV32IFZce = (*MaybeRV32IFZce)->getExtensions();
   EXPECT_EQ(ExtsRV32IFZce.size(), 9UL);
   EXPECT_EQ(ExtsRV32IFZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV32IFZce.count("zicsr"), 1U);
@@ -649,8 +651,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV32IDZce = RISCVISAInfo::parseArchString("rv32idzce", true);
   ASSERT_THAT_EXPECTED(MaybeRV32IDZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32IDZce =
-      (*MaybeRV32IDZce)->getExtensions();
+  const auto &ExtsRV32IDZce = (*MaybeRV32IDZce)->getExtensions();
   EXPECT_EQ(ExtsRV32IDZce.size(), 10UL);
   EXPECT_EQ(ExtsRV32IDZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV32IDZce.count("zicsr"), 1U);
@@ -665,8 +666,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV64IZce = RISCVISAInfo::parseArchString("rv64izce", true);
   ASSERT_THAT_EXPECTED(MaybeRV64IZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64IZce =
-      (*MaybeRV64IZce)->getExtensions();
+  const auto &ExtsRV64IZce = (*MaybeRV64IZce)->getExtensions();
   EXPECT_EQ(ExtsRV64IZce.size(), 7UL);
   EXPECT_EQ(ExtsRV64IZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV64IZce.count("zicsr"), 1U);
@@ -678,8 +678,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV64IFZce = RISCVISAInfo::parseArchString("rv64ifzce", true);
   ASSERT_THAT_EXPECTED(MaybeRV64IFZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64IFZce =
-      (*MaybeRV64IFZce)->getExtensions();
+  const auto &ExtsRV64IFZce = (*MaybeRV64IFZce)->getExtensions();
   EXPECT_EQ(ExtsRV64IFZce.size(), 8UL);
   EXPECT_EQ(ExtsRV64IFZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV64IFZce.count("zicsr"), 1U);
@@ -698,8 +697,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV64IDZce = RISCVISAInfo::parseArchString("rv64idzce", true);
   ASSERT_THAT_EXPECTED(MaybeRV64IDZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64IDZce =
-      (*MaybeRV64IDZce)->getExtensions();
+  const auto &ExtsRV64IDZce = (*MaybeRV64IDZce)->getExtensions();
   EXPECT_EQ(ExtsRV64IDZce.size(), 9UL);
   EXPECT_EQ(ExtsRV64IDZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV64IDZce.count("zicsr"), 1U);
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 2c72a7229b52..816aea44a9bc 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1346,13 +1346,64 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PAUTH}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "neoverse-v3", "armv9.2-a", "neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,        AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,         AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,        AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,         AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,        AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,         AArch64::AEK_CRC,
+                 AArch64::AEK_FP,          AArch64::AEK_PROFILE,
+                 AArch64::AEK_MTE,         AArch64::AEK_SSBS,
+                 AArch64::AEK_SB,          AArch64::AEK_PREDRES,
+                 AArch64::AEK_LS64,        AArch64::AEK_BRBE,
+                 AArch64::AEK_PAUTH,       AArch64::AEK_FLAGM,
+                 AArch64::AEK_PERFMON,     AArch64::AEK_RAND,
+                 AArch64::AEK_SVE2BITPERM, AArch64::AEK_FP16FML,
+                 AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
+                 AArch64::AEK_FCMA}),
+            "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "neoverse-v3ae", "armv9.2-a", "neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,        AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,         AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,        AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,         AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,        AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,         AArch64::AEK_CRC,
+                 AArch64::AEK_FP,          AArch64::AEK_PROFILE,
+                 AArch64::AEK_MTE,         AArch64::AEK_SSBS,
+                 AArch64::AEK_SB,          AArch64::AEK_PREDRES,
+                 AArch64::AEK_LS64,        AArch64::AEK_BRBE,
+                 AArch64::AEK_PAUTH,       AArch64::AEK_FLAGM,
+                 AArch64::AEK_PERFMON,     AArch64::AEK_RAND,
+                 AArch64::AEK_SVE2BITPERM, AArch64::AEK_FP16FML,
+                 AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
+                 AArch64::AEK_FCMA}),
+            "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-r82", "armv8-r", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
                 {AArch64::AEK_CRC, AArch64::AEK_RDM, AArch64::AEK_SSBS,
                  AArch64::AEK_DOTPROD, AArch64::AEK_FP, AArch64::AEK_SIMD,
                  AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_RAS,
                  AArch64::AEK_RCPC, AArch64::AEK_LSE, AArch64::AEK_SB,
-                 AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH}),
+                 AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH,
+                 AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
+                 AArch64::AEK_PREDRES}),
+            "8-R"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "cortex-r82ae", "armv8-r", "crypto-neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_CRC, AArch64::AEK_RDM, AArch64::AEK_SSBS,
+                 AArch64::AEK_DOTPROD, AArch64::AEK_FP, AArch64::AEK_SIMD,
+                 AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_RAS,
+                 AArch64::AEK_RCPC, AArch64::AEK_LSE, AArch64::AEK_SB,
+                 AArch64::AEK_JSCVT, AArch64::AEK_FCMA, AArch64::AEK_PAUTH,
+                 AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
+                 AArch64::AEK_PREDRES}),
             "8-R"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-x1", "armv8.2-a", "crypto-neon-fp-armv8",
@@ -1637,6 +1688,24 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_FP16FML}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "neoverse-n3", "armv9.2-a", "neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,    AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,     AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,    AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,     AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,    AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,     AArch64::AEK_CRC,
+                 AArch64::AEK_FP,      AArch64::AEK_PROFILE,
+                 AArch64::AEK_MTE,     AArch64::AEK_SSBS,
+                 AArch64::AEK_SB,      AArch64::AEK_PREDRES,
+                 AArch64::AEK_FCMA,    AArch64::AEK_PAUTH,
+                 AArch64::AEK_FLAGM,   AArch64::AEK_PERFMON,
+                 AArch64::AEK_RAND,    AArch64::AEK_SVE2BITPERM,
+                 AArch64::AEK_FP16FML, AArch64::AEK_PROFILE,
+                 AArch64::AEK_JSCVT}),
+            "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
             "ampere1", "armv8.6-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
                 {AArch64::AEK_CRC,  AArch64::AEK_FP,   AArch64::AEK_FP16,
@@ -1750,7 +1819,7 @@ INSTANTIATE_TEST_SUITE_P(
     ARMCPUTestParams<AArch64::ExtensionBitset>::PrintToStringParamName);
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 72;
+static constexpr unsigned NumAArch64CPUArchs = 76;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index d7d0ea2c6a6e..a0119ed5159d 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -732,6 +732,67 @@ TEST(Local, FindDbgUsers) {
   EXPECT_EQ(Vals.size(), 1u);
 }
 
+TEST(Local, FindDbgRecords) {
+  // DbgRecord copy of the FindDbgUsers test above.
+  LLVMContext Ctx;
+  std::unique_ptr<Module> M = parseIR(Ctx,
+                                      R"(
+  define dso_local void @fun(ptr %a) #0 !dbg !11 {
+  entry:
+    call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19
+    ret void
+  }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2, !3, !9}
+  !llvm.ident = !{!10}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 17.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.cpp", directory: "/")
+  !2 = !{i32 7, !"Dwarf Version", i32 5}
+  !3 = !{i32 2, !"Debug Info Version", i32 3}
+  !4 = !{i32 1, !"wchar_size", i32 4}
+  !9 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+  !10 = !{!"clang version 17.0.0"}
+  !11 = distinct !DISubprogram(name: "fun", linkageName: "fun", scope: !1, file: !1, line: 1, type: !12, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !14)
+  !12 = !DISubroutineType(types: !13)
+  !13 = !{null}
+  !14 = !{}
+  !15 = distinct !DIAssignID()
+  !16 = !DILocalVariable(name: "x", scope: !11, file: !1, line: 2, type: !17)
+  !17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !18, size: 64)
+  !18 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !19 = !DILocation(line: 0, scope: !11)
+  )");
+
+  bool BrokenDebugInfo = true;
+  verifyModule(*M, &errs(), &BrokenDebugInfo);
+  ASSERT_FALSE(BrokenDebugInfo);
+  bool NewDbgInfoFormat = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = true;
+  M->convertToNewDbgValues();
+
+  Function &Fun = *cast<Function>(M->getNamedValue("fun"));
+  Value *Arg = Fun.getArg(0);
+
+  SmallVector<DbgVariableIntrinsic *> Users;
+  SmallVector<DbgVariableRecord *> Records;
+  // Arg (%a) is used twice by a single dbg_assign. Check findDbgUsers returns
+  // only 1 pointer to it rather than 2.
+  findDbgUsers(Users, Arg, &Records);
+  EXPECT_EQ(Users.size(), 0u);
+  EXPECT_EQ(Records.size(), 1u);
+
+  SmallVector<DbgValueInst *> Vals;
+  Records.clear();
+  // Arg (%a) is used twice by a single dbg_assign. Check findDbgValues returns
+  // only 1 pointer to it rather than 2.
+  findDbgValues(Vals, Arg, &Records);
+  EXPECT_EQ(Vals.size(), 0u);
+  EXPECT_EQ(Records.size(), 1u);
+  UseNewDbgInfoFormat = NewDbgInfoFormat;
+}
+
 TEST(Local, ReplaceAllDbgUsesWith) {
   using namespace llvm::dwarf;
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 64e9c06db3fe..eda4723f67b2 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -895,13 +895,16 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
 
   IntegerType *Int32 = IntegerType::get(C, 32);
   FunctionType *FTy = FunctionType::get(Int32, false);
-  auto *Call = CallInst::Create(FTy, UndefValue::get(FTy));
+  Function *Fn = Function::Create(FTy, GlobalValue::ExternalLinkage, 0);
+  auto *Call = CallInst::Create(FTy, Fn);
   VPValue Op1;
   VPValue Op2;
+  VPValue CalledFn(Call->getCalledFunction());
   SmallVector<VPValue *, 2> Args;
   Args.push_back(&Op1);
   Args.push_back(&Op2);
-  VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()), false);
+  Args.push_back(&CalledFn);
+  VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -912,6 +915,7 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
   EXPECT_EQ(&Recipe, VPV->getDefiningRecipe());
 
   delete Call;
+  delete Fn;
 }
 
 TEST(VPRecipeTest, CastVPWidenSelectRecipeToVPUserAndVPDef) {
@@ -1157,19 +1161,22 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
 
   {
     FunctionType *FTy = FunctionType::get(Int32, false);
-    auto *Call = CallInst::Create(FTy, UndefValue::get(FTy));
+    Function *Fn = Function::Create(FTy, GlobalValue::ExternalLinkage, 0);
+    auto *Call = CallInst::Create(FTy, Fn);
     VPValue Op1;
     VPValue Op2;
-    SmallVector<VPValue *, 2> Args;
+    VPValue CalledFn(Call->getCalledFunction());
+    SmallVector<VPValue *, 3> Args;
     Args.push_back(&Op1);
     Args.push_back(&Op2);
-    VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()),
-                             false);
+    Args.push_back(&CalledFn);
+    VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false);
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());
     EXPECT_TRUE(Recipe.mayReadOrWriteMemory());
     delete Call;
+    delete Fn;
   }
 
   {
@@ -1181,11 +1188,12 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     auto *Call = CallInst::Create(TheFn->getFunctionType(), TheFn);
     VPValue Op1;
     VPValue Op2;
-    SmallVector<VPValue *, 2> Args;
+    VPValue CalledFn(TheFn);
+    SmallVector<VPValue *, 3> Args;
     Args.push_back(&Op1);
     Args.push_back(&Op2);
-    VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()),
-                             false);
+    Args.push_back(&CalledFn);
+    VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false);
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
diff --git a/llvm/utils/LLVMVisualizers/llvm.natvis b/llvm/utils/LLVMVisualizers/llvm.natvis
index 0fc50f79466a..d83ae8013c51 100644
--- a/llvm/utils/LLVMVisualizers/llvm.natvis
+++ b/llvm/utils/LLVMVisualizers/llvm.natvis
@@ -92,11 +92,11 @@ For later versions of Visual Studio, no setup is required.
 
   <Type Name="llvm::PointerIntPair&lt;*&gt;">
     <DisplayString IncludeView="ptr">{($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)}</DisplayString>
-    <DisplayString IncludeView="int">{($T3)((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}</DisplayString>
-    <DisplayString>{$T5::IntMask}: {($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)} [{($T3)((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}]</DisplayString>
+    <DisplayString IncludeView="int">{((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}</DisplayString>
+    <DisplayString>{$T5::IntMask}: {($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)} [{((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}]</DisplayString>
     <Expand>
       <Item Name="[ptr]">($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)</Item>
-      <Item Name="[int]">($T3)((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)</Item>
+      <Item Name="[int]">((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)</Item>
     </Expand>
   </Type>
   <!-- PointerUnion types -->
diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index db87ac3336c1..05aa146b5715 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -56,6 +56,17 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
   for (const StringRef &Arch : ARMArchVals.keys())
     OS << "ARM_ARCHITECTURE(" << Arch << ")\n";
   OS << "\n#undef ARM_ARCHITECTURE\n\n";
+
+  // Emit information for each defined Extension; used to build ArmExtKind.
+  OS << "#ifndef ARM_EXTENSION\n"
+     << "#define ARM_EXTENSION(NAME, ENUM)\n"
+     << "#endif\n\n";
+  for (const Record *Rec : RK.getAllDerivedDefinitions("Extension")) {
+    StringRef Name = Rec->getValueAsString("Name");
+    std::string Enum = Rec->getValueAsString("ArchExtKindSpelling").upper();
+    OS << "ARM_EXTENSION(" << Name << ", " << Enum << ")\n";
+  }
+  OS << "\n#undef ARM_EXTENSION\n\n";
 }
 
 static TableGen::Emitter::Opt
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 7a5d2be3ae95..88d353e89a46 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -4246,7 +4246,7 @@ static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) {
 
 void CodeGenDAGPatterns::ParseOnePattern(
     Record *TheDef, TreePattern &Pattern, TreePattern &Result,
-    const std::vector<Record *> &InstImpResults) {
+    const std::vector<Record *> &InstImpResults, bool ShouldIgnore) {
 
   // Inline pattern fragments and expand multiple alternatives.
   Pattern.InlinePatternFragments();
@@ -4332,7 +4332,7 @@ void CodeGenDAGPatterns::ParseOnePattern(
         AddPatternToMatch(&Pattern,
                           PatternToMatch(TheDef, Preds, T, Temp.getOnlyTree(),
                                          InstImpResults, Complexity,
-                                         TheDef->getID()));
+                                         TheDef->getID(), ShouldIgnore));
     }
   } else {
     // Show a message about a dropped pattern with some info to make it
@@ -4378,7 +4378,8 @@ void CodeGenDAGPatterns::ParsePatterns() {
       FindPatternInputsAndOutputs(Pattern, Pattern.getTree(j), InstInputs,
                                   InstResults, InstImpResults);
 
-    ParseOnePattern(CurPattern, Pattern, Result, InstImpResults);
+    ParseOnePattern(CurPattern, Pattern, Result, InstImpResults,
+                    CurPattern->getValueAsBit("GISelShouldIgnore"));
   }
 }
 
@@ -4407,10 +4408,10 @@ void CodeGenDAGPatterns::ExpandHwModeBasedTypes() {
       return;
     }
 
-    PatternsToMatch.emplace_back(P.getSrcRecord(), P.getPredicates(),
-                                 std::move(NewSrc), std::move(NewDst),
-                                 P.getDstRegs(), P.getAddedComplexity(),
-                                 Record::getNewUID(Records), Check);
+    PatternsToMatch.emplace_back(
+        P.getSrcRecord(), P.getPredicates(), std::move(NewSrc),
+        std::move(NewDst), P.getDstRegs(), P.getAddedComplexity(),
+        Record::getNewUID(Records), P.getGISelShouldIgnore(), Check);
   };
 
   for (PatternToMatch &P : Copy) {
@@ -4781,6 +4782,7 @@ void CodeGenDAGPatterns::GenerateVariants() {
           Variant, PatternsToMatch[i].getDstPatternShared(),
           PatternsToMatch[i].getDstRegs(),
           PatternsToMatch[i].getAddedComplexity(), Record::getNewUID(Records),
+          PatternsToMatch[i].getGISelShouldIgnore(),
           PatternsToMatch[i].getHwModeFeatures());
     }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index 7fcd39a9e940..7f94db0b7d5d 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -1057,17 +1057,19 @@ class PatternToMatch {
   TreePatternNodePtr DstPattern; // Resulting pattern.
   std::vector<Record *> Dstregs; // Physical register defs being matched.
   std::string HwModeFeatures;
-  int AddedComplexity; // Add to matching pattern complexity.
-  unsigned ID;         // Unique ID for the record.
+  int AddedComplexity;    // Add to matching pattern complexity.
+  bool GISelShouldIgnore; // Should GlobalISel ignore importing this pattern.
+  unsigned ID;            // Unique ID for the record.
 
 public:
   PatternToMatch(Record *srcrecord, ListInit *preds, TreePatternNodePtr src,
                  TreePatternNodePtr dst, std::vector<Record *> dstregs,
-                 int complexity, unsigned uid, const Twine &hwmodefeatures = "")
+                 int complexity, unsigned uid, bool ignore,
+                 const Twine &hwmodefeatures = "")
       : SrcRecord(srcrecord), Predicates(preds), SrcPattern(src),
         DstPattern(dst), Dstregs(std::move(dstregs)),
         HwModeFeatures(hwmodefeatures.str()), AddedComplexity(complexity),
-        ID(uid) {}
+        GISelShouldIgnore(ignore), ID(uid) {}
 
   Record *getSrcRecord() const { return SrcRecord; }
   ListInit *getPredicates() const { return Predicates; }
@@ -1078,6 +1080,7 @@ public:
   const std::vector<Record *> &getDstRegs() const { return Dstregs; }
   StringRef getHwModeFeatures() const { return HwModeFeatures; }
   int getAddedComplexity() const { return AddedComplexity; }
+  bool getGISelShouldIgnore() const { return GISelShouldIgnore; }
   unsigned getID() const { return ID; }
 
   std::string getPredicateCheck() const;
@@ -1240,7 +1243,8 @@ private:
 
   void ParseOnePattern(Record *TheDef, TreePattern &Pattern,
                        TreePattern &Result,
-                       const std::vector<Record *> &InstImpResults);
+                       const std::vector<Record *> &InstImpResults,
+                       bool ShouldIgnore = false);
   void AddPatternToMatch(TreePattern *Pattern, PatternToMatch &&PTM);
   void FindPatternInputsAndOutputs(
       TreePattern &I, TreePatternNodePtr Pat,
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index dcecac4380ce..ff508d648733 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -1352,7 +1352,7 @@ void llvm::EmitMatcherTable(Matcher *TheMatcher, const CodeGenDAGPatterns &CGP,
   MatcherEmitter.EmitHistogram(TheMatcher, OS);
 
   OS << "  #undef TARGET_VAL\n";
-  OS << "  SelectCodeCommon(N, MatcherTable,sizeof(MatcherTable));\n";
+  OS << "  SelectCodeCommon(N, MatcherTable, sizeof(MatcherTable));\n";
   OS << "}\n";
   EndEmitFunction(OS);
 
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index f2504775d557..0439df8067ed 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -97,7 +97,7 @@ static ParameterKind getParameterKind(const Record *R) {
     if (R->getValueAsInt("isHalfOrFloat") || R->getValueAsInt("isI16OrI32")) {
       return ParameterKind::Overload;
     }
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   default:
     llvm_unreachable("Support for specified DXIL Type not yet implemented");
   }
@@ -272,7 +272,7 @@ static std::string getOverloadKindStr(const Record *R) {
         return "OverloadKind::I16 | OverloadKind::I32";
       }
     }
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   default:
     llvm_unreachable(
         "Support for specified parameter OverloadKind not yet implemented");
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 25e302ce1ca4..cf7e4398741c 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -45,12 +45,10 @@
 #include "llvm/Support/CodeGenCoverage.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include <numeric>
 #include <string>
 
 using namespace llvm;
@@ -792,8 +790,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
                "nested predicate that uses operands");
         TreePattern *TP = Predicate.getOrigPatFragRecord();
         WaitingForNamedOperands = TP->getNumArgs();
-        for (unsigned i = 0; i < WaitingForNamedOperands; ++i)
-          StoreIdxForName[getScopedName(Call.Scope, TP->getArgName(i))] = i;
+        for (unsigned I = 0; I < WaitingForNamedOperands; ++I)
+          StoreIdxForName[getScopedName(Call.Scope, TP->getArgName(I))] = I;
       }
       InsnMatcher.addPredicate<GenericInstructionPredicateMatcher>(Predicate);
       continue;
@@ -834,6 +832,11 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       return InsnMatcher;
     }
 
+    if (SrcGIOrNull->TheDef->getName() == "G_FRAME_INDEX") {
+      InsnMatcher.addOperand(OpIdx++, Src.getName(), TempOpIdx);
+      return InsnMatcher;
+    }
+
     // Special case because the operand order is changed from setcc. The
     // predicate operand needs to be swapped from the last operand to the first
     // source.
@@ -873,8 +876,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     if (IsIntrinsic && !II)
       return failedImport("Expected IntInit containing intrinsic ID)");
 
-    for (unsigned i = 0; i != NumChildren; ++i) {
-      const TreePatternNode &SrcChild = Src.getChild(i);
+    for (unsigned I = 0; I != NumChildren; ++I) {
+      const TreePatternNode &SrcChild = Src.getChild(I);
 
       // We need to determine the meaning of a literal integer based on the
       // context. If this is a field required to be an immediate (such as an
@@ -883,19 +886,19 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       // argument that is required to be an immediate, we should not emit an LLT
       // type check, and should not be looking for a G_CONSTANT defined
       // register.
-      bool OperandIsImmArg = SrcGIOrNull->isInOperandImmArg(i);
+      bool OperandIsImmArg = SrcGIOrNull->isInOperandImmArg(I);
 
       // SelectionDAG allows pointers to be represented with iN since it doesn't
       // distinguish between pointers and integers but they are different types
       // in GlobalISel. Coerce integers to pointers to address space 0 if the
       // context indicates a pointer.
       //
-      bool OperandIsAPointer = SrcGIOrNull->isInOperandAPointer(i);
+      bool OperandIsAPointer = SrcGIOrNull->isInOperandAPointer(I);
 
       if (IsIntrinsic) {
         // For G_INTRINSIC/G_INTRINSIC_W_SIDE_EFFECTS, the operand immediately
         // following the defs is an intrinsic ID.
-        if (i == 0) {
+        if (I == 0) {
           OperandMatcher &OM =
               InsnMatcher.addOperand(OpIdx++, SrcChild.getName(), TempOpIdx);
           OM.addPredicate<IntrinsicIDOperandMatcher>(II);
@@ -906,8 +909,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         //
         // Note that we have to look at the i-1th parameter, because we don't
         // have the intrinsic ID in the intrinsic's parameter list.
-        OperandIsAPointer |= II->isParamAPointer(i - 1);
-        OperandIsImmArg |= II->isParamImmArg(i - 1);
+        OperandIsAPointer |= II->isParamAPointer(I - 1);
+        OperandIsImmArg |= II->isParamImmArg(I - 1);
       }
 
       if (auto Error =
@@ -962,9 +965,9 @@ Error GlobalISelEmitter::importChildMatcher(
     // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is
     // "MY_PAT:op1:op2" and the ones with same "name" represent same operand.
     std::string PatternName = std::string(SrcChild.getOperator()->getName());
-    for (unsigned i = 0; i < SrcChild.getNumChildren(); ++i) {
+    for (unsigned I = 0; I < SrcChild.getNumChildren(); ++I) {
       PatternName += ":";
-      PatternName += SrcChild.getChild(i).getName();
+      PatternName += SrcChild.getChild(I).getName();
     }
     SrcChildName = PatternName;
   }
@@ -1037,11 +1040,11 @@ Error GlobalISelEmitter::importChildMatcher(
               OM, SrcChild.getOperator(), TempOpIdx))
         return Error;
 
-      for (unsigned i = 0, e = SrcChild.getNumChildren(); i != e; ++i) {
-        auto &SubOperand = SrcChild.getChild(i);
+      for (unsigned I = 0, E = SrcChild.getNumChildren(); I != E; ++I) {
+        auto &SubOperand = SrcChild.getChild(I);
         if (!SubOperand.getName().empty()) {
           if (auto Error = Rule.defineComplexSubOperand(
-                  SubOperand.getName(), SrcChild.getOperator(), RendererID, i,
+                  SubOperand.getName(), SrcChild.getOperator(), RendererID, I,
                   SrcChildName))
             return Error;
         }
@@ -1223,10 +1226,16 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     if (DstChild.getOperator()->getName() == "timm") {
       DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
       return InsertPt;
-    } else if (DstChild.getOperator()->getName() == "imm") {
+    }
+    if (DstChild.getOperator()->getName() == "tframeindex") {
+      DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
+      return InsertPt;
+    }
+    if (DstChild.getOperator()->getName() == "imm") {
       DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(DstChild.getName());
       return InsertPt;
-    } else if (DstChild.getOperator()->getName() == "fpimm") {
+    }
+    if (DstChild.getOperator()->getName() == "fpimm") {
       DstMIBuilder.addRenderer<CopyFConstantAsFPImmRenderer>(
           DstChild.getName());
       return InsertPt;
@@ -1739,7 +1748,7 @@ Error GlobalISelEmitter::importDefaultOperandRenderers(
 
     if (const DefInit *DefaultDefOp = dyn_cast<DefInit>(DefaultOp)) {
       std::optional<LLTCodeGen> OpTyOrNone = MVTToLLT(N.getSimpleType(0));
-      auto Def = DefaultDefOp->getDef();
+      auto *Def = DefaultDefOp->getDef();
       if (Def->getName() == "undef_tied_input") {
         unsigned TempRegID = M.allocateTempRegID();
         M.insertAction<MakeTempRegisterAction>(InsertPt, *OpTyOrNone,
@@ -2411,6 +2420,8 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   for (const PatternToMatch &Pat : CGP.ptms()) {
     ++NumPatternTotal;
 
+    if (Pat.getGISelShouldIgnore())
+      continue; // skip without warning
     auto MatcherOrErr = runOnPattern(Pat);
 
     // The pattern analysis can fail, indicating an unsupported pattern.
@@ -2438,13 +2449,13 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   }
 
   // Comparison function to order records by name.
-  auto orderByName = [](const Record *A, const Record *B) {
+  auto OrderByName = [](const Record *A, const Record *B) {
     return A->getName() < B->getName();
   };
 
   std::vector<Record *> ComplexPredicates =
       RK.getAllDerivedDefinitions("GIComplexOperandMatcher");
-  llvm::sort(ComplexPredicates, orderByName);
+  llvm::sort(ComplexPredicates, OrderByName);
 
   std::vector<StringRef> CustomRendererFns;
   transform(RK.getAllDerivedDefinitions("GICustomOperandRenderer"),
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 26034e31ad8d..6784514032eb 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This tablegen backend emits the include file needed by the target
-// parser to parse the RISC-V CPUs.
+// This tablegen backend emits the include file needed by RISCVTargetParser.cpp
+// and RISCVISAInfo.cpp to parse the RISC-V CPUs and extensions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,21 +17,85 @@
 
 using namespace llvm;
 
+static StringRef getExtensionName(const Record *R) {
+  StringRef Name = R->getValueAsString("Name");
+  Name.consume_front("experimental-");
+  return Name;
+}
+
+static void printExtensionTable(raw_ostream &OS,
+                                const std::vector<Record *> &Extensions,
+                                bool Experimental) {
+  OS << "static const RISCVSupportedExtension Supported";
+  if (Experimental)
+    OS << "Experimental";
+  OS << "Extensions[] = {\n";
+
+  for (Record *R : Extensions) {
+    if (R->getValueAsBit("Experimental") != Experimental)
+      continue;
+
+    OS << "    {\"" << getExtensionName(R) << "\", {"
+       << R->getValueAsInt("MajorVersion") << ", "
+       << R->getValueAsInt("MinorVersion") << "}},\n";
+  }
+
+  OS << "};\n\n";
+}
+
+static void emitRISCVExtensions(RecordKeeper &Records, raw_ostream &OS) {
+  OS << "#ifdef GET_SUPPORTED_EXTENSIONS\n";
+  OS << "#undef GET_SUPPORTED_EXTENSIONS\n\n";
+
+  std::vector<Record *> Extensions =
+      Records.getAllDerivedDefinitions("RISCVExtension");
+  llvm::sort(Extensions, [](const Record *Rec1, const Record *Rec2) {
+    return getExtensionName(Rec1) < getExtensionName(Rec2);
+  });
+
+  printExtensionTable(OS, Extensions, /*Experimental=*/false);
+  printExtensionTable(OS, Extensions, /*Experimental=*/true);
+
+  OS << "#endif // GET_SUPPORTED_EXTENSIONS\n\n";
+
+  OS << "#ifdef GET_IMPLIED_EXTENSIONS\n";
+  OS << "#undef GET_IMPLIED_EXTENSIONS\n\n";
+
+  OS << "\nstatic constexpr ImpliedExtsEntry ImpliedExts[] = {\n";
+  for (Record *Ext : Extensions) {
+    auto ImpliesList = Ext->getValueAsListOfDefs("Implies");
+    if (ImpliesList.empty())
+      continue;
+
+    StringRef Name = getExtensionName(Ext);
+
+    for (auto *ImpliedExt : ImpliesList) {
+      if (!ImpliedExt->isSubClassOf("RISCVExtension"))
+        continue;
+
+      OS << "    { {\"" << Name << "\"}, \"" << getExtensionName(ImpliedExt)
+         << "\"},\n";
+    }
+  }
+
+  OS << "};\n\n";
+
+  OS << "#endif // GET_IMPLIED_EXTENSIONS\n\n";
+}
+
 // We can generate march string from target features as what has been described
 // in RISC-V ISA specification (version 20191213) 'Chapter 27. ISA Extension
 // Naming Conventions'.
 //
 // This is almost the same as RISCVFeatures::parseFeatureBits, except that we
 // get feature name from feature records instead of feature bits.
-static void printMArch(raw_ostream &OS, const Record &Rec) {
-  std::map<std::string, RISCVISAUtils::ExtensionVersion,
-           RISCVISAUtils::ExtensionComparator>
-      Extensions;
+static void printMArch(raw_ostream &OS, const std::vector<Record *> &Features) {
+  RISCVISAUtils::OrderedExtensionMap Extensions;
   unsigned XLen = 0;
 
   // Convert features to FeatureVector.
-  for (auto *Feature : Rec.getValueAsListOfDefs("Features")) {
-    StringRef FeatureName = Feature->getValueAsString("Name");
+  for (auto *Feature : Features) {
+    StringRef FeatureName = getExtensionName(Feature);
     if (Feature->isSubClassOf("RISCVExtension")) {
       unsigned Major = Feature->getValueAsInt("MajorVersion");
       unsigned Minor = Feature->getValueAsInt("MinorVersion");
@@ -54,22 +118,42 @@ static void printMArch(raw_ostream &OS, const Record &Rec) {
     OS << LS << Ext.first << Ext.second.Major << 'p' << Ext.second.Minor;
 }
 
-static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
+static void emitRISCVProfiles(RecordKeeper &Records, raw_ostream &OS) {
+  OS << "#ifdef GET_SUPPORTED_PROFILES\n";
+  OS << "#undef GET_SUPPORTED_PROFILES\n\n";
+
+  OS << "static constexpr RISCVProfile SupportedProfiles[] = {\n";
+
+  auto Profiles = Records.getAllDerivedDefinitions("RISCVProfile");
+  llvm::sort(Profiles, LessRecordFieldName());
+
+  for (const Record *Rec : Profiles) {
+    OS.indent(4) << "{\"" << Rec->getValueAsString("Name") << "\",\"";
+    printMArch(OS, Rec->getValueAsListOfDefs("Implies"));
+    OS << "\"},\n";
+  }
+
+  OS << "};\n\n";
+
+  OS << "#endif // GET_SUPPORTED_PROFILES\n\n";
+}
+
+static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
   OS << "#ifndef PROC\n"
      << "#define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGNED_ACCESS)\n"
      << "#endif\n\n";
 
   // Iterate on all definition records.
   for (const Record *Rec : RK.getAllDerivedDefinitions("RISCVProcessorModel")) {
-    bool FastScalarUnalignedAccess =
-        any_of(Rec->getValueAsListOfDefs("Features"), [&](auto &Feature) {
-          return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
-        });
+    const std::vector<Record *> &Features =
+        Rec->getValueAsListOfDefs("Features");
+    bool FastScalarUnalignedAccess = any_of(Features, [&](auto &Feature) {
+      return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
+    });
 
-    bool FastVectorUnalignedAccess =
-        any_of(Rec->getValueAsListOfDefs("Features"), [&](auto &Feature) {
-          return Feature->getValueAsString("Name") == "unaligned-vector-mem";
-        });
+    bool FastVectorUnalignedAccess = any_of(Features, [&](auto &Feature) {
+      return Feature->getValueAsString("Name") == "unaligned-vector-mem";
+    });
 
     bool FastUnalignedAccess =
         FastScalarUnalignedAccess && FastVectorUnalignedAccess;
@@ -81,7 +165,7 @@ static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
 
     // Compute MArch from features if we don't specify it.
     if (MArch.empty())
-      printMArch(OS, *Rec);
+      printMArch(OS, Features);
     else
       OS << MArch;
     OS << "\"}, " << FastUnalignedAccess << ")\n";
@@ -101,5 +185,12 @@ static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
   OS << "\n#undef TUNE_PROC\n";
 }
 
+static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
+  emitRISCVExtensions(RK, OS);
+  emitRISCVProfiles(RK, OS);
+  emitRISCVProcs(RK, OS);
+}
+
 static TableGen::Emitter::Opt X("gen-riscv-target-def", EmitRISCVTargetDef,
-                                "Generate the list of CPU for RISCV");
+                                "Generate the list of CPUs and extensions for "
+                                "RISC-V");
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/modernize/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/modernize/BUILD.gn
index c9e081383fa0..0d27b786da1f 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/modernize/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/modernize/BUILD.gn
@@ -25,6 +25,7 @@ static_library("modernize") {
     "MakeSharedCheck.cpp",
     "MakeSmartPtrCheck.cpp",
     "MakeUniqueCheck.cpp",
+    "MinMaxUseInitializerListCheck.cpp",
     "ModernizeTidyModule.cpp",
     "PassByValueCheck.cpp",
     "RawStringLiteralCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
index 59dc38c8c4d8..815c5a93c72f 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
@@ -35,6 +35,7 @@ static_library("readability") {
     "IsolateDeclarationCheck.cpp",
     "MagicNumbersCheck.cpp",
     "MakeMemberFunctionConstCheck.cpp",
+    "MathMissingParenthesesCheck.cpp",
     "MisleadingIndentationCheck.cpp",
     "MisplacedArrayIndexCheck.cpp",
     "NamedParameterCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 25fcdc4d0015..971ceb3185ff 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -312,6 +312,7 @@ copy("Headers") {
     "xsavesintrin.h",
     "xtestintrin.h",
     "yvals_core.h",
+    "zos_wrappers/builtins.h",
   ]
   outputs = [ "$clang_resource_dir/include/{{source_target_relative}}" ]
 }
diff --git a/llvm/utils/gn/secondary/lldb/include/lldb/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/include/lldb/API/BUILD.gn
new file mode 100644
index 000000000000..18726255e6a4
--- /dev/null
+++ b/llvm/utils/gn/secondary/lldb/include/lldb/API/BUILD.gn
@@ -0,0 +1,11 @@
+import("//lldb/utils/TableGen/lldb_tablegen.gni")
+
+lldb_tablegen("SBLanguages") {
+  args = [ "-gen-lldb-sbapi-dwarf-enum" ]
+
+  # See discussion on https://github.com/llvm/llvm-project/pull/89981 for
+  # why this runs tblgen on a .def file.
+  td_file = "//llvm/include/llvm/BinaryFormat/Dwarf.def"
+  output_name = "SBLanguages.h"
+}
+
diff --git a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
index 3415900ac291..c99c1b548335 100644
--- a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
@@ -17,6 +17,7 @@ target(liblldb_type, "liblldb") {
 
   output_name = "liblldb"  # XXX lib prefix?
   configs += [ "//llvm/utils/gn/build:lldb_code" ]
+  public_deps = [ "//lldb/include/lldb/API:SBLanguages" ]
   deps = [
     "//lldb/include/lldb/Host:Config",
     "//lldb/source/Breakpoint",
diff --git a/llvm/utils/gn/secondary/lldb/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/lldb/utils/TableGen/BUILD.gn
index 37305d1de66c..bc9f6903a288 100644
--- a/llvm/utils/gn/secondary/lldb/utils/TableGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/utils/TableGen/BUILD.gn
@@ -6,6 +6,7 @@ executable("lldb-tblgen") {
   sources = [
     "LLDBOptionDefEmitter.cpp",
     "LLDBPropertyDefEmitter.cpp",
+    "LLDBSBAPIDWARFEnum.cpp",
     "LLDBTableGen.cpp",
     "LLDBTableGenUtils.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index b252b9fc41dd..43ebd471cbc3 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -63,7 +63,6 @@ static_library("CodeGen") {
     "FaultMaps.cpp",
     "FinalizeISel.cpp",
     "FixupStatepointCallerSaved.cpp",
-    "FreeMachineFunction.cpp",
     "FuncletLayout.cpp",
     "GCEmptyBasicBlocks.cpp",
     "GCMetadata.cpp",
@@ -133,6 +132,7 @@ static_library("CodeGen") {
     "MachineDominators.cpp",
     "MachineFrameInfo.cpp",
     "MachineFunction.cpp",
+    "MachineFunctionAnalysis.cpp",
     "MachineFunctionPass.cpp",
     "MachineFunctionPrinterPass.cpp",
     "MachineFunctionSplitter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index cae491a34331..7e873532b9ab 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -50,6 +50,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonCommonGEP.cpp",
     "HexagonConstExtenders.cpp",
     "HexagonConstPropagation.cpp",
+    "HexagonCopyHoisting.cpp",
     "HexagonCopyToCombine.cpp",
     "HexagonEarlyIfConv.cpp",
     "HexagonExpandCondsets.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
index a8d6290f1b99..2ece91331c5d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
@@ -37,6 +37,7 @@ static_library("LLVMWebAssemblyCodeGen") {
     "WebAssemblyAsmPrinter.cpp",
     "WebAssemblyCFGSort.cpp",
     "WebAssemblyCFGStackify.cpp",
+    "WebAssemblyCleanCodeAfterTrap.cpp",
     "WebAssemblyDebugFixup.cpp",
     "WebAssemblyDebugValueManager.cpp",
     "WebAssemblyExceptionInfo.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 3257f4b5ff23..826dcf4e6ee9 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -59,9 +59,11 @@ write_lit_config("lit_site_cfg") {
     # LLVM_HOST_TRIPLE.)
     "HOST_LDFLAGS=",
 
+    "LLVM_APPEND_VC_REV=0",
     "LLVM_ENABLE_FFI=0",
     "LLVM_ENABLE_HTTPLIB=0",
     "LLVM_ENABLE_ZSTD=0",
+    "LLVM_FORCE_VC_REVISION=",
     "LLVM_HAVE_OPT_VIEWER_MODULES=0",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
     "LLVM_INCLUDE_DXIL_TESTS=0",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BinaryFormat/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BinaryFormat/BUILD.gn
index 099b4cf80add..715baee1e3a0 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BinaryFormat/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BinaryFormat/BUILD.gn
@@ -7,6 +7,7 @@ unittest("BinaryFormatTests") {
   ]
   sources = [
     "DwarfTest.cpp",
+    "ELFTest.cpp",
     "MachOTest.cpp",
     "MsgPackDocumentTest.cpp",
     "MsgPackReaderTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/MIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/MIR/BUILD.gn
index 664d02a30a73..7b5e65c2334d 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/MIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/MIR/BUILD.gn
@@ -14,8 +14,5 @@ unittest("MIRTests") {
     "//llvm/lib/TargetParser",
     "//llvm/lib/Testing/Support",
   ]
-  sources = [
-    "MachineMetadata.cpp",
-    "PassBuilderCallbacksTest.cpp",
-  ]
+  sources = [ "MachineMetadata.cpp" ]
 }
diff --git a/llvm/utils/lit/lit/builtin_commands/cat.py b/llvm/utils/lit/lit/builtin_commands/cat.py
index 37f55c0aef21..6fb2152ef933 100644
--- a/llvm/utils/lit/lit/builtin_commands/cat.py
+++ b/llvm/utils/lit/lit/builtin_commands/cat.py
@@ -55,10 +55,24 @@ def main(argv):
             msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
     for filename in filenames:
         try:
-            fileToCat = open(filename, "rb")
-            contents = fileToCat.read()
+            contents = None
+            is_text = False
+            try:
+                if sys.platform != "win32":
+                    fileToCat = open(filename, "r")
+                    contents = fileToCat.read()
+                    is_text = True
+            except:
+                pass
+
+            if contents is None:
+                fileToCat = open(filename, "rb")
+                contents = fileToCat.read()
+
             if show_nonprinting:
                 contents = convertToCaretAndMNotation(contents)
+            elif is_text:
+                contents = contents.encode()
             writer.write(contents)
             sys.stdout.flush()
             fileToCat.close()
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
index 96b4f7bc8677..1d4babc99984 100644
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -57,6 +57,13 @@ class LLVMConfig(object):
                 self.lit_config.note("using lit tools: {}".format(path))
                 lit_path_displayed = True
 
+        if platform.system() == "OS/390":
+            self.with_environment("_BPXK_AUTOCVT", "ON")
+            self.with_environment("_TAG_REDIR_IN", "TXT")
+            self.with_environment("_TAG_REDIR_OUT", "TXT")
+            self.with_environment("_TAG_REDIR_ERR", "TXT")
+            self.with_environment("_CEE_RUNOPTS", "FILETAG(AUTOCVT,AUTOTAG) POSIX(ON)")
+
         # Choose between lit's internal shell pipeline runner and a real shell.
         # If LIT_USE_INTERNAL_SHELL is in the environment, we use that as an
         # override.
diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
index 4314b565e11b..050004aa08c4 100755
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -353,8 +353,7 @@ function build_with_cmake_cache() {
   env CC="$c_compiler" CXX="$cxx_compiler" \
   cmake -G "$generator" -B $CMakeBuildDir -S $SrcDir/llvm \
         -C $SrcDir/clang/cmake/caches/Release.cmake \
-	-DCLANG_BOOTSTRAP_PASSTHROUGH="CMAKE_POSITION_INDEPENDENT_CODE;LLVM_LIT_ARGS" \
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+	-DCLANG_BOOTSTRAP_PASSTHROUGH="LLVM_LIT_ARGS" \
         -DLLVM_LIT_ARGS="-j $NumJobs $LitVerbose" \
         $ExtraConfigureFlags
         2>&1 | tee $LogDir/llvm.configure-$Flavor.log
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 5c4301af040b..4c0ef8387b8d 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -185,10 +185,13 @@ include_directories( ${MLIR_INCLUDE_DIR})
 add_subdirectory(tools/mlir-linalg-ods-gen)
 add_subdirectory(tools/mlir-pdll)
 add_subdirectory(tools/mlir-tblgen)
+add_subdirectory(tools/mlir-src-sharder)
 set(MLIR_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_TABLEGEN_TARGET "${MLIR_TABLEGEN_TARGET}" CACHE INTERNAL "")
 set(MLIR_PDLL_TABLEGEN_EXE "${MLIR_PDLL_TABLEGEN_EXE}" CACHE INTERNAL "")
 set(MLIR_PDLL_TABLEGEN_TARGET "${MLIR_PDLL_TABLEGEN_TARGET}" CACHE INTERNAL "")
+set(MLIR_SRC_SHARDER_TABLEGEN_EXE "${MLIR_SRC_SHARDER_TABLEGEN_EXE}" CACHE INTERNAL "")
+set(MLIR_SRC_SHARDER_TABLEGEN_TARGET "${MLIR_SRC_SHARDER_TABLEGEN_TARGET}" CACHE INTERNAL "")
 
 add_subdirectory(include/mlir)
 add_subdirectory(lib)
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index 1d2ed748bc2f..afb74fb2d000 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -5,6 +5,28 @@ function(mlir_tablegen ofn)
   tablegen(MLIR ${ARGV})
   set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT} ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
       PARENT_SCOPE)
+
+  # Get the current set of include paths for this td file.
+  cmake_parse_arguments(ARG "" "" "DEPENDS;EXTRA_INCLUDES" ${ARGN})
+  get_directory_property(tblgen_includes INCLUDE_DIRECTORIES)
+  list(APPEND tblgen_includes ${ARG_EXTRA_INCLUDES})
+  # Filter out any empty include items.
+  list(REMOVE_ITEM tblgen_includes "")
+
+  # Build the absolute path for the current input file.
+  if (IS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
+    set(LLVM_TARGET_DEFINITIONS_ABSOLUTE ${LLVM_TARGET_DEFINITIONS})
+  else()
+    set(LLVM_TARGET_DEFINITIONS_ABSOLUTE ${CMAKE_CURRENT_SOURCE_DIR}/${LLVM_TARGET_DEFINITIONS})
+  endif()
+
+  # Append the includes used for this file to the tablegen_compile_commands
+  # file.
+  file(APPEND ${CMAKE_BINARY_DIR}/tablegen_compile_commands.yml
+      "--- !FileInfo:\n"
+      "  filepath: \"${LLVM_TARGET_DEFINITIONS_ABSOLUTE}\"\n"
+      "  includes: \"${CMAKE_CURRENT_SOURCE_DIR};${tblgen_includes}\"\n"
+  )
 endfunction()
 
 # Clear out any pre-existing compile_commands file before processing. This
@@ -149,6 +171,22 @@ function(add_mlir_dialect dialect dialect_namespace)
   add_dependencies(mlir-headers MLIR${dialect}IncGen)
 endfunction()
 
+# Declare sharded dialect operation declarations and definitions
+function(add_sharded_ops ops_target shard_count)
+  set(LLVM_TARGET_DEFINITIONS ${ops_target}.td)
+  mlir_tablegen(${ops_target}.h.inc -gen-op-decls -op-shard-count=${shard_count})
+  mlir_tablegen(${ops_target}.cpp.inc -gen-op-defs -op-shard-count=${shard_count})
+  set(LLVM_TARGET_DEFINITIONS ${ops_target}.cpp)
+  foreach(index RANGE ${shard_count})
+    set(SHARDED_SRC ${ops_target}.${index}.cpp)
+    list(APPEND SHARDED_SRCS ${SHARDED_SRC})
+    tablegen(MLIR_SRC_SHARDER ${SHARDED_SRC} -op-shard-index=${index})
+    set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT} ${CMAKE_CURRENT_BINARY_DIR}/${SHARDED_SRC})
+  endforeach()
+  add_public_tablegen_target(MLIR${ops_target}ShardGen)
+  set(SHARDED_SRCS ${SHARDED_SRCS} PARENT_SCOPE)
+endfunction()
+
 # Declare a dialect in the include directory
 function(add_mlir_interface interface)
   set(LLVM_TARGET_DEFINITIONS ${interface}.td)
diff --git a/mlir/cmake/modules/CMakeLists.txt b/mlir/cmake/modules/CMakeLists.txt
index 8d2904ef46df..3ac1c79b090e 100644
--- a/mlir/cmake/modules/CMakeLists.txt
+++ b/mlir/cmake/modules/CMakeLists.txt
@@ -39,6 +39,7 @@ set(MLIR_CONFIG_INCLUDE_DIRS
 # Refer to the best host mlir-tbgen, which might be a host-optimized version
 set(MLIR_CONFIG_TABLEGEN_EXE "${MLIR_TABLEGEN_EXE}")
 set(MLIR_CONFIG_PDLL_TABLEGEN_EXE "${MLIR_PDLL_TABLEGEN_EXE}")
+set(MLIR_CONFIG_SRC_SHARDER_TABLEGEN_EXE "${MLIR_SRC_SHARDER_TABLEGEN_EXE}")
 
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/MLIRConfig.cmake.in
@@ -77,6 +78,7 @@ set(MLIR_CONFIG_INCLUDE_DIRS
 # if we're building with a host-optimized mlir-tblgen (with LLVM_OPTIMIZED_TABLEGEN).
 set(MLIR_CONFIG_TABLEGEN_EXE mlir-tblgen)
 set(MLIR_CONFIG_PDLL_TABLEGEN_EXE mlir-pdll)
+set(MLIR_CONFIG_SRC_SHARDER_TABLEGEN_EXE mlir-src-sharder)
 
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/MLIRConfig.cmake.in
diff --git a/mlir/cmake/modules/MLIRConfig.cmake.in b/mlir/cmake/modules/MLIRConfig.cmake.in
index d4da3cd98cce..7076d94a32f2 100644
--- a/mlir/cmake/modules/MLIRConfig.cmake.in
+++ b/mlir/cmake/modules/MLIRConfig.cmake.in
@@ -11,6 +11,7 @@ set(MLIR_CMAKE_DIR "@MLIR_CONFIG_CMAKE_DIR@")
 set(MLIR_INCLUDE_DIRS "@MLIR_CONFIG_INCLUDE_DIRS@")
 set(MLIR_TABLEGEN_EXE "@MLIR_CONFIG_TABLEGEN_EXE@")
 set(MLIR_PDLL_TABLEGEN_EXE "@MLIR_CONFIG_PDLL_TABLEGEN_EXE@")
+set(MLIR_SRC_SHARDER_TABLEGEN_EXE "@MLIR_CONFIG_SRC_SHARDER_TABLEGEN_EXE@")
 set(MLIR_INSTALL_AGGREGATE_OBJECTS "@MLIR_INSTALL_AGGREGATE_OBJECTS@")
 set(MLIR_ENABLE_BINDINGS_PYTHON "@MLIR_ENABLE_BINDINGS_PYTHON@")
 set(MLIR_ENABLE_EXECUTION_ENGINE "@MLIR_ENABLE_EXECUTION_ENGINE@")
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index 729393d53626..79a0cc55f138 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -1114,6 +1114,100 @@ void process(AddOp op, ArrayRef<Value> newOperands) {
 }
 ```
 
+#### Sharded Operation Definitions
+
+Large dialects with many operations may struggle with C++ compile time of
+generated op definitions, due to large compilation units. `mlir-tblgen`
+provides the ability to shard op definitions by splitting them up evenly
+by passing `-op-shard-count` to `-gen-op-defs` and `-gen-op-decls`. The tool
+will generate a single include file for the definitions broken up by
+`GET_OP_DEFS_${N}` where `${N}` is the shard number. A shard can be compiled in
+a single compilation unit by adding a file like this to your dialect library:
+
+```c++
+#include "mlir/IR/Operation.h"
+// Add any other required includes.
+
+// Utilities shared by generated op definitions: custom directive parsers,
+// printers, etc.
+#include "OpUtils.h"
+
+#define GET_OP_DEFS_0
+#include "MyDialectOps.cpp.inc"
+```
+
+Note: this requires restructing shared utility functions within the dialect
+library so they can be shared by multiple compilation units. I.e. instead of
+defining `static` methods in the same source file, you should declare them in a
+shared header and define them in their own source file.
+
+The op registration hooks are also sharded, because the template instantiation
+can take a very long time to compile. Operations should be registered in your
+dialect like:
+
+```c++
+void MyDialect::initialize() {
+  registerMyDialectOperations(this);
+}
+```
+
+CMake and Bazel functions are included to make sharding dialects easier.
+Assuming you have organized your operation utility functions into their own
+header, define a file that looks like the one above, but without the `#define`:
+
+```c++
+// MyDialectOps.cpp
+#include "mlir/IR/Operation.h"
+
+#include "OpUtils.h"
+
+#include "MyDialectOps.cpp.inc"
+```
+
+In CMake, remove the manual `mlir_tablegen` invocations and replace them with:
+
+```cmake
+set(LLVM_TARGET_DEFINITIONS MyDialectOps.td)
+add_sharded_ops(MyDialectOps 8) # shard the op definitions by 8
+
+add_mlir_library(MyDialect
+  MyDialect.cpp
+  MyDialectOpDefs.cpp
+  ${SHARDED_SRCS}
+
+  DEPENDS
+  MLIRTestOpsShardGen
+)
+```
+
+This will automatically duplicate the `MyDialectOps.cpp` source file and add the
+`#define` up the number of shards indicated.
+
+It is recommended that any out-of-line op member functions (like verifiers) be
+defined in a separate source file. In this example, it is called
+`MyDialectOpDefs.cpp`.
+
+In Bazel, remove the `-gen-op-defs` and `-gen-op-decls` invocations, and add
+
+```bazel
+gentbl_sharded_ops(
+    name = "MyDialectOpSrcs",
+    hdr_out = "MyDialectOps.h.inc",
+    shard_count = 8,
+    sharder = "//mlir:mlir-src-sharder",
+    src_file = "MyDialectOps.cpp",
+    src_out = "MyDialectOps.cpp.inc",
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "MyDialectOps.td",
+    deps = [":MyDialectOpsTdFiles"],
+)
+
+cc_library(
+    name = "MyDialect",
+    srcs = glob(["MyDialect/*.cpp"]) + [":MyDialectOpSrcs"]
+)
+```
+
 ## Constraints
 
 Constraint is a core concept in table-driven operation definition: operation
diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h
index 52ca7ba8a161..125469f57c5f 100644
--- a/mlir/include/mlir-c/Dialect/SparseTensor.h
+++ b/mlir/include/mlir-c/Dialect/SparseTensor.h
@@ -53,7 +53,8 @@ mlirAttributeIsASparseTensorEncodingAttr(MlirAttribute attr);
 MLIR_CAPI_EXPORTED MlirAttribute mlirSparseTensorEncodingAttrGet(
     MlirContext ctx, intptr_t lvlRank,
     MlirSparseTensorLevelType const *lvlTypes, MlirAffineMap dimToLvl,
-    MlirAffineMap lvlTodim, int posWidth, int crdWidth);
+    MlirAffineMap lvlTodim, int posWidth, int crdWidth,
+    MlirAttribute explicitVal, MlirAttribute implicitVal);
 
 /// Returns the level-rank of the `sparse_tensor.encoding` attribute.
 MLIR_CAPI_EXPORTED intptr_t
@@ -85,6 +86,14 @@ mlirSparseTensorEncodingAttrGetPosWidth(MlirAttribute attr);
 MLIR_CAPI_EXPORTED int
 mlirSparseTensorEncodingAttrGetCrdWidth(MlirAttribute attr);
 
+/// Returns the explicit value of the `sparse_tensor.encoding` attribute.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirSparseTensorEncodingAttrGetExplicitVal(MlirAttribute attr);
+
+/// Returns the implicit value of the `sparse_tensor.encoding` attribute.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirSparseTensorEncodingAttrGetImplicitVal(MlirAttribute attr);
+
 MLIR_CAPI_EXPORTED unsigned
 mlirSparseTensorEncodingAttrGetStructuredN(MlirSparseTensorLevelType lvlType);
 
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index edcfcfd830c4..3640055ea8da 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -742,7 +742,8 @@ def AffineParallelOp : Affine_Op<"parallel",
 }
 
 def AffinePrefetchOp : Affine_Op<"prefetch",
-  [DeclareOpInterfaceMethods<AffineMapAccessInterface>]> {
+  [DeclareOpInterfaceMethods<AffineMapAccessInterface>,
+   MemRefsNormalizable]> {
   let summary = "affine prefetch operation";
   let description = [{
     The `affine.prefetch` op prefetches data from a memref location described
diff --git a/mlir/include/mlir/Dialect/Arith/Utils/Utils.h b/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
index 2111a7c58102..5e7945d9b049 100644
--- a/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
@@ -24,6 +24,29 @@
 
 namespace mlir {
 
+using ReassociationIndices = SmallVector<int64_t, 2>;
+
+/// Infer the output shape for a {memref|tensor}.expand_shape when it is
+/// possible to do so.
+///
+/// Note: This should *only* be used to implement
+/// `ExpandShapeOp::inferOutputShape` in both the memref and tensor namespaces.
+/// If you need to infer the output shape you should use the static method of
+/// `ExpandShapeOp` instead of calling this.
+///
+/// `inputShape` is the shape of the tensor or memref being expanded as a
+/// sequence of SSA values or constants. `expandedType` is the output shape of
+/// the expand_shape operation. `reassociation` is the reassociation denoting
+/// the output dims each input dim is mapped to.
+///
+/// Returns the output shape in `outputShape` and `staticOutputShape`, following
+/// the conventions for the output_shape and static_output_shape inputs to the
+/// expand_shape ops.
+std::optional<SmallVector<OpFoldResult>>
+inferExpandShapeOutputShape(OpBuilder &b, Location loc, ShapedType expandedType,
+                            ArrayRef<ReassociationIndices> reassociation,
+                            ArrayRef<OpFoldResult> inputShape);
+
 /// Matches a ConstantIndexOp.
 detail::op_matcher<arith::ConstantIndexOp> matchConstantIndex();
 
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index a8235bed6f27..4a9ddafdd177 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -88,8 +88,6 @@ def GpuModuleToBinaryPass
     4. `fatbinary`, `fatbin`: produces fatbinaries.
   }];
   let options = [
-    Option<"offloadingHandler", "handler", "Attribute", "nullptr",
-           "Offloading handler to be attached to the resulting binary op.">,
     Option<"toolkitPath", "toolkit", "std::string", [{""}],
            "Toolkit path.">,
     ListOption<"linkFiles", "l", "std::string",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index a52cca3c95de..759cbe6c1564 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -1060,8 +1060,8 @@ def LLVM_vector_extract
   }];
 }
 
-def LLVM_experimental_vector_interleave2
-    : LLVM_OneResultIntrOp<"experimental.vector.interleave2",
+def LLVM_vector_interleave2
+    : LLVM_OneResultIntrOp<"vector.interleave2",
         /*overloadedResults=*/[0], /*overloadedOperands=*/[],
         /*traits=*/[
           Pure, AllTypesMatch<["vec1", "vec2"]>,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index eedae4b9bb7c..6655ce6f123e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -63,12 +63,12 @@ class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
   let arguments = !con(commonArgs, iofArg);
 
   let builders = [
-    OpBuilder<(ins "Type":$type, "Value":$lhs, "Value":$rhs, 
+    OpBuilder<(ins "Type":$type, "Value":$lhs, "Value":$rhs,
                    "IntegerOverflowFlags":$overflowFlags), [{
       build($_builder, $_state, type, lhs, rhs);
       $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
     }]>,
-    OpBuilder<(ins "Value":$lhs, "Value":$rhs, 
+    OpBuilder<(ins "Value":$lhs, "Value":$rhs,
                    "IntegerOverflowFlags":$overflowFlags), [{
       build($_builder, $_state, lhs, rhs);
       $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
@@ -1052,7 +1052,7 @@ def LLVM_SwitchOp : LLVM_TerminatorOp<"switch",
 ////////////////////////////////////////////////////////////////////////////////
 
 def LLVM_AddressOfOp : LLVM_Op<"mlir.addressof",
-    [Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+    [Pure, ConstantLike, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let arguments = (ins FlatSymbolRefAttr:$global_name);
   let results = (outs LLVM_AnyPointer:$res);
 
@@ -1114,6 +1114,8 @@ def LLVM_AddressOfOp : LLVM_Op<"mlir.addressof",
   }];
 
   let assemblyFormat = "$global_name attr-dict `:` qualified(type($res))";
+
+  let hasFolder = 1;
 }
 
 def LLVM_GlobalOp : LLVM_Op<"mlir.global",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/VCIXOps.td b/mlir/include/mlir/Dialect/LLVMIR/VCIXOps.td
index 25c1d027768a..27d9a32dd8e0 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/VCIXOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/VCIXOps.td
@@ -78,7 +78,7 @@ def VCIX_BinaryImmOp : VCIX_Op<"v.iv">,
           xlen, $opcode, $_location, moduleTranslation);
       llvm::Value *immConst = mlir::LLVM::detail::getLLVMConstant(
           xlen, $imm, $_location, moduleTranslation);
-      VectorType vt = op.getResult().getType().cast<VectorType>();
+      VectorType vt = mlir::cast<VectorType>(op.getResult().getType());
       llvm::Value *vl =
           createVL(builder, $vl, vt, xlen, $_location, moduleTranslation);
       $res = createIntrinsicCall(
@@ -120,7 +120,7 @@ def VCIX_BinaryOp : VCIX_Op<"v.sv">,
       } else {
         id = llvm::Intrinsic::riscv_sf_vc_v_fv_se;
       }
-      VectorType vt = op.getResult().getType().cast<VectorType>();
+      VectorType vt = mlir::cast<VectorType>(op.getResult().getType());
       llvm::Value *vl =
           createVL(builder, $vl, vt, xlen, $_location, moduleTranslation);
       $res = createIntrinsicCall(
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
index 59f909aed8f6..6b4b073fc672 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
@@ -22,7 +22,14 @@ def UnaryFn : I32EnumAttr<"UnaryFn", "", [
   I32EnumAttrCase<"abs", 2>,
   I32EnumAttrCase<"ceil", 3>,
   I32EnumAttrCase<"floor", 4>,
-  I32EnumAttrCase<"negf", 5>
+  I32EnumAttrCase<"negf", 5>,
+  I32EnumAttrCase<"reciprocal", 6>,
+  I32EnumAttrCase<"round", 7>,
+  I32EnumAttrCase<"sqrt", 8>,
+  I32EnumAttrCase<"rsqrt", 9>,
+  I32EnumAttrCase<"square", 10>,
+  I32EnumAttrCase<"tanh", 11>,
+  I32EnumAttrCase<"erf", 12>
 ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::linalg";
@@ -36,7 +43,8 @@ def BinaryFn : I32EnumAttr<"BinaryFn", "", [
   I32EnumAttrCase<"max_signed", 5>,
   I32EnumAttrCase<"min_signed", 6>,
   I32EnumAttrCase<"max_unsigned", 7>,
-  I32EnumAttrCase<"min_unsigned", 8>
+  I32EnumAttrCase<"min_unsigned", 8>,
+  I32EnumAttrCase<"powf", 9>
 ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::linalg";
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index 1ff6c4086cf3..584bfcd8b59d 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -305,6 +305,251 @@ structured_op: !LinalgStructuredOpConfig
           scalar_arg: I
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
+  name: reciprocal
+  cpp_class_name: ReciprocalOp
+  doc: |-
+    Applies reciprocal(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: reciprocal
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: round
+  cpp_class_name: RoundOp
+  doc: |-
+    Applies round(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: round
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: sqrt
+  cpp_class_name: SqrtOp
+  doc: |-
+    Applies sqrt(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: sqrt
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: rsqrt
+  cpp_class_name: RsqrtOp
+  doc: |-
+    Applies rsqrt(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: rsqrt
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: square
+  cpp_class_name: SquareOp
+  doc: |-
+    Applies square(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: square
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: tanh
+  cpp_class_name: TanhOp
+  doc: |-
+    Applies tanh(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: tanh
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: erf
+  cpp_class_name: erfOp
+  doc: |-
+    Applies erf(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: erf
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
   name: elemwise_binary
   cpp_class_name: ElemwiseBinaryOp
   doc: |-
@@ -625,7 +870,7 @@ metadata: !LinalgOpMetadata
 
     This means reduction/broadcast/element cast semantics is explicit. Further
     passes can take that into account when lowering this code. For example,
-    a `linalg.broadcast` + `linalg.div` sequence can be lowered to a
+    a `linalg.broadcast` + `linalg.max` sequence can be lowered to a
     `linalg.generic` with different affine maps for the two operands.
 structured_op: !LinalgStructuredOpConfig
   args:
@@ -664,6 +909,106 @@ structured_op: !LinalgStructuredOpConfig
           scalar_arg: rhs
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
+  name: min
+  cpp_class_name: MinOp
+  doc: |-
+    Takes the min (signed) between two inputs, elementwise.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.min` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: lhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: rhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: min_signed
+        operands:
+        - !ScalarExpression
+          scalar_arg: lhs
+        - !ScalarExpression
+          scalar_arg: rhs
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: powf
+  cpp_class_name: PowFOp
+  doc: |-
+    Takes the powf(lhs, rhs) between two inputs, elementwise. For powf(arg, 2) use `linalg.square`.
+
+    Only applies to floating point values.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.powf` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: lhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: rhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: powf
+        operands:
+        - !ScalarExpression
+          scalar_arg: lhs
+        - !ScalarExpression
+          scalar_arg: rhs
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
   name: matmul
   cpp_class_name: MatmulOp
   doc: |-
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
index da12e7c83b22..64c538367267 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -138,10 +138,10 @@ def Linalg_SoftmaxOp : Linalg_Op<"softmax",
 
   let extraClassDeclaration = [{
     ShapedType getInputOperandType() {
-      return getInput().getType().cast<ShapedType>();
+      return cast<ShapedType>(getInput().getType());
     }
     ShapedType getOutputOperandType() {
-      return getOutput().getType().cast<ShapedType>();
+      return cast<ShapedType>(getOutput().getType());
     }
     int64_t getInputOperandRank() {
       return getInputOperandType().getRank();
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/RuntimeOpVerification.h b/mlir/include/mlir/Dialect/Linalg/Transforms/RuntimeOpVerification.h
new file mode 100644
index 000000000000..6c3643f7835c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/RuntimeOpVerification.h
@@ -0,0 +1,21 @@
+//===- RuntimeOpVerification.h - Op Verification ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_RUNTIMEOPVERIFICATION_H
+#define MLIR_DIALECT_LINALG_RUNTIMEOPVERIFICATION_H
+
+namespace mlir {
+class DialectRegistry;
+
+namespace linalg {
+void registerRuntimeVerifiableOpInterfaceExternalModels(
+    DialectRegistry &registry);
+} // namespace linalg
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_RUNTIMEOPVERIFICATION_H
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 39e66cd9e6e5..14b8d95ea15b 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1548,7 +1548,6 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
 class MemRef_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
     MemRef_Op<mnemonic, !listconcat(traits,
       [Pure, ViewLikeOpInterface])>,
-    Arguments<(ins AnyStridedMemRef:$src, IndexListArrayAttr:$reassociation)>,
     Results<(outs AnyStridedMemRef:$result)>{
 
   code commonExtraClassDeclaration = [{
@@ -1573,10 +1572,6 @@ class MemRef_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
     Value getViewSource() { return getSrc(); }
   }];
 
-  let assemblyFormat = [{
-    $src $reassociation attr-dict `:` type($src) `into` type($result)
-  }];
-
   let hasFolder = 1;
   let hasCanonicalizer = 1;
   let hasVerifier = 1;
@@ -1598,14 +1593,10 @@ def MemRef_ExpandShapeOp : MemRef_ReassociativeReshapeOp<"expand_shape", [
     Example:
 
     ```mlir
-    %r = memref.expand_shape %0 [[0, 1], [2]]
-        : memref<?x?xf32> into memref<?x5x?xf32>
+    %r = memref.expand_shape %0 [[0, 1], [2]] output_shape [%sz0, %sz1, 32]
+        : memref<?x32xf32> into memref<?x?x32xf32>
     ```
 
-    At most one dimension of a reassociation group (e.g., [0, 1] above) may be
-    dynamic in the result type. Otherwise, the op would be ambiguous, as it
-    would not be clear how the source dimension is extended.
-
     If an op can be statically proven to be invalid (e.g, an expansion from
     `memref<10xf32>` to `memref<2x6xf32>`), it is rejected by the verifier. If
     it cannot statically be proven invalid (e.g., the full example above; it is
@@ -1622,41 +1613,80 @@ def MemRef_ExpandShapeOp : MemRef_ReassociativeReshapeOp<"expand_shape", [
     there must be a dynamic result dimension in the corresponding reassociation
     group. Same for strides.
 
+    The representation for the output shape supports a partially-static
+    specification via attributes specified through the `static_output_shape`
+    argument.  A special sentinel value `ShapedType::kDynamic` encodes that the
+    corresponding entry has a dynamic value.  There must be exactly as many SSA
+    inputs in `output_shape` as there are `ShapedType::kDynamic` entries in
+    `static_output_shape`.
+
     Note: This op currently assumes that the inner strides are of the
     source/result layout map are the faster-varying ones.
   }];
 
+  let arguments = (ins AnyStridedMemRef:$src, IndexListArrayAttr:$reassociation,
+                       Variadic<Index>:$output_shape,
+                       DenseI64ArrayAttr:$static_output_shape);
+
+  let assemblyFormat = [{
+    $src $reassociation `output_shape`
+    custom<DynamicIndexList>($output_shape, $static_output_shape) attr-dict `:`
+    type($src) `into` type($result)
+  }];
+
   let builders = [
     // Builders using ReassociationIndices.
     OpBuilder<(ins "Type":$resultType, "Value":$src,
       "ArrayRef<ReassociationIndices>":$reassociation,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs),
+      "ArrayRef<OpFoldResult>":$outputShape)>,
+
+    // It will infer output shape using inferOutputShape() method.
+    OpBuilder<(ins "Type":$resultType, "Value":$src,
+      "ArrayRef<ReassociationIndices>":$reassociation)>,
+
+    // Builder using ReassociationExprs.
+    OpBuilder<(ins "Type":$resultType, "Value":$src,
+      "ArrayRef<ReassociationExprs>":$reassociation),
     [{
-      build($_builder, $_state, resultType, src, attrs);
-      $_state.addAttribute("reassociation",
-                          getReassociationIndicesAttribute($_builder, reassociation));
+      auto reassociationIndices =
+          convertReassociationMapsToIndices(reassociation);
+      build($_builder, $_state, resultType, src, reassociationIndices);
     }]>,
 
-    // Builder using ReassociationExprs.
     OpBuilder<(ins "Type":$resultType, "Value":$src,
       "ArrayRef<ReassociationExprs>":$reassociation,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs),
+      "ArrayRef<OpFoldResult>":$outputShape),
     [{
       auto reassociationMaps =
-          convertReassociationMapsToIndices($_builder, reassociation);
-      build($_builder, $_state, resultType, src, reassociationMaps, attrs);
+          convertReassociationMapsToIndices(reassociation);
+      build($_builder, $_state, resultType, src, reassociationMaps,
+            outputShape);
     }]>,
 
     // Builder that infers the result layout map. The result shape must be
+    // specified. Otherwise, the op may be ambiguous. The output shape for 
+    // the op will be inferred using the inferOutputShape() method.
+    OpBuilder<(ins "ArrayRef<int64_t>":$resultShape, "Value":$src,
+               "ArrayRef<ReassociationIndices>":$reassociation)>,
+
+    // Builder that infers the result layout map. The result shape must be
     // specified. Otherwise, the op may be ambiguous.
     OpBuilder<(ins "ArrayRef<int64_t>":$resultShape, "Value":$src,
-               "ArrayRef<ReassociationIndices>":$reassociation)>
+               "ArrayRef<ReassociationIndices>":$reassociation,
+               "ArrayRef<OpFoldResult>":$outputShape)>
   ];
 
   let extraClassDeclaration = commonExtraClassDeclaration # [{
     static FailureOr<MemRefType> computeExpandedType(
         MemRefType srcType, ArrayRef<int64_t> resultShape,
         ArrayRef<ReassociationIndices> reassociation);
+
+    // Infer the output shape for a memref.expand_shape when it is possible
+    // to do so.
+    static FailureOr<SmallVector<OpFoldResult>> inferOutputShape(
+        OpBuilder &b, Location loc, MemRefType expandedType,
+        ArrayRef<ReassociationIndices> reassociation,
+        ArrayRef<OpFoldResult> inputShape);
   }];
 
   let hasVerifier = 1;
@@ -1707,6 +1737,12 @@ def MemRef_CollapseShapeOp : MemRef_ReassociativeReshapeOp<"collapse_shape", [
     source/result layout map are the faster-varying ones.
   }];
 
+  let arguments = (ins AnyStridedMemRef:$src, IndexListArrayAttr:$reassociation);
+
+  let assemblyFormat = [{
+    $src $reassociation attr-dict `:` type($src) `into` type($result)
+  }];
+
   let builders = [
     // Builders for a contracting reshape whose result type is computed from
     // `src` and `reassociation`.
@@ -1718,7 +1754,7 @@ def MemRef_CollapseShapeOp : MemRef_ReassociativeReshapeOp<"collapse_shape", [
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs),
     [{
       auto reassociationMaps =
-          convertReassociationMapsToIndices($_builder, reassociation);
+          convertReassociationMapsToIndices(reassociation);
       build($_builder, $_state, src, reassociationMaps, attrs);
     }]>,
 
@@ -1736,7 +1772,7 @@ def MemRef_CollapseShapeOp : MemRef_ReassociativeReshapeOp<"collapse_shape", [
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs),
     [{
       auto reassociationMaps =
-          convertReassociationMapsToIndices($_builder, reassociation);
+          convertReassociationMapsToIndices(reassociation);
       build($_builder, $_state, resultType, src, reassociationMaps, attrs);
     }]>
   ];
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 8ab116ce391e..a40676d071e6 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -158,8 +158,9 @@ def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove]> {
   let description = [{
     This operation provides a declaration of how to implement the
     [first]privatization of a variable. The dialect users should provide
-    information about how to create an instance of the type in the alloc region
-    and how to initialize the copy from the original item in the copy region.
+    information about how to create an instance of the type in the alloc region,
+    how to initialize the copy from the original item in the copy region, and if
+    needed, how to deallocate allocated memory in the dealloc region.
 
     Examples:
 
@@ -187,13 +188,26 @@ def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove]> {
     }
     ```
 
+    * `private(x)` for "allocatables" would be emitted as:
+    ```mlir
+    omp.private {type = private} @x.privatizer : !some.type alloc {
+    ^bb0(%arg0: !some.type):
+    %0 = ... allocate proper memory for the private clone ...
+    omp.yield(%0 : !fir.ref<i32>)
+    } dealloc {
+    ^bb0(%arg0: !some.type):
+    ... deallocate allocated memory ...
+    omp.yield
+    }
+    ```
+
     There are no restrictions on the body except for:
-    - The `alloc` region has a single argument.
+    - The `alloc` & `dealloc` regions have a single argument.
     - The `copy` region has 2 arguments.
-    - Both regions are terminated by `omp.yield` ops.
+    - All three regions are terminated by `omp.yield` ops.
     The above restrictions and other obvious restrictions (e.g. verifying the
     type of yielded values) are verified by the custom op verifier. The actual
-    contents of the blocks inside both regions are not verified.
+    contents of the blocks inside all regions are not verified.
 
     Instances of this op would then be used by ops that model directives that
     accept data-sharing attribute clauses.
@@ -212,12 +226,14 @@ def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove]> {
                        DataSharingClauseTypeAttr:$data_sharing_type);
 
   let regions = (region MinSizedRegion<1>:$alloc_region,
-                        AnyRegion:$copy_region);
+                        AnyRegion:$copy_region,
+                        AnyRegion:$dealloc_region);
 
   let assemblyFormat = [{
     $data_sharing_type $sym_name `:` $type
       `alloc` $alloc_region
       (`copy` $copy_region^)?
+      (`dealloc` $dealloc_region^)?
       attr-dict
   }];
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index ab9b78e755d9..d9569d9d294d 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -234,8 +234,8 @@ def OffloadModuleInterface : OpInterface<"OffloadModuleInterface"> {
       /*methodName=*/"getIsTargetDevice",
       (ins), [{}], [{
         if (Attribute isTargetDevice = $_op->getAttr("omp.is_target_device"))
-          if (isTargetDevice.isa<mlir::BoolAttr>())
-           return isTargetDevice.dyn_cast<BoolAttr>().getValue();
+          if (::llvm::isa<mlir::BoolAttr>(isTargetDevice))
+           return ::llvm::dyn_cast<BoolAttr>(isTargetDevice).getValue();
         return false;
       }]>,
     InterfaceMethod<
@@ -259,7 +259,7 @@ def OffloadModuleInterface : OpInterface<"OffloadModuleInterface"> {
       /*methodName=*/"getIsGPU",
       (ins), [{}], [{
         if (Attribute isTargetCGAttr = $_op->getAttr("omp.is_gpu"))
-          if (auto isTargetCGVal = isTargetCGAttr.dyn_cast<BoolAttr>())
+          if (auto isTargetCGVal = ::llvm::dyn_cast<BoolAttr>(isTargetCGAttr))
            return isTargetCGVal.getValue();
         return false;
       }]>,
@@ -332,7 +332,7 @@ def OffloadModuleInterface : OpInterface<"OffloadModuleInterface"> {
       /*methodName=*/"getRequires",
       (ins), [{}], [{
         if (Attribute requiresAttr = $_op->getAttr("omp.requires"))
-          if (auto requiresVal = requiresAttr.dyn_cast<mlir::omp::ClauseRequiresAttr>())
+          if (auto requiresVal = ::llvm::dyn_cast<mlir::omp::ClauseRequiresAttr>(requiresAttr))
             return requiresVal.getValue();
         return mlir::omp::ClauseRequires::none;
       }]>,
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h
index 39b05b9d3ad1..3325a6fa3f9f 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h
@@ -51,9 +51,6 @@ public:
     return (exponent.ult(other.exponent));
   }
 
-  // Prints polynomial to 'os'.
-  void print(raw_ostream &os) const;
-
   friend ::llvm::hash_code hash_value(const Monomial &arg);
 
 public:
@@ -102,6 +99,8 @@ public:
 
   unsigned getDegree() const;
 
+  ArrayRef<Monomial> getTerms() const { return terms; }
+
   friend ::llvm::hash_code hash_value(const Polynomial &arg);
 
 private:
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
index 5d8da8399b01..d3e3ac55677f 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
@@ -35,18 +35,18 @@ def Polynomial_Dialect : Dialect {
 
     ```mlir
     // A constant polynomial in a ring with i32 coefficients and no polynomial modulus
-    #ring = #polynomial.ring<ctype=i32>
+    #ring = #polynomial.ring<coefficientType=i32>
     %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring>
 
     // A constant polynomial in a ring with i32 coefficients, modulo (x^1024 + 1)
     #modulus = #polynomial.polynomial<1 + x**1024>
-    #ring = #polynomial.ring<ctype=i32, ideal=#modulus>
+    #ring = #polynomial.ring<coefficientType=i32, polynomialModulus=#modulus>
     %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring>
 
     // A constant polynomial in a ring with i32 coefficients, with a polynomial
     // modulus of (x^1024 + 1) and a coefficient modulus of 17.
     #modulus = #polynomial.polynomial<1 + x**1024>
-    #ring = #polynomial.ring<ctype=i32, cmod=17, ideal=#modulus>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=17, polynomialModulus=#modulus>
     %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring>
     ```
   }];
@@ -63,7 +63,21 @@ class Polynomial_Attr<string name, string attrMnemonic, list<Trait> traits = []>
 def Polynomial_PolynomialAttr : Polynomial_Attr<"Polynomial", "polynomial"> {
   let summary = "An attribute containing a single-variable polynomial.";
   let description = [{
-     #poly = #polynomial.poly<x**1024 + 1>
+    A polynomial attribute represents a single-variable polynomial, which
+    is used to define the modulus of a `RingAttr`, as well as to define constants
+    and perform constant folding for `polynomial` ops.
+
+    The polynomial must be expressed as a list of monomial terms, with addition
+    or subtraction between them. The choice of variable name is arbitrary, but
+    must be consistent across all the monomials used to define a single
+    attribute. The order of monomial terms is arbitrary, each monomial degree
+    must occur at most once.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 + 1>
+    ```
   }];
   let parameters = (ins "Polynomial":$polynomial);
   let hasCustomAssemblyFormat = 1;
@@ -79,10 +93,10 @@ def Polynomial_RingAttr : Polynomial_Attr<"Ring", "ring"> {
     integral, whose coefficients are taken modulo some statically known modulus
     (`coefficientModulus`).
 
-    Additionally, a polynomial ring can specify an _ideal_, which converts
+    Additionally, a polynomial ring can specify a _polynomialModulus_, which converts
     polynomial arithmetic to the analogue of modular integer arithmetic, where
     each polynomial is represented as its remainder when dividing by the
-    modulus. For single-variable polynomials, an "ideal" is always specificed
+    modulus. For single-variable polynomials, an "polynomialModulus" is always specificed
     via a single polynomial, which we call `polynomialModulus`.
 
     An expressive example is polynomials with i32 coefficients, whose
@@ -122,32 +136,284 @@ class Polynomial_Type<string name, string typeMnemonic>
 
 def Polynomial_PolynomialType : Polynomial_Type<"Polynomial", "polynomial"> {
   let summary = "An element of a polynomial ring.";
-
   let description = [{
     A type for polynomials in a polynomial quotient ring.
   }];
-
   let parameters = (ins Polynomial_RingAttr:$ring);
   let assemblyFormat = "`<` $ring `>`";
 }
 
+def PolynomialLike: TypeOrContainer<Polynomial_PolynomialType, "polynomial-like">;
+
 class Polynomial_Op<string mnemonic, list<Trait> traits = []> :
-    Op<Polynomial_Dialect, mnemonic, traits # [Pure]>;
+    Op<Polynomial_Dialect, mnemonic, traits # [Pure]> {
+  let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
+}
 
 class Polynomial_UnaryOp<string mnemonic, list<Trait> traits = []> :
     Polynomial_Op<mnemonic, traits # [SameOperandsAndResultType]> {
   let arguments = (ins Polynomial_PolynomialType:$operand);
   let results = (outs Polynomial_PolynomialType:$result);
-
-  let assemblyFormat = "$operand attr-dict `:` qualified(type($result))";
 }
 
 class Polynomial_BinaryOp<string mnemonic, list<Trait> traits = []> :
-    Polynomial_Op<mnemonic, traits # [SameOperandsAndResultType]> {
-  let arguments = (ins Polynomial_PolynomialType:$lhs, Polynomial_PolynomialType:$rhs);
-  let results = (outs Polynomial_PolynomialType:$result);
+    Polynomial_Op<mnemonic, !listconcat(traits, [Pure, SameOperandsAndResultType, ElementwiseMappable])> {
+  let arguments = (ins PolynomialLike:$lhs, PolynomialLike:$rhs);
+  let results = (outs PolynomialLike:$result);
+  let assemblyFormat = "operands attr-dict `:` type($result)";
+}
+
+def Polynomial_AddOp : Polynomial_BinaryOp<"add", [Commutative]> {
+  let summary = "Addition operation between polynomials.";
+  let description = [{
+    Performs polynomial addition on the operands. The operands may be single
+    polynomials or containers of identically-typed polynomials, i.e., polynomials
+    from the same underlying ring with the same coefficient types.
+
+    Addition is defined to occur in the ring defined by the ring attribute of
+    the two operands, meaning the addition is taken modulo the coefficientModulus
+    and the polynomialModulus of the ring.
+
+    Example:
+
+    ```mlir
+    // add two polynomials modulo x^1024 - 1
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant #polynomial.polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %2 = polynomial.add %0, %1 : !polynomial.polynomial<#ring>
+    ```
+  }];
+}
+
+def Polynomial_SubOp : Polynomial_BinaryOp<"sub"> {
+  let summary = "Subtraction operation between polynomials.";
+  let description = [{
+    Performs polynomial subtraction on the operands. The operands may be single
+    polynomials or containers of identically-typed polynomials, i.e., polynomials
+    from the same underlying ring with the same coefficient types.
+
+    Subtraction is defined to occur in the ring defined by the ring attribute of
+    the two operands, meaning the subtraction is taken modulo the coefficientModulus
+    and the polynomialModulus of the ring.
+
+    Example:
 
-  let assemblyFormat = "$lhs `,` $rhs attr-dict `:` qualified(type($result))";
+    ```mlir
+    // subtract two polynomials modulo x^1024 - 1
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant #polynomial.polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %2 = polynomial.sub %0, %1 : !polynomial.polynomial<#ring>
+    ```
+  }];
+}
+
+def Polynomial_MulOp : Polynomial_BinaryOp<"mul", [Commutative]> {
+  let summary = "Multiplication operation between polynomials.";
+  let description = [{
+    Performs polynomial multiplication on the operands. The operands may be single
+    polynomials or containers of identically-typed polynomials, i.e., polynomials
+    from the same underlying ring with the same coefficient types.
+
+    Multiplication is defined to occur in the ring defined by the ring attribute of
+    the two operands, meaning the multiplication is taken modulo the coefficientModulus
+    and the polynomialModulus of the ring.
+
+    Example:
+
+    ```mlir
+    // multiply two polynomials modulo x^1024 - 1
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant #polynomial.polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %2 = polynomial.mul %0, %1 : !polynomial.polynomial<#ring>
+    ```
+  }];
+}
+
+def Polynomial_MulScalarOp : Polynomial_Op<"mul_scalar", [
+      ElementwiseMappable, AllTypesMatch<["polynomial", "output"]>]> {
+  let summary = "Multiplication by a scalar of the field.";
+  let description = [{
+    Multiplies the polynomial operand's coefficients by a given scalar value.
+    The operation is defined to occur in the ring defined by the ring attribute
+    of the two operands, meaning the multiplication is taken modulo the
+    coefficientModulus of the ring.
+
+    The `scalar` input must have the same type as the polynomial ring's
+    coefficientType.
+
+    Example:
+
+    ```mlir
+    // multiply two polynomials modulo x^1024 - 1
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = arith.constant 3 : i32
+    %2 = polynomial.mul_scalar %0, %1 : !polynomial.polynomial<#ring>, i32
+    ```
+  }];
+
+  let arguments = (ins
+    PolynomialLike:$polynomial,
+    AnyInteger:$scalar
+  );
+  let results = (outs
+    PolynomialLike:$output
+  );
+  let assemblyFormat = "operands attr-dict `:` type($polynomial) `,` type($scalar)";
+  let hasVerifier = 1;
+}
+
+def Polynomial_LeadingTermOp: Polynomial_Op<"leading_term"> {
+  let summary = "Compute the leading term of the polynomial.";
+  let description = [{
+    The degree of a polynomial is the largest $k$ for which the coefficient
+    `a_k` of `x^k` is nonzero. The leading term is the term `a_k * x^k`, which
+    this op represents as a pair of results. The first is the degree `k` as an
+    index, and the second is the coefficient, whose type matches the
+    coefficient type of the polynomial's ring attribute.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1, %2 = polynomial.leading_term %0 : !polynomial.polynomial<#ring> -> (index, i32)
+    ```
+  }];
+  let arguments = (ins Polynomial_PolynomialType:$input);
+  let results = (outs Index:$degree, AnyInteger:$coefficient);
+  let assemblyFormat = "operands attr-dict `:` type($input) `->` `(` type($degree) `,` type($coefficient) `)`";
+}
+
+def Polynomial_MonomialOp: Polynomial_Op<"monomial"> {
+  let summary = "Create a polynomial that consists of a single monomial.";
+  let description = [{
+    Construct a polynomial that consists of a single monomial term, from its
+    degree and coefficient as dynamic inputs.
+
+    The coefficient type of the output polynomial's ring attribute must match
+    the `coefficient` input type.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %deg = arith.constant 1023 : index
+    %five = arith.constant 5 : i32
+    %0 = polynomial.monomial %five, %deg : (i32, index) -> !polynomial.polynomial<#ring>
+    ```
+  }];
+  let arguments = (ins AnyInteger:$coefficient, Index:$degree);
+  let results = (outs Polynomial_PolynomialType:$output);
+}
+
+def Polynomial_MonicMonomialMulOp: Polynomial_Op<"monic_monomial_mul", [AllTypesMatch<["input", "output"]>]> {
+  let summary = "Multiply a polynomial by a monic monomial.";
+  let description = [{
+    Multiply a polynomial by a monic monomial, meaning a polynomial of the form
+    `1 * x^k` for an index operand `k`.
+
+    In some special rings of polynomials, such as a ring of polynomials
+    modulo `x^n - 1`, `monomial_mul` can be interpreted as a cyclic shift of
+    the coefficients of the polynomial. For some rings, this results in
+    optimized lowerings that involve rotations and rescaling of the
+    coefficients of the input.
+  }];
+  let arguments = (ins PolynomialLike:$input, Index:$monomialDegree);
+  let results = (outs PolynomialLike:$output);
+}
+
+def Polynomial_FromTensorOp : Polynomial_Op<"from_tensor", [Pure]> {
+  let summary = "Creates a polynomial from integer coefficients stored in a tensor.";
+  let description = [{
+    `polynomial.from_tensor` creates a polynomial value from a tensor of coefficients.
+    The input tensor must list the coefficients in degree-increasing order.
+
+    The input one-dimensional tensor may have size at most the degree of the
+    ring's polynomialModulus generator polynomial, with smaller dimension implying that
+    all higher-degree terms have coefficient zero.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %two = arith.constant 2 : i32
+    %five = arith.constant 5 : i32
+    %coeffs = tensor.from_elements %two, %two, %five : tensor<3xi32>
+    %poly = polynomial.from_tensor %coeffs : tensor<3xi32> -> !polynomial.polynomial<#ring>
+    ```
+  }];
+  let arguments = (ins RankedTensorOf<[AnyInteger]>:$input);
+  let results = (outs Polynomial_PolynomialType:$output);
+
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
+
+  let builders = [
+    // Builder that infers coefficient modulus from tensor bit width,
+    // and uses whatever input ring is provided by the caller.
+    OpBuilder<(ins "::mlir::Value":$input, "::mlir::polynomial::RingAttr":$ring)>
+  ];
+  let hasVerifier = 1;
+}
+
+def Polynomial_ToTensorOp : Polynomial_Op<"to_tensor", [Pure]> {
+  let summary = "Creates a tensor containing the coefficients of a polynomial.";
+  let description = [{
+    `polynomial.to_tensor` creates a dense tensor value containing the
+    coefficients of the input polynomial. The output tensor contains the
+    coefficients in degree-increasing order.
+
+    Operations that act on the coefficients of a polynomial, such as extracting
+    a specific coefficient or extracting a range of coefficients, should be
+    implemented by composing `to_tensor` with the relevant `tensor` dialect
+    ops.
+
+    The output tensor has shape equal to the degree of the polynomial ring
+    attribute's polynomialModulus, including zeroes.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %two = arith.constant 2 : i32
+    %five = arith.constant 5 : i32
+    %coeffs = tensor.from_elements %two, %two, %five : tensor<3xi32>
+    %poly = polynomial.from_tensor %coeffs : tensor<3xi32> -> !polynomial.polynomial<#ring>
+    %tensor = polynomial.to_tensor %poly : !polynomial.polynomial<#ring> -> tensor<1024xi32>
+    ```
+  }];
+  let arguments = (ins Polynomial_PolynomialType:$input);
+  let results = (outs RankedTensorOf<[AnyInteger]>:$output);
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
+
+  let hasVerifier = 1;
+}
+
+def Polynomial_ConstantOp : Polynomial_Op<"constant", [Pure]> {
+  let summary = "Define a constant polynomial via an attribute.";
+  let description = [{
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    ```
+  }];
+  let arguments = (ins Polynomial_PolynomialAttr:$input);
+  let results = (outs Polynomial_PolynomialType:$output);
+  let assemblyFormat = "$input attr-dict `:` type($output)";
 }
 
 #endif // POLYNOMIAL_OPS
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index 5e523ec428ae..b182b4c72b95 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -89,18 +89,21 @@ inline MemRefType getMemRefType(T &&t) {
 /// Returns null-attribute for any type without an encoding.
 SparseTensorEncodingAttr getSparseTensorEncoding(Type type);
 
+/// Returns true iff the type range has any sparse tensor type.
+inline bool hasAnySparseType(TypeRange types) {
+  return llvm::any_of(types, [](Type type) {
+    return getSparseTensorEncoding(type) != nullptr;
+  });
+}
+
 /// Returns true iff MLIR operand has any sparse operand.
 inline bool hasAnySparseOperand(Operation *op) {
-  return llvm::any_of(op->getOperands().getTypes(), [](Type t) {
-    return getSparseTensorEncoding(t) != nullptr;
-  });
+  return hasAnySparseType(op->getOperands().getTypes());
 }
 
 /// Returns true iff MLIR operand has any sparse result.
 inline bool hasAnySparseResult(Operation *op) {
-  return llvm::any_of(op->getResults().getTypes(), [](Type t) {
-    return getSparseTensorEncoding(t) != nullptr;
-  });
+  return hasAnySparseType(op->getResults().getTypes());
 }
 
 /// Returns true iff MLIR operand has any sparse operand or result.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index 4a9b9169ae4b..eefa4c71bbd2 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -167,7 +167,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     - **soa** : only applicable to singleton levels, fuses the singleton
       level in SoA (structure of arrays) scheme.
 
-    In addition to the map, the following two fields are optional:
+    In addition to the map, the following fields are optional:
 
     - The required bitwidth for position storage (integral offsets
       into the sparse storage scheme).  A narrow width reduces the memory
@@ -183,6 +183,23 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
       coordinate over all levels).  The choices are `8`, `16`, `32`,
       `64`, or, the default, `0` to indicate a native bitwidth.
 
+    - The explicit value for the sparse tensor. If explicitVal is set,
+      then all the non-zero values in the tensor have the same explicit value.
+      The default value Attribute() indicates that it is not set. This
+      is useful for binary-valued sparse tensors whose values can either
+      be an implicit value (0 by default) or an explicit value (such as 1).
+      In this approach, we don't store explicit/implicit values, and metadata
+      (such as position and coordinate arrays) alone fully defines the original tensor.
+      This yields additional savings for the storage requirements,
+      as well as for the computational time, since we skip operating on
+      implicit values and can constant fold the explicit values where they are used.
+
+    - The implicit value for the sparse tensor. If implicitVal is set,
+      then the "zero" value in the tensor is equal to the implicit value.
+      For now, we only support `0` as the implicit value but it could be
+      extended in the future. The default value Attribute() indicates that
+      the implicit value is `0` (same type as the tensor element type).
+
     Examples:
 
     ```mlir
@@ -226,6 +243,15 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     }>
     ... tensor<8x8xf64, #DCSC> ...
 
+    // Doubly compressed sparse column storage with specific
+    // explicit and implicit values.
+    #DCSC = #sparse_tensor.encoding<{
+      map = (i, j) -> (j : compressed, i : compressed),
+      explicitVal = 1 : i64,
+      implicitVal = 0 : i64
+    }>
+    ... tensor<8x8xi64, #DCSC> ...
+
     // Block sparse row storage (2x3 blocks).
     #BSR = #sparse_tensor.encoding<{
       map = ( i, j ) ->
@@ -307,6 +333,12 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     // The required bitwidth for coordinate storage.
     "unsigned":$crdWidth,
 
+    // The required explicit value.
+    "::mlir::Attribute":$explicitVal,
+
+    // The required implicit value.
+    "::mlir::Attribute":$implicitVal,
+
     // A slice attribute for each dimension of the tensor type.
     ArrayRefParameter<
       "::mlir::sparse_tensor::SparseTensorDimSliceAttr",
@@ -319,7 +351,9 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
                      CArg<"AffineMap", "{}">:$dimToLvl,
                      CArg<"AffineMap", "{}">:$lvlToDim,
                      CArg<"unsigned", "0">:$posWidth,
-                     CArg<"unsigned", "0">:$crdWidth), [{
+                     CArg<"unsigned", "0">:$crdWidth,
+                     CArg<"::mlir::Attribute", "{}">:$explicitVal,
+                     CArg<"::mlir::Attribute", "{}">:$implicitVal), [{
       if (!dimToLvl) {
         dimToLvl = ::mlir::AffineMap::getMultiDimIdentityMap(lvlTypes.size(), $_ctxt);
       }
@@ -327,6 +361,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
         lvlToDim = ::mlir::sparse_tensor::inferLvlToDim(dimToLvl, $_ctxt);
       }
       return $_get($_ctxt, lvlTypes, dimToLvl, lvlToDim, posWidth, crdWidth,
+        explicitVal, implicitVal,
         ArrayRef<::mlir::sparse_tensor::SparseTensorDimSliceAttr>{});
     }]>
   ];
@@ -353,6 +388,22 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     /// reset to the default, and all other fields inherited from `this`.
     SparseTensorEncodingAttr withoutBitWidths() const;
 
+    /// Constructs a new encoding with the given explicit value
+    /// and all other fields inherited from `this`.
+    SparseTensorEncodingAttr withExplicitVal(Attribute explicitVal) const;
+
+    /// Constructs a new encoding with the explicit value
+    /// reset to the default, and all other fields inherited from `this`.
+    SparseTensorEncodingAttr withoutExplicitVal() const;
+
+    /// Constructs a new encoding with the given implicit value
+    /// and all other fields inherited from `this`.
+    SparseTensorEncodingAttr withImplicitVal(Attribute implicitVal) const;
+
+    /// Constructs a new encoding with the implicit value
+    /// reset to the default, and all other fields inherited from `this`.
+    SparseTensorEncodingAttr withoutImplicitVal() const;
+
     /// Constructs a new encoding with the given dimSlices, and all
     /// other fields inherited from `this`.
     SparseTensorEncodingAttr withDimSlices(ArrayRef<::mlir::sparse_tensor::SparseTensorDimSliceAttr> dimSlices) const;
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 825d89a408fe..ea3d8013b456 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -115,6 +115,22 @@ public:
     return withEncoding(enc.withoutBitWidths());
   }
 
+  SparseTensorType withExplicitVal(Attribute explicitVal) const {
+    return withEncoding(enc.withExplicitVal(explicitVal));
+  }
+
+  SparseTensorType withoutExplicitVal() const {
+    return withEncoding(enc.withoutExplicitVal());
+  }
+
+  SparseTensorType withImplicitVal(Attribute implicitVal) const {
+    return withEncoding(enc.withImplicitVal(implicitVal));
+  }
+
+  SparseTensorType withoutImplicitVal() const {
+    return withEncoding(enc.withoutImplicitVal());
+  }
+
   SparseTensorType
   withDimSlices(ArrayRef<SparseTensorDimSliceAttr> dimSlices) const {
     return withEncoding(enc.withDimSlices(dimSlices));
@@ -327,6 +343,16 @@ public:
   /// Returns the position-overhead bitwidth, defaulting to zero.
   unsigned getPosWidth() const { return enc ? enc.getPosWidth() : 0; }
 
+  /// Returns the explicit value, defaulting to null Attribute for unset.
+  Attribute getExplicitVal() const {
+    return enc ? enc.getExplicitVal() : nullptr;
+  }
+
+  /// Returns the implicit value, defaulting to null Attribute for 0.
+  Attribute getImplicitVal() const {
+    return enc ? enc.getImplicitVal() : nullptr;
+  }
+
   /// Returns the coordinate-overhead MLIR type, defaulting to `IndexType`.
   Type getCrdType() const { return enc.getCrdElemType(); }
 
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index cf7f3e89079c..a403e89a39f9 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1062,8 +1062,7 @@ class Tensor_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
     Tensor_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
       Pure])>,
-    Arguments<(ins AnyRankedTensor:$src, IndexListArrayAttr:$reassociation)>,
-    Results<(outs AnyRankedTensor:$result)> {
+    Results<(outs AnyTensor:$result)> {
 
   code commonExtraClassDeclaration = [{
     static StringRef getReassociationAttrStrName() { return "reassociation"; }
@@ -1086,10 +1085,6 @@ class Tensor_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
     }
   }];
 
-  let assemblyFormat = [{
-    $src $reassociation attr-dict `:` type($src) `into` type($result)
-  }];
-
   let hasFolder = 1;
   let hasCanonicalizer = 1;
   let hasVerifier = 1;
@@ -1102,43 +1097,75 @@ def Tensor_ExpandShapeOp : Tensor_ReassociativeReshapeOp<"expand_shape"> {
     rank than the operand `src` whose dimension sizes are a reassociation of
     `src`.
 
-    A reassociation is defined as a continuous grouping of dimensions. It is
-    represented with an array of DenseI64ArrayAttr attribute. Entries in the
-    array are referred to as reassociation maps.
+    A reassociation is defined as a continuous grouping of dimensions and is
+    represented with an array of DenseI64ArrayAttr attribute.  The reassociation
+    maps applied to the result tensor with the higher rank must result in the
+    operand tensor with the smaller rank.
 
-    The reassociation maps are applied to the result shape to obtain the operand
-    shape.
+    The representation for the output shape supports a partially-static
+    specification via attributes specified through the `static_output_shape`
+    argument.  A special sentinel value `ShapedType::kDynamic` encodes that the
+    corresponding entry has a dynamic value.  There must be exactly as many SSA
+    inputs in `output_shape` as there are `ShapedType::kDynamic` entries in
+    `static_output_shape`.
 
     Example:
 
     ```mlir
     // Dimension expansion i -> (i', j') and (k) -> (k')
-    %b = tensor.expand_shape %a [[0, 1], [2]]
-        : tensor<?x?xf32> into tensor<?x?x?xf32>
+    %b = tensor.expand_shape %a [[0, 1], [2]] output_shape [%sz0, %sz1, 32]
+        : tensor<?x32xf32> into tensor<?x?x32xf32>
     ```
   }];
+
+  let arguments = (ins AnyTensor:$src, IndexListArrayAttr:$reassociation,
+                       Variadic<Index>:$output_shape,
+                       DenseI64ArrayAttr:$static_output_shape);
+
+  let assemblyFormat = [{
+    $src $reassociation `output_shape`
+    custom<DynamicIndexList>($output_shape, $static_output_shape) attr-dict `:`
+    type($src) `into` type($result)
+  }];
+
   let builders = [
     // Builders using ReassociationIndices.
     OpBuilder<(ins "Type":$resultType, "Value":$src,
       "ArrayRef<ReassociationIndices>":$reassociation,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs),
+      "ArrayRef<OpFoldResult>":$outputShape)>,
+
+    // It will infer output shape using inferOutputShape() method.
+    OpBuilder<(ins "Type":$resultType, "Value":$src,
+      "ArrayRef<ReassociationIndices>":$reassociation)>,
+
+    // Builder using ReassociationExprs.
+    OpBuilder<(ins "Type":$resultType, "Value":$src,
+      "ArrayRef<ReassociationExprs>":$reassociation),
     [{
-      build($_builder, $_state, resultType, src, attrs);
-      $_state.addAttribute("reassociation",
-          getReassociationIndicesAttribute($_builder, reassociation));
+      auto reassociationIndices =
+          convertReassociationMapsToIndices(reassociation);
+      build($_builder, $_state, resultType, src, reassociationIndices);
     }]>,
     OpBuilder<(ins "Type":$resultType, "Value":$src,
       "ArrayRef<ReassociationExprs>":$reassociation,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs),
+      "ArrayRef<OpFoldResult>":$outputShape),
     [{
-      auto reassociationMaps =
-          convertReassociationMapsToIndices($_builder, reassociation);
-      build($_builder, $_state, resultType, src, reassociationMaps, attrs);
+      auto reassociationIndices =
+          convertReassociationMapsToIndices(reassociation);
+      build($_builder, $_state, resultType, src, reassociationIndices,
+            outputShape);
     }]>
   ];
 
   let extraClassDeclaration = commonExtraClassDeclaration # [{
     int64_t getCorrespondingSourceDim(int64_t resultDim);
+
+    // Infer the output shape for a tensor.expand_shape when it is possible
+    // to do so.
+    static FailureOr<SmallVector<OpFoldResult>> inferOutputShape(
+        OpBuilder &b, Location loc, RankedTensorType expandedType,
+        ArrayRef<ReassociationIndices> reassociation,
+        ArrayRef<OpFoldResult> inputShape);
   }];
 
   let hasVerifier = 1;
@@ -1146,6 +1173,7 @@ def Tensor_ExpandShapeOp : Tensor_ReassociativeReshapeOp<"expand_shape"> {
 
 def Tensor_CollapseShapeOp : Tensor_ReassociativeReshapeOp<"collapse_shape"> {
   let summary = "operation to produce a tensor with a smaller rank";
+  let arguments = (ins AnyTensor:$src, IndexListArrayAttr:$reassociation);
   let description = [{
     The `tensor.collapse_shape` op produces a new tensor of lower (or equal)
     rank whose dimension sizes are a reassociation of the original `src` dimensions.
@@ -1163,6 +1191,11 @@ def Tensor_CollapseShapeOp : Tensor_ReassociativeReshapeOp<"collapse_shape"> {
         : tensor<?x?x?xf32> into tensor<?x?xf32>
     ```
   }];
+
+  let assemblyFormat = [{
+    $src $reassociation attr-dict `:` type($src) `into` type($result)
+  }];
+
   let builders = [
     // Builders for a contracting reshape whose result type is computed from
     // `src` and `reassociation`.
@@ -1174,7 +1207,7 @@ def Tensor_CollapseShapeOp : Tensor_ReassociativeReshapeOp<"collapse_shape"> {
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs),
     [{
       auto reassociationMaps =
-          convertReassociationMapsToIndices($_builder, reassociation);
+          convertReassociationMapsToIndices(reassociation);
       build($_builder, $_state, src, reassociationMaps, attrs);
     }]>,
 
@@ -1192,7 +1225,7 @@ def Tensor_CollapseShapeOp : Tensor_ReassociativeReshapeOp<"collapse_shape"> {
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs),
     [{
       auto reassociationMaps =
-          convertReassociationMapsToIndices($_builder, reassociation);
+          convertReassociationMapsToIndices(reassociation);
       build($_builder, $_state, resultType, src, reassociationMaps, attrs);
     }]>
   ];
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index dde17e2dc892..97a36c49d01b 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -605,9 +605,9 @@ def Tosa_BitwiseXorOp : Tosa_ElementwiseOp<"bitwise_xor", [
 }
 
 //===----------------------------------------------------------------------===//
-// Operator: div
+// Operator: int_div
 //===----------------------------------------------------------------------===//
-def Tosa_DivOp : Tosa_ElementwiseOp<"div", [SameOperandsAndResultElementType]> {
+def Tosa_IntDivOp : Tosa_ElementwiseOp<"int_div", [SameOperandsAndResultElementType]> {
   let summary = "Integer divide operator";
 
   let description = [{
@@ -1502,7 +1502,7 @@ def Tosa_ReduceSumOp : Tosa_InferTensorTypeOp<"reduce_sum"> {
 
   let hasFolder = 1;
   let hasVerifier = 1;
-  
+
   let extraClassDeclaration = [{
     /// Returns true when two result types are compatible for this op;
     /// Method used by InferTypeOpInterface.
@@ -1652,7 +1652,7 @@ def Tosa_ReverseOp: Tosa_Op<"reverse", [
 
   let hasFolder = 1;
   let hasVerifier = 1;
-  
+
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
@@ -1708,7 +1708,8 @@ def Tosa_TileOp : Tosa_InferShapedTypeOp<"tile"> {
 //===----------------------------------------------------------------------===//
 // Operator: transpose
 //===----------------------------------------------------------------------===//
-def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose"> {
+def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
+                [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>]> {
   let summary = "Transpose operator";
 
   let description = [{
@@ -1835,9 +1836,9 @@ def Tosa_CastOp: Tosa_Op<"cast", [Pure,
 
     | Mode                     | Input   | Output  |
     |--------------------------|---------|---------|
-    | signed 8 to bool         | int8    | Boolean | 
-    | signed 16 to bool        | int16   | Boolean | 
-    | signed 32 to bool        | int32   | Boolean | 
+    | signed 8 to bool         | int8    | Boolean |
+    | signed 16 to bool        | int16   | Boolean |
+    | signed 32 to bool        | int32   | Boolean |
     | bool to 8                | Boolean | int8    |
     | bool to 16               | Boolean | int16   |
     | bool to 32               | Boolean | int32   |
@@ -1851,8 +1852,8 @@ def Tosa_CastOp: Tosa_Op<"cast", [Pure,
     | float to signed 16       | float   | int16   |
     | signed 8 to float        | int8    | float   |
     | signed 16 to float       | int16   | float   |
-    | float 32 to float 64     | float32 | float64 | 
-    | float 64 to float 32     | float64 | float32 | 
+    | float 32 to float 64     | float32 | float64 |
+    | float 64 to float 32     | float64 | float32 |
   }];
 
   let arguments = (ins
diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
index ae9824f728da..e8f6edc3f133 100644
--- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
@@ -62,7 +62,7 @@ getReassociationIndicesAttribute(OpBuilder &b,
 
 /// Convert Array<Array<AffineExpr>> to Array<Array<int64_t>>.
 SmallVector<ReassociationIndices, 2> convertReassociationMapsToIndices(
-    OpBuilder &b, ArrayRef<ReassociationExprs> reassociationExprs);
+    ArrayRef<ReassociationExprs> reassociationExprs);
 
 /// Return the reassociations maps to use to reshape given the source type and
 /// the target type when possible. Return std::nullopt when this computation
@@ -140,14 +140,11 @@ static LogicalResult verifyReshapeLikeTypes(Op op, T expandedType,
       op.getReassociationIndices(), isExpansion);
 }
 
-/// Verify that shapes of the reshaped types using following rules
-/// 1) if a dimension in the collapsed type is static, then the corresponding
-///    dimensions in the expanded shape should be
+/// Verify that shapes of the reshaped types using following rule:
+/// if a dimension in the collapsed type is static, then the corresponding
+/// dimensions in the expanded shape should be
 ///    a) static
 ///    b) the product should be same as the collaped shape.
-/// 2) if a dimension in the collaped type is dynamic, one and only one of the
-///    corresponding dimensions in the expanded type should be dynamic. This
-///    rule is only needed with reshape operations that are expanding.
 LogicalResult reshapeLikeShapesAreCompatible(
     function_ref<LogicalResult(const Twine &)> emitError,
     ArrayRef<int64_t> collapsedShape, ArrayRef<int64_t> expandedShape,
@@ -156,9 +153,11 @@ LogicalResult reshapeLikeShapesAreCompatible(
 /// Returns true iff the type is a MemRefType and has a non-identity layout.
 bool hasNonIdentityLayout(Type type);
 
+enum class ReshapeOpKind { kExpand, kCollapse };
+
 /// Pattern to collapse producer/consumer reshape ops that are both collapsing
 /// dimensions or are both expanding dimensions.
-template <typename ReshapeOpTy>
+template <typename ReshapeOpTy, ReshapeOpKind opKind>
 struct ComposeReassociativeReshapeOps : public OpRewritePattern<ReshapeOpTy> {
   using OpRewritePattern<ReshapeOpTy>::OpRewritePattern;
   LogicalResult matchAndRewrite(ReshapeOpTy reshapeOp,
@@ -181,8 +180,18 @@ struct ComposeReassociativeReshapeOps : public OpRewritePattern<ReshapeOpTy> {
                                     rewriter.getContext());
     if (!reassociationIndices)
       return failure();
-    rewriter.replaceOpWithNewOp<ReshapeOpTy>(
-        reshapeOp, resultType, srcReshapeOp.getSrc(), *reassociationIndices);
+
+    if constexpr (opKind == ReshapeOpKind::kExpand) {
+      SmallVector<OpFoldResult> outputShape(
+          getMixedValues(reshapeOp.getStaticOutputShape(),
+                         reshapeOp.getOutputShape(), rewriter));
+      rewriter.replaceOpWithNewOp<ReshapeOpTy>(
+          reshapeOp, resultType, srcReshapeOp.getSrc(), *reassociationIndices,
+          outputShape);
+    } else {
+      rewriter.replaceOpWithNewOp<ReshapeOpTy>(
+          reshapeOp, resultType, srcReshapeOp.getSrc(), *reassociationIndices);
+    }
     return success();
   }
 };
@@ -215,7 +224,8 @@ struct ComposeReassociativeReshapeOps : public OpRewritePattern<ReshapeOpTy> {
 //
 /// When `rank(srcType) < rank(resultType)`, then we just swap `reassociation_1`
 /// `reassociation_2` and produce `expand_shape`.
-template <typename CollapseOpTy, typename ExpandOpTy, typename CastOpTy>
+template <typename CollapseOpTy, typename ExpandOpTy, typename CastOpTy,
+          typename DimOpTy, typename TensorTy>
 struct ComposeCollapseOfExpandOp : public OpRewritePattern<CollapseOpTy> {
   using OpRewritePattern<CollapseOpTy>::OpRewritePattern;
   LogicalResult matchAndRewrite(CollapseOpTy collapseOp,
@@ -322,8 +332,11 @@ struct ComposeExpandOfCollapseOp : public OpRewritePattern<ExpandOpTy> {
     if (!composedReassociation)
       return failure();
 
+    SmallVector<OpFoldResult> outputShape(getMixedValues(
+        expandOp.getStaticOutputShape(), expandOp.getOutputShape(), rewriter));
     rewriter.replaceOpWithNewOp<ExpandOpTy>(
-        expandOp, resultType, collapseOp.getSrc(), *composedReassociation);
+        expandOp, resultType, collapseOp.getSrc(), *composedReassociation,
+        outputShape);
     return success();
   }
 
diff --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
index 20f019666a2e..594bcf5dbb39 100644
--- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
@@ -125,9 +125,8 @@ SmallVector<OpFoldResult> getMixedValues(ArrayRef<int64_t> staticValues,
 /// Decompose a vector of mixed static or dynamic values into the
 /// corresponding pair of arrays. This is the inverse function of
 /// `getMixedValues`.
-std::pair<ArrayAttr, SmallVector<Value>>
-decomposeMixedValues(Builder &b,
-                     const SmallVectorImpl<OpFoldResult> &mixedValues);
+std::pair<SmallVector<int64_t>, SmallVector<Value>>
+decomposeMixedValues(const SmallVectorImpl<OpFoldResult> &mixedValues);
 
 /// Helper to sort `values` according to matching `keys`.
 SmallVector<Value>
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index 8a57c6094c41..030be328e97f 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -194,7 +194,7 @@ bool isLinearizableVector(VectorType type);
 /// for each dimension of the passed in tensor.
 Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source,
                              ArrayRef<int64_t> readShape, Value padValue,
-                             bool useInBoundsInsteadOfMasking = true);
+                             bool useInBoundsInsteadOfMasking);
 
 /// Returns success if `inputVectorSizes` is a valid masking configuraion for
 /// given `shape`, i.e., it meets:
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
index f1740e9ed929..3f8cac4dc07c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -2,12 +2,12 @@ add_mlir_dialect(XeGPU xegpu)
 add_mlir_doc(XeGPU XeGPU Dialects/ -gen-dialect-doc -dialect=xegpu)
 
 set(LLVM_TARGET_DEFINITIONS XeGPU.td)
-mlir_tablegen(XeGPUAttrs.h.inc -gen-attrdef-decls)
-mlir_tablegen(XeGPUAttrs.cpp.inc -gen-attrdef-defs)
+mlir_tablegen(XeGPUAttrs.h.inc -gen-attrdef-decls -attrdefs-dialect=xegpu)
+mlir_tablegen(XeGPUAttrs.cpp.inc -gen-attrdef-defs -attrdefs-dialect=xegpu)
 add_public_tablegen_target(MLIRXeGPUAttrsIncGen)
 add_dependencies(mlir-headers MLIRXeGPUAttrsIncGen)
 
-set(LLVM_TARGET_DEFINITIONS XeGPU.td)
+set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
 mlir_tablegen(XeGPUEnums.h.inc -gen-enum-decls)
 mlir_tablegen(XeGPUEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRXeGPUEnumsIncGen)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index eca9255ff397..7ac0cf77fe59 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_XEGPU_IR_XEGPU_H
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/TypeUtilities.h"
@@ -19,7 +20,7 @@
 
 namespace mlir {
 namespace xegpu {
-// placeholder
+class TensorDescType;
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 6579d07ec262..f3ca09a6a68e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
 
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/EnumAttr.td"
 
 class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
@@ -98,4 +99,21 @@ def XeGPU_CacheHintAttr
     let assemblyFormat = "`<` $value `>`";
 }
 
+def XeGPU_FenceScopeWorkgroup: I32EnumAttrCase<"Workgroup", 0, "workgroup">;
+def XeGPU_FenceScopeGPU: I32EnumAttrCase<"GPU", 1, "gpu">;
+def XeGPU_FenceScope: I32EnumAttr<"FenceScope",
+      "The enumeration for the scope of fence operation.",
+      [XeGPU_FenceScopeWorkgroup, XeGPU_FenceScopeGPU]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_FenceScopeAttr:
+  EnumAttr<XeGPU_Dialect, XeGPU_FenceScope, "fence_scope"> {
+    let summary = [{Describes the scope of fence.
+                    "workgroup" means that the scope is within each work group.
+                    "gpu" means the scope is across work groups within the gpu.}];
+    let assemblyFormat = "$value";
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
 \ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index c2f09319c790..765f218f95d2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -17,12 +17,14 @@ def XeGPU_Dialect : Dialect {
     let summary = "The XeGPU dialect that models Intel GPU's ISA";
     let description = [{
       The XeGPU dialect models Intel Xe ISA semantics but works at vector and
-      TensorDesc data type. It provides 1:1 mappings to match Xe instructions 
+      TensorDesc data type. It provides 1:1 mappings to match Xe instructions
       like DPAS and 2D block load. The matrix size being processed at this level
       exactly matches the hardware instructions or the intrinsic supported by
       the lower-level GPU compiler.
     }];
 
+    let dependentDialects = ["arith::ArithDialect"];
+
     let useDefaultTypePrinterParser = true;
     let useDefaultAttributePrinterParser = true;
 }
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index c6f7f83441b9..e477d9a0ca3f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -9,7 +9,7 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
 #define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
 
-include "mlir/IR/AttrTypeBase.td"
+include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
@@ -36,7 +36,7 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
 
     static ::mlir::ParseResult parseProperties(::mlir::OpAsmParser &parser,
                                      ::mlir::OperationState &result) {
-      if (mlir::succeeded(parser.parseLess())) {
+      if (mlir::succeeded(parser.parseOptionalLess())) {
         if (parser.parseAttribute(result.propertiesAttr) || parser.parseGreater())
           return failure();
       }
@@ -164,10 +164,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// source operand. They overide static shape from source memref type.
     ArrayRef<int64_t> getStaticSizes() {
       auto attr = getConstShapeAttr();
-      if (getSourceType().isa<IntegerType>() || attr)
+      if (llvm::isa<IntegerType>(getSourceType()) || attr)
         return attr;
 
-      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType());
       assert(memrefType && "Incorrect use of getStaticSizes");
       return memrefType.getShape();
     }
@@ -179,10 +179,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// source operand. They overide static strides from source memref type.
     ArrayRef<int64_t> getStaticStrides() {
       auto attr = getConstStridesAttr();
-      if (getSourceType().isa<IntegerType>() || attr)
+      if (llvm::isa<IntegerType>(getSourceType()) || attr)
         return attr;
 
-      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType());
       assert(memrefType && "Incorrect use of getStaticStrides");
       auto [strides, offset] = getStridesAndOffset(memrefType);
       // reuse the storage of ConstStridesAttr since strides from
@@ -196,7 +196,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// `static_shape` and `static_strides` attributes.
     std::array<unsigned, 3> getArrayAttrMaxRanks() {
       unsigned rank;
-      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
+      if (auto ty = llvm::dyn_cast<MemRefType>(getSourceType())) {
         rank = ty.getRank();
       } else {
         rank = (unsigned)getMixedOffsets().size();
@@ -254,7 +254,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
     a block of data from memory to register. It takes a set of optional cache
     hints for each level of cache, L1, L2 and L3. If hardware does not have a
     correspoding cache, Corresponding cache hint attribute will be masked.
-    vnni transform is an hardware feature for Intel GPU, which is used to
+    VNNI transformation is an hardware feature for Intel GPU, which is used to
     do data packing during the load for B operand of matrix operation, if
     the bit width of the data type is less then 32 bits, e.g., fp16. And
     transpose is another Intel hardware feature, which will do transpose
@@ -425,10 +425,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     %0 = memref.alloc() : memref<1024xf32>
     %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
     ```
-
-
-
-
   }];
 
   let arguments = (ins XeGPU_BaseAddrType: $source,
@@ -663,4 +659,153 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
   }];
 }
 
+def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> {
+  let summary = "It performs mma computation";
+
+  let description = [{DPAS performs matrix multiplication on matrix A of `mxk`
+    size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
+    matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
+    data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
+    and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
+    also requires A and B to be loaded with the required data layout. Specially,
+    VNNI layout is required for B operand. It is achieved via setting `vnni_axis = 0`
+    of the corresponding `load_nd` operator. To keep both operands as 3D vector,
+    operand A is loaded via setting `vnni_axis = 1` without impacting the
+    physical layouts change in register. Due to the VNNI transformation, A and B operands
+    are represented as 3D vector, with the last dimension representing the VNNI factor,
+    which is computed as `32/bit_width_of_elem_type`. Therefore, `A: vector<8x16xf16>`
+    is represented as `A: vector<8x8x2xf16>`, and `B: vector<16x16xf16>` is
+    represented as `B: vector<8x16x2xf16>`.
+
+    Note: on PVC, the hardware can perform load with VNNI transformation when data
+          element type is 16-bit or lower precision, taking 2 or 4 elements from
+          the first dimension and inserted into the newly added innermost dimension.
+  }];
+
+  let arguments = (ins
+    XeGPU_DpasOpType : $lhs,
+    XeGPU_DpasOpType : $rhs,
+    Optional<XeGPU_Vector2DType>: $acc);
+  let results = (outs XeGPU_Vector2DType: $result);
+
+  let extraClassDeclaration = [{
+    VectorType getLhsType() {
+      return getLhs().getType();
+    }
+
+    VectorType getRhsType() {
+      return getRhs().getType();
+    }
+
+    VectorType getAccType() {
+      if (getAcc())
+        return getAcc().getType();
+      return {};
+    }
+
+    VectorType getResultType() {
+      return getResult().getType();
+    }
+  }];
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs (`,` $acc^)? attr-dict `:` type($lhs)`,` type($rhs) (`,` type($acc)^)?  `->` type($result)
+  }];
+
+  let hasVerifier = 1;
+}
+
+def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
+      AllElementTypesMatch<["tensorDesc", "value", "result"]>,
+      AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
+  let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
+
+  let description = [{
+    The `xegpu.atomic_rmw` operation provides a way to perform a read-modify-write
+    operation on the region described by the `TensorDesc` free from data races. The
+    `kind` enumeration specifies the modification to be performed, The `mask` operand
+    has the same shape with `TensorDesc`, and is used to enable or disable specific
+    data points of the `TensorDesc`. The `value` operand represents the new value to
+    be applied during the modification.
+  }];
+
+  let arguments = (ins
+    AtomicRMWKindAttr:$kind,
+    XeGPU_TensorDesc:$tensorDesc,
+    XeGPU_MaskType:$mask,
+    XeGPU_ValueType:$value);
+
+  let results = (outs XeGPU_ValueType:$result);
+
+  let assemblyFormat = [{
+    $kind $tensorDesc `,` $mask `,` $value attr-dict `:`
+    type($tensorDesc) `,` type($mask) `,` type($value) `->` type($result)
+  }];
+}
+
+def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> {
+  let summary = "It allocates a set of named barriers.";
+  let description = [{AllocNbarrier is to create a set of named barriers as
+  specified by `nbarrier_num`. Named barriers are workgroup level resources,
+    and are shared by all threads in the workgroup. For example, there are
+    up to 32 barriers (range 0-31) for each XeCore on PVC. A typical use case
+    is that a workgroup is partitioned into N subgroups of threads (N <= 32),
+    and each subgroup coordinating their work with a separate barrier with id
+    range from 0 to N respectively.}];
+  let arguments = (ins I64Attr: $nbarrier_num);
+  let assemblyFormat = "$nbarrier_num attr-dict";
+}
+
+def XeGPU_InitNbarrierOp: XeGPU_Op<"init_nbarrier", []> {
+  let summary = "It assigns a named barrier to the current thread.";
+  let description = [{InitNbarrierOp assigns the named barrier with the specified
+      barrier ID (0~31) to the current thread. Multiple threads may bind to the
+      same named barrier, and the `participant_thread_num` specifies the total
+      number of threads associated with the nbarrier. It returns an object of
+      NbarrierType representing the barrier}];
+
+  let arguments = (ins I8: $nbarrier_id,
+                       I8: $participant_thread_num);
+  let results = (outs XeGPU_Nbarrier: $result);
+  let assemblyFormat = [{
+    $nbarrier_id `,` $participant_thread_num attr-dict `:`
+    type($nbarrier_id) `,` type($participant_thread_num) `->` qualified(type($result))
+  }];
+}
+
+def XeGPU_NbarrierArriveOp: XeGPU_Op<"nbarrier_arrive", []> {
+  let summary = "It signals the arrival at the named barrier.";
+  let description = [{NbarrierArriveOp signals the hardware (or other threads)
+    that the current thread has produced its data for the consumer threads. When
+    the hardware signalled by `participant_thread_num` threads for the named barrier,
+    it will notify the threads waiting for the named barrier to continue their work.}];
+
+  let arguments = (ins XeGPU_Nbarrier: $nbarrier);
+  let assemblyFormat = [{ $nbarrier attr-dict `:` qualified(type($nbarrier))}];
+}
+
+def XeGPU_NbarrierWaitOp: XeGPU_Op<"nbarrier_wait", []> {
+  let summary = "It waits for a named barrier.";
+  let description = [{NbarrierWaitOp signals the hardware which named barrier
+    the current thread is waiting for, such that it can get notified when the
+    named barrier is completed.}];
+  let arguments = (ins XeGPU_Nbarrier: $nbarrier);
+  let assemblyFormat = [{ $nbarrier attr-dict `:` qualified(type($nbarrier)) }];
+}
+
+def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
+  let summary = "It synchronizes memory accesses.";
+  let description = [{It synchronizes the memory access between
+    write and following read or write.
+    1. `Memory_kind` describes the memory kind. "global" means the global memory,
+        "slm" means the share local memory.
+    2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
+        within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
+  }];
+  let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind,
+                       XeGPU_FenceScopeAttr: $fence_scope);
+  let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
+  let extraClassDeclaration = extraBaseClassDeclaration;
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 4cd4e5411653..bab0e4afb1e5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -151,4 +151,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
 
 }
 
+
+def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
+  let summary = "!xegpu.nbarrier a custom XeGPU type representing a barrier.";
+
+  let extraClassDeclaration = [{
+    static NbarrierType get(mlir::MLIRContext *context) {
+      return Base::get(context);
+    };
+  }];
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/include/mlir/IR/Attributes.h b/mlir/include/mlir/IR/Attributes.h
index cc0cee6a3118..8a077865b51b 100644
--- a/mlir/include/mlir/IR/Attributes.h
+++ b/mlir/include/mlir/IR/Attributes.h
@@ -50,14 +50,19 @@ public:
   /// Casting utility functions. These are deprecated and will be removed,
   /// please prefer using the `llvm` namespace variants instead.
   template <typename... Tys>
+  [[deprecated("Use mlir::isa<U>() instead")]]
   bool isa() const;
   template <typename... Tys>
+  [[deprecated("Use mlir::isa_and_nonnull<U>() instead")]]
   bool isa_and_nonnull() const;
   template <typename U>
+  [[deprecated("Use mlir::dyn_cast<U>() instead")]]
   U dyn_cast() const;
   template <typename U>
+  [[deprecated("Use mlir::dyn_cast_or_null<U>() instead")]]
   U dyn_cast_or_null() const;
   template <typename U>
+  [[deprecated("Use mlir::cast<U>() instead")]]
   U cast() const;
 
   /// Return a unique identifier for the concrete attribute type. This is used
diff --git a/mlir/include/mlir/IR/BuiltinLocationAttributes.td b/mlir/include/mlir/IR/BuiltinLocationAttributes.td
index dfcc180071f7..5a72404dea15 100644
--- a/mlir/include/mlir/IR/BuiltinLocationAttributes.td
+++ b/mlir/include/mlir/IR/BuiltinLocationAttributes.td
@@ -228,7 +228,8 @@ def OpaqueLoc : Builtin_LocationAttr<"OpaqueLoc"> {
     template <typename T> static T getUnderlyingLocation(Location location) {
       assert(isa<T>(location));
       return reinterpret_cast<T>(
-          location.cast<mlir::OpaqueLoc>().getUnderlyingLocation());
+          mlir::cast<mlir::OpaqueLoc>(static_cast<LocationAttr>(location))
+              .getUnderlyingLocation());
     }
 
     /// Returns a pointer to some data structure that opaque location stores.
@@ -237,15 +238,17 @@ def OpaqueLoc : Builtin_LocationAttr<"OpaqueLoc"> {
     template <typename T>
     static T getUnderlyingLocationOrNull(Location location) {
       return isa<T>(location)
-                 ? reinterpret_cast<T>(
-                       location.cast<mlir::OpaqueLoc>().getUnderlyingLocation())
-                 : T(nullptr);
+                ? reinterpret_cast<T>(mlir::cast<mlir::OpaqueLoc>(
+                                          static_cast<LocationAttr>(location))
+                                          .getUnderlyingLocation())
+                : T(nullptr);
     }
 
     /// Checks whether provided location is opaque location and contains a
     /// pointer to an object of particular type.
     template <typename T> static bool isa(Location location) {
-      auto opaque_loc = location.dyn_cast<OpaqueLoc>();
+      auto opaque_loc =
+          mlir::dyn_cast<OpaqueLoc>(static_cast<LocationAttr>(location));
       return opaque_loc && opaque_loc.getUnderlyingTypeID() == TypeID::get<T>();
     }
   }];
diff --git a/mlir/include/mlir/IR/Location.h b/mlir/include/mlir/IR/Location.h
index aa8314f38cdf..423b4d19b5b9 100644
--- a/mlir/include/mlir/IR/Location.h
+++ b/mlir/include/mlir/IR/Location.h
@@ -78,14 +78,17 @@ public:
 
   /// Type casting utilities on the underlying location.
   template <typename U>
+  [[deprecated("Use mlir::isa<U>() instead")]]
   bool isa() const {
     return llvm::isa<U>(*this);
   }
   template <typename U>
+  [[deprecated("Use mlir::dyn_cast<U>() instead")]]
   U dyn_cast() const {
     return llvm::dyn_cast<U>(*this);
   }
   template <typename U>
+  [[deprecated("Use mlir::cast<U>() instead")]]
   U cast() const {
     return llvm::cast<U>(*this);
   }
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index cdb75a3777ad..e661bb87a27e 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -1037,8 +1037,11 @@ public:
     addAttribute(StringAttr::get(getContext(), name), attr);
   }
 
-  /// Add an attribute with the specified name.
+  /// Add an attribute with the specified name. `name` and `attr` must not be
+  /// null.
   void addAttribute(StringAttr name, Attribute attr) {
+    assert(name && "attribute name cannot be null");
+    assert(attr && "attribute cannot be null");
     attributes.append(name, attr);
   }
 
@@ -1047,7 +1050,11 @@ public:
     attributes.append(newAttributes);
   }
 
-  void addSuccessors(Block *successor) { successors.push_back(successor); }
+  /// Adds a successor to the operation sate. `successor` must not be null.
+  void addSuccessors(Block *successor) {
+    assert(successor && "successor cannot be null");
+    successors.push_back(successor);
+  }
   void addSuccessors(BlockRange newSuccessors);
 
   /// Create a region that should be attached to the operation.  These regions
diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h
index a89e13b625bf..65824531fdc9 100644
--- a/mlir/include/mlir/IR/Types.h
+++ b/mlir/include/mlir/IR/Types.h
@@ -97,14 +97,19 @@ public:
   bool operator!() const { return impl == nullptr; }
 
   template <typename... Tys>
+  [[deprecated("Use mlir::isa<U>() instead")]]
   bool isa() const;
   template <typename... Tys>
+  [[deprecated("Use mlir::isa_and_nonnull<U>() instead")]]
   bool isa_and_nonnull() const;
   template <typename U>
+  [[deprecated("Use mlir::dyn_cast<U>() instead")]]
   U dyn_cast() const;
   template <typename U>
+  [[deprecated("Use mlir::dyn_cast_or_null<U>() instead")]]
   U dyn_cast_or_null() const;
   template <typename U>
+  [[deprecated("Use mlir::cast<U>() instead")]]
   U cast() const;
 
   /// Return a unique identifier for the concrete type. This is used to support
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index cdbc6cc37436..a7344c64e673 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -98,25 +98,25 @@ public:
   constexpr Value(detail::ValueImpl *impl = nullptr) : impl(impl) {}
 
   template <typename U>
-  [[deprecated("Use isa<U>() instead")]]
+  [[deprecated("Use mlir::isa<U>() instead")]]
   bool isa() const {
     return llvm::isa<U>(*this);
   }
 
   template <typename U>
-  [[deprecated("Use dyn_cast<U>() instead")]]
+  [[deprecated("Use mlir::dyn_cast<U>() instead")]]
   U dyn_cast() const {
     return llvm::dyn_cast<U>(*this);
   }
 
   template <typename U>
-  [[deprecated("Use dyn_cast_or_null<U>() instead")]]
+  [[deprecated("Use mlir::dyn_cast_or_null<U>() instead")]]
   U dyn_cast_or_null() const {
     return llvm::dyn_cast_or_null<U>(*this);
   }
 
   template <typename U>
-  [[deprecated("Use cast<U>() instead")]]
+  [[deprecated("Use mlir::cast<U>() instead")]]
   U cast() const {
     return llvm::cast<U>(*this);
   }
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index c4d788cf8ed3..d9db21073e15 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -45,6 +45,7 @@
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/AllInterfaces.h"
+#include "mlir/Dialect/Linalg/Transforms/RuntimeOpVerification.h"
 #include "mlir/Dialect/MLProgram/IR/MLProgram.h"
 #include "mlir/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/MPI/IR/MPI.h"
@@ -161,6 +162,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
   cf::registerBufferDeallocationOpInterfaceExternalModels(registry);
   gpu::registerBufferDeallocationOpInterfaceExternalModels(registry);
   linalg::registerAllDialectInterfaceImplementations(registry);
+  linalg::registerRuntimeVerifiableOpInterfaceExternalModels(registry);
   memref::registerAllocationOpInterfaceExternalModels(registry);
   memref::registerBufferViewFlowOpInterfaceExternalModels(registry);
   memref::registerRuntimeVerifiableOpInterfaceExternalModels(registry);
diff --git a/mlir/include/mlir/Interfaces/RuntimeVerifiableOpInterface.td b/mlir/include/mlir/Interfaces/RuntimeVerifiableOpInterface.td
index d5f11d00cc3d..6fd0df59d9d2 100644
--- a/mlir/include/mlir/Interfaces/RuntimeVerifiableOpInterface.td
+++ b/mlir/include/mlir/Interfaces/RuntimeVerifiableOpInterface.td
@@ -35,6 +35,12 @@ def RuntimeVerifiableOpInterface : OpInterface<"RuntimeVerifiableOpInterface"> {
                     "::mlir::Location":$loc)
     >,
   ];
+
+  let extraClassDeclaration = [{
+    /// Generate the error message that will be printed to the user when 
+    /// verification fails.
+    static std::string generateErrorMessage(Operation *op, const std::string &msg);
+  }];
 }
 
 #endif // MLIR_INTERFACES_RUNTIMEVERIFIABLEOPINTERFACE
diff --git a/mlir/include/mlir/TableGen/CodeGenHelpers.h b/mlir/include/mlir/TableGen/CodeGenHelpers.h
index dd17a44c889b..c263c69c53d1 100644
--- a/mlir/include/mlir/TableGen/CodeGenHelpers.h
+++ b/mlir/include/mlir/TableGen/CodeGenHelpers.h
@@ -99,8 +99,14 @@ private:
 ///
 class StaticVerifierFunctionEmitter {
 public:
+  /// Create a constraint uniquer with a unique prefix derived from the record
+  /// keeper with an optional tag.
   StaticVerifierFunctionEmitter(raw_ostream &os,
-                                const llvm::RecordKeeper &records);
+                                const llvm::RecordKeeper &records,
+                                StringRef tag = "");
+
+  /// Collect and unique all the constraints used by operations.
+  void collectOpConstraints(ArrayRef<llvm::Record *> opDefs);
 
   /// Collect and unique all compatible type, attribute, successor, and region
   /// constraints from the operations in the file and emit them at the top of
@@ -108,7 +114,7 @@ public:
   ///
   /// Constraints that do not meet the restriction that they can only reference
   /// `$_self` and `$_op` are not uniqued.
-  void emitOpConstraints(ArrayRef<llvm::Record *> opDefs, bool emitDecl);
+  void emitOpConstraints(ArrayRef<llvm::Record *> opDefs);
 
   /// Unique all compatible type and attribute constraints from a pattern file
   /// and emit them at the top of the generated file.
@@ -177,8 +183,6 @@ private:
   /// Emit pattern constraints.
   void emitPatternConstraints();
 
-  /// Collect and unique all the constraints used by operations.
-  void collectOpConstraints(ArrayRef<llvm::Record *> opDefs);
   /// Collect and unique all pattern constraints.
   void collectPatternConstraints(ArrayRef<DagLeaf> constraints);
 
diff --git a/mlir/include/mlir/Tools/PDLL/AST/Nodes.h b/mlir/include/mlir/Tools/PDLL/AST/Nodes.h
index 5515ee7548b5..aed2562e4d30 100644
--- a/mlir/include/mlir/Tools/PDLL/AST/Nodes.h
+++ b/mlir/include/mlir/Tools/PDLL/AST/Nodes.h
@@ -597,7 +597,7 @@ public:
   }
 
   /// Return the range result type of this expression.
-  RangeType getType() const { return Base::getType().cast<RangeType>(); }
+  RangeType getType() const { return mlir::cast<RangeType>(Base::getType()); }
 
 private:
   RangeExpr(SMRange loc, RangeType type, unsigned numElements)
@@ -630,7 +630,7 @@ public:
   }
 
   /// Return the tuple result type of this expression.
-  TupleType getType() const { return Base::getType().cast<TupleType>(); }
+  TupleType getType() const { return mlir::cast<TupleType>(Base::getType()); }
 
 private:
   TupleExpr(SMRange loc, TupleType type) : Base(loc, type) {}
diff --git a/mlir/include/mlir/Tools/PDLL/AST/Types.h b/mlir/include/mlir/Tools/PDLL/AST/Types.h
index 03252e9f6620..89c8e193ddc3 100644
--- a/mlir/include/mlir/Tools/PDLL/AST/Types.h
+++ b/mlir/include/mlir/Tools/PDLL/AST/Types.h
@@ -64,23 +64,28 @@ public:
 
   /// Provide type casting support.
   template <typename U>
+  [[deprecated("Use mlir::isa<U>() instead")]]
   bool isa() const {
     assert(impl && "isa<> used on a null type.");
     return U::classof(*this);
   }
   template <typename U, typename V, typename... Others>
+  [[deprecated("Use mlir::isa<U>() instead")]]
   bool isa() const {
     return isa<U>() || isa<V, Others...>();
   }
   template <typename U>
+  [[deprecated("Use mlir::dyn_cast<U>() instead")]]
   U dyn_cast() const {
     return isa<U>() ? U(impl) : U(nullptr);
   }
   template <typename U>
+  [[deprecated("Use mlir::dyn_cast_or_null<U>() instead")]]
   U dyn_cast_or_null() const {
     return (impl && isa<U>()) ? U(impl) : U(nullptr);
   }
   template <typename U>
+  [[deprecated("Use mlir::cast<U>() instead")]]
   U cast() const {
     assert(isa<U>());
     return U(impl);
@@ -323,6 +328,29 @@ struct DenseMapInfo<mlir::pdll::ast::Type> {
     return lhs == rhs;
   }
 };
+
+/// Add support for llvm style casts.
+/// We provide a cast between To and From if From is mlir::pdll::ast::Type or
+/// derives from it
+template <typename To, typename From>
+struct CastInfo<
+    To, From,
+    std::enable_if_t<
+        std::is_same_v<mlir::pdll::ast::Type, std::remove_const_t<From>> ||
+        std::is_base_of_v<mlir::pdll::ast::Type, From>>>
+    : NullableValueCastFailed<To>,
+      DefaultDoCastIfPossible<To, From, CastInfo<To, From>> {
+  static inline bool isPossible(mlir::pdll::ast::Type ty) {
+    /// Return a constant true instead of a dynamic true when casting to self or
+    /// up the hierarchy.
+    if constexpr (std::is_base_of_v<To, From>) {
+      return true;
+    } else {
+      return To::classof(ty);
+    };
+  }
+  static inline To doCast(mlir::pdll::ast::Type ty) { return To(ty.getImpl()); }
+};
 } // namespace llvm
 
 #endif // MLIR_TOOLS_PDLL_AST_TYPES_H_
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Transport.h b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
index ce742be7a941..047d174234df 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/Transport.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
@@ -15,6 +15,7 @@
 #ifndef MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
 #define MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
 
+#include "mlir/Support/DebugStringHelper.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Tools/lsp-server-support/Logging.h"
@@ -100,6 +101,18 @@ using Callback = llvm::unique_function<void(llvm::Expected<T>)>;
 template <typename T>
 using OutgoingNotification = llvm::unique_function<void(const T &)>;
 
+/// An OutgoingRequest<T> is a function used for outgoing requests to send to
+/// the client.
+template <typename T>
+using OutgoingRequest =
+    llvm::unique_function<void(const T &, llvm::json::Value id)>;
+
+/// An `OutgoingRequestCallback` is invoked when an outgoing request to the
+/// client receives a response in turn. It is passed the original request's ID,
+/// as well as the result JSON.
+using OutgoingRequestCallback =
+    std::function<void(llvm::json::Value, llvm::Expected<llvm::json::Value>)>;
+
 /// A handler used to process the incoming transport messages.
 class MessageHandler {
 public:
@@ -147,9 +160,15 @@ public:
                     void (ThisT::*handler)(const Param &)) {
     notificationHandlers[method] = [method, handler,
                                     thisPtr](llvm::json::Value rawParams) {
-      llvm::Expected<Param> param = parse<Param>(rawParams, method, "request");
-      if (!param)
-        return llvm::consumeError(param.takeError());
+      llvm::Expected<Param> param =
+          parse<Param>(rawParams, method, "notification");
+      if (!param) {
+        return llvm::consumeError(
+            llvm::handleErrors(param.takeError(), [](const LSPError &lspError) {
+              Logger::error("JSON parsing error: {0}",
+                            lspError.message.c_str());
+            }));
+      }
       (thisPtr->*handler)(*param);
     };
   }
@@ -164,6 +183,26 @@ public:
     };
   }
 
+  /// Create an OutgoingRequest function that, when called, sends a request with
+  /// the given method via the transport. Should the outgoing request be
+  /// met with a response, the response callback is invoked to handle that
+  /// response.
+  template <typename T>
+  OutgoingRequest<T> outgoingRequest(llvm::StringLiteral method,
+                                     OutgoingRequestCallback callback) {
+    return [&, method, callback](const T &params, llvm::json::Value id) {
+      {
+        std::lock_guard<std::mutex> lock(responseHandlersMutex);
+        responseHandlers.insert(
+            {debugString(id), std::make_pair(method.str(), callback)});
+      }
+
+      std::lock_guard<std::mutex> transportLock(transportOutputMutex);
+      Logger::info("--> {0}({1})", method, id);
+      transport.call(method, llvm::json::Value(params), id);
+    };
+  }
+
 private:
   template <typename HandlerT>
   using HandlerMap = llvm::StringMap<llvm::unique_function<HandlerT>>;
@@ -172,6 +211,14 @@ private:
   HandlerMap<void(llvm::json::Value, Callback<llvm::json::Value>)>
       methodHandlers;
 
+  /// A pair of (1) the original request's method name, and (2) the callback
+  /// function to be invoked for responses.
+  using ResponseHandlerTy = std::pair<std::string, OutgoingRequestCallback>;
+  /// A mapping from request/response ID to response handler.
+  llvm::StringMap<ResponseHandlerTy> responseHandlers;
+  /// Mutex to guard insertion into the response handler map.
+  std::mutex responseHandlersMutex;
+
   JSONTransport &transport;
 
   /// Mutex to guard sending output messages to the transport.
diff --git a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
index 171faf9e0087..584981cfe99b 100644
--- a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
+++ b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp
@@ -42,16 +42,19 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) {
           [](py::object cls, std::vector<MlirSparseTensorLevelType> lvlTypes,
              std::optional<MlirAffineMap> dimToLvl,
              std::optional<MlirAffineMap> lvlToDim, int posWidth, int crdWidth,
-             MlirContext context) {
+             std::optional<MlirAttribute> explicitVal,
+             std::optional<MlirAttribute> implicitVal, MlirContext context) {
             return cls(mlirSparseTensorEncodingAttrGet(
                 context, lvlTypes.size(), lvlTypes.data(),
                 dimToLvl ? *dimToLvl : MlirAffineMap{nullptr},
                 lvlToDim ? *lvlToDim : MlirAffineMap{nullptr}, posWidth,
-                crdWidth));
+                crdWidth, explicitVal ? *explicitVal : MlirAttribute{nullptr},
+                implicitVal ? *implicitVal : MlirAttribute{nullptr}));
           },
           py::arg("cls"), py::arg("lvl_types"), py::arg("dim_to_lvl"),
           py::arg("lvl_to_dim"), py::arg("pos_width"), py::arg("crd_width"),
-          py::arg("context") = py::none(),
+          py::arg("explicit_val") = py::none(),
+          py::arg("implicit_val") = py::none(), py::arg("context") = py::none(),
           "Gets a sparse_tensor.encoding from parameters.")
       .def_classmethod(
           "build_level_type",
@@ -98,6 +101,24 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) {
       .def_property_readonly("crd_width",
                              mlirSparseTensorEncodingAttrGetCrdWidth)
       .def_property_readonly(
+          "explicit_val",
+          [](MlirAttribute self) -> std::optional<MlirAttribute> {
+            MlirAttribute ret =
+                mlirSparseTensorEncodingAttrGetExplicitVal(self);
+            if (mlirAttributeIsNull(ret))
+              return {};
+            return ret;
+          })
+      .def_property_readonly(
+          "implicit_val",
+          [](MlirAttribute self) -> std::optional<MlirAttribute> {
+            MlirAttribute ret =
+                mlirSparseTensorEncodingAttrGetImplicitVal(self);
+            if (mlirAttributeIsNull(ret))
+              return {};
+            return ret;
+          })
+      .def_property_readonly(
           "structured_n",
           [](MlirAttribute self) -> unsigned {
             const int lvlRank = mlirSparseTensorEncodingGetLvlRank(self);
diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
index 3ae06f220c52..19171d64d409 100644
--- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp
+++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
@@ -44,18 +44,20 @@ bool mlirAttributeIsASparseTensorEncodingAttr(MlirAttribute attr) {
   return isa<SparseTensorEncodingAttr>(unwrap(attr));
 }
 
-MlirAttribute
-mlirSparseTensorEncodingAttrGet(MlirContext ctx, intptr_t lvlRank,
-                                MlirSparseTensorLevelType const *lvlTypes,
-                                MlirAffineMap dimToLvl, MlirAffineMap lvlToDim,
-                                int posWidth, int crdWidth) {
+MlirAttribute mlirSparseTensorEncodingAttrGet(
+    MlirContext ctx, intptr_t lvlRank,
+    MlirSparseTensorLevelType const *lvlTypes, MlirAffineMap dimToLvl,
+    MlirAffineMap lvlToDim, int posWidth, int crdWidth,
+    MlirAttribute explicitVal, MlirAttribute implicitVal) {
   SmallVector<LevelType> cppLvlTypes;
+
   cppLvlTypes.reserve(lvlRank);
   for (intptr_t l = 0; l < lvlRank; ++l)
     cppLvlTypes.push_back(static_cast<LevelType>(lvlTypes[l]));
-  return wrap(SparseTensorEncodingAttr::get(unwrap(ctx), cppLvlTypes,
-                                            unwrap(dimToLvl), unwrap(lvlToDim),
-                                            posWidth, crdWidth));
+
+  return wrap(SparseTensorEncodingAttr::get(
+      unwrap(ctx), cppLvlTypes, unwrap(dimToLvl), unwrap(lvlToDim), posWidth,
+      crdWidth, unwrap(explicitVal), unwrap(implicitVal)));
 }
 
 MlirAffineMap mlirSparseTensorEncodingAttrGetDimToLvl(MlirAttribute attr) {
@@ -91,6 +93,14 @@ int mlirSparseTensorEncodingAttrGetCrdWidth(MlirAttribute attr) {
   return cast<SparseTensorEncodingAttr>(unwrap(attr)).getCrdWidth();
 }
 
+MlirAttribute mlirSparseTensorEncodingAttrGetExplicitVal(MlirAttribute attr) {
+  return wrap(cast<SparseTensorEncodingAttr>(unwrap(attr)).getExplicitVal());
+}
+
+MlirAttribute mlirSparseTensorEncodingAttrGetImplicitVal(MlirAttribute attr) {
+  return wrap(cast<SparseTensorEncodingAttr>(unwrap(attr)).getImplicitVal());
+}
+
 MlirSparseTensorLevelType mlirSparseTensorEncodingAttrBuildLvlType(
     enum MlirSparseTensorLevelFormat lvlFmt,
     const enum MlirSparseTensorLevelPropertyNondefault *properties,
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 4a15976d40c7..c2a83f90bcbe 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -857,7 +857,7 @@ struct SqrtOpConversion : public OpConversionPattern<complex::SqrtOp> {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     auto type = cast<ComplexType>(op.getType());
-    auto elementType = type.getElementType().cast<FloatType>();
+    auto elementType = cast<FloatType>(type.getElementType());
     arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue();
 
     auto cst = [&](APFloat v) {
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index b6b85cab5a38..e6ba6e6bc602 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -18,7 +18,6 @@
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
@@ -90,8 +89,8 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args,
     return rewriter.create<arith::MulFOp>(loc, resultTypes, args);
   }
 
-  // tosa::DivOp
-  if (isa<tosa::DivOp>(op) && isa<IntegerType>(elementTy))
+  // tosa::IntDivOp
+  if (isa<tosa::IntDivOp>(op) && isa<IntegerType>(elementTy))
     return rewriter.create<arith::DivSIOp>(loc, resultTypes, args);
 
   // tosa::ReciprocalOp
@@ -2568,7 +2567,7 @@ void mlir::tosa::populateTosaToLinalgConversionPatterns(
       PointwiseConverter<tosa::AddOp>,
       PointwiseConverter<tosa::SubOp>,
       PointwiseConverter<tosa::MulOp>,
-      PointwiseConverter<tosa::DivOp>,
+      PointwiseConverter<tosa::IntDivOp>,
       PointwiseConverter<tosa::NegateOp>,
       PointwiseConverter<tosa::PowOp>,
       PointwiseConverter<tosa::ReciprocalOp>,
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 1b9975237c69..fe6bcc1c8b66 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1738,7 +1738,7 @@ struct VectorInterleaveOpLowering
                                          "InterleaveOp not rank 1");
     // If the result is rank 1, then this directly maps to LLVM.
     if (resultType.isScalable()) {
-      rewriter.replaceOpWithNewOp<LLVM::experimental_vector_interleave2>(
+      rewriter.replaceOpWithNewOp<LLVM::vector_interleave2>(
           interleaveOp, typeConverter->convertType(resultType),
           adaptor.getLhs(), adaptor.getRhs());
       return success();
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index caca2ff81964..02d05780a7ac 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -175,6 +175,7 @@ def MulSIExtendedToMulI :
 def IsScalarOrSplatOne :
     Constraint<And<[
       CPred<"succeeded(getIntOrSplatIntValue($0))">,
+      CPred<"getIntOrSplatIntValue($0)->isStrictlyPositive()">,
       CPred<"*getIntOrSplatIntValue($0) == 1">]>>;
 
 // mulsi_extended(x, 1) -> [x, extsi(cmpi slt, x, 0)]
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index dd04a5996558..54be644a7101 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -152,6 +152,22 @@ struct FloorDivSIOpConverter : public OpRewritePattern<arith::FloorDivSIOp> {
   }
 };
 
+template <typename OpTy, arith::CmpIPredicate pred>
+struct MaxMinIOpConverter : public OpRewritePattern<OpTy> {
+public:
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const final {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+
+    Value cmp = rewriter.create<arith::CmpIOp>(op.getLoc(), pred, lhs, rhs);
+    rewriter.replaceOpWithNewOp<arith::SelectOp>(op, cmp, lhs, rhs);
+    return success();
+  }
+};
+
 template <typename OpTy, arith::CmpFPredicate pred>
 struct MaximumMinimumFOpConverter : public OpRewritePattern<OpTy> {
 public:
@@ -335,6 +351,10 @@ struct ArithExpandOpsPass
       arith::CeilDivSIOp,
       arith::CeilDivUIOp,
       arith::FloorDivSIOp,
+      arith::MaxSIOp,
+      arith::MaxUIOp,
+      arith::MinSIOp,
+      arith::MinUIOp,
       arith::MaximumFOp,
       arith::MinimumFOp,
       arith::MaxNumFOp,
@@ -383,6 +403,10 @@ void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
   populateCeilFloorDivExpandOpsPatterns(patterns);
   // clang-format off
   patterns.add<
+    MaxMinIOpConverter<MaxSIOp, arith::CmpIPredicate::sgt>,
+    MaxMinIOpConverter<MaxUIOp, arith::CmpIPredicate::ugt>,
+    MaxMinIOpConverter<MinSIOp, arith::CmpIPredicate::slt>,
+    MaxMinIOpConverter<MinUIOp, arith::CmpIPredicate::ult>,
     MaximumMinimumFOpConverter<MaximumFOp, arith::CmpFPredicate::UGT>,
     MaximumMinimumFOpConverter<MinimumFOp, arith::CmpFPredicate::ULT>,
     MaxNumMinNumFOpConverter<MaxNumFOp, arith::CmpFPredicate::UGT>,
diff --git a/mlir/lib/Dialect/Arith/Utils/CMakeLists.txt b/mlir/lib/Dialect/Arith/Utils/CMakeLists.txt
index 2be2724d4a91..07fa58b209b5 100644
--- a/mlir/lib/Dialect/Arith/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Arith/Utils/CMakeLists.txt
@@ -8,5 +8,6 @@ add_mlir_dialect_library(MLIRArithUtils
   MLIRArithDialect
   MLIRComplexDialect
   MLIRDialect
+  MLIRDialectUtils
   MLIRIR
   )
diff --git a/mlir/lib/Dialect/Arith/Utils/Utils.cpp b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
index aa239f5e0539..4ce55a23820c 100644
--- a/mlir/lib/Dialect/Arith/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
@@ -13,12 +13,74 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include <numeric>
 
 using namespace mlir;
 
+std::optional<SmallVector<OpFoldResult>>
+mlir::inferExpandShapeOutputShape(OpBuilder &b, Location loc,
+                                  ShapedType expandedType,
+                                  ArrayRef<ReassociationIndices> reassociation,
+                                  ArrayRef<OpFoldResult> inputShape) {
+
+  SmallVector<Value> outputShapeValues;
+  SmallVector<int64_t> outputShapeInts;
+  // For zero-rank inputs, all dims in result shape are unit extent.
+  if (inputShape.empty()) {
+    outputShapeInts.resize(expandedType.getRank(), 1);
+    return getMixedValues(outputShapeInts, outputShapeValues, b);
+  }
+
+  // Check for all static shapes.
+  if (expandedType.hasStaticShape()) {
+    ArrayRef<int64_t> staticShape = expandedType.getShape();
+    outputShapeInts.assign(staticShape.begin(), staticShape.end());
+    return getMixedValues(outputShapeInts, outputShapeValues, b);
+  }
+
+  outputShapeInts.resize(expandedType.getRank(), ShapedType::kDynamic);
+  for (const auto &it : llvm::enumerate(reassociation)) {
+    ReassociationIndices indexGroup = it.value();
+
+    int64_t indexGroupStaticSizesProductInt = 1;
+    bool foundDynamicShape = false;
+    for (int64_t index : indexGroup) {
+      int64_t outputDimSize = expandedType.getDimSize(index);
+      // Cannot infer expanded shape with multiple dynamic dims in the
+      // same reassociation group!
+      if (ShapedType::isDynamic(outputDimSize)) {
+        if (foundDynamicShape)
+          return std::nullopt;
+        foundDynamicShape = true;
+      } else {
+        outputShapeInts[index] = outputDimSize;
+        indexGroupStaticSizesProductInt *= outputDimSize;
+      }
+    }
+    if (!foundDynamicShape)
+      continue;
+
+    int64_t inputIndex = it.index();
+    // Call get<Value>() under the assumption that we're not casting
+    // dynamism.
+    Value indexGroupSize = inputShape[inputIndex].get<Value>();
+    Value indexGroupStaticSizesProduct =
+        b.create<arith::ConstantIndexOp>(loc, indexGroupStaticSizesProductInt);
+    Value dynamicDimSize = b.createOrFold<arith::DivUIOp>(
+        loc, indexGroupSize, indexGroupStaticSizesProduct);
+    outputShapeValues.push_back(dynamicDimSize);
+  }
+
+  if ((int64_t)outputShapeValues.size() !=
+      llvm::count(outputShapeInts, ShapedType::kDynamic))
+    return std::nullopt;
+
+  return getMixedValues(outputShapeInts, outputShapeValues, b);
+}
+
 /// Matches a ConstantIndexOp.
 /// TODO: This should probably just be a general matcher that uses matchConstant
 /// and checks the operation for an index type.
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
index d3751d4ba7e7..39292c4533d6 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
@@ -86,8 +86,7 @@ static Value createInterleave2Intrinsic(RewriterBase &rewriter, Location loc,
   auto inputType = cast<VectorType>(lhs.getType());
   VectorType inputTypeX2 =
       VectorType::Builder(inputType).setDim(0, inputType.getShape()[0] * 2);
-  return rewriter.create<LLVM::experimental_vector_interleave2>(
-      loc, inputTypeX2, lhs, rhs);
+  return rewriter.create<LLVM::vector_interleave2>(loc, inputTypeX2, lhs, rhs);
 }
 
 // Fuse two 'arm_sme.outerproduct' operations that are chained via the
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 531016130d1d..2d329a1f3d88 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -1382,14 +1382,27 @@ LogicalResult
 bufferization::runOneShotBufferize(Operation *op,
                                    const OneShotBufferizationOptions &options,
                                    BufferizationStatistics *statistics) {
+  // copy-before-write deactivates the analysis. It cannot be used together with
+  // test-analysis-only.
   assert(!(options.copyBeforeWrite && options.testAnalysisOnly) &&
          "invalid combination of bufferization flags");
-  if (!options.copyBeforeWrite) {
-    // If a buffer is copied before every write, no analysis is needed.
+
+  if (options.copyBeforeWrite) {
+    // Copy buffer before each write. No analysis is needed.
+  } else {
+    // Run One-Shot Analysis and insert buffer copies (on the tensor level)
+    // only where needed. This is the default and much more efficient than
+    // copy-before-write.
     if (failed(insertTensorCopies(op, options, statistics)))
       return failure();
+
+    // If test-analysis-only is set, the IR was annotated with RaW conflict
+    // markers (attributes) during One-Shot Analysis.
+    if (options.testAnalysisOnly)
+      return success();
   }
-  if (options.testAnalysisOnly)
-    return success();
+
+  // Bufferize the op and its nested ops. If options.copyBeforeWrite is set,
+  // a new buffer copy is allocated every time a buffer is written to.
   return bufferizeOp(op, options, statistics);
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 2436113dc423..f5e80553ae72 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -241,24 +241,26 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
     map.map(operand.value(), entryBlock.getArgument(operand.index()));
 
   // Clone the region of the gpu.launch operation into the gpu.func operation.
-  // TODO: If cloneInto can be modified such that if a mapping for
-  // a block exists, that block will be used to clone operations into (at the
-  // end of the block), instead of creating a new block, this would be much
-  // cleaner.
   launchOpBody.cloneInto(&outlinedFuncBody, map);
 
-  // Branch from entry of the gpu.func operation to the block that is cloned
-  // from the entry block of the gpu.launch operation.
-  Block &launchOpEntry = launchOpBody.front();
-  Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
-  builder.setInsertionPointToEnd(&entryBlock);
-  builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);
-
-  outlinedFunc.walk([](gpu::TerminatorOp op) {
-    OpBuilder replacer(op);
-    replacer.create<gpu::ReturnOp>(op.getLoc());
-    op.erase();
-  });
+  // Replace the terminator op with returns.
+  for (Block &block : launchOpBody) {
+    Block *clonedBlock = map.lookup(&block);
+    auto terminator = dyn_cast<gpu::TerminatorOp>(clonedBlock->getTerminator());
+    if (!terminator)
+      continue;
+    OpBuilder replacer(terminator);
+    replacer.create<gpu::ReturnOp>(terminator->getLoc());
+    terminator->erase();
+  }
+
+  // Splice now the entry block of the gpu.launch operation at the end of the
+  // gpu.func entry block and erase the redundant block.
+  Block *clonedLaunchOpEntry = map.lookup(&launchOpBody.front());
+  entryBlock.getOperations().splice(entryBlock.getOperations().end(),
+                                    clonedLaunchOpEntry->getOperations());
+  clonedLaunchOpEntry->erase();
+
   return outlinedFunc;
 }
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 01613ab5268b..836e939a8295 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -88,10 +88,7 @@ void GpuModuleToBinaryPass::runOnOperation() {
   TargetOptions targetOptions(toolkitPath, linkFiles, cmdOptions, *targetFormat,
                               lazyTableBuilder);
   if (failed(transformGpuModulesToBinaries(
-          getOperation(),
-          offloadingHandler ? dyn_cast<OffloadingLLVMTranslationAttrInterface>(
-                                  offloadingHandler.getValue())
-                            : OffloadingLLVMTranslationAttrInterface(nullptr),
+          getOperation(), OffloadingLLVMTranslationAttrInterface(nullptr),
           targetOptions)))
     return signalPassFailure();
 }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 4e06b9c127e7..7be493d5992c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -1785,7 +1785,7 @@ LogicalResult ReturnOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// Verifier for LLVM::AddressOfOp.
+// LLVM::AddressOfOp.
 //===----------------------------------------------------------------------===//
 
 static Operation *parentLLVMModule(Operation *op) {
@@ -1826,6 +1826,11 @@ AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   return success();
 }
 
+// AddressOfOp constant-folds to the global symbol name.
+OpFoldResult LLVM::AddressOfOp::fold(FoldAdaptor) {
+  return getGlobalNameAttr();
+}
+
 //===----------------------------------------------------------------------===//
 // Verifier for LLVM::ComdatOp.
 //===----------------------------------------------------------------------===//
@@ -3258,6 +3263,12 @@ LogicalResult LLVMDialect::verifyRegionResultAttribute(Operation *op,
 
 Operation *LLVMDialect::materializeConstant(OpBuilder &builder, Attribute value,
                                             Type type, Location loc) {
+  // If this was folded from an llvm.mlir.addressof operation, it should be
+  // materialized as such.
+  if (auto symbol = dyn_cast<FlatSymbolRefAttr>(value))
+    if (isa<LLVM::LLVMPointerType>(type))
+      return builder.create<LLVM::AddressOfOp>(loc, type, symbol);
+  // Otherwise try materializing it as a regular llvm.mlir.constant op.
   return LLVM::ConstantOp::materialize(builder, value, type, loc);
 }
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 9c5c58fa1fab..e5f83331baf8 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -395,6 +395,24 @@ public:
       return builder.create<math::FloorOp>(arg.getLoc(), arg);
     case UnaryFn::negf:
       return builder.create<arith::NegFOp>(arg.getLoc(), arg);
+    case UnaryFn::reciprocal: {
+      Attribute oneAttr = builder.getOneAttr(arg.getType());
+      auto one = builder.create<arith::ConstantOp>(arg.getLoc(),
+                                                   ::cast<TypedAttr>(oneAttr));
+      return builder.create<arith::DivFOp>(arg.getLoc(), one, arg);
+    }
+    case UnaryFn::round:
+      return builder.create<math::RoundOp>(arg.getLoc(), arg);
+    case UnaryFn::sqrt:
+      return builder.create<math::SqrtOp>(arg.getLoc(), arg);
+    case UnaryFn::rsqrt:
+      return builder.create<math::RsqrtOp>(arg.getLoc(), arg);
+    case UnaryFn::square:
+      return builder.create<arith::MulFOp>(arg.getLoc(), arg, arg);
+    case UnaryFn::tanh:
+      return builder.create<math::TanhOp>(arg.getLoc(), arg);
+    case UnaryFn::erf:
+      return builder.create<math::ErfOp>(arg.getLoc(), arg);
     }
     llvm_unreachable("unsupported unary function");
   }
@@ -467,6 +485,9 @@ public:
       if (allFloatingPoint)
         return builder.create<arith::MinimumFOp>(arg0.getLoc(), arg0, arg1);
       return builder.create<arith::MinUIOp>(arg0.getLoc(), arg0, arg1);
+    case BinaryFn::powf:
+      assert(allFloatingPoint);
+      return builder.create<math::PowFOp>(arg0.getLoc(), arg0, arg1);
     }
     llvm_unreachable("unsupported binary function");
   }
@@ -586,12 +607,20 @@ struct FoldFillWithTensorReshape : OpRewritePattern<TensorReshapeOp> {
       return failure();
 
     Location loc = oldFill.getLoc();
-    auto newInit = rewriter.create<TensorReshapeOp>(
-        loc, reshapeOp.getResultType(), oldFill.output(),
-        reshapeOp.getReassociation());
+    TensorReshapeOp newInit;
+    if constexpr (std::is_same<TensorReshapeOp, tensor::ExpandShapeOp>::value) {
+
+      newInit = rewriter.create<TensorReshapeOp>(
+          loc, reshapeOp.getResultType(), oldFill.output(),
+          reshapeOp.getReassociation(), reshapeOp.getOutputShape(),
+          reshapeOp.getStaticOutputShape());
+    } else {
+      newInit = rewriter.create<TensorReshapeOp>(loc, reshapeOp.getResultType(),
+                                                 oldFill.output(),
+                                                 reshapeOp.getReassociation());
+    }
     rewriter.replaceOpWithNewOp<FillOp>(reshapeOp, ValueRange{oldFill.value()},
                                         ValueRange{newInit});
-
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index ee6e391d0cc6..3b5282a09569 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -27,6 +27,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   NamedOpConversions.cpp
   Padding.cpp
   Promotion.cpp
+  RuntimeOpVerification.cpp
   Specialize.cpp
   Split.cpp
   SplitReduction.cpp
@@ -60,6 +61,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   MLIRFuncDialect
   MLIRFuncToLLVM
   MLIRFuncTransforms
+  MLIRIndexDialect
   MLIRInferTypeOpInterface
   MLIRIR
   MLIRMemRefDialect
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
index 420b04b3ee28..81d44ba04fa1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
@@ -349,7 +349,7 @@ rewriteInIm2Col(RewriterBase &rewriter,
   SmallVector<ReassociationIndices> batchMatVecReassociationIndice = {{0, 1},
                                                                       {2, 3}};
 
-  Value batchMatVecResultReshaped = rewriter.create<tensor::ExpandShapeOp>(
+  auto batchMatVecResultReshaped = rewriter.create<tensor::ExpandShapeOp>(
       loc, transposedOutputTensor.getType(), batchMatVecResult.getResult(0),
       batchMatVecReassociationIndice);
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
index 7fd88dec71d4..2bea083ac2d7 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -757,7 +757,10 @@ pushDownUnPackOpThroughExpandShape(tensor::UnPackOp unPackOp,
   ArrayRef<int64_t> innerDimsPos = unPackOp.getInnerDimsPos();
   ArrayRef<int64_t> outerDimsPerm = unPackOp.getOuterDimsPerm();
 
-  ArrayRef<int64_t> dstShape = expandOp.getType().getShape();
+  auto expandTy = dyn_cast<RankedTensorType>(expandOp.getType());
+  if (!expandTy)
+    return failure();
+  ArrayRef<int64_t> dstShape = expandTy.getShape();
   SmallVector<ReassociationIndices> reassocIndices =
       expandOp.getReassociationIndices();
   // Project inner tile pos to the dim pos after expanding. For example, if dims
@@ -796,9 +799,8 @@ pushDownUnPackOpThroughExpandShape(tensor::UnPackOp unPackOp,
     nextPos += 1;
   }
 
-  RankedTensorType newExpandType =
-      tensor::PackOp::inferPackedType(expandOp.getType(), innerTileSizes,
-                                      projectedInnerDimsPos, newOuterDimsPerm);
+  RankedTensorType newExpandType = tensor::PackOp::inferPackedType(
+      expandTy, innerTileSizes, projectedInnerDimsPos, newOuterDimsPerm);
   auto newExpandOp = rewriter.create<tensor::ExpandShapeOp>(
       expandOp.getLoc(), newExpandType, unPackOp.getSource(),
       newReassocIndices);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 023ea277bcf4..65efa18af18f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -23,6 +23,7 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -272,8 +273,9 @@ expandValue(RewriterBase &rewriter, Location loc, Value result, Value origDest,
   assert(rankReductionStrategy ==
              ControlDropUnitDims::RankReductionStrategy::ReassociativeReshape &&
          "unknown rank reduction strategy");
-  return rewriter.create<tensor::ExpandShapeOp>(loc, origResultType, result,
-                                                reassociation);
+  return rewriter
+      .create<tensor::ExpandShapeOp>(loc, origResultType, result, reassociation)
+      .getResult();
 }
 
 /// Collapse the given `value` so that the type matches the type of
@@ -536,9 +538,10 @@ LogicalResult linalg::dropUnitDims(RewriterBase &rewriter, GenericOp genericOp,
       resultReplacements.push_back(result);
       continue;
     }
-    resultReplacements.push_back(expandValue(rewriter, loc, result, origDest,
-                                             reassociations[opOperandIndex],
-                                             options.rankReductionStrategy));
+    Value expandedValue = expandValue(rewriter, loc, result, origDest,
+                                      reassociations[opOperandIndex],
+                                      options.rankReductionStrategy);
+    resultReplacements.push_back(expandedValue);
   }
 
   rewriter.replaceOp(genericOp, resultReplacements);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 373e9cfc3ce7..89fb4944c0ca 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -625,14 +625,14 @@ LogicalResult ExpansionInfo::compute(LinalgOp linalgOp,
   return success();
 }
 
-/// Epanding the body of a linalg operation requires adaptations of the accessed
-/// loop indices. Specifically, access of indices in the original operation need
-/// to be replaced with linearizations of indices in the expanded op. That
-/// requires the shape of the expanded dimensions to be static (at least all but
-/// the most significant). For now check that these are all statically sized.
-/// Note that this could be extended to handle dynamic case, but the
-/// implementation below uses `affine.apply` which seems to have issues when the
-/// shapes are not static.
+/// Expanding the body of a linalg operation requires adaptations of the
+/// accessed loop indices. Specifically, access of indices in the original
+/// operation need to be replaced with linearizations of indices in the expanded
+/// op. That requires the shape of the expanded dimensions to be static (at
+/// least all but the most significant). For now check that these are all
+/// statically sized. Note that this could be extended to handle dynamic case,
+/// but the implementation below uses `affine.apply` which seems to have issues
+/// when the shapes are not static.
 static LogicalResult isLinalgOpExpandable(LinalgOp linalgOp,
                                           const ExpansionInfo &expansionInfo,
                                           PatternRewriter &rewriter) {
@@ -750,6 +750,31 @@ static void updateExpandedGenericOpRegion(PatternRewriter &rewriter,
   }
 }
 
+/// Checks if a single dynamic dimension expanded into multiple dynamic
+/// dimensions.
+static LogicalResult
+validateDynamicDimExpansion(LinalgOp linalgOp,
+                            const ExpansionInfo &expansionInfo,
+                            PatternRewriter &rewriter) {
+  for (unsigned i : llvm::seq<unsigned>(0, expansionInfo.getOrigOpNumDims())) {
+    ArrayRef<int64_t> expandedShape = expansionInfo.getExpandedShapeOfDim(i);
+    if (expandedShape.size() == 1)
+      continue;
+    bool foundDynamic = false;
+    for (int64_t shape : expandedShape) {
+      if (!ShapedType::isDynamic(shape))
+        continue;
+      if (foundDynamic) {
+        return rewriter.notifyMatchFailure(
+            linalgOp, "cannot infer expanded shape with multiple dynamic "
+                      "dims in the same reassociation group");
+      }
+      foundDynamic = true;
+    }
+  }
+  return success();
+}
+
 /// Implements the fusion of a tensor.collapse_shape or a tensor.expand_shape op
 /// and a generic op as explained in `isFusableWithReshapeByExpansion`. Assumes
 /// that those conditions have been satisfied.
@@ -759,6 +784,8 @@ fuseWithReshapeByExpansion(LinalgOp linalgOp, Operation *reshapeOp,
                            PatternRewriter &rewriter) {
   assert(isFusableWithReshapeByDimExpansion(linalgOp, fusableOpOperand) &&
          "preconditions for fuse operation failed");
+
+  Location loc = linalgOp.getLoc();
   // Check if reshape is expanding or collapsing.
   auto expandingReshapeOp = dyn_cast<tensor::ExpandShapeOp>(*reshapeOp);
   auto collapsingReshapeOp = dyn_cast<tensor::CollapseShapeOp>(*reshapeOp);
@@ -778,6 +805,11 @@ fuseWithReshapeByExpansion(LinalgOp linalgOp, Operation *reshapeOp,
           expandedType.getShape(), collapsedType.getShape(), rewriter)))
     return std::nullopt;
 
+  // TODO: With the support of multiple dynamic dims expansion in
+  // tensor.expand_shape op, this case can be handled.
+  if (failed(validateDynamicDimExpansion(linalgOp, expansionInfo, rewriter)))
+    return std::nullopt;
+
   if (failed(isLinalgOpExpandable(linalgOp, expansionInfo, rewriter)))
     return std::nullopt;
 
@@ -816,15 +848,13 @@ fuseWithReshapeByExpansion(LinalgOp linalgOp, Operation *reshapeOp,
                 /*isExpandingReshape=*/true)))
           return std::nullopt;
         expandedOpOperands.push_back(rewriter.create<tensor::ExpandShapeOp>(
-            linalgOp.getLoc(), expandedOperandType, opOperand->get(),
-            reassociation));
+            loc, expandedOperandType, opOperand->get(), reassociation));
         continue;
       }
     }
     expandedOpOperands.push_back(opOperand->get());
   }
 
-  Location loc = linalgOp.getLoc();
   SmallVector<Value> outputs;
   for (OpOperand &opOperand : linalgOp.getDpsInitsMutable()) {
     AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand);
@@ -843,8 +873,7 @@ fuseWithReshapeByExpansion(LinalgOp linalgOp, Operation *reshapeOp,
               /*isExpandingReshape=*/true)))
         return std::nullopt;
       outputs.push_back(rewriter.create<tensor::ExpandShapeOp>(
-          linalgOp.getLoc(), expandedOutputType, opOperand.get(),
-          reassociation));
+          loc, expandedOutputType, opOperand.get(), reassociation));
     } else {
       outputs.push_back(opOperand.get());
     }
@@ -1615,15 +1644,17 @@ FailureOr<CollapseResult> mlir::linalg::collapseOpIterationDims(
           op.getIndexingMapMatchingResult(originalResult.value());
       SmallVector<ReassociationIndices> reassociation =
           getOperandReassociation(indexingMap, collapsingInfo);
+      Value result;
       if (isa<MemRefType>(collapsedOpResult.getType())) {
-        Value result = rewriter.create<memref::ExpandShapeOp>(
-            loc, originalResultType, collapsedOpResult, reassociation);
-        results.push_back(result);
+        MemRefType expandShapeResultType = MemRefType::get(
+            originalResultType.getShape(), originalResultType.getElementType());
+        result = rewriter.create<memref::ExpandShapeOp>(
+            loc, expandShapeResultType, collapsedOpResult, reassociation);
       } else {
-        Value result = rewriter.create<tensor::ExpandShapeOp>(
+        result = rewriter.create<tensor::ExpandShapeOp>(
             loc, originalResultType, collapsedOpResult, reassociation);
-        results.push_back(result);
       }
+      results.push_back(result);
     } else {
       results.push_back(collapsedOpResult);
     }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp
new file mode 100644
index 000000000000..b30182dc8407
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp
@@ -0,0 +1,135 @@
+//===- RuntimeOpVerification.cpp - Op Verification ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/Transforms/RuntimeOpVerification.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/Index/IR/IndexAttrs.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
+
+namespace mlir {
+namespace linalg {
+namespace {
+/// Verify that the runtime sizes of the operands to linalg structured ops are
+/// compatible with the runtime sizes inferred by composing the loop ranges with
+/// the linalg op's indexing maps. This is similar to the verifier except that
+/// here we insert IR to perform the verification at runtime.
+template <typename T>
+struct StructuredOpInterface
+    : public RuntimeVerifiableOpInterface::ExternalModel<
+          StructuredOpInterface<T>, T> {
+  void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+                                   Location loc) const {
+    auto linalgOp = llvm::cast<LinalgOp>(op);
+
+    SmallVector<Range> loopRanges = linalgOp.createLoopRanges(builder, loc);
+    auto [starts, ends, _] = getOffsetsSizesAndStrides(loopRanges);
+
+    auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+    auto one = builder.create<arith::ConstantIndexOp>(loc, 1);
+
+    // Subtract one from the loop ends before composing with the indexing map
+    transform(ends, ends.begin(), [&](OpFoldResult end) {
+      auto endValue = getValueOrCreateConstantIndexOp(builder, loc, end);
+      return builder.createOrFold<index::SubOp>(loc, endValue, one);
+    });
+
+    for (OpOperand &opOperand : linalgOp->getOpOperands()) {
+      AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand);
+      auto startIndices = affine::makeComposedFoldedMultiResultAffineApply(
+          builder, loc, indexingMap, starts);
+      auto endIndices = affine::makeComposedFoldedMultiResultAffineApply(
+          builder, loc, indexingMap, ends);
+
+      for (auto dim : llvm::seq(linalgOp.getRank(&opOperand))) {
+        auto startIndex =
+            getValueOrCreateConstantIndexOp(builder, loc, startIndices[dim]);
+        auto endIndex =
+            getValueOrCreateConstantIndexOp(builder, loc, endIndices[dim]);
+
+        // Generate:
+        //   minIndex = min(startIndex, endIndex)
+        //   assert(minIndex >= 0)
+        // To ensure we do not generate a negative index. We take the minimum of
+        // the start and end indices in order to handle reverse loops such as
+        // `affine_map<(i) -> (3 - i)>`
+        auto min =
+            builder.createOrFold<index::MinSOp>(loc, startIndex, endIndex);
+        auto cmpOp = builder.createOrFold<index::CmpOp>(
+            loc, index::IndexCmpPredicate::SGE, min, zero);
+        auto msg = RuntimeVerifiableOpInterface::generateErrorMessage(
+            linalgOp, "unexpected negative result on dimension #" +
+                          std::to_string(dim) + " of input/output operand #" +
+                          std::to_string(opOperand.getOperandNumber()));
+        builder.createOrFold<cf::AssertOp>(loc, cmpOp, msg);
+
+        // Generate:
+        //   inferredDimSize = max(startIndex, endIndex) + 1
+        //   actualDimSize = dim(operand)
+        //   assert(inferredDimSize <= actualDimSize)
+        // To ensure that we do not index past the bounds of the operands.
+        auto max =
+            builder.createOrFold<index::MaxSOp>(loc, startIndex, endIndex);
+
+        auto inferredDimSize =
+            builder.createOrFold<index::AddOp>(loc, max, one);
+
+        auto actualDimSize =
+            createOrFoldDimOp(builder, loc, opOperand.get(), dim);
+
+        // Similar to the verifier, when the affine expression in the indexing
+        // map is complicated, we just check that the inferred dimension sizes
+        // are in the boundary of the operands' size. Being more precise than
+        // that is difficult.
+        auto predicate = isa<AffineDimExpr>(indexingMap.getResult(dim))
+                             ? index::IndexCmpPredicate::EQ
+                             : index::IndexCmpPredicate::SLE;
+
+        cmpOp = builder.createOrFold<index::CmpOp>(
+            loc, predicate, inferredDimSize, actualDimSize);
+        msg = RuntimeVerifiableOpInterface::generateErrorMessage(
+            linalgOp, "dimension #" + std::to_string(dim) +
+                          " of input/output operand #" +
+                          std::to_string(opOperand.getOperandNumber()) +
+                          " is incompatible with inferred dimension size");
+        builder.createOrFold<cf::AssertOp>(loc, cmpOp, msg);
+      }
+    }
+  }
+};
+
+template <typename... OpTs>
+void attachInterface(MLIRContext *ctx) {
+  (OpTs::template attachInterface<StructuredOpInterface<OpTs>>(*ctx), ...);
+}
+} // namespace
+} // namespace linalg
+} // namespace mlir
+
+void mlir::linalg::registerRuntimeVerifiableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, LinalgDialect *) {
+    attachInterface<
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+        >(ctx);
+
+    // Load additional dialects of which ops may get created.
+    ctx->loadDialect<affine::AffineDialect, arith::ArithDialect,
+                     cf::ControlFlowDialect, index::IndexDialect,
+                     tensor::TensorDialect>();
+  });
+}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp b/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp
index 6559c86c9e0f..5bfdbc6d0bb5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp
@@ -114,6 +114,7 @@ FailureOr<SplitReductionResult> mlir::linalg::splitReduction(
     Type newType = RankedTensorType::get(
         newShape,
         cast<RankedTensorType>(operand->get().getType()).getElementType());
+
     Value newInput = b.create<tensor::ExpandShapeOp>(
         loc, newType, operand->get(), reassociation);
     newInputs.push_back(newInput);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 2297bf5e3551..91dfac802ad6 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -329,11 +329,13 @@ FailureOr<LowerPackResult> linalg::lowerPack(RewriterBase &rewriter,
                              /*transposeOp=*/nullptr};
     }
   }
+
   // 5. Expand from the padded result to the stripMinedShape.
+  auto expandShapeResultType =
+      RankedTensorType::Builder(packedTensorType).setShape(stripMinedShape);
   auto reshapeOp = rewriter.create<tensor::ExpandShapeOp>(
-      loc,
-      RankedTensorType::Builder(packedTensorType).setShape(stripMinedShape),
-      padOp.getResult(), packingMetadata.reassociations);
+      loc, expandShapeResultType, padOp.getResult(),
+      packingMetadata.reassociations);
 
   // 6. Transpose stripMinedShape to packedShape.
   SmallVector<int64_t> transpPerm =
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index e836f0dc63b4..ef9a30be9a01 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1499,11 +1499,11 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
   // If the input vector sizes are not provided, then the vector sizes are
   // determined by the result tensor shape. In case the vector sizes aren't
   // provided, we update the inBounds attribute instead of masking.
-  bool useInBoundsInsteadOfMasking = true;
+  bool useInBoundsInsteadOfMasking = false;
   if (inputVectorSizes.empty()) {
     ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
     inputVectorSizes = resultTensorShape.take_front(packOp.getSourceRank());
-    useInBoundsInsteadOfMasking = false;
+    useInBoundsInsteadOfMasking = true;
   }
 
   // Create masked TransferReadOp.
@@ -1612,7 +1612,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp,
   // to shape of source, then a mask is necessary.
   Value readResult = vector::createReadOrMaskedRead(
       rewriter, loc, unpackOp.getSource(),
-      ArrayRef<int64_t>(readMaskShape.begin(), readMaskShape.end()), padValue);
+      ArrayRef<int64_t>(readMaskShape.begin(), readMaskShape.end()), padValue,
+      /*useInBoundsInsteadOfMasking=*/false);
 
   PackingMetadata packMetadata;
   SmallVector<int64_t> lastDimToInsertPosPerm =
@@ -1669,7 +1670,8 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp,
   (void)status; // prevent unused variable warning on non-assert builds
   assert(succeeded(status) && "failed to reify result shapes");
   auto maskedRead = vector::createReadOrMaskedRead(
-      rewriter, loc, padOp.getSource(), inputVectorSizes, padValue);
+      rewriter, loc, padOp.getSource(), inputVectorSizes, padValue,
+      /*useInBoundsInsteadOfMasking=*/false);
   Operation *write = createWriteOrMaskedWrite(
       rewriter, loc, maskedRead, reifiedReturnShapes[0], inputVectorSizes);
   newResults.push_back(write->getResult(0));
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 836dcb8f329e..b969d41d934d 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2237,6 +2237,44 @@ FailureOr<MemRefType> ExpandShapeOp::computeExpandedType(
                          srcType.getMemorySpace());
 }
 
+FailureOr<SmallVector<OpFoldResult>>
+ExpandShapeOp::inferOutputShape(OpBuilder &b, Location loc,
+                                MemRefType expandedType,
+                                ArrayRef<ReassociationIndices> reassociation,
+                                ArrayRef<OpFoldResult> inputShape) {
+  std::optional<SmallVector<OpFoldResult>> outputShape =
+      inferExpandShapeOutputShape(b, loc, expandedType, reassociation,
+                                  inputShape);
+  if (!outputShape)
+    return failure();
+  return *outputShape;
+}
+
+void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
+                          Type resultType, Value src,
+                          ArrayRef<ReassociationIndices> reassociation,
+                          ArrayRef<OpFoldResult> outputShape) {
+  auto [staticOutputShape, dynamicOutputShape] =
+      decomposeMixedValues(SmallVector<OpFoldResult>(outputShape));
+  build(builder, result, llvm::cast<MemRefType>(resultType), src,
+        getReassociationIndicesAttribute(builder, reassociation),
+        dynamicOutputShape, staticOutputShape);
+}
+
+void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
+                          Type resultType, Value src,
+                          ArrayRef<ReassociationIndices> reassociation) {
+  SmallVector<OpFoldResult> inputShape =
+      getMixedSizes(builder, result.location, src);
+  MemRefType memrefResultTy = llvm::cast<MemRefType>(resultType);
+  FailureOr<SmallVector<OpFoldResult>> outputShape = inferOutputShape(
+      builder, result.location, memrefResultTy, reassociation, inputShape);
+  // Failure of this assertion usually indicates presence of multiple
+  // dynamic dimensions in the same reassociation group.
+  assert(succeeded(outputShape) && "unable to infer output shape");
+  build(builder, result, memrefResultTy, src, reassociation, *outputShape);
+}
+
 void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
                           ArrayRef<int64_t> resultShape, Value src,
                           ArrayRef<ReassociationIndices> reassociation) {
@@ -2250,6 +2288,20 @@ void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
   build(builder, result, *resultType, src, reassociation);
 }
 
+void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
+                          ArrayRef<int64_t> resultShape, Value src,
+                          ArrayRef<ReassociationIndices> reassociation,
+                          ArrayRef<OpFoldResult> outputShape) {
+  // Only ranked memref source values are supported.
+  auto srcType = llvm::cast<MemRefType>(src.getType());
+  FailureOr<MemRefType> resultType =
+      ExpandShapeOp::computeExpandedType(srcType, resultShape, reassociation);
+  // Failure of this assertion usually indicates a problem with the source
+  // type, e.g., could not get strides/offset.
+  assert(succeeded(resultType) && "could not compute layout");
+  build(builder, result, *resultType, src, reassociation, outputShape);
+}
+
 LogicalResult ExpandShapeOp::verify() {
   MemRefType srcType = getSrcType();
   MemRefType resultType = getResultType();
@@ -2266,7 +2318,7 @@ LogicalResult ExpandShapeOp::verify() {
   if (failed(verifyCollapsedShape(getOperation(), srcType.getShape(),
                                   resultType.getShape(),
                                   getReassociationIndices(),
-                                  /*allowMultipleDynamicDimsPerGroup=*/false)))
+                                  /*allowMultipleDynamicDimsPerGroup=*/true)))
     return failure();
 
   // Compute expected result type (including layout map).
@@ -2280,14 +2332,28 @@ LogicalResult ExpandShapeOp::verify() {
     return emitOpError("expected expanded type to be ")
            << *expectedResultType << " but found " << resultType;
 
+  if ((int64_t)getStaticOutputShape().size() != resultType.getRank())
+    return emitOpError("expected number of static shape bounds to be equal to "
+                       "the output rank (")
+           << resultType.getRank() << ") but found "
+           << getStaticOutputShape().size() << " inputs instead";
+
+  if ((int64_t)getOutputShape().size() !=
+      llvm::count(getStaticOutputShape(), ShapedType::kDynamic))
+    return emitOpError("mismatch in dynamic dims in output_shape and "
+                       "static_output_shape: static_output_shape has ")
+           << llvm::count(getStaticOutputShape(), ShapedType::kDynamic)
+           << " dynamic dims while output_shape has " << getOutputShape().size()
+           << " values";
+
   return success();
 }
 
 void ExpandShapeOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                 MLIRContext *context) {
-  results.add<ComposeReassociativeReshapeOps<ExpandShapeOp>,
-              ComposeExpandOfCollapseOp<ExpandShapeOp, CollapseShapeOp>>(
-      context);
+  results.add<
+      ComposeReassociativeReshapeOps<ExpandShapeOp, ReshapeOpKind::kExpand>,
+      ComposeExpandOfCollapseOp<ExpandShapeOp, CollapseShapeOp>>(context);
 }
 
 /// Compute the layout map after collapsing a given source MemRef type with the
@@ -2488,9 +2554,11 @@ public:
 
 void CollapseShapeOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                   MLIRContext *context) {
-  results.add<ComposeReassociativeReshapeOps<CollapseShapeOp>,
-              ComposeCollapseOfExpandOp<CollapseShapeOp, ExpandShapeOp, CastOp>,
-              CollapseShapeOpMemRefCastFolder>(context);
+  results.add<
+      ComposeReassociativeReshapeOps<CollapseShapeOp, ReshapeOpKind::kCollapse>,
+      ComposeCollapseOfExpandOp<CollapseShapeOp, ExpandShapeOp, CastOp,
+                                memref::DimOp, MemRefType>,
+      CollapseShapeOpMemRefCastFolder>(context);
 }
 
 OpFoldResult ExpandShapeOp::fold(FoldAdaptor adaptor) {
@@ -2799,7 +2867,8 @@ static bool haveCompatibleOffsets(MemRefType t1, MemRefType t2) {
 /// marked as dropped in `droppedDims`.
 static bool haveCompatibleStrides(MemRefType t1, MemRefType t2,
                                   const llvm::SmallBitVector &droppedDims) {
-  assert(size_t(t1.getRank()) == droppedDims.size() && "incorrect number of bits");
+  assert(size_t(t1.getRank()) == droppedDims.size() &&
+         "incorrect number of bits");
   assert(size_t(t1.getRank() - t2.getRank()) == droppedDims.count() &&
          "incorrect number of dropped dims");
   int64_t t1Offset, t2Offset;
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
index 4449733f0daf..77c108aab480 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -24,7 +23,6 @@
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <type_traits>
 
@@ -430,6 +428,33 @@ struct ConvertMemRefSubview final : OpConversionPattern<memref::SubViewOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// ConvertMemRefCollapseShape
+//===----------------------------------------------------------------------===//
+
+/// Emulating a `memref.collapse_shape` becomes a no-op after emulation given
+/// that we flatten memrefs to a single dimension as part of the emulation and
+/// there is no dimension to collapse any further.
+struct ConvertMemRefCollapseShape final
+    : OpConversionPattern<memref::CollapseShapeOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(memref::CollapseShapeOp collapseShapeOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value srcVal = adaptor.getSrc();
+    auto newTy = dyn_cast<MemRefType>(srcVal.getType());
+    if (!newTy)
+      return failure();
+
+    if (newTy.getRank() != 1)
+      return failure();
+
+    rewriter.replaceOp(collapseShapeOp, srcVal);
+    return success();
+  }
+};
+
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -442,7 +467,8 @@ void memref::populateMemRefNarrowTypeEmulationPatterns(
 
   // Populate `memref.*` conversion patterns.
   patterns.add<ConvertMemRefAllocation<memref::AllocOp>,
-               ConvertMemRefAllocation<memref::AllocaOp>, ConvertMemRefLoad,
+               ConvertMemRefAllocation<memref::AllocaOp>,
+               ConvertMemRefCollapseShape, ConvertMemRefLoad,
                ConvertMemrefStore, ConvertMemRefAssumeAlignment,
                ConvertMemRefSubview, ConvertMemRefReinterpretCast>(
       typeConverter, patterns.getContext());
diff --git a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
index 96eb7cfd2db6..585c5b738142 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
@@ -550,6 +550,89 @@ getCollapsedStride(memref::CollapseShapeOp collapseShape, OpBuilder &builder,
   return {makeComposedFoldedAffineMin(builder, collapseShape.getLoc(), minMap,
                                       groupStrides)};
 }
+
+/// From `reshape_like(memref, subSizes, subStrides))` compute
+///
+/// \verbatim
+/// baseBuffer, baseOffset, baseSizes, baseStrides =
+///     extract_strided_metadata(memref)
+/// strides#i = baseStrides#i * subStrides#i
+/// sizes = subSizes
+/// \endverbatim
+///
+/// and return {baseBuffer, baseOffset, sizes, strides}
+template <typename ReassociativeReshapeLikeOp>
+static FailureOr<StridedMetadata> resolveReshapeStridedMetadata(
+    RewriterBase &rewriter, ReassociativeReshapeLikeOp reshape,
+    function_ref<SmallVector<OpFoldResult>(
+        ReassociativeReshapeLikeOp, OpBuilder &,
+        ArrayRef<OpFoldResult> /*origSizes*/, unsigned /*groupId*/)>
+        getReshapedSizes,
+    function_ref<SmallVector<OpFoldResult>(
+        ReassociativeReshapeLikeOp, OpBuilder &,
+        ArrayRef<OpFoldResult> /*origSizes*/,
+        ArrayRef<OpFoldResult> /*origStrides*/, unsigned /*groupId*/)>
+        getReshapedStrides) {
+  // Build a plain extract_strided_metadata(memref) from
+  // extract_strided_metadata(reassociative_reshape_like(memref)).
+  Location origLoc = reshape.getLoc();
+  Value source = reshape.getSrc();
+  auto sourceType = cast<MemRefType>(source.getType());
+  unsigned sourceRank = sourceType.getRank();
+
+  auto newExtractStridedMetadata =
+      rewriter.create<memref::ExtractStridedMetadataOp>(origLoc, source);
+
+  // Collect statically known information.
+  auto [strides, offset] = getStridesAndOffset(sourceType);
+  MemRefType reshapeType = reshape.getResultType();
+  unsigned reshapeRank = reshapeType.getRank();
+
+  OpFoldResult offsetOfr =
+      ShapedType::isDynamic(offset)
+          ? getAsOpFoldResult(newExtractStridedMetadata.getOffset())
+          : rewriter.getIndexAttr(offset);
+
+  // Get the special case of 0-D out of the way.
+  if (sourceRank == 0) {
+    SmallVector<OpFoldResult> ones(reshapeRank, rewriter.getIndexAttr(1));
+    return StridedMetadata{newExtractStridedMetadata.getBaseBuffer(), offsetOfr,
+                           /*sizes=*/ones, /*strides=*/ones};
+  }
+
+  SmallVector<OpFoldResult> finalSizes;
+  finalSizes.reserve(reshapeRank);
+  SmallVector<OpFoldResult> finalStrides;
+  finalStrides.reserve(reshapeRank);
+
+  // Compute the reshaped strides and sizes from the base strides and sizes.
+  SmallVector<OpFoldResult> origSizes =
+      getAsOpFoldResult(newExtractStridedMetadata.getSizes());
+  SmallVector<OpFoldResult> origStrides =
+      getAsOpFoldResult(newExtractStridedMetadata.getStrides());
+  unsigned idx = 0, endIdx = reshape.getReassociationIndices().size();
+  for (; idx != endIdx; ++idx) {
+    SmallVector<OpFoldResult> reshapedSizes =
+        getReshapedSizes(reshape, rewriter, origSizes, /*groupId=*/idx);
+    SmallVector<OpFoldResult> reshapedStrides = getReshapedStrides(
+        reshape, rewriter, origSizes, origStrides, /*groupId=*/idx);
+
+    unsigned groupSize = reshapedSizes.size();
+    for (unsigned i = 0; i < groupSize; ++i) {
+      finalSizes.push_back(reshapedSizes[i]);
+      finalStrides.push_back(reshapedStrides[i]);
+    }
+  }
+  assert(((isa<memref::ExpandShapeOp>(reshape) && idx == sourceRank) ||
+          (isa<memref::CollapseShapeOp>(reshape) && idx == reshapeRank)) &&
+         "We should have visited all the input dimensions");
+  assert(finalSizes.size() == reshapeRank &&
+         "We should have populated all the values");
+
+  return StridedMetadata{newExtractStridedMetadata.getBaseBuffer(), offsetOfr,
+                         finalSizes, finalStrides};
+}
+
 /// Replace `baseBuffer, offset, sizes, strides =
 ///              extract_strided_metadata(reshapeLike(memref))`
 /// With
@@ -580,68 +663,65 @@ public:
 
   LogicalResult matchAndRewrite(ReassociativeReshapeLikeOp reshape,
                                 PatternRewriter &rewriter) const override {
-    // Build a plain extract_strided_metadata(memref) from
-    // extract_strided_metadata(reassociative_reshape_like(memref)).
-    Location origLoc = reshape.getLoc();
-    Value source = reshape.getSrc();
-    auto sourceType = cast<MemRefType>(source.getType());
-    unsigned sourceRank = sourceType.getRank();
-
-    auto newExtractStridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(origLoc, source);
-
-    // Collect statically known information.
-    auto [strides, offset] = getStridesAndOffset(sourceType);
-    MemRefType reshapeType = reshape.getResultType();
-    unsigned reshapeRank = reshapeType.getRank();
-
-    OpFoldResult offsetOfr =
-        ShapedType::isDynamic(offset)
-            ? getAsOpFoldResult(newExtractStridedMetadata.getOffset())
-            : rewriter.getIndexAttr(offset);
-
-    // Get the special case of 0-D out of the way.
-    if (sourceRank == 0) {
-      SmallVector<OpFoldResult> ones(reshapeRank, rewriter.getIndexAttr(1));
-      auto memrefDesc = rewriter.create<memref::ReinterpretCastOp>(
-          origLoc, reshapeType, newExtractStridedMetadata.getBaseBuffer(),
-          offsetOfr, /*sizes=*/ones, /*strides=*/ones);
-      rewriter.replaceOp(reshape, memrefDesc.getResult());
-      return success();
+    FailureOr<StridedMetadata> stridedMetadata =
+        resolveReshapeStridedMetadata<ReassociativeReshapeLikeOp>(
+            rewriter, reshape, getReshapedSizes, getReshapedStrides);
+    if (failed(stridedMetadata)) {
+      return rewriter.notifyMatchFailure(reshape,
+                                         "failed to resolve reshape metadata");
     }
 
-    SmallVector<OpFoldResult> finalSizes;
-    finalSizes.reserve(reshapeRank);
-    SmallVector<OpFoldResult> finalStrides;
-    finalStrides.reserve(reshapeRank);
-
-    // Compute the reshaped strides and sizes from the base strides and sizes.
-    SmallVector<OpFoldResult> origSizes =
-        getAsOpFoldResult(newExtractStridedMetadata.getSizes());
-    SmallVector<OpFoldResult> origStrides =
-        getAsOpFoldResult(newExtractStridedMetadata.getStrides());
-    unsigned idx = 0, endIdx = reshape.getReassociationIndices().size();
-    for (; idx != endIdx; ++idx) {
-      SmallVector<OpFoldResult> reshapedSizes =
-          getReshapedSizes(reshape, rewriter, origSizes, /*groupId=*/idx);
-      SmallVector<OpFoldResult> reshapedStrides = getReshapedStrides(
-          reshape, rewriter, origSizes, origStrides, /*groupId=*/idx);
-
-      unsigned groupSize = reshapedSizes.size();
-      for (unsigned i = 0; i < groupSize; ++i) {
-        finalSizes.push_back(reshapedSizes[i]);
-        finalStrides.push_back(reshapedStrides[i]);
-      }
+    rewriter.replaceOpWithNewOp<memref::ReinterpretCastOp>(
+        reshape, reshape.getType(), stridedMetadata->basePtr,
+        stridedMetadata->offset, stridedMetadata->sizes,
+        stridedMetadata->strides);
+    return success();
+  }
+};
+
+/// Pattern to replace `extract_strided_metadata(collapse_shape)`
+/// With
+///
+/// \verbatim
+/// baseBuffer, baseOffset, baseSizes, baseStrides =
+///     extract_strided_metadata(memref)
+/// strides#i = baseStrides#i * subSizes#i
+/// offset = baseOffset + sum(subOffset#i * baseStrides#i)
+/// sizes = subSizes
+/// \verbatim
+///
+/// with `baseBuffer`, `offset`, `sizes` and `strides` being
+/// the replacements for the original `extract_strided_metadata`.
+struct ExtractStridedMetadataOpCollapseShapeFolder
+    : OpRewritePattern<memref::ExtractStridedMetadataOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::ExtractStridedMetadataOp op,
+                                PatternRewriter &rewriter) const override {
+    auto collapseShapeOp =
+        op.getSource().getDefiningOp<memref::CollapseShapeOp>();
+    if (!collapseShapeOp)
+      return failure();
+
+    FailureOr<StridedMetadata> stridedMetadata =
+        resolveReshapeStridedMetadata<memref::CollapseShapeOp>(
+            rewriter, collapseShapeOp, getCollapsedSize, getCollapsedStride);
+    if (failed(stridedMetadata)) {
+      return rewriter.notifyMatchFailure(
+          op,
+          "failed to resolve metadata in terms of source collapse_shape op");
     }
-    assert(((isa<memref::ExpandShapeOp>(reshape) && idx == sourceRank) ||
-            (isa<memref::CollapseShapeOp>(reshape) && idx == reshapeRank)) &&
-           "We should have visited all the input dimensions");
-    assert(finalSizes.size() == reshapeRank &&
-           "We should have populated all the values");
-    auto memrefDesc = rewriter.create<memref::ReinterpretCastOp>(
-        origLoc, reshapeType, newExtractStridedMetadata.getBaseBuffer(),
-        offsetOfr, finalSizes, finalStrides);
-    rewriter.replaceOp(reshape, memrefDesc.getResult());
+
+    Location loc = collapseShapeOp.getLoc();
+    SmallVector<Value> results;
+    results.push_back(stridedMetadata->basePtr);
+    results.push_back(getValueOrCreateConstantIndexOp(rewriter, loc,
+                                                      stridedMetadata->offset));
+    results.append(
+        getValueOrCreateConstantIndexOp(rewriter, loc, stridedMetadata->sizes));
+    results.append(getValueOrCreateConstantIndexOp(rewriter, loc,
+                                                   stridedMetadata->strides));
+    rewriter.replaceOp(op, results);
     return success();
   }
 };
@@ -1018,9 +1098,11 @@ void memref::populateExpandStridedMetadataPatterns(
                              getCollapsedStride>,
                ExtractStridedMetadataOpAllocFolder<memref::AllocOp>,
                ExtractStridedMetadataOpAllocFolder<memref::AllocaOp>,
+               ExtractStridedMetadataOpCollapseShapeFolder,
                ExtractStridedMetadataOpGetGlobalFolder,
                RewriteExtractAlignedPointerAsIndexOfViewLikeOp,
                ExtractStridedMetadataOpReinterpretCastFolder,
+               ExtractStridedMetadataOpSubviewFolder,
                ExtractStridedMetadataOpCastFolder,
                ExtractStridedMetadataOpExtractStridedMetadataFolder>(
       patterns.getContext());
@@ -1030,6 +1112,7 @@ void memref::populateResolveExtractStridedMetadataPatterns(
     RewritePatternSet &patterns) {
   patterns.add<ExtractStridedMetadataOpAllocFolder<memref::AllocOp>,
                ExtractStridedMetadataOpAllocFolder<memref::AllocaOp>,
+               ExtractStridedMetadataOpCollapseShapeFolder,
                ExtractStridedMetadataOpGetGlobalFolder,
                ExtractStridedMetadataOpSubviewFolder,
                RewriteExtractAlignedPointerAsIndexOfViewLikeOp,
diff --git a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
index 05b813a3b1e9..450bfa0cec0c 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
@@ -20,25 +20,6 @@
 
 using namespace mlir;
 
-/// Generate an error message string for the given op and the specified error.
-static std::string generateErrorMessage(Operation *op, const std::string &msg) {
-  std::string buffer;
-  llvm::raw_string_ostream stream(buffer);
-  OpPrintingFlags flags;
-  // We may generate a lot of error messages and so we need to ensure the
-  // printing is fast.
-  flags.elideLargeElementsAttrs();
-  flags.printGenericOpForm();
-  flags.skipRegions();
-  flags.useLocalScope();
-  stream << "ERROR: Runtime op verification failed\n";
-  op->print(stream, flags);
-  stream << "\n^ " << msg;
-  stream << "\nLocation: ";
-  op->getLoc().print(stream);
-  return stream.str();
-}
-
 namespace mlir {
 namespace memref {
 namespace {
@@ -62,8 +43,10 @@ struct CastOpInterface
           builder.create<arith::ConstantIndexOp>(loc, resultType.getRank());
       Value isSameRank = builder.create<arith::CmpIOp>(
           loc, arith::CmpIPredicate::eq, srcRank, resultRank);
-      builder.create<cf::AssertOp>(loc, isSameRank,
-                                   generateErrorMessage(op, "rank mismatch"));
+      builder.create<cf::AssertOp>(
+          loc, isSameRank,
+          RuntimeVerifiableOpInterface::generateErrorMessage(op,
+                                                             "rank mismatch"));
     }
 
     // Get source offset and strides. We do not have an op to get offsets and
@@ -101,8 +84,8 @@ struct CastOpInterface
           loc, arith::CmpIPredicate::eq, srcDimSz, resultDimSz);
       builder.create<cf::AssertOp>(
           loc, isSameSz,
-          generateErrorMessage(op, "size mismatch of dim " +
-                                       std::to_string(it.index())));
+          RuntimeVerifiableOpInterface::generateErrorMessage(
+              op, "size mismatch of dim " + std::to_string(it.index())));
     }
 
     // Get result offset and strides.
@@ -119,8 +102,10 @@ struct CastOpInterface
           builder.create<arith::ConstantIndexOp>(loc, resultOffset);
       Value isSameOffset = builder.create<arith::CmpIOp>(
           loc, arith::CmpIPredicate::eq, srcOffset, resultOffsetVal);
-      builder.create<cf::AssertOp>(loc, isSameOffset,
-                                   generateErrorMessage(op, "offset mismatch"));
+      builder.create<cf::AssertOp>(
+          loc, isSameOffset,
+          RuntimeVerifiableOpInterface::generateErrorMessage(
+              op, "offset mismatch"));
     }
 
     // Check strides.
@@ -137,8 +122,8 @@ struct CastOpInterface
           loc, arith::CmpIPredicate::eq, srcStride, resultStrideVal);
       builder.create<cf::AssertOp>(
           loc, isSameStride,
-          generateErrorMessage(op, "stride mismatch of dim " +
-                                       std::to_string(it.index())));
+          RuntimeVerifiableOpInterface::generateErrorMessage(
+              op, "stride mismatch of dim " + std::to_string(it.index())));
     }
   }
 };
@@ -178,7 +163,9 @@ struct LoadStoreOpInterface
                 : andOp;
     }
     builder.create<cf::AssertOp>(
-        loc, assertCond, generateErrorMessage(op, "out-of-bounds access"));
+        loc, assertCond,
+        RuntimeVerifiableOpInterface::generateErrorMessage(
+            op, "out-of-bounds access"));
   }
 };
 
@@ -248,7 +235,7 @@ struct ReinterpretCastOpInterface
 
     builder.create<cf::AssertOp>(
         loc, assertCond,
-        generateErrorMessage(
+        RuntimeVerifiableOpInterface::generateErrorMessage(
             op,
             "result of reinterpret_cast is out-of-bounds of the base memref"));
   }
@@ -293,8 +280,8 @@ struct SubViewOpInterface
 
     builder.create<cf::AssertOp>(
         loc, assertCond,
-        generateErrorMessage(op,
-                             "subview is out-of-bounds of the base memref"));
+        RuntimeVerifiableOpInterface::generateErrorMessage(
+            op, "subview is out-of-bounds of the base memref"));
   }
 };
 
@@ -334,8 +321,9 @@ struct ExpandShapeOpInterface
           builder.create<arith::ConstantIndexOp>(loc, 0));
       builder.create<cf::AssertOp>(
           loc, isModZero,
-          generateErrorMessage(op, "static result dims in reassoc group do not "
-                                   "divide src dim evenly"));
+          RuntimeVerifiableOpInterface::generateErrorMessage(
+              op, "static result dims in reassoc group do not "
+                  "divide src dim evenly"));
     }
   }
 };
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index f60668dd0cf9..0799090cdea9 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2258,7 +2258,8 @@ void PrivateClauseOp::build(OpBuilder &odsBuilder, OperationState &odsState,
 LogicalResult PrivateClauseOp::verify() {
   Type symType = getType();
 
-  auto verifyTerminator = [&](Operation *terminator) -> LogicalResult {
+  auto verifyTerminator = [&](Operation *terminator,
+                              bool yieldsValue) -> LogicalResult {
     if (!terminator->getBlock()->getSuccessors().empty())
       return success();
 
@@ -2269,6 +2270,14 @@ LogicalResult PrivateClauseOp::verify() {
     YieldOp yieldOp = llvm::cast<YieldOp>(terminator);
     TypeRange yieldedTypes = yieldOp.getResults().getTypes();
 
+    if (!yieldsValue) {
+      if (yieldedTypes.empty())
+        return success();
+
+      return mlir::emitError(terminator->getLoc())
+             << "Did not expect any values to be yielded.";
+    }
+
     if (yieldedTypes.size() == 1 && yieldedTypes.front() == symType)
       return success();
 
@@ -2285,7 +2294,8 @@ LogicalResult PrivateClauseOp::verify() {
   };
 
   auto verifyRegion = [&](Region &region, unsigned expectedNumArgs,
-                          StringRef regionName) -> LogicalResult {
+                          StringRef regionName,
+                          bool yieldsValue) -> LogicalResult {
     assert(!region.empty());
 
     if (region.getNumArguments() != expectedNumArgs)
@@ -2299,14 +2309,15 @@ LogicalResult PrivateClauseOp::verify() {
       if (!block.mightHaveTerminator())
         continue;
 
-      if (failed(verifyTerminator(block.getTerminator())))
+      if (failed(verifyTerminator(block.getTerminator(), yieldsValue)))
         return failure();
     }
 
     return success();
   };
 
-  if (failed(verifyRegion(getAllocRegion(), /*expectedNumArgs=*/1, "alloc")))
+  if (failed(verifyRegion(getAllocRegion(), /*expectedNumArgs=*/1, "alloc",
+                          /*yieldsValue=*/true)))
     return failure();
 
   DataSharingClauseType dsType = getDataSharingType();
@@ -2319,7 +2330,13 @@ LogicalResult PrivateClauseOp::verify() {
         "`firstprivate` clauses require both `alloc` and `copy` regions.");
 
   if (dsType == DataSharingClauseType::FirstPrivate &&
-      failed(verifyRegion(getCopyRegion(), /*expectedNumArgs=*/2, "copy")))
+      failed(verifyRegion(getCopyRegion(), /*expectedNumArgs=*/2, "copy",
+                          /*yieldsValue=*/true)))
+    return failure();
+
+  if (!getDeallocRegion().empty() &&
+      failed(verifyRegion(getDeallocRegion(), /*expectedNumArgs=*/1, "dealloc",
+                          /*yieldsValue=*/false)))
     return failure();
 
   return success();
diff --git a/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt b/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
index 7f5b3255d5d9..d6e703b8b359 100644
--- a/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
@@ -16,4 +16,5 @@ add_mlir_dialect_library(MLIRPolynomialDialect
   MLIRSupport
   MLIRDialect
   MLIRIR
+  MLIRInferTypeOpInterface
   )
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
index ee09c73bb3c4..f1ec2be72a33 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
@@ -172,7 +172,7 @@ Attribute RingAttr::parse(AsmParser &parser, Type type) {
     if (failed(parser.parseEqual()))
       return {};
 
-    IntegerType iType = ty.dyn_cast<IntegerType>();
+    IntegerType iType = mlir::dyn_cast<IntegerType>(ty);
     if (!iType) {
       parser.emitError(parser.getCurrentLocation(),
                        "coefficientType must specify an integer type");
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp
index a672a59b8a46..825b80d70f80 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp
@@ -8,9 +8,18 @@
 
 #include "mlir/Dialect/Polynomial/IR/Polynomial.h"
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h"
 #include "mlir/Dialect/Polynomial/IR/PolynomialOps.h"
 #include "mlir/Dialect/Polynomial/IR/PolynomialTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
index 96c59a28b8fd..8e2bb5f27dc6 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
@@ -6,10 +6,101 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Polynomial/IR/PolynomialOps.h"
 #include "mlir/Dialect/Polynomial/IR/Polynomial.h"
+#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h"
+#include "mlir/Dialect/Polynomial/IR/PolynomialTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/APInt.h"
 
 using namespace mlir;
 using namespace mlir::polynomial;
 
-#define GET_OP_CLASSES
-#include "mlir/Dialect/Polynomial/IR/Polynomial.cpp.inc"
+void FromTensorOp::build(OpBuilder &builder, OperationState &result,
+                         Value input, RingAttr ring) {
+  TensorType tensorType = dyn_cast<TensorType>(input.getType());
+  auto bitWidth = tensorType.getElementTypeBitWidth();
+  APInt cmod(1 + bitWidth, 1);
+  cmod = cmod << bitWidth;
+  Type resultType = PolynomialType::get(builder.getContext(), ring);
+  build(builder, result, resultType, input);
+}
+
+LogicalResult FromTensorOp::verify() {
+  ArrayRef<int64_t> tensorShape = getInput().getType().getShape();
+  RingAttr ring = getOutput().getType().getRing();
+  unsigned polyDegree = ring.getPolynomialModulus().getPolynomial().getDegree();
+  bool compatible = tensorShape.size() == 1 && tensorShape[0] <= polyDegree;
+  if (!compatible) {
+    InFlightDiagnostic diag = emitOpError()
+                              << "input type " << getInput().getType()
+                              << " does not match output type "
+                              << getOutput().getType();
+    diag.attachNote() << "the input type must be a tensor of shape [d] where d "
+                         "is at most the degree of the polynomialModulus of "
+                         "the output type's ring attribute";
+    return diag;
+  }
+
+  APInt coefficientModulus = ring.getCoefficientModulus().getValue();
+  unsigned cmodBitWidth = coefficientModulus.ceilLogBase2();
+  unsigned inputBitWidth = getInput().getType().getElementTypeBitWidth();
+
+  if (inputBitWidth > cmodBitWidth) {
+    InFlightDiagnostic diag = emitOpError()
+                              << "input tensor element type "
+                              << getInput().getType().getElementType()
+                              << " is too large to fit in the coefficients of "
+                              << getOutput().getType();
+    diag.attachNote() << "the input tensor's elements must be rescaled"
+                         " to fit before using from_tensor";
+    return diag;
+  }
+
+  return success();
+}
+
+LogicalResult ToTensorOp::verify() {
+  ArrayRef<int64_t> tensorShape = getOutput().getType().getShape();
+  unsigned polyDegree = getInput()
+                            .getType()
+                            .getRing()
+                            .getPolynomialModulus()
+                            .getPolynomial()
+                            .getDegree();
+  bool compatible = tensorShape.size() == 1 && tensorShape[0] == polyDegree;
+
+  if (compatible)
+    return success();
+
+  InFlightDiagnostic diag =
+      emitOpError() << "input type " << getInput().getType()
+                    << " does not match output type " << getOutput().getType();
+  diag.attachNote() << "the output type must be a tensor of shape [d] where d "
+                       "is at most the degree of the polynomialModulus of "
+                       "the input type's ring attribute";
+  return diag;
+}
+
+LogicalResult MulScalarOp::verify() {
+  Type argType = getPolynomial().getType();
+  PolynomialType polyType;
+
+  if (auto shapedPolyType = dyn_cast<ShapedType>(argType)) {
+    polyType = cast<PolynomialType>(shapedPolyType.getElementType());
+  } else {
+    polyType = cast<PolynomialType>(argType);
+  }
+
+  Type coefficientType = polyType.getRing().getCoefficientType();
+
+  if (coefficientType != getScalar().getType())
+    return emitOpError() << "polynomial coefficient type " << coefficientType
+                         << " does not match scalar type "
+                         << getScalar().getType();
+
+  return success();
+}
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index b1d44559fa5a..028a69da10c1 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -326,9 +326,9 @@ SparseTensorDimSliceAttr::verify(function_ref<InFlightDiagnostic()> emitError,
 SparseTensorEncodingAttr
 SparseTensorEncodingAttr::withDimToLvl(AffineMap dimToLvl) const {
   assert(getImpl() && "Uninitialized SparseTensorEncodingAttr");
-  return SparseTensorEncodingAttr::get(getContext(), getLvlTypes(), dimToLvl,
-                                       AffineMap(), getPosWidth(),
-                                       getCrdWidth());
+  return SparseTensorEncodingAttr::get(
+      getContext(), getLvlTypes(), dimToLvl, AffineMap(), getPosWidth(),
+      getCrdWidth(), getExplicitVal(), getImplicitVal());
 }
 
 SparseTensorEncodingAttr
@@ -344,20 +344,44 @@ SparseTensorEncodingAttr
 SparseTensorEncodingAttr::withBitWidths(unsigned posWidth,
                                         unsigned crdWidth) const {
   assert(getImpl() && "Uninitialized SparseTensorEncodingAttr");
-  return SparseTensorEncodingAttr::get(getContext(), getLvlTypes(),
-                                       getDimToLvl(), getLvlToDim(), posWidth,
-                                       crdWidth);
+  return SparseTensorEncodingAttr::get(
+      getContext(), getLvlTypes(), getDimToLvl(), getLvlToDim(), posWidth,
+      crdWidth, getExplicitVal(), getImplicitVal());
 }
 
 SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutBitWidths() const {
   return withBitWidths(0, 0);
 }
 
+SparseTensorEncodingAttr
+SparseTensorEncodingAttr::withExplicitVal(Attribute explicitVal) const {
+  assert(getImpl() && "Uninitialized SparseTensorEncodingAttr");
+  return SparseTensorEncodingAttr::get(
+      getContext(), getLvlTypes(), getDimToLvl(), getLvlToDim(), getPosWidth(),
+      getCrdWidth(), explicitVal, getImplicitVal());
+}
+
+SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutExplicitVal() const {
+  return withExplicitVal(Attribute());
+}
+
+SparseTensorEncodingAttr
+SparseTensorEncodingAttr::withImplicitVal(Attribute implicitVal) const {
+  assert(getImpl() && "Uninitialized SparseTensorEncodingAttr");
+  return SparseTensorEncodingAttr::get(
+      getContext(), getLvlTypes(), getDimToLvl(), getLvlToDim(), getPosWidth(),
+      getCrdWidth(), getExplicitVal(), implicitVal);
+}
+
+SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutImplicitVal() const {
+  return withImplicitVal(Attribute());
+}
+
 SparseTensorEncodingAttr SparseTensorEncodingAttr::withDimSlices(
     ArrayRef<SparseTensorDimSliceAttr> dimSlices) const {
-  return SparseTensorEncodingAttr::get(getContext(), getLvlTypes(),
-                                       getDimToLvl(), getLvlToDim(),
-                                       getPosWidth(), getCrdWidth(), dimSlices);
+  return SparseTensorEncodingAttr::get(
+      getContext(), getLvlTypes(), getDimToLvl(), getLvlToDim(), getPosWidth(),
+      getCrdWidth(), getExplicitVal(), getImplicitVal(), dimSlices);
 }
 
 SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutDimSlices() const {
@@ -553,8 +577,11 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) {
   AffineMap lvlToDim = {};
   unsigned posWidth = 0;
   unsigned crdWidth = 0;
+  Attribute explicitVal;
+  Attribute implicitVal;
   StringRef attrName;
-  SmallVector<StringRef, 3> keys = {"map", "posWidth", "crdWidth"};
+  SmallVector<StringRef, 5> keys = {"map", "posWidth", "crdWidth",
+                                    "explicitVal", "implicitVal"};
   while (succeeded(parser.parseOptionalKeyword(&attrName))) {
     // Detect admissible keyword.
     auto *it = find(keys, attrName);
@@ -628,6 +655,36 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) {
       crdWidth = intAttr.getInt();
       break;
     }
+    case 3: { // explicitVal
+      Attribute attr;
+      if (failed(parser.parseAttribute(attr)))
+        return {};
+      if (auto result = llvm::dyn_cast<FloatAttr>(attr)) {
+        explicitVal = result;
+      } else if (auto result = llvm::dyn_cast<IntegerAttr>(attr)) {
+        explicitVal = result;
+      } else {
+        parser.emitError(parser.getNameLoc(),
+                         "expected a numeric value for explicitVal");
+        return {};
+      }
+      break;
+    }
+    case 4: { // implicitVal
+      Attribute attr;
+      if (failed(parser.parseAttribute(attr)))
+        return {};
+      if (auto result = llvm::dyn_cast<FloatAttr>(attr)) {
+        implicitVal = result;
+      } else if (auto result = llvm::dyn_cast<IntegerAttr>(attr)) {
+        implicitVal = result;
+      } else {
+        parser.emitError(parser.getNameLoc(),
+                         "expected a numeric value for implicitVal");
+        return {};
+      }
+      break;
+    }
     } // switch
     // Only last item can omit the comma.
     if (parser.parseOptionalComma().failed())
@@ -646,7 +703,7 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) {
   }
   return parser.getChecked<SparseTensorEncodingAttr>(
       parser.getContext(), lvlTypes, dimToLvl, lvlToDim, posWidth, crdWidth,
-      dimSlices);
+      explicitVal, implicitVal, dimSlices);
 }
 
 void SparseTensorEncodingAttr::print(AsmPrinter &printer) const {
@@ -666,6 +723,11 @@ void SparseTensorEncodingAttr::print(AsmPrinter &printer) const {
     printer << ", posWidth = " << getPosWidth();
   if (getCrdWidth())
     printer << ", crdWidth = " << getCrdWidth();
+  if (getExplicitVal()) {
+    printer << ", explicitVal = " << getExplicitVal();
+  }
+  if (getImplicitVal())
+    printer << ", implicitVal = " << getImplicitVal();
   printer << " }>";
 }
 
@@ -715,7 +777,8 @@ void SparseTensorEncodingAttr::printLevels(AffineMap &map, AsmPrinter &printer,
 LogicalResult SparseTensorEncodingAttr::verify(
     function_ref<InFlightDiagnostic()> emitError, ArrayRef<LevelType> lvlTypes,
     AffineMap dimToLvl, AffineMap lvlToDim, unsigned posWidth,
-    unsigned crdWidth, ArrayRef<SparseTensorDimSliceAttr> dimSlices) {
+    unsigned crdWidth, Attribute explicitVal, Attribute implicitVal,
+    ArrayRef<SparseTensorDimSliceAttr> dimSlices) {
   if (!acceptBitWidth(posWidth))
     return emitError() << "unexpected position bitwidth: " << posWidth;
   if (!acceptBitWidth(crdWidth))
@@ -831,7 +894,8 @@ LogicalResult SparseTensorEncodingAttr::verifyEncoding(
   // Check structural integrity.  In particular, this ensures that the
   // level-rank is coherent across all the fields.
   if (failed(verify(emitError, getLvlTypes(), getDimToLvl(), getLvlToDim(),
-                    getPosWidth(), getCrdWidth(), getDimSlices())))
+                    getPosWidth(), getCrdWidth(), getExplicitVal(),
+                    getImplicitVal(), getDimSlices())))
     return failure();
   // Check integrity with tensor type specifics.  In particular, we
   // need only check that the dimension-rank of the tensor agrees with
@@ -921,9 +985,9 @@ mlir::sparse_tensor::SparseTensorType::getCOOType(bool ordered) const {
     // Ends by a unique singleton level.
     lvlTypes.push_back(*buildLevelType(LevelFormat::Singleton, ordered, true));
   }
-  auto enc = SparseTensorEncodingAttr::get(getContext(), lvlTypes,
-                                           getDimToLvl(), getLvlToDim(),
-                                           getPosWidth(), getCrdWidth());
+  auto enc = SparseTensorEncodingAttr::get(
+      getContext(), lvlTypes, getDimToLvl(), getLvlToDim(), getPosWidth(),
+      getCrdWidth(), getExplicitVal(), getImplicitVal());
   return RankedTensorType::get(getDimShape(), getElementType(), enc);
 }
 
@@ -1115,7 +1179,10 @@ getNormalizedEncodingForSpecifier(SparseTensorEncodingAttr enc) {
       // `getPosWidth` and `getCrdWidth`. It allows us to reuse the same SSA
       // value for different bitwidth, it also avoids casting between index and
       // integer (returned by DimOp)
-      0, 0, enc.getDimSlices());
+      0, 0,
+      Attribute(), // explicitVal (irrelevant to storage specifier)
+      Attribute(), // implicitVal (irrelevant to storage specifier)
+      enc.getDimSlices());
 }
 
 StorageSpecifierType
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 02375f54d715..7d469198a653 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -209,6 +209,117 @@ static void concatSizesFromInputs(OpBuilder &builder,
 
 namespace {
 
+/// TODO: move it to tensor dialect instead.
+///
+/// Fold `tensor.concat` and `tensor.extract_slice`
+///
+/// %concat = tensor.concat dim(2) %t0, %t1
+///   : (tensor<1x64x1xf32>, tensor<1x64x1xf32>) -> tensor<1x64x2xf32>
+/// %extracted0 = tensor.extract_slice %concat[0, 0, 0][1, 64, 1][1, 1, 1]
+///   : tensor<1x64x2xf32> to tensor<1x64x1xf32>
+/// %extracted1 = tensor.extract_slice %concat[0, 0, 1][1, 64, 1][1, 1, 1]
+///   : tensor<1x64x2xf32> to tensor<1x64x1xf32>
+///
+/// Becomes
+///
+/// %extract0, %extract1 = %t0, %t1
+struct FuseExtractSliceWithConcat
+    : public OpRewritePattern<tensor::ExtractSliceOp> {
+  using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp extractOp,
+                                PatternRewriter &rewriter) const override {
+    auto concatOp = extractOp.getSource().getDefiningOp<tensor::ConcatOp>();
+    if (!concatOp)
+      return failure();
+
+    Location loc = extractOp.getLoc();
+    int64_t dim = concatOp.getDim();
+    int64_t rank = extractOp.getResultType().getRank();
+
+    SmallVector<OpFoldResult> srcStrides(rank, rewriter.getIndexAttr(1));
+    SmallVector<OpFoldResult> srcOffsets(rank, rewriter.getIndexAttr(0));
+
+    // Compute the partial sums for the slice offsets.
+    AffineExpr sum = rewriter.getAffineDimExpr(0);
+    SmallVector<AffineExpr> partialSums = {sum};
+    SmallVector<OpFoldResult> offsetStrides = {rewriter.getIndexAttr(0)};
+    for (auto [idx, input] :
+         llvm::enumerate(concatOp.getInputs().drop_back())) {
+      sum = sum + rewriter.getAffineDimExpr(idx + 1);
+      partialSums.push_back(sum);
+      offsetStrides.push_back(
+          rewriter.createOrFold<tensor::DimOp>(loc, input, dim));
+    }
+    auto partialSumMap = AffineMap::get(concatOp.getInputs().size(), 0,
+                                        partialSums, rewriter.getContext());
+    SmallVector<OpFoldResult> dimOffsets =
+        affine::makeComposedFoldedMultiResultAffineApply(
+            rewriter, loc, partialSumMap, offsetStrides);
+
+    auto allEqual = [](ArrayRef<OpFoldResult> lhs, ArrayRef<OpFoldResult> rhs) {
+      for (auto [l, r] : llvm::zip(lhs, rhs)) {
+        std::optional<int64_t> staticVal = getConstantIntValue(l);
+        if (!staticVal.has_value() || staticVal != getConstantIntValue(r))
+          return false;
+      }
+      return lhs.size() == rhs.size();
+    };
+
+    for (auto [i, input, offset] :
+         llvm::enumerate(concatOp.getInputs(), dimOffsets)) {
+      SmallVector<OpFoldResult> srcSizes =
+          tensor::getMixedSizes(rewriter, loc, input);
+      srcOffsets[dim] = offset;
+
+      SmallVector<OpFoldResult> dstSizes = extractOp.getMixedSizes();
+      SmallVector<OpFoldResult> dstOffsets = extractOp.getMixedOffsets();
+      SmallVector<OpFoldResult> dstStrides = extractOp.getMixedStrides();
+
+      if (allEqual(srcSizes, dstSizes) && allEqual(srcOffsets, dstOffsets) &&
+          allEqual(srcStrides, dstStrides)) {
+        Value operand = concatOp.getOperand(i);
+        if (operand.getType() == extractOp.getResultType())
+          rewriter.replaceOp(extractOp, operand);
+        break;
+      }
+    }
+
+    return success();
+  }
+};
+
+/// Rewriting rule that fuses sparse_tensor.convert into producer.
+struct FoldConvertIntoProducer : public OpRewritePattern<ConvertOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ConvertOp op,
+                                PatternRewriter &rewriter) const override {
+    auto producer = op.getSource().getDefiningOp<GenericOp>();
+    if (!producer || producer.getDpsInits().size() != 1 ||
+        !isMaterializing(producer.getDpsInitOperand(0), false) ||
+        !producer.getResult(0).hasOneUse()) {
+      return failure();
+    }
+    rewriter.modifyOpInPlace(producer, [&]() {
+      producer.getResult(0).setType(op.getResult().getType());
+    });
+
+    Operation *materializeOp =
+        producer.getDpsInitOperand(0)->get().getDefiningOp();
+
+    rewriter.modifyOpInPlace(materializeOp, [&]() {
+      materializeOp->getResult(0).setType(op.getResult().getType());
+    });
+
+    rewriter.replaceAllOpUsesWith(op, producer);
+    op->erase();
+
+    return success();
+  }
+};
+
 /// Rewriting rule that converts direct yield of zero with initial allocation.
 struct FoldInvariantYield : public OpRewritePattern<GenericOp> {
 public:
@@ -952,8 +1063,15 @@ public:
       auto rtp = getRankedTensorType(op.getResult());
       auto denseTp =
           RankedTensorType::get(rtp.getShape(), rtp.getElementType());
-      auto reshape = rewriter.create<ReshapeOp>(loc, denseTp, op.getSrc(),
-                                                op.getReassociation());
+      ReshapeOp reshape;
+      if constexpr (std::is_same<ReshapeOp, tensor::ExpandShapeOp>::value) {
+        reshape = rewriter.create<ReshapeOp>(
+            loc, denseTp, op.getSrc(), op.getReassociation(),
+            op.getOutputShape(), op.getStaticOutputShape());
+      } else {
+        reshape = rewriter.create<ReshapeOp>(loc, denseTp, op.getSrc(),
+                                             op.getReassociation());
+      }
       Value convert = rewriter.create<ConvertOp>(loc, rtp, reshape);
       rewriter.replaceOp(op, convert);
       return success();
@@ -1426,7 +1544,8 @@ struct OutRewriter : public OpRewritePattern<OutOp> {
 //===---------------------------------------------------------------------===//
 
 void mlir::populatePreSparsificationRewriting(RewritePatternSet &patterns) {
-  patterns.add<FoldInvariantYield, FuseSparseMultiplyOverAdd, FuseTensorCast,
+  patterns.add<FuseExtractSliceWithConcat, FoldConvertIntoProducer,
+               FoldInvariantYield, FuseSparseMultiplyOverAdd, FuseTensorCast,
                GenSemiRingReduction, GenSemiRingSelect, PrintRewriter>(
       patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index cd046b670d9a..0c8e431d8c99 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -403,6 +403,22 @@ static Value genInsertionLoadReduce(CodegenEnv &env, OpBuilder &builder,
   return builder.create<arith::SelectOp>(loc, isFilled, valAtIndex, identity);
 }
 
+static Value genConditionalInsert(Location loc, OpBuilder &builder, Value cond,
+                                  Value sparseOut, ValueRange ivs, Value v) {
+  scf::IfOp condInsert =
+      builder.create<scf::IfOp>(loc, sparseOut.getType(), cond, true);
+  // True branch.
+  builder.setInsertionPointToStart(condInsert.thenBlock());
+  Value res = builder.create<tensor::InsertOp>(loc, v, sparseOut, ivs);
+  builder.create<scf::YieldOp>(loc, res);
+  // False branch.
+  builder.setInsertionPointToStart(condInsert.elseBlock());
+  builder.create<scf::YieldOp>(loc, sparseOut);
+  // Value assignment.
+  builder.setInsertionPointAfter(condInsert);
+  return condInsert.getResult(0);
+}
+
 /// Generates insertion code to implement dynamic tensor store.
 static void genInsertionStore(CodegenEnv &env, OpBuilder &builder, OpOperand *t,
                               Value rhs) {
@@ -423,23 +439,21 @@ static void genInsertionStore(CodegenEnv &env, OpBuilder &builder, OpOperand *t,
       //     return updated chain
       //   else
       //     return unmodified chain
-      scf::IfOp ifValidLexInsert = builder.create<scf::IfOp>(
-          loc, chain.getType(), env.getValidLexInsert(),
-          /*else=*/true);
-      // True branch.
-      builder.setInsertionPointToStart(ifValidLexInsert.thenBlock());
-      Value res = builder.create<tensor::InsertOp>(loc, rhs, chain, ivs);
-      builder.create<scf::YieldOp>(loc, res);
-      // False branch.
-      builder.setInsertionPointToStart(ifValidLexInsert.elseBlock());
-      builder.create<scf::YieldOp>(loc, chain);
-      // Value assignment.
-      builder.setInsertionPointAfter(ifValidLexInsert);
-      env.updateInsertionChain(ifValidLexInsert.getResult(0));
+      Value out = genConditionalInsert(loc, builder, env.getValidLexInsert(),
+                                       chain, ivs, rhs);
+      env.updateInsertionChain(out);
     } else {
+      Value sparseOut;
+      if (!hasAnySparseType(env.op().getInputs().getTypes())) {
+        // This is an all-dense -> sparse kernel, test rhs != 0 before
+        // insertion.
+        Value nz = genIsNonzero(builder, loc, rhs);
+        sparseOut = genConditionalInsert(loc, builder, nz, chain, ivs, rhs);
+      } else {
+        sparseOut = builder.create<tensor::InsertOp>(loc, rhs, chain, ivs);
+      }
       // Generates regular insertion chain.
-      env.updateInsertionChain(
-          builder.create<tensor::InsertOp>(loc, rhs, chain, ivs));
+      env.updateInsertionChain(sparseOut);
     }
     return;
   }
@@ -484,9 +498,15 @@ static Value genTensorLoad(CodegenEnv &env, OpBuilder &builder, ExprId exp) {
   Value val = env.exp(exp).val;
   if (val)
     return val;
-  // Load during insertion.
+  // Get tensor operand.
   linalg::GenericOp op = env.op();
+  Location loc = op.getLoc();
   OpOperand *t = &op->getOpOperand(env.exp(exp).tensor);
+  // Fold binary-valued tensor into explicit value.
+  const auto stt = getSparseTensorType(t->get());
+  if (auto explVal = stt.getExplicitVal())
+    return genValFromAttr(builder, loc, explVal);
+  // Load during insertion.
   if (env.isSparseOutput(t)) {
     if (env.isCustomReduc())
       return genInsertionLoadReduce(env, builder, t);
@@ -495,7 +515,7 @@ static Value genTensorLoad(CodegenEnv &env, OpBuilder &builder, ExprId exp) {
   // Actual load.
   SmallVector<Value> args;
   Value ptr = genSubscript(env, builder, t, args);
-  return builder.create<memref::LoadOp>(op.getLoc(), ptr, args);
+  return builder.create<memref::LoadOp>(loc, ptr, args);
 }
 
 /// Generates a store on a dense or sparse tensor.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
index ce5831d999e9..cf3c35f5fa4c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
@@ -399,6 +399,16 @@ inline Value constantLevelTypeEncoding(OpBuilder &builder, Location loc,
   return constantI64(builder, loc, static_cast<uint64_t>(lt));
 }
 
+// Generates a constant from a validated value carrying attribute.
+inline Value genValFromAttr(OpBuilder &builder, Location loc, Attribute attr) {
+  if (auto arrayAttr = dyn_cast<ArrayAttr>(attr)) {
+    Type tp = cast<TypedAttr>(arrayAttr[0]).getType();
+    return builder.create<complex::ConstantOp>(loc, tp, arrayAttr);
+  }
+  return builder.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
+}
+
+// TODO: is this at the right place?
 inline bool isZeroRankedTensorOrScalar(Type type) {
   auto rtp = dyn_cast<RankedTensorType>(type);
   return !rtp || rtp.getRank() == 0;
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index 812c288a20c2..98e315865ba5 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -75,6 +75,40 @@ static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor,
   return createOrFoldSliceStrideOp(builder, loc, tensor, toDim(enc, lvl));
 }
 
+static bool isIntOrFPZero(Attribute attr) {
+  if (auto f = llvm::dyn_cast<FloatAttr>(attr); f && f.getValue().isZero())
+    return true;
+  if (auto i = llvm::dyn_cast<IntegerAttr>(attr); i && i.getValue().isZero())
+    return true;
+  return false;
+}
+
+static Value unFoldOpIntResult(OpBuilder &builder, Location loc,
+                               OpFoldResult ofr) {
+  if (std::optional<int64_t> i = getConstantIntValue(ofr); i.has_value())
+    return constantIndex(builder, loc, *i);
+  return ofr.get<Value>();
+}
+
+static Value tryFoldTensors(Value t) {
+  // TODO: this should be done through a folding pass after switching to
+  // `sparse_tensor.iterate`-based sparsification.
+  auto stt = tryGetSparseTensorType(t);
+  auto padOp = t.getDefiningOp<tensor::PadOp>();
+  if (padOp && stt.has_value() && stt->hasEncoding() &&
+      padOp.getSourceType().getEncoding() == stt->getEncoding() &&
+      stt->getEncoding().isIdentity()) {
+    // Try fusing padOp with zeros.
+    Attribute padCst;
+    if (matchPattern(padOp.getBody()->getTerminator(),
+                     m_Op<tensor::YieldOp>(m_Constant(&padCst))) &&
+        isIntOrFPZero(padCst)) {
+      return padOp.getSource();
+    }
+  }
+  return t;
+}
+
 //===----------------------------------------------------------------------===//
 // Sparse tensor loop emitter class implementations
 //===----------------------------------------------------------------------===//
@@ -166,15 +200,30 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
 std::unique_ptr<SparseIterator>
 LoopEmitter::makeLevelIterator(OpBuilder &builder, Location loc, TensorId t,
                                Level l) {
+  Value tensor = tensors[t];
+  auto stt = getSparseTensorType(tensor);
   auto it = makeSimpleIterator(*lvls[t][l], emitStrategy);
-  auto stt = getSparseTensorType(tensors[t]);
+
+  Value folded = tryFoldTensors(tensor);
+  if (folded != tensor) {
+    auto padOp = tensor.getDefiningOp<tensor::PadOp>();
+    assert(padOp);
+    if (padOp.getPaddedDims().test(l)) {
+      Value low = unFoldOpIntResult(builder, loc, padOp.getMixedLowPad()[l]);
+      Value high = unFoldOpIntResult(builder, loc, padOp.getMixedHighPad()[l]);
+      auto padIt = makePaddedIterator(std::move(it), low, high, emitStrategy);
+      return padIt;
+    }
+  }
+
   if (stt.hasEncoding() && stt.getEncoding().isSlice()) {
-    Value offset = genSliceOffset(builder, loc, tensors[t], l);
-    Value stride = genSliceStride(builder, loc, tensors[t], l);
+    Value offset = genSliceOffset(builder, loc, tensor, l);
+    Value stride = genSliceStride(builder, loc, tensor, l);
     auto slicedIt = makeSlicedLevelIterator(
         std::move(it), offset, stride, lvls[t][l]->getSize(), emitStrategy);
     return slicedIt;
   }
+
   return it;
 }
 
@@ -200,7 +249,9 @@ void LoopEmitter::initializeLoopEmit(
   //     on positions.
   for (TensorId t = 0, numTensors = getNumManifestTensors(); t < numTensors;
        t++) {
-    const Value tensor = tensors[t];
+    // TODO: this should be done through a folding pass after switching to
+    // `sparse_tensor.iterate`-based sparsification.
+    const Value tensor = tryFoldTensors(tensors[t]);
     const auto rtp = dyn_cast<RankedTensorType>(tensor.getType());
     if (!rtp)
       // Skips only scalar, zero ranked tensor still need to be bufferized and
@@ -213,14 +264,6 @@ void LoopEmitter::initializeLoopEmit(
     const Level lvlRank = stt.getLvlRank();
     const auto shape = rtp.getShape();
 
-    SmallVector<Value> lvlSzs;
-    for (Level l = 0; l < stt.getLvlRank(); l++) {
-      if (stt.hasEncoding())
-        lvlSzs.push_back(builder.create<LvlOp>(loc, tensor, l));
-      else
-        lvlSzs.push_back(builder.create<tensor::DimOp>(loc, tensor, l));
-    }
-
     // Scan all levels of current tensor.
     for (Level l = 0; l < lvlRank; l++) {
       // Find upper bound in current dimension.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index 59c3e49264db..34312df91299 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -222,7 +222,7 @@ public:
   ///
   SmallVector<Value> getValPosits(TensorId tid) const {
     SmallVector<Value> batchCrds = iters[tid].back().back()->getBatchCrds();
-    Value lastLvlPos = iters[tid].back().back()->getCurPosition().first;
+    Value lastLvlPos = iters[tid].back().back()->getCurPosition().front();
     batchCrds.push_back(lastLvlPos);
     return batchCrds;
   };
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
index 60dca3c55dec..dbec46d2616d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
@@ -94,8 +94,11 @@ public:
 
   ValueRange getLvlBuffers() const override { return {}; }
 
-  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange, Value p,
-                        Value max) const override {
+  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
+                        ValueRange parentPos, Value inPadZone) const override {
+    assert(parentPos.size() == 1 && "Dense level can not be non-unique.");
+    assert(!inPadZone && "Not implemented");
+    Value p = parentPos.front();
     Value posLo = MULI(p, lvlSize);
     return {posLo, lvlSize};
   }
@@ -112,9 +115,10 @@ public:
 
   ValueRange getLvlBuffers() const override { return {}; }
 
-  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange, Value p,
-                        Value max) const override {
-    assert(max == nullptr && "Dense level can not be non-unique.");
+  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange,
+                        ValueRange parentPos, Value inPadZone) const override {
+    assert(!inPadZone && "Not implemented");
+    assert(parentPos.size() == 1 && "Dense level can not be non-unique.");
     // No need to linearize the position for non-annotated tensors.
     return {C_IDX(0), lvlSize};
   }
@@ -127,16 +131,42 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {posBuffer, crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr &&
+                        ValueRange parentPos, Value inPadZone) const override {
+
+    assert(parentPos.size() == 1 &&
            "compressed level must be the first non-unique level.");
 
-    SmallVector<Value> memCrd(batchPrefix);
-    memCrd.push_back(p);
-    Value pLo = genIndexLoad(b, l, getPosBuf(), memCrd);
-    memCrd.back() = ADDI(p, C_IDX(1));
-    Value pHi = genIndexLoad(b, l, getPosBuf(), memCrd);
-    return {pLo, pHi};
+    auto loadRange = [&b, l, parentPos, batchPrefix, this]() -> ValuePair {
+      Value p = parentPos.front();
+      SmallVector<Value> memCrd(batchPrefix);
+      memCrd.push_back(p);
+      Value pLo = genIndexLoad(b, l, getPosBuf(), memCrd);
+      memCrd.back() = ADDI(p, C_IDX(1));
+      Value pHi = genIndexLoad(b, l, getPosBuf(), memCrd);
+      return {pLo, pHi};
+    };
+
+    if (inPadZone == nullptr)
+      return loadRange();
+
+    SmallVector<Type, 2> types{b.getIndexType(), b.getIndexType()};
+    scf::IfOp posRangeIf = b.create<scf::IfOp>(l, types, inPadZone, true);
+    // True branch, returns a "fake" empty range [0, 0) if parent
+    // iterator is in pad zone.
+    b.setInsertionPointToStart(posRangeIf.thenBlock());
+
+    SmallVector<Value, 2> emptyRange{C_IDX(0), C_IDX(0)};
+    b.create<scf::YieldOp>(l, emptyRange);
+
+    // False branch, returns the actual range.
+    b.setInsertionPointToStart(posRangeIf.elseBlock());
+    auto [pLo, pHi] = loadRange();
+    SmallVector<Value, 2> loadedRange{pLo, pHi};
+    b.create<scf::YieldOp>(l, loadedRange);
+
+    b.setInsertionPointAfter(posRangeIf);
+    ValueRange posRange = posRangeIf.getResults();
+    return {posRange.front(), posRange.back()};
   }
 };
 
@@ -147,11 +177,12 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {posBuffer, crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr &&
+                        ValueRange parentPos, Value inPadZone) const override {
+    assert(parentPos.size() == 1 &&
            "loose-compressed level must be the first non-unique level.");
+    assert(!inPadZone && "Not implemented");
     SmallVector<Value> memCrd(batchPrefix);
-
+    Value p = parentPos.front();
     p = MULI(p, C_IDX(2));
     memCrd.push_back(p);
     Value pLo = genIndexLoad(b, l, getPosBuf(), memCrd);
@@ -168,10 +199,14 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value segHi) const override {
+                        ValueRange parentPos, Value inPadZone) const override {
+    assert(parentPos.size() == 1 || parentPos.size() == 2);
+    assert(!inPadZone && "Not implemented");
+    Value p = parentPos.front();
+    Value segHi = parentPos.size() == 2 ? parentPos.back() : nullptr;
+
     if (segHi == nullptr)
       return {p, ADDI(p, C_IDX(1))};
-
     // Use the segHi as the loop upper bound.
     return {p, segHi};
   }
@@ -184,11 +219,13 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr && isUnique() && "n:m level can not be non-unique.");
+                        ValueRange parentPos, Value inPadZone) const override {
+    assert(parentPos.size() == 1 && isUnique() &&
+           "n:m level can not be non-unique.");
+    assert(!inPadZone && "Not implemented");
     // Each n:m blk has exactly n specified elements.
     auto n = getN(lt);
-    Value posLo = MULI(p, C_IDX(n));
+    Value posLo = MULI(parentPos.front(), C_IDX(n));
     return {posLo, ADDI(posLo, C_IDX(n))};
   }
 };
@@ -316,26 +353,8 @@ public:
       posHi = vs.back();
   };
 
-  ValuePair getCurPosition() const override { return {getItPos(), nullptr}; }
-
   void genInitImpl(OpBuilder &b, Location l,
-                   const SparseIterator *parent) override {
-
-    if (isBatchIterator() && batchCrds.size() <= stl.lvl)
-      batchCrds.resize(stl.lvl + 1, nullptr);
-
-    Value pos = C_IDX(0);
-    Value hi = nullptr;
-    // If the parent iterator is a batch iterator, we also start from 0 (but
-    // on a different batch).
-    if (parent && !parent->isBatchIterator())
-      std::tie(pos, hi) = parent->getCurPosition();
-
-    ValueRange batchPrefix = parent ? parent->getBatchCrds() : ValueRange{};
-    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pos, hi);
-    // Seek to the lowest position.
-    seek(posLo);
-  }
+                   const SparseIterator *parent) override;
 
   ValuePair genForCond(OpBuilder &b, Location l) override {
     if (randomAccessible())
@@ -406,21 +425,19 @@ public:
     return {b.getIndexType(), b.getIndexType()};
   }
 
-  ValuePair getCurPosition() const override { return {getPos(), getSegHi()}; }
-
   void genInitImpl(OpBuilder &b, Location l,
                    const SparseIterator *parent) override {
+    Value c0 = C_IDX(0);
+    ValueRange pPos = c0;
 
-    Value pos = C_IDX(0);
-    Value hi = nullptr;
     // If the parent iterator is a batch iterator, we also start from 0 (but
     // on a different batch).
     if (parent && !parent->isBatchIterator())
-      std::tie(pos, hi) = parent->getCurPosition();
+      pPos = parent->getCurPosition();
 
     Value posLo;
     ValueRange batchPrefix = parent ? parent->getBatchCrds() : ValueRange{};
-    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pos, hi);
+    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pPos);
 
     seek({posLo, genSegmentHigh(b, l, posLo)});
   }
@@ -458,11 +475,56 @@ public:
   Value posHi;
 };
 
+// A util base-iterator that delegates all methods to the wrapped iterator.
+class SimpleWrapIterator : public SparseIterator {
+public:
+  SimpleWrapIterator(std::unique_ptr<SparseIterator> &&wrap, IterKind kind,
+                     unsigned extraCursorVal = 0)
+      : SparseIterator(kind, *wrap, extraCursorVal), wrap(std::move(wrap)) {}
+
+  SmallVector<Type> getCursorValTypes(OpBuilder &b) const override {
+    return wrap->getCursorValTypes(b);
+  }
+  bool isBatchIterator() const override { return wrap->isBatchIterator(); }
+  bool randomAccessible() const override { return wrap->randomAccessible(); };
+  bool iteratableByFor() const override { return wrap->iteratableByFor(); };
+
+  SmallVector<Value> serialize() const override { return wrap->serialize(); };
+  void deserialize(ValueRange vs) override { wrap->deserialize(vs); };
+  ValueRange getCurPosition() const override { return wrap->getCurPosition(); }
+  void genInitImpl(OpBuilder &b, Location l,
+                   const SparseIterator *parent) override {
+    wrap->genInit(b, l, parent);
+  }
+  Value genNotEndImpl(OpBuilder &b, Location l) override {
+    return wrap->genNotEndImpl(b, l);
+  }
+  ValueRange forwardImpl(OpBuilder &b, Location l) override {
+    return wrap->forward(b, l);
+  };
+  Value upperBound(OpBuilder &b, Location l) const override {
+    return wrap->upperBound(b, l);
+  };
+
+  Value derefImpl(OpBuilder &b, Location l) override {
+    return wrap->derefImpl(b, l);
+  }
+
+  void locateImpl(OpBuilder &b, Location l, Value crd) override {
+    return wrap->locate(b, l, crd);
+  }
+
+  SparseIterator &getWrappedIterator() const { return *wrap; }
+
+protected:
+  std::unique_ptr<SparseIterator> wrap;
+};
+
 //
 // A filter iterator wrapped from another iterator. The filter iterator update
 // the wrapped iterator *in-place*.
 //
-class FilterIterator : public SparseIterator {
+class FilterIterator : public SimpleWrapIterator {
   // Coorindate translation between crd loaded from the wrap iterator and the
   // filter iterator.
   Value fromWrapCrd(OpBuilder &b, Location l, Value wrapCrd) const {
@@ -483,8 +545,8 @@ public:
   // when crd always < size.
   FilterIterator(std::unique_ptr<SparseIterator> &&wrap, Value offset,
                  Value stride, Value size)
-      : SparseIterator(IterKind::kFilter, *wrap), offset(offset),
-        stride(stride), size(size), wrap(std::move(wrap)) {}
+      : SimpleWrapIterator(std::move(wrap), IterKind::kFilter), offset(offset),
+        stride(stride), size(size) {}
 
   // For LLVM-style RTTI.
   static bool classof(const SparseIterator *from) {
@@ -494,19 +556,10 @@ public:
   std::string getDebugInterfacePrefix() const override {
     return std::string("filter<") + wrap->getDebugInterfacePrefix() + ">";
   }
-  SmallVector<Type> getCursorValTypes(OpBuilder &b) const override {
-    return wrap->getCursorValTypes(b);
-  }
 
-  bool isBatchIterator() const override { return wrap->isBatchIterator(); }
-  bool randomAccessible() const override { return wrap->randomAccessible(); };
   bool iteratableByFor() const override { return randomAccessible(); };
   Value upperBound(OpBuilder &b, Location l) const override { return size; };
 
-  SmallVector<Value> serialize() const override { return wrap->serialize(); };
-  void deserialize(ValueRange vs) override { wrap->deserialize(vs); };
-  ValuePair getCurPosition() const override { return wrap->getCurPosition(); }
-
   void genInitImpl(OpBuilder &b, Location l,
                    const SparseIterator *parent) override {
     wrap->genInit(b, l, parent);
@@ -537,7 +590,74 @@ public:
   ValueRange forwardImpl(OpBuilder &b, Location l) override;
 
   Value offset, stride, size;
-  std::unique_ptr<SparseIterator> wrap;
+};
+
+//
+// A pad iterator wrapped from another iterator. The pad iterator updates
+// the wrapped iterator *in-place*.
+//
+class PadIterator : public SimpleWrapIterator {
+
+public:
+  PadIterator(std::unique_ptr<SparseIterator> &&wrap, Value padLow,
+              Value padHigh)
+      : SimpleWrapIterator(std::move(wrap), IterKind::kPad,
+                           wrap->randomAccessible() ? 1 : 0),
+        padLow(padLow), padHigh(padHigh) {}
+
+  // For LLVM-style RTTI.
+  static bool classof(const SparseIterator *from) {
+    return from->kind == IterKind::kPad;
+  }
+
+  std::string getDebugInterfacePrefix() const override {
+    return std::string("pad<") + wrap->getDebugInterfacePrefix() + ">";
+  }
+
+  // Returns a pair of values for *upper*, *lower* bound respectively.
+  ValuePair genForCond(OpBuilder &b, Location l) override {
+    if (randomAccessible())
+      return {getCrd(), upperBound(b, l)};
+    return wrap->genForCond(b, l);
+  }
+
+  // For padded dense iterator, we append a `inPadZone: bool` in addition to
+  // values used by the wrapped iterator.
+  ValueRange getCurPosition() const override { return getCursor(); }
+
+  SmallVector<Type> getCursorValTypes(OpBuilder &b) const override {
+    SmallVector<Type> ret = wrap->getCursorValTypes(b);
+    // Need an extra boolean value `inPadZone` for padded dense iterator.
+    if (randomAccessible())
+      ret.push_back(b.getI1Type());
+
+    return ret;
+  }
+
+  // The upper bound after padding becomes `size + padLow + padHigh`.
+  Value upperBound(OpBuilder &b, Location l) const override {
+    return ADDI(ADDI(wrap->upperBound(b, l), padLow), padHigh);
+  };
+
+  // The pad_coord = coord + pad_lo
+  Value derefImpl(OpBuilder &b, Location l) override {
+    updateCrd(ADDI(wrap->deref(b, l), padLow));
+    return getCrd();
+  }
+
+  void locateImpl(OpBuilder &b, Location l, Value crd) override {
+    assert(randomAccessible());
+    wrap->locate(b, l, SUBI(crd, padLow));
+
+    // inPadZone = crd < padLow || crd >= size + padLow.
+    Value inPadLow = CMPI(ult, crd, padLow);
+    Value inPadHigh = CMPI(uge, crd, ADDI(wrap->upperBound(b, l), padLow));
+    getMutCursorVals().back() = ORI(inPadLow, inPadHigh);
+
+    updateCrd(crd);
+  }
+
+  Value padLow, padHigh;
 };
 
 class NonEmptySubSectIterator : public SparseIterator {
@@ -756,9 +876,8 @@ public:
   Value upperBound(OpBuilder &b, Location l) const override {
     return subSect.subSectSz;
   }
-  std::pair<Value, Value> getCurPosition() const override {
-    return wrap->getCurPosition();
-  };
+
+  ValueRange getCurPosition() const override { return wrap->getCurPosition(); };
 
   Value getNxLvlTupleId(OpBuilder &b, Location l) const {
     if (randomAccessible()) {
@@ -1150,6 +1269,33 @@ ValueRange NonEmptySubSectIterator::inflateSubSectTree(
   return p->inflateSubSectTree(b, l, reduc, visitDenseSubSect);
 }
 
+void TrivialIterator::genInitImpl(OpBuilder &b, Location l,
+                                  const SparseIterator *parent) {
+
+  if (isBatchIterator() && batchCrds.size() <= stl.lvl)
+    batchCrds.resize(stl.lvl + 1, nullptr);
+
+  Value c0 = C_IDX(0);
+  ValueRange pPos = c0;
+  Value inPadZone = nullptr;
+  // If the parent iterator is a batch iterator, we also start from 0 (but
+  // on a different batch).
+  if (parent && !parent->isBatchIterator()) {
+    pPos = parent->getCurPosition();
+    if (llvm::isa<PadIterator>(parent) && parent->randomAccessible()) {
+      // A padded dense iterator create "sparse" padded zone, which need to be
+      // handled specially.
+      inPadZone = pPos.back();
+      pPos = pPos.drop_back();
+    }
+  }
+
+  ValueRange batchPrefix = parent ? parent->getBatchCrds() : ValueRange{};
+  std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pPos, inPadZone);
+  // Seek to the lowest position.
+  seek(posLo);
+}
+
 void NonEmptySubSectIterator::genInitImpl(OpBuilder &b, Location l,
                                           const SparseIterator *) {
   Value c0 = C_IDX(0);
@@ -1405,10 +1551,19 @@ sparse_tensor::makeSlicedLevelIterator(std::unique_ptr<SparseIterator> &&sit,
   return ret;
 }
 
+std::unique_ptr<SparseIterator>
+sparse_tensor::makePaddedIterator(std::unique_ptr<SparseIterator> &&sit,
+                                  Value padLow, Value padHigh,
+                                  SparseEmitStrategy strategy) {
+  auto ret = std::make_unique<PadIterator>(std::move(sit), padLow, padHigh);
+  ret->setSparseEmitStrategy(strategy);
+  return ret;
+}
+
 static const SparseIterator *tryUnwrapFilter(const SparseIterator *it) {
   auto *filter = llvm::dyn_cast_or_null<FilterIterator>(it);
   if (filter)
-    return filter->wrap.get();
+    return &filter->getWrappedIterator();
   return it;
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
index 9d69a2335559..120a806536f1 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
@@ -36,8 +36,9 @@ public:
                           Value iv) const = 0;
 
   /// Peeks the lower and upper bound to *fully* traverse the level with
-  /// the given position `p` that the immediate parent level is current at.
-  /// Returns a pair of values for *posLo* and *loopHi* respectively.
+  /// the given position `parentPos`, see SparseTensorIterator::getCurPostion(),
+  /// that the immediate parent level is current at. Returns a pair of values
+  /// for *posLo* and *loopHi* respectively.
   ///
   /// For a dense level, the *posLo* is the linearized position at beginning,
   /// while *loopHi* is the largest *coordinate*, it also implies that the
@@ -45,12 +46,9 @@ public:
   ///
   /// For a sparse level, [posLo, loopHi) specifies the range of index pointer
   /// to load coordinate from the coordinate buffer.
-  ///
-  /// `bound` is only used when the level is `non-unique` and deduplication is
-  /// required. It specifies the max upper bound of the non-unique segment.
-  virtual std::pair<Value, Value> peekRangeAt(OpBuilder &b, Location l,
-                                              ValueRange batchPrefix, Value p,
-                                              Value segHi = Value()) const = 0;
+  virtual std::pair<Value, Value>
+  peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
+              ValueRange parentPos, Value inPadZone = nullptr) const = 0;
 
   Level getLevel() const { return lvl; }
   LevelType getLT() const { return lt; }
@@ -78,6 +76,7 @@ enum class IterKind : uint8_t {
   kSubSect,
   kNonEmptySubSect,
   kFilter,
+  kPad,
 };
 
 /// Helper class that generates loop conditions, etc, to traverse a
@@ -199,18 +198,17 @@ public:
   }
   virtual Value genNotEndImpl(OpBuilder &b, Location l) = 0;
   virtual Value derefImpl(OpBuilder &b, Location l) = 0;
-  // Gets the current position and the optional *position high* (for
-  // non-unique iterators), the value is essentially the number of sparse
-  // coordinate that the iterator is current visiting. It should be able to
-  // uniquely identify the sparse range for the next level. See
-  // SparseTensorLevel::peekRangeAt();
+  // Gets the ValueRange that together specifies the current position of the
+  // iterator. For a unique level, the position can be a single index points to
+  // the current coordinate being visited. For a non-unique level, an extra
+  // index for the `segment high` is needed to to specifies the range of
+  // duplicated coordinates. The ValueRange should be able to uniquely identify
+  // the sparse range for the next level. See SparseTensorLevel::peekRangeAt();
   //
   // Not every type of iterator supports the operation, e.g., non-empty
   // subsection iterator does not because it represent a range of coordinates
   // instead of just one.
-  virtual std::pair<Value, Value> getCurPosition() const {
-    llvm_unreachable("unsupported");
-  };
+  virtual ValueRange getCurPosition() const { return getCursor(); };
 
   // Returns a pair of values for *upper*, *lower* bound respectively.
   virtual std::pair<Value, Value> genForCond(OpBuilder &b, Location l) {
@@ -284,27 +282,34 @@ private:
 };
 
 /// Helper function to create a TensorLevel object from given `tensor`.
-std::unique_ptr<SparseTensorLevel> makeSparseTensorLevel(OpBuilder &builder,
-                                                         Location loc, Value t,
-                                                         unsigned tid, Level l);
+std::unique_ptr<SparseTensorLevel> makeSparseTensorLevel(OpBuilder &b,
+                                                         Location l, Value t,
+                                                         unsigned tid,
+                                                         Level lvl);
 
 /// Helper function to create a simple SparseIterator object that iterate over
 /// the SparseTensorLevel.
 std::unique_ptr<SparseIterator> makeSimpleIterator(const SparseTensorLevel &stl,
                                                    SparseEmitStrategy strategy);
 
-/// Helper function to create a synthetic SparseIterator object that iterate
+/// Helper function to create a synthetic SparseIterator object that iterates
 /// over a dense space specified by [0,`sz`).
 std::pair<std::unique_ptr<SparseTensorLevel>, std::unique_ptr<SparseIterator>>
 makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl,
                         SparseEmitStrategy strategy);
 
-/// Helper function to create a SparseIterator object that iterate over a
+/// Helper function to create a SparseIterator object that iterates over a
 /// sliced space, the orignal space (before slicing) is traversed by `sit`.
 std::unique_ptr<SparseIterator>
 makeSlicedLevelIterator(std::unique_ptr<SparseIterator> &&sit, Value offset,
                         Value stride, Value size, SparseEmitStrategy strategy);
 
+/// Helper function to create a SparseIterator object that iterates over a
+/// padded sparse level (the padded value must be zero).
+std::unique_ptr<SparseIterator>
+makePaddedIterator(std::unique_ptr<SparseIterator> &&sit, Value padLow,
+                   Value padHigh, SparseEmitStrategy strategy);
+
 /// Helper function to create a SparseIterator object that iterate over the
 /// non-empty subsections set.
 std::unique_ptr<SparseIterator> makeNonEmptySubSectIterator(
@@ -312,7 +317,7 @@ std::unique_ptr<SparseIterator> makeNonEmptySubSectIterator(
     std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride,
     SparseEmitStrategy strategy);
 
-/// Helper function to create a SparseIterator object that iterate over a
+/// Helper function to create a SparseIterator object that iterates over a
 /// non-empty subsection created by NonEmptySubSectIterator.
 std::unique_ptr<SparseIterator> makeTraverseSubSectIterator(
     OpBuilder &b, Location l, const SparseIterator &subsectIter,
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 3ff41ab22fbc..4c65045084dc 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -1609,6 +1609,9 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
             cst.has_value() && cst.value() == static_cast<int64_t>(id);
         continue;
       }
+
+      dynamicNoop = false;
+      break;
     }
 
     if (dynamicNoop)
@@ -1641,6 +1644,44 @@ int64_t ExpandShapeOp::getCorrespondingSourceDim(int64_t resultDim) {
   llvm_unreachable("could not find reassociation group");
 }
 
+FailureOr<SmallVector<OpFoldResult>>
+ExpandShapeOp::inferOutputShape(OpBuilder &b, Location loc,
+                                RankedTensorType expandedType,
+                                ArrayRef<ReassociationIndices> reassociation,
+                                ArrayRef<OpFoldResult> inputShape) {
+  std::optional<SmallVector<OpFoldResult>> outputShape =
+      inferExpandShapeOutputShape(b, loc, expandedType, reassociation,
+                                  inputShape);
+  if (!outputShape)
+    return failure();
+  return *outputShape;
+}
+
+void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
+                          Type resultType, Value src,
+                          ArrayRef<ReassociationIndices> reassociation,
+                          ArrayRef<OpFoldResult> outputShape) {
+  auto [staticOutputShape, dynamicOutputShape] =
+      decomposeMixedValues(SmallVector<OpFoldResult>(outputShape));
+  build(builder, result, cast<RankedTensorType>(resultType), src,
+        getReassociationIndicesAttribute(builder, reassociation),
+        dynamicOutputShape, staticOutputShape);
+}
+
+void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
+                          Type resultType, Value src,
+                          ArrayRef<ReassociationIndices> reassociation) {
+  SmallVector<OpFoldResult> inputShape =
+      getMixedSizes(builder, result.location, src);
+  auto tensorResultTy = cast<RankedTensorType>(resultType);
+  FailureOr<SmallVector<OpFoldResult>> outputShape = inferOutputShape(
+      builder, result.location, tensorResultTy, reassociation, inputShape);
+  // Failure of this assertion usually indicates presence of multiple
+  // dynamic dimensions in the same reassociation group.
+  assert(succeeded(outputShape) && "unable to infer output shape");
+  build(builder, result, tensorResultTy, src, reassociation, *outputShape);
+}
+
 SmallVector<AffineMap, 4> CollapseShapeOp::getReassociationMaps() {
   return getSymbolLessAffineMaps(getReassociationExprs());
 }
@@ -1724,7 +1765,24 @@ static LogicalResult verifyTensorReshapeOp(TensorReshapeOp op,
 }
 
 LogicalResult ExpandShapeOp::verify() {
-  return verifyTensorReshapeOp(*this, getResultType(), getSrcType());
+  auto srcType = getSrcType();
+  auto resultType = getResultType();
+
+  if ((int64_t)getStaticOutputShape().size() != resultType.getRank())
+    return emitOpError("expected number of static shape dims to be equal to "
+                       "the output rank (")
+           << resultType.getRank() << ") but found "
+           << getStaticOutputShape().size() << " inputs instead";
+
+  if ((int64_t)getOutputShape().size() !=
+      llvm::count(getStaticOutputShape(), ShapedType::kDynamic))
+    return emitOpError("mismatch in dynamic dims in output_shape and "
+                       "static_output_shape: static_output_shape has ")
+           << llvm::count(getStaticOutputShape(), ShapedType::kDynamic)
+           << " dynamic dims while output_shape has " << getOutputShape().size()
+           << " values";
+
+  return verifyTensorReshapeOp(*this, resultType, srcType);
 }
 
 LogicalResult CollapseShapeOp::verify() {
@@ -1908,23 +1966,25 @@ struct FoldDimOfCollapseShape : public OpRewritePattern<DimOp> {
 
 void ExpandShapeOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                 MLIRContext *context) {
-  results.add<ComposeReassociativeReshapeOps<ExpandShapeOp>,
-              ComposeExpandOfCollapseOp<ExpandShapeOp, CollapseShapeOp>,
-              FoldReshapeWithConstant<ExpandShapeOp>,
-              FoldReshapeWithSplat<ExpandShapeOp>,
-              FoldReshapeWithFromElements<ExpandShapeOp>, FoldDimOfExpandShape,
-              FoldDimOfCollapseShape>(context);
+  results.add<
+      ComposeReassociativeReshapeOps<ExpandShapeOp, ReshapeOpKind::kExpand>,
+      ComposeExpandOfCollapseOp<ExpandShapeOp, CollapseShapeOp>,
+      FoldReshapeWithConstant<ExpandShapeOp>,
+      FoldReshapeWithSplat<ExpandShapeOp>,
+      FoldReshapeWithFromElements<ExpandShapeOp>, FoldDimOfExpandShape,
+      FoldDimOfCollapseShape>(context);
 }
 
 void CollapseShapeOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                   MLIRContext *context) {
-  results
-      .add<ComposeReassociativeReshapeOps<CollapseShapeOp>,
-           ComposeCollapseOfExpandOp<CollapseShapeOp, ExpandShapeOp, CastOp>,
-           FoldReshapeWithConstant<CollapseShapeOp>,
-           FoldReshapeWithSplat<CollapseShapeOp>,
-           FoldReshapeWithFromElements<CollapseShapeOp>, FoldCollapseOfCastOp>(
-          context);
+  results.add<
+      ComposeReassociativeReshapeOps<CollapseShapeOp, ReshapeOpKind::kCollapse>,
+      ComposeCollapseOfExpandOp<CollapseShapeOp, ExpandShapeOp, CastOp,
+                                tensor::DimOp, RankedTensorType>,
+      FoldReshapeWithConstant<CollapseShapeOp>,
+      FoldReshapeWithSplat<CollapseShapeOp>,
+      FoldReshapeWithFromElements<CollapseShapeOp>, FoldCollapseOfCastOp>(
+      context);
 }
 
 OpFoldResult ExpandShapeOp::fold(FoldAdaptor adaptor) {
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 58ea4cc4da3c..d078a575f40d 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -338,6 +338,9 @@ struct ExpandShapeOpInterface
 
     // Memref result type is inferred by the builder based on reassociation
     // indices and result shape.
+    // TODO: Instead of inferring the output shape argument of
+    // memref.expand_shape op, use output_shape argument of tensor.expand_shape
+    // op.
     replaceOpWithNewBufferizedOp<memref::ExpandShapeOp>(
         rewriter, op, tensorResultType.getShape(), *buffer,
         expandShapeOp.getReassociationIndices());
diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
index 666ac56c6cd5..ebcb34e9ef02 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
@@ -52,12 +52,16 @@ static LogicalResult isPackOn1D(RewriterBase &rewriter, Operation *op,
 struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> {
   using OpRewritePattern<PackOp>::OpRewritePattern;
 
-  Value insertExpand(RewriterBase &rewriter, Location loc, Value operand,
-                     Type newOperandType, ArrayAttr reassociation) const {
+  FailureOr<Value>
+  insertExpand(RewriterBase &rewriter, Location loc, Value operand,
+               Type newOperandType,
+               ArrayRef<ReassociationIndices> reassociation) const {
     if (operand.getType() == newOperandType)
       return operand;
-    return rewriter.create<tensor::ExpandShapeOp>(loc, newOperandType, operand,
-                                                  reassociation);
+    return rewriter
+        .create<tensor::ExpandShapeOp>(loc, newOperandType, operand,
+                                       reassociation)
+        .getResult();
   }
 
   /// Returns success() if it is only packing on the innermost dimension.
@@ -96,10 +100,14 @@ struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> {
         getReassociationIndicesForReshape(sourceType, destType);
     if (!reassociation)
       return failure();
-    Value expanded = insertExpand(
-        rewriter, packOp.getLoc(), packOp.getSource(), destType,
-        getReassociationIndicesAttribute(rewriter, *reassociation));
-    rewriter.replaceOp(packOp, expanded);
+    FailureOr<Value> expanded =
+        insertExpand(rewriter, packOp.getLoc(), packOp.getSource(), destType,
+                     *reassociation);
+    if (failed(expanded)) {
+      return rewriter.notifyMatchFailure(
+          packOp, "unable to expand source of tensor.pack");
+    }
+    rewriter.replaceOp(packOp, *expanded);
     return success();
   }
 };
@@ -366,13 +374,10 @@ struct FoldProducerUnPackWithConsumerLinalgTransposeOp
     for (auto dim : innerDimsPos)
       newInnerDimsPosVec.push_back(transposePermutation[dim]);
 
-    Value output = unPackOp.createDestinationTensor(
-        rewriter, transposeOp.getLoc(), unPackOp.getSource(),
-        unPackOp.getMixedTiles(), newInnerDimsPosVec, newOuterDimsPermVec);
-
+    // Reuse the destination of the transpose op.
     rewriter.replaceOpWithNewOp<UnPackOp>(
-        transposeOp, unPackOp.getSource(), output, newInnerDimsPosVec,
-        unPackOp.getMixedTiles(), newOuterDimsPermVec);
+        transposeOp, unPackOp.getSource(), transposeOp.getDpsInits()[0],
+        newInnerDimsPosVec, unPackOp.getMixedTiles(), newOuterDimsPermVec);
 
     return success();
   }
diff --git a/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp
index 137156fe1a73..ffbb707344b8 100644
--- a/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp
@@ -79,7 +79,7 @@ void mlir::tosa::registerShardingInterfaceExternalModels(
   registry.addExtension(+[](MLIRContext *ctx, TosaDialect *dialect) {
     registerElemwiseAll<
         ClampOp, SigmoidOp, TanhOp, AddOp, ArithmeticRightShiftOp, BitwiseAndOp,
-        BitwiseOrOp, BitwiseXorOp, DivOp, LogicalAndOp, LogicalLeftShiftOp,
+        BitwiseOrOp, BitwiseXorOp, IntDivOp, LogicalAndOp, LogicalLeftShiftOp,
         LogicalRightShiftOp, LogicalOrOp, LogicalXorOp, MaximumOp, MinimumOp,
         MulOp, PowOp, SubOp, AbsOp, BitwiseNotOp, CeilOp, ClzOp, ExpOp, FloorOp,
         LogOp, LogicalNotOp, NegateOp, ReciprocalOp, RsqrtOp, SelectOp, EqualOp,
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index c139d5f60024..e4e5fe3d1db3 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -520,7 +520,7 @@ OpFoldResult ArgMaxOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
-OpFoldResult DivOp::fold(FoldAdaptor adaptor) {
+OpFoldResult IntDivOp::fold(FoldAdaptor adaptor) {
   auto lhsTy = llvm::dyn_cast<RankedTensorType>(getInput1().getType());
   auto rhsTy = llvm::dyn_cast<RankedTensorType>(getInput2().getType());
   auto resultTy = llvm::dyn_cast<RankedTensorType>(getType());
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 10e6016a1ed4..288cbac7b7b8 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -139,7 +139,7 @@ void TosaDialect::initialize() {
   addInterfaces<TosaDialectBytecodeInterface, TosaInlinerInterface>();
   declarePromisedInterfaces<
       mesh::ShardingInterface, ClampOp, SigmoidOp, TanhOp, AddOp,
-      ArithmeticRightShiftOp, BitwiseAndOp, BitwiseOrOp, BitwiseXorOp, DivOp,
+      ArithmeticRightShiftOp, BitwiseAndOp, BitwiseOrOp, BitwiseXorOp, IntDivOp,
       LogicalAndOp, LogicalLeftShiftOp, LogicalRightShiftOp, LogicalOrOp,
       LogicalXorOp, MaximumOp, MinimumOp, MulOp, PowOp, SubOp, AbsOp,
       BitwiseNotOp, CeilOp, ClzOp, ExpOp, FloorOp, LogOp, LogicalNotOp,
@@ -220,7 +220,8 @@ static bool hasZeroDimension(ShapedType shapedType) {
   return false;
 }
 
-template <typename T> static LogicalResult verifyConvOp(T op) {
+template <typename T>
+static LogicalResult verifyConvOp(T op) {
   // All TOSA conv ops have an input() and weight().
   auto inputType = llvm::dyn_cast<RankedTensorType>(op.getInput().getType());
   auto weightType = llvm::dyn_cast<RankedTensorType>(op.getWeight().getType());
@@ -962,7 +963,7 @@ mlir::LogicalResult tosa::ReshapeOp::verify() {
     return emitOpError() << "tensor has a dimension with size zero. Each "
                             "dimension of a tensor must have size >= 1";
 
-  if ((int64_t) getNewShape().size() != outputType.getRank())
+  if ((int64_t)getNewShape().size() != outputType.getRank())
     return emitOpError() << "new shape does not match result rank";
 
   for (auto [newShapeDim, outputShapeDim] :
@@ -1119,6 +1120,32 @@ LogicalResult tosa::TransposeOp::verify() {
   return success();
 }
 
+LogicalResult TransposeOp::reifyResultShapes(
+    OpBuilder &builder, ReifiedRankedShapedTypeDims &reifiedReturnShapes) {
+
+  SmallVector<int64_t> transposePerms;
+  if (getConstantPerms(transposePerms).failed())
+    return failure();
+
+  Value input = getInput1();
+  auto inputType = cast<TensorType>(input.getType());
+
+  SmallVector<OpFoldResult> returnedDims(inputType.getRank());
+  for (auto dim : transposePerms) {
+    int64_t dimInInput = transposePerms[dim];
+    if (inputType.isDynamicDim(dimInInput))
+      returnedDims[dim] =
+          builder.create<tensor::DimOp>(getLoc(), input, dimInInput)
+              .getResult();
+    else
+      returnedDims[dim] =
+          builder.getIndexAttr(inputType.getDimSize(dimInInput));
+  }
+
+  reifiedReturnShapes.emplace_back(std::move(returnedDims));
+  return success();
+}
+
 LogicalResult tosa::GatherOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     GatherOp::Adaptor adaptor,
@@ -1354,12 +1381,12 @@ NARY_SHAPE_INFER(tosa::CeilOp)
 NARY_SHAPE_INFER(tosa::ClampOp)
 NARY_SHAPE_INFER(tosa::ClzOp)
 NARY_SHAPE_INFER(tosa::CosOp)
-NARY_SHAPE_INFER(tosa::DivOp)
 NARY_SHAPE_INFER(tosa::ExpOp)
 NARY_SHAPE_INFER(tosa::FloorOp)
 NARY_SHAPE_INFER(tosa::GreaterEqualOp)
 NARY_SHAPE_INFER(tosa::GreaterOp)
 NARY_SHAPE_INFER(tosa::IdentityOp)
+NARY_SHAPE_INFER(tosa::IntDivOp)
 NARY_SHAPE_INFER(tosa::LogOp)
 NARY_SHAPE_INFER(tosa::LogicalAndOp)
 NARY_SHAPE_INFER(tosa::LogicalLeftShiftOp)
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp
index 18bc7d6aa9ee..9c6ee4c62eee 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp
@@ -232,7 +232,7 @@ public:
     patterns.add<ConvertTosaOp<tosa::AddOp>>(ctx);
     patterns.add<ConvertTosaOp<tosa::SubOp>>(ctx);
     patterns.add<ConvertTosaOp<tosa::MulOp>>(ctx);
-    patterns.add<ConvertTosaOp<tosa::DivOp>>(ctx);
+    patterns.add<ConvertTosaOp<tosa::IntDivOp>>(ctx);
     patterns.add<ConvertTosaOp<tosa::MaximumOp>>(ctx);
     patterns.add<ConvertTosaOp<tosa::MinimumOp>>(ctx);
     patterns.add<ConvertTosaOp<tosa::EqualOp>>(ctx);
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index c99f62d5ae11..539501082fd3 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -200,7 +200,7 @@ private:
     CHECK_RANKS_FOR(BitwiseAnd);
     CHECK_RANKS_FOR(BitwiseOr);
     CHECK_RANKS_FOR(BitwiseXor);
-    CHECK_RANKS_FOR(Div);
+    CHECK_RANKS_FOR(IntDiv);
     CHECK_RANKS_FOR(LogicalAnd);
     CHECK_RANKS_FOR(LogicalLeftShift);
     CHECK_RANKS_FOR(LogicalRightShift);
diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index 41c7af4593c7..e4f387d40ced 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -168,7 +168,7 @@ ArrayAttr mlir::getReassociationIndicesAttribute(
 }
 
 SmallVector<ReassociationIndices, 2> mlir::convertReassociationMapsToIndices(
-    OpBuilder &b, ArrayRef<ReassociationExprs> reassociationExprs) {
+    ArrayRef<ReassociationExprs> reassociationExprs) {
   SmallVector<ReassociationIndices, 2> reassociationIndices;
   for (const auto &exprs : reassociationExprs) {
     ReassociationIndices indices;
@@ -230,24 +230,17 @@ LogicalResult mlir::reshapeLikeShapesAreCompatible(
     ArrayRef<ReassociationIndices> reassociationMaps, bool isExpandingReshape) {
   unsigned expandedDimStart = 0;
   for (const auto &map : llvm::enumerate(reassociationMaps)) {
-    std::optional<int64_t> dynamicShape;
+    bool foundDynamicShape = false;
     int64_t linearizedStaticShape = 1;
+
     for (const auto &dim : llvm::enumerate(
              expandedShape.slice(expandedDimStart, map.value().size()))) {
-      if (ShapedType::isDynamic(dim.value())) {
-        if (isExpandingReshape && dynamicShape) {
-          return emitError("invalid to have a single dimension (" +
-                           Twine(map.index()) +
-                           ") expanded into multiple dynamic dims (" +
-                           Twine(expandedDimStart + dynamicShape.value()) +
-                           "," + Twine(expandedDimStart + dim.index()) + ")");
-        }
-        dynamicShape = dim.index();
-      } else {
+      if (ShapedType::isDynamic(dim.value()))
+        foundDynamicShape = true;
+      else
         linearizedStaticShape *= dim.value();
-      }
     }
-    if (dynamicShape) {
+    if (foundDynamicShape) {
       if (!ShapedType::isDynamic(collapsedShape[map.index()])) {
         return emitError(
             "expected dimension " + Twine(map.index()) +
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 1e8197e10944..74a53709592d 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -180,9 +180,8 @@ SmallVector<OpFoldResult> getMixedValues(ArrayRef<int64_t> staticValues,
 
 /// Decompose a vector of mixed static or dynamic values into the corresponding
 /// pair of arrays. This is the inverse function of `getMixedValues`.
-std::pair<ArrayAttr, SmallVector<Value>>
-decomposeMixedValues(Builder &b,
-                     const SmallVectorImpl<OpFoldResult> &mixedValues) {
+std::pair<SmallVector<int64_t>, SmallVector<Value>>
+decomposeMixedValues(const SmallVectorImpl<OpFoldResult> &mixedValues) {
   SmallVector<int64_t> staticValues;
   SmallVector<Value> dynamicValues;
   for (const auto &it : mixedValues) {
@@ -193,7 +192,7 @@ decomposeMixedValues(Builder &b,
       dynamicValues.push_back(it.get<Value>());
     }
   }
-  return {b.getI64ArrayAttr(staticValues), dynamicValues};
+  return {staticValues, dynamicValues};
 }
 
 /// Helper to sort `values` according to matching `keys`.
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
index 2f21c50c6347..ac576ed0b4f0 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
@@ -437,8 +437,10 @@ struct OneDimMultiReductionToTwoDim
     auto loc = multiReductionOp.getLoc();
     auto srcVectorType = multiReductionOp.getSourceVectorType();
     auto srcShape = srcVectorType.getShape();
-    auto castedType = VectorType::get(ArrayRef<int64_t>{1, srcShape.back()},
-                                      srcVectorType.getElementType());
+    auto castedType = VectorType::get(
+        ArrayRef<int64_t>{1, srcShape.back()}, srcVectorType.getElementType(),
+        ArrayRef<bool>{false, srcVectorType.getScalableDims().back()});
+
     auto accType =
         VectorType::get(ArrayRef<int64_t>{1}, srcVectorType.getElementType());
     assert(!llvm::isa<VectorType>(multiReductionOp.getDestType()) &&
@@ -455,10 +457,11 @@ struct OneDimMultiReductionToTwoDim
         loc, accType, multiReductionOp.getAcc());
     Value castMask;
     if (maskableOp.isMasked()) {
-      auto maskType = llvm::cast<ShapedType>(mask.getType());
-      auto castMaskType =
-          VectorType::get(ArrayRef<int64_t>{1, maskType.getShape().back()},
-                          maskType.getElementType());
+      auto maskType = llvm::cast<VectorType>(mask.getType());
+      auto castMaskType = VectorType::get(
+          ArrayRef<int64_t>{1, maskType.getShape().back()},
+          maskType.getElementType(),
+          ArrayRef<bool>{false, maskType.getScalableDims().back()});
       castMask = rewriter.create<vector::BroadcastOp>(loc, castMaskType, mask);
     }
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index d24721f3defa..a301b919dc52 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -880,6 +880,38 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc,
   return rewriter.create<vector::InterleaveOp>(loc, low, high);
 }
 
+/// Rewrite the i4 -> i8 unsigned extension into a sequence of shuffles and
+/// bitwise ops that take advantage of high-level information to avoid leaving
+/// LLVM to scramble with peephole optimizations.
+static Value rewriteI4ToI8UnsignedExt(PatternRewriter &rewriter, Location loc,
+                                      Value srcValue) {
+  VectorType srcVecType = cast<VectorType>(srcValue.getType());
+  assert(srcVecType.getElementType().isSignlessInteger(4) &&
+         "Expected i4 type");
+
+  // 1. Generate a bitcast vector<Xxi4> -> vector<X/2xi8>.
+  SmallVector<int64_t> i8VecShape = llvm::to_vector(srcVecType.getShape());
+  constexpr int64_t i4Toi8BitwidthFactor = 2;
+  i8VecShape.back() = i8VecShape.back() / i4Toi8BitwidthFactor;
+  auto i8VecType = VectorType::get(i8VecShape, rewriter.getI8Type());
+  Value i8Vector = rewriter.create<vector::BitCastOp>(loc, i8VecType, srcValue);
+
+  // 2 Extend the i4 elements using shifts & masking. Low i4 elements of each
+  //  byte are placed in one vector and the high i4 elements in another vector.
+  constexpr uint8_t lowBitsMask = 15; // Equivalent to [00001111] bit mask
+  auto lowBitsMaskValues = rewriter.create<arith::ConstantOp>(
+      loc, DenseElementsAttr::get(i8VecType, lowBitsMask));
+  Value low = rewriter.create<arith::AndIOp>(loc, i8VecType, i8Vector,
+                                             lowBitsMaskValues);
+  constexpr int8_t highBitsToShift = 4;
+  auto highShiftValues = rewriter.create<arith::ConstantOp>(
+      loc, DenseElementsAttr::get(i8VecType, highBitsToShift));
+  Value high = rewriter.create<arith::ShRUIOp>(loc, i8Vector, highShiftValues);
+
+  // 3. Interleave low and high i8 elements.
+  return rewriter.create<vector::InterleaveOp>(loc, low, high);
+}
+
 /// Rewrite the i8 -> i4 truncation into a sequence of shuffles and bitwise ops
 /// that take advantage of high-level information to avoid leaving LLVM to
 /// scramble with peephole optimizations.
@@ -1048,9 +1080,10 @@ struct RewriteExtOfBitCast : OpRewritePattern<ExtOpType> {
 
 /// Rewrite the i4 -> i8 part of any conversion into a sequence of shuffles and
 /// bitwise ops that take advantage of high-level information to avoid leaving
-/// LLVM to scramble with peephole optimizations.
+/// LLVM to scramble with peephole optimizations. Templated to choose between
+/// signed and unsigned conversions.
 ///
-/// For example:
+/// For example (signed):
 ///    arith.extsi %in : vector<8xi4> to vector<8xi32>
 ///      is rewriten as
 ///        %0 = vector.bitcast %in : vector<8xi4> to vector<4xi8>
@@ -1069,16 +1102,25 @@ struct RewriteExtOfBitCast : OpRewritePattern<ExtOpType> {
 ///        %4 = vector.interleave %2, %3 : vector<4xi8>
 ///        %5 = arith.sitofp %4 : vector<8xi8> to vector<8xf32>
 ///
-template <typename ConversionOpType>
-struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
+/// Example (unsigned):
+///    arith.extui %in : vector<8xi4> to vector<8xi32>
+///      is rewritten as
+///        %0 = vector.bitcast %in : vector<8xi4> to vector<4xi8>
+///        %1 = arith.andi %0, 15 : vector<4xi8>
+///        %2 = arith.shrui %0, 4 : vector<4xi8>
+///        %3 = vector.interleave %1, %2 : vector<4xi8>
+///        %4 = arith.extui %3 : vector<8xi8> to vector<8xi32>
+///
+template <typename ConversionOpType, bool isSigned>
+struct RewriteAlignedSubByteIntExt : OpRewritePattern<ConversionOpType> {
   using OpRewritePattern<ConversionOpType>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(ConversionOpType conversionOp,
                                 PatternRewriter &rewriter) const override {
     // Verify the preconditions.
     Value srcValue = conversionOp.getIn();
-    auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
-    auto dstVecType = dyn_cast<VectorType>(conversionOp.getType());
+    auto srcVecType = cast<VectorType>(srcValue.getType());
+    auto dstVecType = cast<VectorType>(conversionOp.getType());
     if (failed(
             commonConversionPrecondition(rewriter, dstVecType, conversionOp)))
       return failure();
@@ -1089,8 +1131,14 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
       return failure();
 
     // Perform the rewrite.
-    Value subByteExt =
-        rewriteI4ToI8SignedExt(rewriter, conversionOp.getLoc(), srcValue);
+    Value subByteExt;
+    if (isSigned) {
+      subByteExt =
+          rewriteI4ToI8SignedExt(rewriter, conversionOp.getLoc(), srcValue);
+    } else {
+      subByteExt =
+          rewriteI4ToI8UnsignedExt(rewriter, conversionOp.getLoc(), srcValue);
+    }
 
     // Finalize the rewrite.
     rewriter.replaceOpWithNewOp<ConversionOpType>(
@@ -1229,10 +1277,12 @@ void vector::populateVectorNarrowTypeRewritePatterns(
 
   // Patterns for aligned cases. We set higher priority as they are expected to
   // generate better performance for aligned cases.
-  patterns.add<RewriteAlignedSubByteIntSignedExt<arith::ExtSIOp>,
-               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>,
+  patterns.add<RewriteAlignedSubByteIntExt<arith::ExtSIOp, /*isSigned=*/true>,
+               RewriteAlignedSubByteIntExt<arith::SIToFPOp, /*isSigned=*/true>,
                RewriteAlignedSubByteIntTrunc>(patterns.getContext(),
                                               benefit.getBenefit() + 1);
+  patterns.add<RewriteAlignedSubByteIntExt<arith::ExtUIOp, /*isSigned=*/false>>(
+      patterns.getContext(), benefit.getBenefit() + 1);
 }
 
 void vector::populateVectorTransposeNarrowTypeRewritePatterns(
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 69999f0918c1..802a64b0805e 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -140,7 +140,7 @@ struct LinearizeVectorExtractStridedSlice final
                   ConversionPatternRewriter &rewriter) const override {
     Type dstType = getTypeConverter()->convertType(extractOp.getType());
     assert(!(extractOp.getVector().getType().isScalable() ||
-             dstType.cast<VectorType>().isScalable()) &&
+             cast<VectorType>(dstType).isScalable()) &&
            "scalable vectors are not supported.");
     if (!isLessThanTargetBitWidth(extractOp, targetVectorBitWidth))
       return rewriter.notifyMatchFailure(
@@ -172,7 +172,7 @@ struct LinearizeVectorExtractStridedSlice final
     // Get total number of extracted slices.
     int64_t nExtractedSlices = 1;
     for (Attribute size : sizes) {
-      nExtractedSlices *= size.cast<IntegerAttr>().getInt();
+      nExtractedSlices *= cast<IntegerAttr>(size).getInt();
     }
     // Compute the strides of the source vector considering first k dimensions.
     llvm::SmallVector<int64_t, 4> sourceStrides(kD, extractGranularitySize);
@@ -189,7 +189,7 @@ struct LinearizeVectorExtractStridedSlice final
     // Compute extractedStrides.
     for (int i = kD - 2; i >= 0; --i) {
       extractedStrides[i] =
-          extractedStrides[i + 1] * sizes[i + 1].cast<IntegerAttr>().getInt();
+          extractedStrides[i + 1] * cast<IntegerAttr>(sizes[i + 1]).getInt();
     }
     // Iterate over all extracted slices from 0 to nExtractedSlices - 1
     // and compute the multi-dimensional index and the corresponding linearized
@@ -207,7 +207,7 @@ struct LinearizeVectorExtractStridedSlice final
       int64_t linearizedIndex = 0;
       for (int64_t j = 0; j < kD; ++j) {
         linearizedIndex +=
-            (offsets[j].cast<IntegerAttr>().getInt() + multiDimIndex[j]) *
+            (cast<IntegerAttr>(offsets[j]).getInt() + multiDimIndex[j]) *
             sourceStrides[j];
       }
       // Fill the indices array form linearizedIndex to linearizedIndex +
@@ -254,7 +254,7 @@ struct LinearizeVectorShuffle final
     Type dstType = getTypeConverter()->convertType(shuffleOp.getType());
     assert(!(shuffleOp.getV1VectorType().isScalable() ||
              shuffleOp.getV2VectorType().isScalable() ||
-             dstType.cast<VectorType>().isScalable()) &&
+             cast<VectorType>(dstType).isScalable()) &&
            "scalable vectors are not supported.");
     if (!isLessThanTargetBitWidth(shuffleOp, targetVectorBitWidth))
       return rewriter.notifyMatchFailure(
@@ -324,7 +324,7 @@ struct LinearizeVectorExtract final
                   ConversionPatternRewriter &rewriter) const override {
     Type dstTy = getTypeConverter()->convertType(extractOp.getType());
     assert(!(extractOp.getVector().getType().isScalable() ||
-             dstTy.cast<VectorType>().isScalable()) &&
+             cast<VectorType>(dstTy).isScalable()) &&
            "scalable vectors are not supported.");
     if (!isLessThanTargetBitWidth(extractOp, targetVectorBitWidth))
       return rewriter.notifyMatchFailure(
@@ -405,9 +405,7 @@ void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns(
       [=](vector::ShuffleOp shuffleOp) -> bool {
         return isLessThanTargetBitWidth(shuffleOp, targetBitWidth)
                    ? (typeConverter.isLegal(shuffleOp) &&
-                      shuffleOp.getResult()
-                              .getType()
-                              .cast<mlir::VectorType>()
+                      cast<mlir::VectorType>(shuffleOp.getResult().getType())
                               .getRank() == 1)
                    : true;
       });
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index fcaf1ec944b4..6727f3f46172 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -345,7 +345,7 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
   int64_t readRank = readShape.size();
   auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
   SmallVector<bool> inBoundsVal(readRank, true);
-  if (!useInBoundsInsteadOfMasking) {
+  if (useInBoundsInsteadOfMasking) {
     // Update the inBounds attribute.
     for (unsigned i = 0; i < readRank; i++)
       inBoundsVal[i] = (sourceShape[i] == readShape[i]) &&
@@ -359,7 +359,7 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
       /*padding=*/padValue,
       /*inBounds=*/inBoundsVal);
 
-  if (llvm::equal(readShape, sourceShape) || !useInBoundsInsteadOfMasking)
+  if (llvm::equal(readShape, sourceShape) || useInBoundsInsteadOfMasking)
     return transferReadOp;
   SmallVector<OpFoldResult> mixedSourceDims =
       tensor::getMixedSizes(builder, loc, source);
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 617c89a84ee0..a0ce7f9706ce 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
   MLIRXeGPUEnumsIncGen
 
   LINK_LIBS PUBLIC
+  MLIRArithDialect
   MLIRDialectUtils
   MLIRIR
   MLIRViewLikeInterface
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 23c5749c2309..22959224d56c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -406,6 +406,28 @@ LogicalResult StoreScatterOp::verify() {
 
   return success();
 }
+//===----------------------------------------------------------------------===//
+// XeGPU_DpasOp
+//===----------------------------------------------------------------------===//
+LogicalResult DpasOp::verify() {
+  int64_t lhsRank = getLhsType().getRank();
+  int64_t rhsRank = getRhsType().getRank();
+
+  if (lhsRank != rhsRank || lhsRank != 3)
+    return emitOpError(
+        "lhs and rhs rank does not match for dpas op, or their rank is not 3.");
+
+  if (getAcc() && getAccType() != getResultType())
+    return emitOpError("Accumulator and Result for dpas op should have the "
+                       "same type (both shape and element type).");
+
+  auto lhsShape = getLhsType().getShape();
+  auto rhsShape = getRhsType().getShape();
+  if (lhsShape[1] != rhsShape[0] || lhsShape[2] != rhsShape[2])
+    return emitOpError("K-dimension or vnni-factor mismatch.");
+
+  return success();
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Interfaces/RuntimeVerifiableOpInterface.cpp b/mlir/lib/Interfaces/RuntimeVerifiableOpInterface.cpp
index 9205d8d8c34a..561e8d338687 100644
--- a/mlir/lib/Interfaces/RuntimeVerifiableOpInterface.cpp
+++ b/mlir/lib/Interfaces/RuntimeVerifiableOpInterface.cpp
@@ -11,6 +11,27 @@
 namespace mlir {
 class Location;
 class OpBuilder;
+
+/// Generate an error message string for the given op and the specified error.
+std::string
+RuntimeVerifiableOpInterface::generateErrorMessage(Operation *op,
+                                                   const std::string &msg) {
+  std::string buffer;
+  llvm::raw_string_ostream stream(buffer);
+  OpPrintingFlags flags;
+  // We may generate a lot of error messages and so we need to ensure the
+  // printing is fast.
+  flags.elideLargeElementsAttrs();
+  flags.printGenericOpForm();
+  flags.skipRegions();
+  flags.useLocalScope();
+  stream << "ERROR: Runtime op verification failed\n";
+  op->print(stream, flags);
+  stream << "\n^ " << msg;
+  stream << "\nLocation: ";
+  op->getLoc().print(stream);
+  return stream.str();
+}
 } // namespace mlir
 
 /// Include the definitions of the interface.
diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp
index d906de6b56af..59865146e20b 100644
--- a/mlir/lib/TableGen/CodeGenHelpers.cpp
+++ b/mlir/lib/TableGen/CodeGenHelpers.cpp
@@ -24,7 +24,8 @@ using namespace mlir::tblgen;
 
 /// Generate a unique label based on the current file name to prevent name
 /// collisions if multiple generated files are included at once.
-static std::string getUniqueOutputLabel(const llvm::RecordKeeper &records) {
+static std::string getUniqueOutputLabel(const llvm::RecordKeeper &records,
+                                        StringRef tag) {
   // Use the input file name when generating a unique name.
   std::string inputFilename = records.getInputFilename();
 
@@ -33,7 +34,7 @@ static std::string getUniqueOutputLabel(const llvm::RecordKeeper &records) {
   nameRef.consume_back(".td");
 
   // Sanitize any invalid characters.
-  std::string uniqueName;
+  std::string uniqueName(tag);
   for (char c : nameRef) {
     if (llvm::isAlnum(c) || c == '_')
       uniqueName.push_back(c);
@@ -44,15 +45,11 @@ static std::string getUniqueOutputLabel(const llvm::RecordKeeper &records) {
 }
 
 StaticVerifierFunctionEmitter::StaticVerifierFunctionEmitter(
-    raw_ostream &os, const llvm::RecordKeeper &records)
-    : os(os), uniqueOutputLabel(getUniqueOutputLabel(records)) {}
+    raw_ostream &os, const llvm::RecordKeeper &records, StringRef tag)
+    : os(os), uniqueOutputLabel(getUniqueOutputLabel(records, tag)) {}
 
 void StaticVerifierFunctionEmitter::emitOpConstraints(
-    ArrayRef<llvm::Record *> opDefs, bool emitDecl) {
-  collectOpConstraints(opDefs);
-  if (emitDecl)
-    return;
-
+    ArrayRef<llvm::Record *> opDefs) {
   NamespaceEmitter namespaceEmitter(os, Operator(*opDefs[0]).getCppNamespace());
   emitTypeConstraints();
   emitAttrConstraints();
diff --git a/mlir/lib/Tools/PDLL/AST/Types.cpp b/mlir/lib/Tools/PDLL/AST/Types.cpp
index fc4cb613dd22..081f85d69a2f 100644
--- a/mlir/lib/Tools/PDLL/AST/Types.cpp
+++ b/mlir/lib/Tools/PDLL/AST/Types.cpp
@@ -35,8 +35,8 @@ Type Type::refineWith(Type other) const {
     return *this;
 
   // Operation types are compatible if the operation names don't conflict.
-  if (auto opTy = dyn_cast<OperationType>()) {
-    auto otherOpTy = other.dyn_cast<ast::OperationType>();
+  if (auto opTy = mlir::dyn_cast<OperationType>(*this)) {
+    auto otherOpTy = mlir::dyn_cast<ast::OperationType>(other);
     if (!otherOpTy)
       return nullptr;
     if (!otherOpTy.getName())
@@ -105,25 +105,26 @@ Type RangeType::getElementType() const {
 // TypeRangeType
 
 bool TypeRangeType::classof(Type type) {
-  RangeType range = type.dyn_cast<RangeType>();
-  return range && range.getElementType().isa<TypeType>();
+  RangeType range = mlir::dyn_cast<RangeType>(type);
+  return range && mlir::isa<TypeType>(range.getElementType());
 }
 
 TypeRangeType TypeRangeType::get(Context &context) {
-  return RangeType::get(context, TypeType::get(context)).cast<TypeRangeType>();
+  return mlir::cast<TypeRangeType>(
+      RangeType::get(context, TypeType::get(context)));
 }
 
 //===----------------------------------------------------------------------===//
 // ValueRangeType
 
 bool ValueRangeType::classof(Type type) {
-  RangeType range = type.dyn_cast<RangeType>();
-  return range && range.getElementType().isa<ValueType>();
+  RangeType range = mlir::dyn_cast<RangeType>(type);
+  return range && mlir::isa<ValueType>(range.getElementType());
 }
 
 ValueRangeType ValueRangeType::get(Context &context) {
-  return RangeType::get(context, ValueType::get(context))
-      .cast<ValueRangeType>();
+  return mlir::cast<ValueRangeType>(
+      RangeType::get(context, ValueType::get(context)));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp b/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
index 16c3ccf0de26..964d94c9c0a4 100644
--- a/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
+++ b/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
@@ -337,13 +337,13 @@ Value CodeGen::genNonInitializerVar(const ast::VariableDecl *varDecl,
   // Generate a value based on the type of the variable.
   ast::Type type = varDecl->getType();
   Type mlirType = genType(type);
-  if (type.isa<ast::ValueType>())
+  if (isa<ast::ValueType>(type))
     return builder.create<pdl::OperandOp>(loc, mlirType, getTypeConstraint());
-  if (type.isa<ast::TypeType>())
+  if (isa<ast::TypeType>(type))
     return builder.create<pdl::TypeOp>(loc, mlirType, /*type=*/TypeAttr());
-  if (type.isa<ast::AttributeType>())
+  if (isa<ast::AttributeType>(type))
     return builder.create<pdl::AttributeOp>(loc, getTypeConstraint());
-  if (ast::OperationType opType = type.dyn_cast<ast::OperationType>()) {
+  if (ast::OperationType opType = dyn_cast<ast::OperationType>(type)) {
     Value operands = builder.create<pdl::OperandsOp>(
         loc, pdl::RangeType::get(builder.getType<pdl::ValueType>()),
         /*type=*/Value());
@@ -354,12 +354,12 @@ Value CodeGen::genNonInitializerVar(const ast::VariableDecl *varDecl,
         loc, opType.getName(), operands, std::nullopt, ValueRange(), results);
   }
 
-  if (ast::RangeType rangeTy = type.dyn_cast<ast::RangeType>()) {
+  if (ast::RangeType rangeTy = dyn_cast<ast::RangeType>(type)) {
     ast::Type eleTy = rangeTy.getElementType();
-    if (eleTy.isa<ast::ValueType>())
+    if (isa<ast::ValueType>(eleTy))
       return builder.create<pdl::OperandsOp>(loc, mlirType,
                                              getTypeConstraint());
-    if (eleTy.isa<ast::TypeType>())
+    if (isa<ast::TypeType>(eleTy))
       return builder.create<pdl::TypesOp>(loc, mlirType, /*types=*/ArrayAttr());
   }
 
@@ -440,7 +440,7 @@ Value CodeGen::genExprImpl(const ast::MemberAccessExpr *expr) {
   ast::Type parentType = expr->getParentExpr()->getType();
 
   // Handle operation based member access.
-  if (ast::OperationType opType = parentType.dyn_cast<ast::OperationType>()) {
+  if (ast::OperationType opType = dyn_cast<ast::OperationType>(parentType)) {
     if (isa<ast::AllResultsMemberAccessExpr>(expr)) {
       Type mlirType = genType(expr->getType());
       if (isa<pdl::ValueType>(mlirType))
@@ -480,7 +480,7 @@ Value CodeGen::genExprImpl(const ast::MemberAccessExpr *expr) {
   }
 
   // Handle tuple based member access.
-  if (auto tupleType = parentType.dyn_cast<ast::TupleType>()) {
+  if (auto tupleType = dyn_cast<ast::TupleType>(parentType)) {
     auto elementNames = tupleType.getElementNames();
 
     // The index is either a numeric index, or a name.
@@ -581,14 +581,14 @@ CodeGen::genConstraintOrRewriteCall(const T *decl, Location loc,
   if (!cstBody) {
     ast::Type declResultType = decl->getResultType();
     SmallVector<Type> resultTypes;
-    if (ast::TupleType tupleType = declResultType.dyn_cast<ast::TupleType>()) {
+    if (ast::TupleType tupleType = dyn_cast<ast::TupleType>(declResultType)) {
       for (ast::Type type : tupleType.getElementTypes())
         resultTypes.push_back(genType(type));
     } else {
       resultTypes.push_back(genType(declResultType));
     }
-    PDLOpT pdlOp = builder.create<PDLOpT>(
-        loc, resultTypes, decl->getName().getName(), inputs);
+    PDLOpT pdlOp = builder.create<PDLOpT>(loc, resultTypes,
+                                          decl->getName().getName(), inputs);
     if (isNegated && std::is_same_v<PDLOpT, pdl::ApplyNativeConstraintOp>)
       cast<pdl::ApplyNativeConstraintOp>(pdlOp).setIsNegated(true);
     return pdlOp->getResults();
diff --git a/mlir/lib/Tools/PDLL/Parser/Parser.cpp b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
index 9f931f4fce00..45f9e2f899a7 100644
--- a/mlir/lib/Tools/PDLL/Parser/Parser.cpp
+++ b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
@@ -623,7 +623,7 @@ LogicalResult Parser::convertExpressionTo(
     return diag;
   };
 
-  if (auto exprOpType = exprType.dyn_cast<ast::OperationType>())
+  if (auto exprOpType = dyn_cast<ast::OperationType>(exprType))
     return convertOpExpressionTo(expr, exprOpType, type, emitConvertError);
 
   // FIXME: Decide how to allow/support converting a single result to multiple,
@@ -638,7 +638,7 @@ LogicalResult Parser::convertExpressionTo(
     return success();
 
   // Handle tuple types.
-  if (auto exprTupleType = exprType.dyn_cast<ast::TupleType>())
+  if (auto exprTupleType = dyn_cast<ast::TupleType>(exprType))
     return convertTupleExpressionTo(expr, exprTupleType, type, emitConvertError,
                                     noteAttachFn);
 
@@ -650,7 +650,7 @@ LogicalResult Parser::convertOpExpressionTo(
     function_ref<ast::InFlightDiagnostic()> emitErrorFn) {
   // Two operation types are compatible if they have the same name, or if the
   // expected type is more general.
-  if (auto opType = type.dyn_cast<ast::OperationType>()) {
+  if (auto opType = dyn_cast<ast::OperationType>(type)) {
     if (opType.getName())
       return emitErrorFn();
     return success();
@@ -702,7 +702,7 @@ LogicalResult Parser::convertTupleExpressionTo(
     function_ref<ast::InFlightDiagnostic()> emitErrorFn,
     function_ref<void(ast::Diagnostic &diag)> noteAttachFn) {
   // Handle conversions between tuples.
-  if (auto tupleType = type.dyn_cast<ast::TupleType>()) {
+  if (auto tupleType = dyn_cast<ast::TupleType>(type)) {
     if (tupleType.size() != exprType.size())
       return emitErrorFn();
 
@@ -2568,7 +2568,7 @@ Parser::createVariableDecl(StringRef name, SMRange loc, ast::Expr *initializer,
   }
 
   // Constraint types cannot be used when defining variables.
-  if (type.isa<ast::ConstraintType, ast::RewriteType>()) {
+  if (isa<ast::ConstraintType, ast::RewriteType>(type)) {
     return emitError(
         loc, llvm::formatv("unable to define variable of `{0}` type", type));
   }
@@ -2782,7 +2782,7 @@ Parser::createMemberAccessExpr(ast::Expr *parentExpr, StringRef name,
 FailureOr<ast::Type> Parser::validateMemberAccess(ast::Expr *parentExpr,
                                                   StringRef name, SMRange loc) {
   ast::Type parentType = parentExpr->getType();
-  if (ast::OperationType opType = parentType.dyn_cast<ast::OperationType>()) {
+  if (ast::OperationType opType = dyn_cast<ast::OperationType>(parentType)) {
     if (name == ast::AllResultsMemberAccessExpr::getMemberName())
       return valueRangeTy;
 
@@ -2808,7 +2808,7 @@ FailureOr<ast::Type> Parser::validateMemberAccess(ast::Expr *parentExpr,
       // operations. It returns a single value.
       return valueTy;
     }
-  } else if (auto tupleType = parentType.dyn_cast<ast::TupleType>()) {
+  } else if (auto tupleType = dyn_cast<ast::TupleType>(parentType)) {
     // Handle indexed results.
     unsigned index = 0;
     if (llvm::isDigit(name[0]) && !name.getAsInteger(/*Radix=*/10, index) &&
@@ -2845,7 +2845,7 @@ FailureOr<ast::OperationExpr *> Parser::createOperationExpr(
   for (ast::NamedAttributeDecl *attr : attributes) {
     // Check for an attribute type, or a type awaiting resolution.
     ast::Type attrType = attr->getValue()->getType();
-    if (!attrType.isa<ast::AttributeType>()) {
+    if (!isa<ast::AttributeType>(attrType)) {
       return emitError(
           attr->getValue()->getLoc(),
           llvm::formatv("expected `Attr` expression, but got `{0}`", attrType));
@@ -3024,7 +3024,7 @@ LogicalResult Parser::validateOperationOperandsOrResults(
     // ValueRange. This situations arises quite often with nested operation
     // expressions: `op<my_dialect.foo>(op<my_dialect.bar>)`
     if (singleTy == valueTy) {
-      if (valueExprType.isa<ast::OperationType>()) {
+      if (isa<ast::OperationType>(valueExprType)) {
         valueExpr = convertOpToValue(valueExpr);
         continue;
       }
@@ -3048,7 +3048,7 @@ Parser::createTupleExpr(SMRange loc, ArrayRef<ast::Expr *> elements,
                         ArrayRef<StringRef> elementNames) {
   for (const ast::Expr *element : elements) {
     ast::Type eleTy = element->getType();
-    if (eleTy.isa<ast::ConstraintType, ast::RewriteType, ast::TupleType>()) {
+    if (isa<ast::ConstraintType, ast::RewriteType, ast::TupleType>(eleTy)) {
       return emitError(
           element->getLoc(),
           llvm::formatv("unable to build a tuple with `{0}` element", eleTy));
@@ -3064,7 +3064,7 @@ FailureOr<ast::EraseStmt *> Parser::createEraseStmt(SMRange loc,
                                                     ast::Expr *rootOp) {
   // Check that root is an Operation.
   ast::Type rootType = rootOp->getType();
-  if (!rootType.isa<ast::OperationType>())
+  if (!isa<ast::OperationType>(rootType))
     return emitError(rootOp->getLoc(), "expected `Op` expression");
 
   return ast::EraseStmt::create(ctx, loc, rootOp);
@@ -3075,7 +3075,7 @@ Parser::createReplaceStmt(SMRange loc, ast::Expr *rootOp,
                           MutableArrayRef<ast::Expr *> replValues) {
   // Check that root is an Operation.
   ast::Type rootType = rootOp->getType();
-  if (!rootType.isa<ast::OperationType>()) {
+  if (!isa<ast::OperationType>(rootType)) {
     return emitError(
         rootOp->getLoc(),
         llvm::formatv("expected `Op` expression, but got `{0}`", rootType));
@@ -3088,7 +3088,7 @@ Parser::createReplaceStmt(SMRange loc, ast::Expr *rootOp,
     ast::Type replType = replExpr->getType();
 
     // Check that replExpr is an Operation, Value, or ValueRange.
-    if (replType.isa<ast::OperationType>()) {
+    if (isa<ast::OperationType>(replType)) {
       if (shouldConvertOpToValues)
         replExpr = convertOpToValue(replExpr);
       continue;
@@ -3110,7 +3110,7 @@ Parser::createRewriteStmt(SMRange loc, ast::Expr *rootOp,
                           ast::CompoundStmt *rewriteBody) {
   // Check that root is an Operation.
   ast::Type rootType = rootOp->getType();
-  if (!rootType.isa<ast::OperationType>()) {
+  if (!isa<ast::OperationType>(rootType)) {
     return emitError(
         rootOp->getLoc(),
         llvm::formatv("expected `Op` expression, but got `{0}`", rootType));
@@ -3125,9 +3125,9 @@ Parser::createRewriteStmt(SMRange loc, ast::Expr *rootOp,
 
 LogicalResult Parser::codeCompleteMemberAccess(ast::Expr *parentExpr) {
   ast::Type parentType = parentExpr->getType();
-  if (ast::OperationType opType = parentType.dyn_cast<ast::OperationType>())
+  if (ast::OperationType opType = dyn_cast<ast::OperationType>(parentType))
     codeCompleteContext->codeCompleteOperationMemberAccess(opType);
-  else if (ast::TupleType tupleType = parentType.dyn_cast<ast::TupleType>())
+  else if (ast::TupleType tupleType = dyn_cast<ast::TupleType>(parentType))
     codeCompleteContext->codeCompleteTupleMemberAccess(tupleType);
   return failure();
 }
diff --git a/mlir/lib/Tools/lsp-server-support/Transport.cpp b/mlir/lib/Tools/lsp-server-support/Transport.cpp
index 64dea35614c0..1e90ab32281f 100644
--- a/mlir/lib/Tools/lsp-server-support/Transport.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Transport.cpp
@@ -51,12 +51,12 @@ private:
 
 Reply::Reply(const llvm::json::Value &id, llvm::StringRef method,
              JSONTransport &transport, std::mutex &transportOutputMutex)
-    : id(id), transport(&transport),
+    : method(method), id(id), transport(&transport),
       transportOutputMutex(transportOutputMutex) {}
 
 Reply::Reply(Reply &&other)
-    : replied(other.replied.load()), id(std::move(other.id)),
-      transport(other.transport),
+    : method(other.method), replied(other.replied.load()),
+      id(std::move(other.id)), transport(other.transport),
       transportOutputMutex(other.transportOutputMutex) {
   other.transport = nullptr;
 }
@@ -117,21 +117,29 @@ bool MessageHandler::onCall(llvm::StringRef method, llvm::json::Value params,
 
 bool MessageHandler::onReply(llvm::json::Value id,
                              llvm::Expected<llvm::json::Value> result) {
-  // TODO: Add support for reply callbacks when support for outgoing messages is
-  // added. For now, we just log an error on any replies received.
-  Callback<llvm::json::Value> replyHandler =
-      [&id](llvm::Expected<llvm::json::Value> result) {
-        Logger::error(
-            "received a reply with ID {0}, but there was no such call", id);
-        if (!result)
-          llvm::consumeError(result.takeError());
-      };
-
-  // Log and run the reply handler.
-  if (result)
-    replyHandler(std::move(result));
-  else
-    replyHandler(result.takeError());
+  // Find the response handler in the mapping. If it exists, move it out of the
+  // mapping and erase it.
+  ResponseHandlerTy responseHandler;
+  {
+    std::lock_guard<std::mutex> responseHandlersLock(responseHandlersMutex);
+    auto it = responseHandlers.find(debugString(id));
+    if (it != responseHandlers.end()) {
+      responseHandler = std::move(it->second);
+      responseHandlers.erase(it);
+    }
+  }
+
+  // If we found a response handler, invoke it. Otherwise, log an error.
+  if (responseHandler.second) {
+    Logger::info("--> reply:{0}({1})", responseHandler.first, id);
+    responseHandler.second(std::move(id), std::move(result));
+  } else {
+    Logger::error(
+        "received a reply with ID {0}, but there was no such outgoing request",
+        id);
+    if (!result)
+      llvm::consumeError(result.takeError());
+  }
   return true;
 }
 
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
index d282ee8f61d8..ae0961c62aba 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
@@ -137,7 +137,8 @@ struct PDLIndexSymbol {
 
   /// Return the location of the definition of this symbol.
   SMRange getDefLoc() const {
-    if (const ast::Decl *decl = llvm::dyn_cast_if_present<const ast::Decl *>(definition)) {
+    if (const ast::Decl *decl =
+            llvm::dyn_cast_if_present<const ast::Decl *>(definition)) {
       const ast::Name *declName = decl->getName();
       return declName ? declName->getLoc() : decl->getLoc();
     }
@@ -466,7 +467,8 @@ PDLDocument::findHover(const lsp::URIForFile &uri,
     return std::nullopt;
 
   // Add hover for operation names.
-  if (const auto *op = llvm::dyn_cast_if_present<const ods::Operation *>(symbol->definition))
+  if (const auto *op =
+          llvm::dyn_cast_if_present<const ods::Operation *>(symbol->definition))
     return buildHoverForOpName(op, hoverRange);
   const auto *decl = symbol->definition.get<const ast::Decl *>();
   return findHover(decl, hoverRange);
@@ -587,7 +589,7 @@ lsp::Hover PDLDocument::buildHoverForUserConstraintOrRewrite(
       hoverOS << "***\n";
     }
     ast::Type resultType = decl->getResultType();
-    if (auto resultTupleTy = resultType.dyn_cast<ast::TupleType>()) {
+    if (auto resultTupleTy = dyn_cast<ast::TupleType>(resultType)) {
       if (!resultTupleTy.empty()) {
         hoverOS << "Results:\n";
         for (auto it : llvm::zip(resultTupleTy.getElementNames(),
@@ -795,13 +797,13 @@ public:
     }
     if (allowInlineTypeConstraints) {
       /// Attr<Type>.
-      if (!currentType || currentType.isa<ast::AttributeType>())
+      if (!currentType || isa<ast::AttributeType>(currentType))
         addCoreConstraint("Attr<type>", "mlir::Attribute", "Attr<$1>");
       /// Value<Type>.
-      if (!currentType || currentType.isa<ast::ValueType>())
+      if (!currentType || isa<ast::ValueType>(currentType))
         addCoreConstraint("Value<type>", "mlir::Value", "Value<$1>");
       /// ValueRange<TypeRange>.
-      if (!currentType || currentType.isa<ast::ValueRangeType>())
+      if (!currentType || isa<ast::ValueRangeType>(currentType))
         addCoreConstraint("ValueRange<type>", "mlir::ValueRange",
                           "ValueRange<$1>");
     }
@@ -1242,7 +1244,7 @@ void PDLDocument::getInlayHintsFor(const ast::OperationExpr *expr,
                                    const lsp::URIForFile &uri,
                                    std::vector<lsp::InlayHint> &inlayHints) {
   // Check for ODS information.
-  ast::OperationType opType = expr->getType().dyn_cast<ast::OperationType>();
+  ast::OperationType opType = dyn_cast<ast::OperationType>(expr->getType());
   const auto *odsOp = opType ? opType.getODSOperation() : nullptr;
 
   auto addOpHint = [&](const ast::Expr *valueExpr, StringRef label) {
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
index 23d6d26b7e29..bb43ebf2b692 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
@@ -291,6 +291,12 @@ class UnaryFn:
     ceil = UnaryFnType("ceil")
     floor = UnaryFnType("floor")
     negf = UnaryFnType("negf")
+    round = UnaryFnType("round")
+    sqrt = UnaryFnType("sqrt")
+    rsqrt = UnaryFnType("rsqrt")
+    square = UnaryFnType("square")
+    tanh = UnaryFnType("tanh")
+    erf = UnaryFnType("erf")
 
 
 class BinaryFnType:
@@ -330,6 +336,7 @@ class BinaryFn:
     min_signed = BinaryFnType("min_signed")
     max_unsigned = BinaryFnType("max_unsigned")
     min_unsigned = BinaryFnType("min_unsigned")
+    powf = BinaryFnType("powf")
 
 
 class TypeFnType:
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index 5b05364f6d35..ca2bb0c5f7f8 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -109,6 +109,78 @@ def negf(
 
 
 @linalg_structured_op
+def round(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies round(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.round(I[None])
+
+
+@linalg_structured_op
+def sqrt(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies sqrt(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.sqrt(I[None])
+
+
+@linalg_structured_op
+def rsqrt(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies rsqrt(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.rsqrt(I[None])
+
+
+@linalg_structured_op
+def square(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies square(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.square(I[None])
+
+
+@linalg_structured_op
+def tanh(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies tanh(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.tanh(I[None])
+
+
+@linalg_structured_op
+def erf(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies erf(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.erf(I[None])
+
+
+@linalg_structured_op
 def elemwise_binary(
     lhs=TensorDef(T1),
     rhs=TensorDef(T2),
@@ -233,13 +305,53 @@ def max(
 
     This means reduction/broadcast/element cast semantics is explicit. Further
     passes can take that into account when lowering this code. For example,
-    a `linalg.broadcast` + `linalg.div` sequence can be lowered to a
+    a `linalg.broadcast` + `linalg.max` sequence can be lowered to a
     `linalg.generic` with different affine maps for the two operands.
     """
     O[None] = BinaryFn.max_signed(lhs[None], rhs[None])
 
 
 @linalg_structured_op
+def min(
+    lhs=TensorDef(T1),
+    rhs=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Takes the min (signed) between two inputs, elementwise.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.min` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+    """
+    O[None] = BinaryFn.min_signed(lhs[None], rhs[None])
+
+
+@linalg_structured_op
+def powf(
+    lhs=TensorDef(T1),
+    rhs=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Takes the powf(lhs, rhs) between two inputs, elementwise. For powf(arg, 2) use `linalg.square`.
+
+    Only applies to floating point values.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.powf` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+    """
+    O[None] = BinaryFn.powf(lhs[None], rhs[None])
+
+
+@linalg_structured_op
 def matmul(
     A=TensorDef(T1, S.M, S.K),
     B=TensorDef(T2, S.K, S.N),
diff --git a/mlir/test/CAPI/sparse_tensor.c b/mlir/test/CAPI/sparse_tensor.c
index f241e0e5c2fb..22b7052b732a 100644
--- a/mlir/test/CAPI/sparse_tensor.c
+++ b/mlir/test/CAPI/sparse_tensor.c
@@ -27,7 +27,7 @@ static int testRoundtripEncoding(MlirContext ctx) {
   const char *originalAsm =
     "#sparse_tensor.encoding<{ "
     "map = [s0](d0, d1) -> (s0 : dense, d0 : compressed, d1 : compressed), "
-    "posWidth = 32, crdWidth = 64 }>";
+    "posWidth = 32, crdWidth = 64, explicitVal = 1 : i64}>";
   // clang-format on
   MlirAttribute originalAttr =
       mlirAttributeParseGet(ctx, mlirStringRefCreateFromCString(originalAsm));
@@ -56,8 +56,21 @@ static int testRoundtripEncoding(MlirContext ctx) {
   // CHECK: crdWidth: 64
   int crdWidth = mlirSparseTensorEncodingAttrGetCrdWidth(originalAttr);
   fprintf(stderr, "crdWidth: %d\n", crdWidth);
+
+  // CHECK: explicitVal: 1 : i64
+  MlirAttribute explicitVal =
+      mlirSparseTensorEncodingAttrGetExplicitVal(originalAttr);
+  fprintf(stderr, "explicitVal: ");
+  mlirAttributeDump(explicitVal);
+  // CHECK: implicitVal: <<NULL ATTRIBUTE>>
+  MlirAttribute implicitVal =
+      mlirSparseTensorEncodingAttrGetImplicitVal(originalAttr);
+  fprintf(stderr, "implicitVal: ");
+  mlirAttributeDump(implicitVal);
+
   MlirAttribute newAttr = mlirSparseTensorEncodingAttrGet(
-      ctx, lvlRank, lvlTypes, dimToLvl, lvlToDim, posWidth, crdWidth);
+      ctx, lvlRank, lvlTypes, dimToLvl, lvlToDim, posWidth, crdWidth,
+      explicitVal, implicitVal);
   mlirAttributeDump(newAttr); // For debugging filecheck output.
   // CHECK: equal: 1
   fprintf(stderr, "equal: %d\n", mlirAttributeEqual(originalAttr, newAttr));
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index 87d613986c7c..b86103422b07 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -453,7 +453,7 @@ func.func @collapse_shape_dynamic_with_non_identity_layout(
 
 func.func @expand_shape_static(%arg0: memref<3x4x5xf32>) -> memref<1x3x4x1x5xf32> {
   // Reshapes that expand a contiguous tensor with some 1's.
-  %0 = memref.expand_shape %arg0 [[0, 1], [2], [3, 4]]
+  %0 = memref.expand_shape %arg0 [[0, 1], [2], [3, 4]] output_shape [1, 3, 4, 1, 5]
       : memref<3x4x5xf32> into memref<1x3x4x1x5xf32>
   return %0 : memref<1x3x4x1x5xf32>
 }
@@ -510,7 +510,7 @@ func.func @collapse_shape_fold_zero_dim(%arg0 : memref<1x1xf32>) -> memref<f32>
 // -----
 
 func.func @expand_shape_zero_dim(%arg0 : memref<f32>) -> memref<1x1xf32> {
-  %0 = memref.expand_shape %arg0 [] : memref<f32> into memref<1x1xf32>
+  %0 = memref.expand_shape %arg0 [] output_shape [1, 1] : memref<f32> into memref<1x1xf32>
   return %0 : memref<1x1xf32>
 }
 
@@ -571,13 +571,13 @@ func.func @collapse_shape_dynamic(%arg0 : memref<1x2x?xf32>) -> memref<1x?xf32>
 
 // -----
 
-func.func @expand_shape_dynamic(%arg0 : memref<1x?xf32>) -> memref<1x2x?xf32> {
-  %0 = memref.expand_shape %arg0 [[0], [1, 2]]: memref<1x?xf32> into memref<1x2x?xf32>
+func.func @expand_shape_dynamic(%arg0 : memref<1x?xf32>, %sz0: index) -> memref<1x2x?xf32> {
+  %0 = memref.expand_shape %arg0 [[0], [1, 2]] output_shape [1, 2, %sz0]: memref<1x?xf32> into memref<1x2x?xf32>
   return %0 : memref<1x2x?xf32>
 }
 
 // CHECK-LABEL:   func.func @expand_shape_dynamic(
-// CHECK-SAME:                                    %[[ARG:.*]]: memref<1x?xf32>) -> memref<1x2x?xf32> {
+// CHECK-SAME:              %[[ARG:.*]]: memref<1x?xf32>, %[[SZ0:.*]]: index) -> memref<1x2x?xf32> {
 // CHECK:           %[[MEM:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : memref<1x?xf32> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK:           %[[BASE_BUFFER:.*]] = llvm.extractvalue %[[MEM]][0] : !llvm.struct<(ptr, ptr, i64,
 // CHECK:           %[[ALIGNED_BUFFER:.*]] = llvm.extractvalue %[[MEM]][1] : !llvm.struct<(ptr, ptr, i64,
@@ -614,15 +614,15 @@ func.func @expand_shape_dynamic(%arg0 : memref<1x?xf32>) -> memref<1x2x?xf32> {
 // -----
 
 func.func @expand_shape_dynamic_with_non_identity_layout(
-            %arg0 : memref<1x?xf32, strided<[?, ?], offset: ?>>) ->
+            %arg0 : memref<1x?xf32, strided<[?, ?], offset: ?>>, %sz0: index) ->
             memref<1x2x?xf32, strided<[?, ?, ?], offset: ?>> {
-  %0 = memref.expand_shape %arg0 [[0], [1, 2]]:
+  %0 = memref.expand_shape %arg0 [[0], [1, 2]] output_shape [1, 2, %sz0] :
     memref<1x?xf32, strided<[?, ?], offset: ?>> into
     memref<1x2x?xf32, strided<[?, ?, ?], offset: ?>>
   return %0 : memref<1x2x?xf32, strided<[?, ?, ?], offset: ?>>
 }
 // CHECK-LABEL:   func.func @expand_shape_dynamic_with_non_identity_layout(
-// CHECK-SAME:                                                             %[[ARG:.*]]: memref<1x?xf32, strided<[?, ?], offset: ?>>) -> memref<1x2x?xf32, strided<[?, ?, ?], offset: ?>> {
+// CHECK-SAME:        %[[ARG:.*]]: memref<1x?xf32, strided<[?, ?], offset: ?>>, %[[SZ0:.*]]: index) -> memref<1x2x?xf32, strided<[?, ?, ?], offset: ?>> {
 // CHECK:           %[[MEM:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : memref<1x?xf32, strided<[?, ?], offset: ?>> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK:           %[[BASE_BUFFER:.*]] = llvm.extractvalue %[[MEM]][0] : !llvm.struct<(ptr, ptr, i64,
 // CHECK:           %[[ALIGNED_BUFFER:.*]] = llvm.extractvalue %[[MEM]][1] : !llvm.struct<(ptr, ptr, i64,
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 37999d6fc14a..baf9cfe610a5 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -334,9 +334,9 @@ memref.global "private" @gv4 : memref<f32> = dense<1.0> {alignment = 64}
 // CHECK-LABEL: func @expand_shape_static(
 // CHECK-SAME:         %[[ARG:.*]]: memref<{{.*}}>)
 func.func @expand_shape_static(%arg0: memref<3x4x5xf32>) -> memref<1x3x4x1x5xf32> {
-  // CHECK: memref.expand_shape %[[ARG]] {{\[}}[0, 1], [2], [3, 4]]
+  // CHECK: memref.expand_shape %[[ARG]] {{\[}}[0, 1], [2], [3, 4]] output_shape [1, 3, 4, 1, 5]
   // Reshapes that expand a contiguous tensor with some 1's.
-  %0 = memref.expand_shape %arg0 [[0, 1], [2], [3, 4]]
+  %0 = memref.expand_shape %arg0 [[0, 1], [2], [3, 4]] output_shape [1, 3, 4, 1, 5]
       : memref<3x4x5xf32> into memref<1x3x4x1x5xf32>
   return %0 : memref<1x3x4x1x5xf32>
 }
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 3ec15221e299..45b39f79a2a7 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -348,7 +348,7 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z_]*]]:
 func.func @test_add_2d_different_ranks(%arg0: tensor<3x4xf32>, %arg1: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
 
-  // CHECK: %[[ARG0_EXPANDED:.*]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2]] : tensor<3x4xf32> into tensor<1x3x4xf32>
+  // CHECK: %[[ARG0_EXPANDED:.*]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2]] output_shape [1, 3, 4] : tensor<3x4xf32> into tensor<1x3x4xf32>
   // CHECK: %[[VAL_0:.*]] = tensor.empty() : tensor<2x3x4xf32>
   // CHECK: %[[RESULT:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[ARG0_EXPANDED]], %[[ARG1]] : tensor<1x3x4xf32>, tensor<2x3x4xf32>) outs(%[[VAL_0]] : tensor<2x3x4xf32>) {
   // CHECK: ^bb0(%[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: f32, %[[VAL_3:.*]]: f32):
@@ -626,7 +626,7 @@ func.func @test_simple_i32(%arg0: tensor<1xi32>) -> () {
 
   // CHECK: linalg.generic
   // CHECK: arith.divsi
-  %4 = tosa.div %arg0, %arg0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  %40 = tosa.int_div %arg0, %arg0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
 
   // CHECK: linalg.generic
   // CHECK: ^bb0(%[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32):
@@ -871,7 +871,7 @@ func.func @reduce_float(%arg0: tensor<5x4xf32>) -> () {
   // CHECK:   [[RES:%.+]] = arith.addf %[[ARG1]], %[[ARG2]] : f32
   // CHECK:   linalg.yield [[RES]] : f32
   // CHECK:  }
-  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] : tensor<4xf32> into tensor<1x4xf32>
+  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] output_shape [1, 4] : tensor<4xf32> into tensor<1x4xf32>
   %0 = tosa.reduce_sum %arg0 {axis = 0 : i32} : (tensor<5x4xf32>) -> tensor<1x4xf32>
 
   // CHECK: [[INIT:%.+]] = tensor.empty() : tensor<5xf32>
@@ -882,7 +882,7 @@ func.func @reduce_float(%arg0: tensor<5x4xf32>) -> () {
   // CHECK:   [[RES:%.+]] = arith.addf %[[ARG1]], %[[ARG2]] : f32
   // CHECK:   linalg.yield [[RES]] : f32
   // CHECK:  }
-  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] : tensor<5xf32> into tensor<5x1xf32>
+  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] output_shape [5, 1] : tensor<5xf32> into tensor<5x1xf32>
   %1 = tosa.reduce_sum %arg0 {axis = 1 : i32} : (tensor<5x4xf32>) -> tensor<5x1xf32>
 
   // CHECK: arith.constant 1.0
@@ -920,7 +920,10 @@ func.func @reduce_float_dyn(%arg0: tensor<?x5x4xf32>) -> () {
   // CHECK:   %[[RES:.+]] = arith.addf %[[ARG1]], %[[ARG2]] : f32
   // CHECK:   linalg.yield %[[RES]] : f32
   // CHECK:  }
-  // CHECK: tensor.expand_shape %[[REDUCE]] {{\[}}[0], [1, 2]] : tensor<?x4xf32> into tensor<?x1x4xf32>
+  // CHECK: %[[C0_0:.+]] = arith.constant 0 : index
+  // CHECK: %[[DIM_1:.+]] = tensor.dim %[[REDUCE]], %[[C0_0]] : tensor<?x4xf32>
+  // CHECK: %[[C1:.+]] = arith.constant 1 : index
+  // CHECK: tensor.expand_shape %[[REDUCE]] {{\[}}[0], [1, 2]] output_shape [%[[DIM_1]], 1, 4] : tensor<?x4xf32> into tensor<?x1x4xf32>
   %0 = tosa.reduce_sum %arg0 {axis = 1 : i32} : (tensor<?x5x4xf32>) -> tensor<?x1x4xf32>
   return
 }
@@ -938,7 +941,7 @@ func.func @reduce_float_dyn_rank_1(%arg0: tensor<?xf32>) -> () {
   // CHECK:   %[[RES:.+]] = arith.addf %[[ARG1]], %[[ARG2]] : f32
   // CHECK:   linalg.yield %[[RES]] : f32
   // CHECK:  }
-  // CHECK: tensor.expand_shape %[[REDUCE]] {{\[}}] : tensor<f32> into tensor<1xf32>
+  // CHECK: tensor.expand_shape %[[REDUCE]] {{\[}}] output_shape [1] : tensor<f32> into tensor<1xf32>
   %0 = tosa.reduce_sum %arg0 {axis = 0 : i32} : (tensor<?xf32>) -> tensor<1xf32>
   return
 }
@@ -958,7 +961,10 @@ func.func @reduce_float_dyn_nonzero_batch(%arg0: tensor<5x?x4xf32>) -> () {
   // CHECK:   %[[RES:.+]] = arith.mulf %[[ARG1]], %[[ARG2]] : f32
   // CHECK:   linalg.yield %[[RES]] : f32
   // CHECK:  }
-  // CHECK: tensor.expand_shape %[[REDUCE]] {{\[}}[0], [1, 2]] : tensor<5x?xf32> into tensor<5x?x1xf32>
+  // CHECK: %[[C1_0:.+]] = arith.constant 1 : index
+  // CHECK: %[[DIM_1:.+]] = tensor.dim %[[REDUCE]], %[[C1_0]] : tensor<5x?xf32>
+  // CHECK: %[[C1_2:.+]] = arith.constant 1 : index
+  // CHECK: tensor.expand_shape %[[REDUCE]] {{\[}}[0], [1, 2]] output_shape [5, %[[DIM_1]], 1] : tensor<5x?xf32> into tensor<5x?x1xf32>
   %0 = tosa.reduce_prod %arg0 {axis = 2 : i32} : (tensor<5x?x4xf32>) -> tensor<5x?x1xf32>
   return
 }
@@ -978,7 +984,10 @@ func.func @reduce_float_dyn_multiple(%arg0: tensor<?x?xf32>) -> () {
   // CHECK:   %[[MAX:.+]] = arith.maximumf %[[ARG1]], %[[ARG2]] : f32
   // CHECK:   linalg.yield %[[MAX]] : f32
   // CHECK:  }
-  // CHECK: tensor.expand_shape %[[REDUCE]] {{\[}}[0, 1]] : tensor<?xf32> into tensor<?x1xf32>
+  // CHECK: %[[C0_0:.+]] = arith.constant 0 : index
+  // CHECK: %[[DIM_1:.+]] = tensor.dim %[[REDUCE]], %[[C0_0]] : tensor<?xf32>
+  // CHECK: %[[C1_2:.+]] = arith.constant 1 : index
+  // CHECK: tensor.expand_shape %[[REDUCE]] {{\[}}[0, 1]] output_shape [%[[DIM_1]], 1] : tensor<?xf32> into tensor<?x1xf32>
   %0 = tosa.reduce_max %arg0 {axis = 1 : i32} : (tensor<?x?xf32>) -> tensor<?x1xf32>
   return
 }
@@ -996,7 +1005,7 @@ func.func @reduce_int(%arg0: tensor<5x4xi32>) -> () {
   // CHECK:   [[RES:%.+]] = arith.addi %[[ARG1]], %[[ARG2]] : i32
   // CHECK:   linalg.yield [[RES]] : i32
   // CHECK:  }
-  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] : tensor<4xi32> into tensor<1x4xi32>
+  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] output_shape [1, 4] : tensor<4xi32> into tensor<1x4xi32>
   %0 = tosa.reduce_sum %arg0 {axis = 0 : i32} : (tensor<5x4xi32>) -> tensor<1x4xi32>
 
   // CHECK: [[INIT:%.+]] = tensor.empty()
@@ -1007,7 +1016,7 @@ func.func @reduce_int(%arg0: tensor<5x4xi32>) -> () {
   // CHECK:   [[RES:%.+]] = arith.addi %[[ARG1]], %[[ARG2]] : i32
   // CHECK:   linalg.yield [[RES]] : i32
   // CHECK:  }
-  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] : tensor<5xi32> into tensor<5x1xi32>
+  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] output_shape [5, 1] : tensor<5xi32> into tensor<5x1xi32>
   %1 = tosa.reduce_sum %arg0 {axis = 1 : i32} : (tensor<5x4xi32>) -> tensor<5x1xi32>
 
   // CHECK: arith.constant 1
@@ -1043,7 +1052,7 @@ func.func @reduce_bool(%arg0: tensor<5x4xi1>) -> () {
   // CHECK:   [[RES:%.+]] = arith.andi %[[ARG1]], %[[ARG2]] : i1
   // CHECK:   linalg.yield [[RES]] : i1
   // CHECK:  }
-  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] : tensor<4xi1> into tensor<1x4xi1>
+  // CHECK: tensor.expand_shape [[REDUCE]] {{\[}}[0, 1]] output_shape [1, 4] : tensor<4xi1> into tensor<1x4xi1>
   %0 = tosa.reduce_all %arg0 {axis = 0 : i32} : (tensor<5x4xi1>) -> tensor<1x4xi1>
 
   // CHECK: arith.constant false
diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
index a8a3c42e1684..b8c3d56f21f1 100644
--- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
+++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
@@ -14,7 +14,7 @@ func.func @test_reshape_0d_same_s2s_explicit(%arg0: tensor<f32>) -> tensor<f32>
 
 // CHECK-LABEL: test_reshape_0d_up_s2d_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<f32>
-// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] [] : tensor<f32> into tensor<1xf32>
+// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] [] output_shape [1] : tensor<f32> into tensor<1xf32>
 // CHECK: %[[VAL_1:.*]] = tensor.cast %[[VAL_0]] : tensor<1xf32> to tensor<?xf32>
 // CHECK: return %[[VAL_1]] : tensor<?xf32>
 func.func @test_reshape_0d_up_s2d_auto(%arg0: tensor<f32>) -> tensor<?xf32> {
@@ -26,7 +26,7 @@ func.func @test_reshape_0d_up_s2d_auto(%arg0: tensor<f32>) -> tensor<?xf32> {
 
 // CHECK-LABEL: test_reshape_0d_up_s2d_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<f32>
-// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] [] : tensor<f32> into tensor<1xf32>
+// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] [] output_shape [1] : tensor<f32> into tensor<1xf32>
 // CHECK: %[[VAL_1:.*]] = tensor.cast %[[VAL_0]] : tensor<1xf32> to tensor<?xf32>
 // CHECK: return %[[VAL_1]] : tensor<?xf32>
 func.func @test_reshape_0d_up_s2d_explicit(%arg0: tensor<f32>) -> tensor<?xf32> {
@@ -38,7 +38,7 @@ func.func @test_reshape_0d_up_s2d_explicit(%arg0: tensor<f32>) -> tensor<?xf32>
 
 // CHECK-LABEL: test_reshape_0d_up_s2s_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<f32>
-// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] [] : tensor<f32> into tensor<1xf32>
+// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] [] output_shape [1] : tensor<f32> into tensor<1xf32>
 // CHECK: return %[[VAL_0]] : tensor<1xf32>
 func.func @test_reshape_0d_up_s2s_auto(%arg0: tensor<f32>) -> tensor<1xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: -1>} : (tensor<f32>) -> tensor<1xf32>
@@ -49,7 +49,7 @@ func.func @test_reshape_0d_up_s2s_auto(%arg0: tensor<f32>) -> tensor<1xf32> {
 
 // CHECK-LABEL: test_reshape_0d_up_s2s_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<f32>
-// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] [] : tensor<f32> into tensor<1xf32>
+// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] [] output_shape [1] : tensor<f32> into tensor<1xf32>
 // CHECK: return %[[VAL_0]] : tensor<1xf32>
 func.func @test_reshape_0d_up_s2s_explicit(%arg0: tensor<f32>) -> tensor<1xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 1>} : (tensor<f32>) -> tensor<1xf32>
@@ -83,8 +83,12 @@ func.func @test_reshape_1d_down_s2s_explicit(%arg0: tensor<1xf32>) -> tensor<f32
 
 // CHECK-LABEL: test_reshape_1d_up_d2d_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<?xf32>
-// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] {{\[\[}}0, 1]] : tensor<?xf32> into tensor<2x?xf32>
-// CHECK: return %[[VAL_0]] : tensor<2x?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?xf32>
+// CHECK: %[[C2:.*]] = arith.constant 2 : index
+// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C2]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[ARG_0]] {{\[\[}}0, 1]] output_shape [2, %[[VAL_0]]] : tensor<?xf32> into tensor<2x?xf32>
+// CHECK: return %[[EXPANDED]] : tensor<2x?xf32>
 func.func @test_reshape_1d_up_d2d_auto(%arg0: tensor<?xf32>) -> tensor<2x?xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, -1>} : (tensor<?xf32>) -> tensor<2x?xf32>
   return %0 : tensor<2x?xf32>
@@ -94,7 +98,7 @@ func.func @test_reshape_1d_up_d2d_auto(%arg0: tensor<?xf32>) -> tensor<2x?xf32>
 
 // CHECK-LABEL: test_reshape_1d_up_s2s_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<6xf32>
-// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] {{\[\[}}0, 1]] : tensor<6xf32> into tensor<2x3xf32>
+// CHECK: %[[VAL_0:.*]] = tensor.expand_shape %[[ARG_0]] {{\[\[}}0, 1]] output_shape [2, 3] : tensor<6xf32> into tensor<2x3xf32>
 // CHECK: return %[[VAL_0]] : tensor<2x3xf32>
 func.func @test_reshape_1d_up_s2s_explicit(%arg0: tensor<6xf32>) -> tensor<2x3xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 3>} : (tensor<6xf32>) -> tensor<2x3xf32>
@@ -128,8 +132,12 @@ func.func @test_reshape_2d_down_s2s_explicit(%arg0: tensor<2x3xf32>) -> tensor<6
 // CHECK-LABEL: test_reshape_2d_same_d2d_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<?x2xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1]] : tensor<?x2xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] : tensor<?xf32> into tensor<2x?xf32>
-// CHECK: return %[[VAL_1]] : tensor<2x?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C2:.*]] = arith.constant 2 : index
+// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C2]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] output_shape [2, %[[DIV]]] : tensor<?xf32> into tensor<2x?xf32>
+// CHECK: return %[[EXPANDED]] : tensor<2x?xf32>
 func.func @test_reshape_2d_same_d2d_auto(%arg0: tensor<?x2xf32>) -> tensor<2x?xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, -1>} : (tensor<?x2xf32>) -> tensor<2x?xf32>
   return %0 : tensor<2x?xf32>
@@ -140,7 +148,7 @@ func.func @test_reshape_2d_same_d2d_auto(%arg0: tensor<?x2xf32>) -> tensor<2x?xf
 // CHECK-LABEL: test_reshape_2d_same_s2d_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<2x4xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1]] : tensor<2x4xf32> into tensor<8xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] : tensor<8xf32> into tensor<4x2xf32>
+// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] output_shape [4, 2] : tensor<8xf32> into tensor<4x2xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<4x2xf32> to tensor<?x2xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x2xf32>
 func.func @test_reshape_2d_same_s2d_auto(%arg0: tensor<2x4xf32>) -> tensor<?x2xf32> {
@@ -153,7 +161,7 @@ func.func @test_reshape_2d_same_s2d_auto(%arg0: tensor<2x4xf32>) -> tensor<?x2xf
 // CHECK-LABEL: test_reshape_2d_same_s2d_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<2x4xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1]] : tensor<2x4xf32> into tensor<8xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] : tensor<8xf32> into tensor<4x2xf32>
+// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] output_shape [4, 2] : tensor<8xf32> into tensor<4x2xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<4x2xf32> to tensor<?x2xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x2xf32>
 func.func @test_reshape_2d_same_s2d_explicit(%arg0: tensor<2x4xf32>) -> tensor<?x2xf32> {
@@ -166,7 +174,7 @@ func.func @test_reshape_2d_same_s2d_explicit(%arg0: tensor<2x4xf32>) -> tensor<?
 // CHECK-LABEL: test_reshape_2d_same_s2s_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<3x2xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1]] : tensor<3x2xf32> into tensor<6xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] : tensor<6xf32> into tensor<2x3xf32>
+// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] output_shape [2, 3] : tensor<6xf32> into tensor<2x3xf32>
 // CHECK: return %[[VAL_1]] : tensor<2x3xf32>
 func.func @test_reshape_2d_same_s2s_explicit(%arg0: tensor<3x2xf32>) -> tensor<2x3xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 3>} : (tensor<3x2xf32>) -> tensor<2x3xf32>
@@ -178,7 +186,11 @@ func.func @test_reshape_2d_same_s2s_explicit(%arg0: tensor<3x2xf32>) -> tensor<2
 // CHECK-LABEL: test_reshape_3d_same_d2d_auto_empty
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<3x2x?xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<3x2x?xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<0x3x?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C0_0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C0_0]] : index
+// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [0, 3, %[[DIV]]] : tensor<?xf32> into tensor<0x3x?xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<0x3x?xf32> to tensor<?x?x?xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x?x?xf32>
 func.func @test_reshape_3d_same_d2d_auto_empty(%arg0: tensor<3x2x?xf32>) -> tensor<?x?x?xf32> {
@@ -191,7 +203,11 @@ func.func @test_reshape_3d_same_d2d_auto_empty(%arg0: tensor<3x2x?xf32>) -> tens
 // CHECK-LABEL: test_reshape_3d_same_d2d_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<2x?x?xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<2x?x?xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<2x?x4xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C8:.*]] = arith.constant 8 : index
+// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C8]] : index
+// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, %[[DIV]], 4] : tensor<?xf32> into tensor<2x?x4xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<2x?x4xf32> to tensor<?x?x?xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x?x?xf32>
 func.func @test_reshape_3d_same_d2d_auto(%arg0: tensor<2x?x?xf32>) -> tensor<?x?x?xf32> {
@@ -204,7 +220,11 @@ func.func @test_reshape_3d_same_d2d_auto(%arg0: tensor<2x?x?xf32>) -> tensor<?x?
 // CHECK-LABEL: test_reshape_3d_same_d2d_auto_identity
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<?x3x4xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<?x3x4xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<2x3x?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C6:.*]] = arith.constant 6 : index
+// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C6]] : index
+// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, 3, %[[DIV]]] : tensor<?xf32> into tensor<2x3x?xf32>
 // CHECK: return %[[VAL_1]] : tensor<2x3x?xf32>
 func.func @test_reshape_3d_same_d2d_auto_identity(%arg0: tensor<?x3x4xf32>) -> tensor<2x3x?xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 3, -1>} : (tensor<?x3x4xf32>) -> tensor<2x3x?xf32>
@@ -216,8 +236,12 @@ func.func @test_reshape_3d_same_d2d_auto_identity(%arg0: tensor<?x3x4xf32>) -> t
 // CHECK-LABEL: test_reshape_3d_same_d2d_explicit_empty
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<3x2x?xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<3x2x?xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<?x3x2xf32>
-// CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<?x3x2xf32> to tensor<?x?x?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C6:.*]] = arith.constant 6 : index
+// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C6]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 2] : tensor<?xf32> into tensor<?x3x2xf32>
+// CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<?x3x2xf32> to tensor<?x?x?xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x?x?xf32>
 func.func @test_reshape_3d_same_d2d_explicit_empty(%arg0: tensor<3x2x?xf32>) -> tensor<?x?x?xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 0, 3, 2>} : (tensor<3x2x?xf32>) -> tensor<?x?x?xf32>
@@ -229,8 +253,12 @@ func.func @test_reshape_3d_same_d2d_explicit_empty(%arg0: tensor<3x2x?xf32>) ->
 // CHECK-LABEL: test_reshape_3d_same_d2d_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<?x?x?xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<?x3x4xf32>
-// CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<?x3x4xf32> to tensor<?x?x?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C12:.*]] = arith.constant 12 : index
+// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C12]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 4] : tensor<?xf32> into tensor<?x3x4xf32>
+// CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<?x3x4xf32> to tensor<?x?x?xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x?x?xf32>
 func.func @test_reshape_3d_same_d2d_explicit(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 3, 4>} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
@@ -253,8 +281,12 @@ func.func @test_reshape_3d_same_d2d_explicit_identity(%arg0: tensor<?x3x4xf32>)
 // CHECK-LABEL: test_reshape_3d_same_d2s_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<?x?x?xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<2x?x4xf32>
-// CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<2x?x4xf32> to tensor<2x3x4xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C8:.*]] = arith.constant 8 : index
+// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C8]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, %[[DIV]], 4] : tensor<?xf32> into tensor<2x?x4xf32>
+// CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<2x?x4xf32> to tensor<2x3x4xf32>
 // CHECK: return %[[VAL_2]] : tensor<2x3x4xf32>
 func.func @test_reshape_3d_same_d2s_auto(%arg0: tensor<?x?x?xf32>) -> tensor<2x3x4xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, -1, 4>} : (tensor<?x?x?xf32>) -> tensor<2x3x4xf32>
@@ -266,8 +298,12 @@ func.func @test_reshape_3d_same_d2s_auto(%arg0: tensor<?x?x?xf32>) -> tensor<2x3
 // CHECK-LABEL: test_reshape_3d_same_d2s_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
 // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<?x?x?xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<?x3x4xf32>
-// CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<?x3x4xf32> to tensor<2x3x4xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C12:.*]] = arith.constant 12 : index
+// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C12]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 4] : tensor<?xf32> into tensor<?x3x4xf32>
+// CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<?x3x4xf32> to tensor<2x3x4xf32>
 // CHECK: return %[[VAL_2]] : tensor<2x3x4xf32>
 func.func @test_reshape_3d_same_d2s_explicit(%arg0: tensor<?x?x?xf32>) -> tensor<2x3x4xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 3, 4>} : (tensor<?x?x?xf32>) -> tensor<2x3x4xf32>
@@ -288,10 +324,14 @@ func.func @test_reshape_3d_same_s2s_explicit_identity(%arg0: tensor<2x3x4xf32>)
 
 // CHECK-LABEL: test_reshape_3d_up_d2s_explicit
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
-// CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<?x?x?xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2, 3]] : tensor<?xf32> into tensor<?x3x2x1xf32>
-// CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<?x3x2x1xf32> to tensor<1x3x2x1xf32>
-// CHECK: return %[[VAL_2]] : tensor<1x3x2x1xf32>
+// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2]] : tensor<?x?x?xf32> into tensor<?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C6:.*]] = arith.constant 6 : index
+// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C6]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2, 3]] output_shape [%[[VAL_0]], 3, 2, 1] : tensor<?xf32> into tensor<?x3x2x1xf32>
+// CHECK: %[[CAST:.*]] = tensor.cast %[[EXPANDED]] : tensor<?x3x2x1xf32> to tensor<1x3x2x1xf32>
+// CHECK: return %[[CAST]] : tensor<1x3x2x1xf32>
 func.func @test_reshape_3d_up_d2s_explicit(%input: tensor<?x?x?xf32>) -> tensor<1x3x2x1xf32> {
   %0 = tosa.reshape %input {new_shape = array<i64: 1, 3, 2, 1>} : (tensor<?x?x?xf32>) -> tensor<1x3x2x1xf32>
   return %0 : tensor<1x3x2x1xf32>
@@ -313,9 +353,13 @@ func.func @test_reshape_4d_down_d2s_explicit(%arg0: tensor<?x?x?x?xf32>) -> tens
 
 // CHECK-LABEL: test_reshape_5d_down_d2d_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<?x?x?x2x3xf32>
-// CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2, 3, 4]] : tensor<?x?x?x2x3xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<?x2x3xf32>
-// CHECK: return %[[VAL_1]] : tensor<?x2x3xf32>
+// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2, 3, 4]] : tensor<?x?x?x2x3xf32> into tensor<?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C6:.*]] = arith.constant 6 : index
+// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C6]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2]] output_shape [%[[VAL_0]], 2, 3] : tensor<?xf32> into tensor<?x2x3xf32>
+// CHECK: return %[[EXPANDED]] : tensor<?x2x3xf32>
 func.func @test_reshape_5d_down_d2d_auto(%arg0: tensor<?x?x?x2x3xf32>) -> tensor<?x2x3xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: -1, 2, 3>} : (tensor<?x?x?x2x3xf32>) -> tensor<?x2x3xf32>
   return %0 : tensor<?x2x3xf32>
@@ -325,9 +369,13 @@ func.func @test_reshape_5d_down_d2d_auto(%arg0: tensor<?x?x?x2x3xf32>) -> tensor
 
 // CHECK-LABEL: test_reshape_6d_down_d2d_auto
 // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<1x2x?x5x7x11xf32>
-// CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2, 3, 4, 5]] : tensor<1x2x?x5x7x11xf32> into tensor<?xf32>
-// CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] : tensor<?xf32> into tensor<?x5x77xf32>
-// CHECK: return %[[VAL_1]] : tensor<?x5x77xf32>
+// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2, 3, 4, 5]] : tensor<1x2x?x5x7x11xf32> into tensor<?xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor<?xf32>
+// CHECK: %[[C385:.*]] = arith.constant 385 : index
+// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C385]] : index
+// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2]] output_shape [%[[VAL_0]], 5, 77] : tensor<?xf32> into tensor<?x5x77xf32>
+// CHECK: return %[[EXPANDED]] : tensor<?x5x77xf32>
 func.func @test_reshape_6d_down_d2d_auto(%arg0: tensor<1x2x?x5x7x11xf32>) -> tensor<?x5x77xf32> {
   %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: -1, 5, 77>} : (tensor<1x2x?x5x7x11xf32>) -> tensor<?x5x77xf32>
   return %0 : tensor<?x5x77xf32>
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 1712d3d745b7..439f1e920e39 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2517,7 +2517,7 @@ func.func @vector_interleave_1d(%a: vector<8xf32>, %b: vector<8xf32>) -> vector<
 //  CHECK-SAME:     %[[LHS:.*]]: vector<[4]xi32>, %[[RHS:.*]]: vector<[4]xi32>)
 func.func @vector_interleave_1d_scalable(%a: vector<[4]xi32>, %b: vector<[4]xi32>) -> vector<[8]xi32>
 {
-  // CHECK: %[[ZIP:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS]], %[[RHS]]) : (vector<[4]xi32>, vector<[4]xi32>) -> vector<[8]xi32>
+  // CHECK: %[[ZIP:.*]] = "llvm.intr.vector.interleave2"(%[[LHS]], %[[RHS]]) : (vector<[4]xi32>, vector<[4]xi32>) -> vector<[8]xi32>
   // CHECK: return %[[ZIP]]
   %0 = vector.interleave %a, %b : vector<[4]xi32>
   return %0 : vector<[8]xi32>
@@ -2541,7 +2541,7 @@ func.func @vector_interleave_2d(%a: vector<2x3xi8>, %b: vector<2x3xi8>) -> vecto
 //  CHECK-SAME:     %[[LHS:.*]]: vector<2x[8]xi16>, %[[RHS:.*]]: vector<2x[8]xi16>)
 func.func @vector_interleave_2d_scalable(%a: vector<2x[8]xi16>, %b: vector<2x[8]xi16>) -> vector<2x[16]xi16>
 {
-  // CHECK: llvm.intr.experimental.vector.interleave2
+  // CHECK: llvm.intr.vector.interleave2
   // CHECK-NOT: vector.interleave {{.*}} : vector<2x[8]xi16>
   %0 = vector.interleave %a, %b : vector<2x[8]xi16>
   return %0 : vector<2x[16]xi16>
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 79a318565e98..f7ce2123a93c 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -1223,6 +1223,28 @@ func.func @mulsiExtendedOneRhsSplat(%arg0: vector<3xi32>) -> (vector<3xi32>, vec
   return %low, %high : vector<3xi32>, vector<3xi32>
 }
 
+// CHECK-LABEL: @mulsiExtendedOneRhsI1
+//  CHECK-SAME:   (%[[ARG:.+]]: i1) -> (i1, i1)
+//  CHECK-NEXT:   %[[T:.+]]  = arith.constant true
+//  CHECK-NEXT:   %[[LOW:.+]], %[[HIGH:.+]] = arith.mulsi_extended %[[ARG]], %[[T]] : i1
+//  CHECK-NEXT:   return %[[LOW]], %[[HIGH]] : i1, i1
+func.func @mulsiExtendedOneRhsI1(%arg0: i1) -> (i1, i1) {
+  %one = arith.constant true
+  %low, %high = arith.mulsi_extended %arg0, %one: i1
+  return %low, %high : i1, i1
+}
+
+// CHECK-LABEL: @mulsiExtendedOneRhsSplatI1
+//  CHECK-SAME:   (%[[ARG:.+]]: vector<3xi1>) -> (vector<3xi1>, vector<3xi1>)
+//  CHECK-NEXT:   %[[TS:.+]]  = arith.constant dense<true> : vector<3xi1>
+//  CHECK-NEXT:   %[[LOW:.+]], %[[HIGH:.+]] = arith.mulsi_extended %[[ARG]], %[[TS]] : vector<3xi1>
+//  CHECK-NEXT:   return %[[LOW]], %[[HIGH]] : vector<3xi1>, vector<3xi1>
+func.func @mulsiExtendedOneRhsSplatI1(%arg0: vector<3xi1>) -> (vector<3xi1>, vector<3xi1>) {
+  %one = arith.constant dense<true> : vector<3xi1>
+  %low, %high = arith.mulsi_extended %arg0, %one: vector<3xi1>
+  return %low, %high : vector<3xi1>, vector<3xi1>
+}
+
 // CHECK-LABEL: @mulsiExtendedUnusedHigh
 //  CHECK-SAME:   (%[[ARG:.+]]: i32) -> i32
 //  CHECK-NEXT:   %[[RES:.+]] = arith.muli %[[ARG]], %[[ARG]] : i32
@@ -2809,6 +2831,87 @@ func.func @unsignedExtendConstantResource() -> tensor<i16> {
   return %ext : tensor<i16>
 }
 
+// CHECK-LABEL: @extsi_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
+//       CHECK:   return %[[ZERO]] : i16
+func.func @extsi_i0() -> i16 {
+  %c0 = arith.constant 0 : i0
+  %extsi = arith.extsi %c0 : i0 to i16
+  return %extsi : i16
+}
+
+// CHECK-LABEL: @extui_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
+//       CHECK:   return %[[ZERO]] : i16
+func.func @extui_i0() -> i16 {
+  %c0 = arith.constant 0 : i0
+  %extui = arith.extui %c0 : i0 to i16
+  return %extui : i16
+}
+
+// CHECK-LABEL: @trunc_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @trunc_i0() -> i0 {
+  %cFF = arith.constant 0xFF : i8
+  %trunc = arith.trunci %cFF : i8 to i0
+  return %trunc : i0
+}
+
+// CHECK-LABEL: @shli_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @shli_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %shli = arith.shli %c0, %c0 : i0
+  return %shli : i0
+}
+
+// CHECK-LABEL: @shrsi_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @shrsi_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %shrsi = arith.shrsi %c0, %c0 : i0
+  return %shrsi : i0
+}
+
+// CHECK-LABEL: @shrui_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @shrui_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %shrui = arith.shrui %c0, %c0 : i0
+  return %shrui : i0
+}
+
+// CHECK-LABEL: @maxsi_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @maxsi_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %maxsi = arith.maxsi %c0, %c0 : i0
+  return %maxsi : i0
+}
+
+// CHECK-LABEL: @minsi_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @minsi_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %minsi = arith.minsi %c0, %c0 : i0
+  return %minsi : i0
+}
+
+// CHECK-LABEL: @mulsi_extended_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]], %[[ZERO]] : i0
+func.func @mulsi_extended_i0() -> (i0, i0) {
+  %c0 = arith.constant 0 : i0
+  %mulsi_extended:2 = arith.mulsi_extended %c0, %c0 : i0
+  return %mulsi_extended#0, %mulsi_extended#1 : i0, i0
+}
+
 {-#
   dialect_resources: {
     builtin: {
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
index 6bed93e4c969..174eb468cc00 100644
--- a/mlir/test/Dialect/Arith/expand-ops.mlir
+++ b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -262,3 +262,51 @@ func.func @truncf_vector_f32(%arg0 : vector<4xf32>) -> vector<4xbf16> {
 
 // CHECK-LABEL: @truncf_vector_f32
 // CHECK-NOT: arith.truncf
+
+// -----
+
+func.func @maxsi(%a: i32, %b: i32) -> i32 {
+  %result = arith.maxsi %a, %b : i32
+  return %result : i32
+}
+// CHECK-LABEL: func @maxsi
+// CHECK-SAME: %[[LHS:.*]]: i32, %[[RHS:.*]]: i32
+// CHECK-NEXT: %[[CMP:.*]] = arith.cmpi sgt, %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT: %[[RESULT:.*]] = arith.select %[[CMP]], %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT: return %[[RESULT]] : i32
+
+// -----
+
+func.func @minsi(%a: i32, %b: i32) -> i32 {
+  %result = arith.minsi %a, %b : i32
+  return %result : i32
+}
+// CHECK-LABEL: func @minsi
+// CHECK-SAME: %[[LHS:.*]]: i32, %[[RHS:.*]]: i32
+// CHECK-NEXT: %[[CMP:.*]] = arith.cmpi slt, %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT: %[[RESULT:.*]] = arith.select %[[CMP]], %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT: return %[[RESULT]] : i32
+
+// -----
+
+func.func @maxui(%a: i32, %b: i32) -> i32 {
+  %result = arith.maxui %a, %b : i32
+  return %result : i32
+}
+// CHECK-LABEL: func @maxui
+// CHECK-SAME: %[[LHS:.*]]: i32, %[[RHS:.*]]: i32
+// CHECK-NEXT: %[[CMP:.*]] = arith.cmpi ugt, %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT: %[[RESULT:.*]] = arith.select %[[CMP]], %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT: return %[[RESULT]] : i32
+
+// -----
+
+func.func @minui(%a: i32, %b: i32) -> i32 {
+  %result = arith.minui %a, %b : i32
+  return %result : i32
+}
+// CHECK-LABEL: func @minui
+// CHECK-SAME: %[[LHS:.*]]: i32, %[[RHS:.*]]: i32
+// CHECK-NEXT: %[[CMP:.*]] = arith.cmpi ult, %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT: %[[RESULT:.*]] = arith.select %[[CMP]], %[[LHS]], %[[RHS]] : i32
+// CHECK-NEXT: return %[[RESULT]] : i32
diff --git a/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir b/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
index de9de86003e6..01f54a4cf186 100644
--- a/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
+++ b/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
@@ -4,10 +4,10 @@
 // CHECK-SAME:    %[[A0:.*]]: vector<[4]xf16>, %[[B0:.*]]: vector<[4]xf16>, %[[A1:.*]]: vector<[4]xf16>, %[[B1:.*]]: vector<[4]xf16>,
 // CHECK-SAME:    %[[A0_MASK:.*]]: vector<[4]xi1>, %[[B0_MASK:.*]]: vector<[4]xi1>, %[[A1_MASK:.*]]: vector<[4]xi1>, %[[B1_MASK:.*]]: vector<[4]xi1>
 // CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<[4]x[4]xf32>
-// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0]], %[[A1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0]], %[[B1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0_MASK]], %[[A1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0_MASK]], %[[B1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.vector.interleave2"(%[[A0]], %[[A1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.vector.interleave2"(%[[B0]], %[[B1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A0_MASK]], %[[A1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B0_MASK]], %[[B1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
 // CHECK-DAG: arm_sme.fmopa_2way %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[8]xf16>, vector<[8]xf16> into vector<[4]x[4]xf32>
 func.func @outerproduct_add_widening_2way_f16f16f32(
     %a0 : vector<[4]xf16>, %b0 : vector<[4]xf16>,
@@ -225,18 +225,18 @@ func.func @outerproduct_sub_widening_2way_unsigned_i16i16i32(
 // CHECK-SAME:    %[[A2_MASK:[a-z0-9]+]]: vector<[4]xi1>, %[[B2_MASK:[a-z0-9]+]]: vector<[4]xi1>,
 // CHECK-SAME:    %[[A3_MASK:[a-z0-9]+]]: vector<[4]xi1>, %[[B3_MASK:[a-z0-9]+]]: vector<[4]xi1>
 // CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0> : vector<[4]x[4]xi32>
-// CHECK-DAG: %[[LHS0:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0]], %[[A2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[LHS1:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A1]], %[[A3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[RHS0:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0]], %[[B2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[RHS1:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B1]], %[[B3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS0]], %[[LHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
-// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[RHS0]], %[[RHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
-// CHECK-DAG: %[[LHS0_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0_MASK]], %[[A2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[LHS1_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A1_MASK]], %[[A3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS0_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0_MASK]], %[[B2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS1_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B1_MASK]], %[[B3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS0_MASK]], %[[LHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
-// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[RHS0_MASK]], %[[RHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
+// CHECK-DAG: %[[LHS0:.*]] = "llvm.intr.vector.interleave2"(%[[A0]], %[[A2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[LHS1:.*]] = "llvm.intr.vector.interleave2"(%[[A1]], %[[A3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[RHS0:.*]] = "llvm.intr.vector.interleave2"(%[[B0]], %[[B2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[RHS1:.*]] = "llvm.intr.vector.interleave2"(%[[B1]], %[[B3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.vector.interleave2"(%[[LHS0]], %[[LHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
+// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.vector.interleave2"(%[[RHS0]], %[[RHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
+// CHECK-DAG: %[[LHS0_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A0_MASK]], %[[A2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS1_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A1_MASK]], %[[A3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS0_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B0_MASK]], %[[B2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS1_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B1_MASK]], %[[B3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[LHS0_MASK]], %[[LHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
+// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[RHS0_MASK]], %[[RHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
 // CHECK-DAG: arm_sme.smopa_4way %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
 func.func @outerproduct_add_widening_4way_signed_i8i8i32(
     %a0 : vector<[4]xi8>, %b0 : vector<[4]xi8>,
diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
new file mode 100644
index 000000000000..2dedcb2fbc24
--- /dev/null
+++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
@@ -0,0 +1,183 @@
+// RUN: mlir-opt %s -allocate-arm-sme-tiles -split-input-file -verify-diagnostics | FileCheck %s --check-prefix=CHECK-BAD
+
+// This file tests some aspects of liveness issues in the SME tile allocator.
+// These tests were designed with a new liveness-based tile allocator in mind
+// (where the names of test cases make more sense), with the current tile
+// allocator these tests all give incorrect results (which is documented by
+// `CHECK-BAD`).
+
+// Incorrect result! The second `move_vector_to_tile_slice` overwrites the first (which is still live).
+//
+// CHECK-BAD-LABEL: @constant_with_multiple_users
+// CHECK-BAD: %[[ZERO_TILE:.*]] = arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32>
+// CHECK-BAD: %[[INSERT_TILE_1:.*]] = arm_sme.move_vector_to_tile_slice %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+// CHECK-BAD: %[[INSERT_TILE_0:.*]] = arm_sme.move_vector_to_tile_slice %{{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+func.func @constant_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) {
+  %zero = arm_sme.zero : vector<[4]x[4]xf32>
+  %tile_a = arm_sme.move_vector_to_tile_slice %a, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32>
+  %tile_b = arm_sme.move_vector_to_tile_slice %b, %zero, %index : vector<[4]xf32> into vector<[4]x[4]xf32>
+  "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+  return
+}
+
+// -----
+
+// (No tile IDs -- the current tile allocator ignores this case)
+
+// CHECK-BAD-LABEL: @value_with_multiple_users
+// CHECK-BAD-NOT: tile_id
+func.func @value_with_multiple_users(%tile: vector<[4]x[4]xf32>, %a: vector<[4]xf32>, %b: vector<[4]xf32>, %index: index) {
+  // A future allocator should error here (as `%tile` would need to be copied).
+  %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32>
+  %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %index : vector<[4]xf32> into vector<[4]x[4]xf32>
+  "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-BAD-LABEL: @reuse_tiles_after_initial_use
+func.func @reuse_tiles_after_initial_use() {
+  // CHECK-BAD: arm_sme.get_tile {tile_id = 0 : i32}
+  // CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32}
+  // CHECK-BAD: arm_sme.get_tile {tile_id = 2 : i32}
+  // CHECK-BAD: arm_sme.get_tile {tile_id = 3 : i32}
+  %tile_a = arm_sme.get_tile : vector<[4]x[4]xf32>
+  %tile_b = arm_sme.get_tile : vector<[4]x[4]xf32>
+  %tile_c = arm_sme.get_tile : vector<[4]x[4]xf32>
+  %tile_d = arm_sme.get_tile : vector<[4]x[4]xf32>
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
+  // -> Spills after the fourth tile (unnecessary):
+  // CHECK-BAD: arm_sme.zero {tile_id = 16 : i32}
+  // CHECK-BAD: arm_sme.zero {tile_id = 17 : i32}
+  // CHECK-BAD: arm_sme.zero {tile_id = 18 : i32}
+  // CHECK-BAD: arm_sme.zero {tile_id = 19 : i32}
+  // Unnecessary spills:
+  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  %tile_1 = arm_sme.zero : vector<[4]x[4]xf32>
+  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  %tile_2 = arm_sme.zero : vector<[4]x[4]xf32>
+  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  %tile_3 = arm_sme.zero : vector<[4]x[4]xf32>
+  // expected-warning @below {{failed to allocate SME virtual tile to operation, all tile operations will go through memory, expect degraded performance}}
+  %tile_4 = arm_sme.zero : vector<[4]x[4]xf32>
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.dummy"(): () -> ()
+  "test.some_use"(%tile_1) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_2) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_3) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_4) : (vector<[4]x[4]xf32>) -> ()
+  return
+}
+
+// -----
+
+// Incorrect result! Both branches should yield the result via the same tile.
+//
+// CHECK-BAD-LABEL: @non_overlapping_branches
+// CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32>
+// CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32} : vector<[4]x[4]xf32>
+func.func @non_overlapping_branches(%cond: i1) {
+  %tile = scf.if %cond -> vector<[4]x[4]xf32> {
+    %zero = arm_sme.zero : vector<[4]x[4]xf32>
+    scf.yield %zero : vector<[4]x[4]xf32>
+  } else {
+    %undef = arm_sme.get_tile : vector<[4]x[4]xf32>
+    scf.yield %undef : vector<[4]x[4]xf32>
+  }
+  "test.some_use"(%tile) : (vector<[4]x[4]xf32>) -> ()
+  return
+}
+
+// -----
+
+// Incorrect result! Everything assigned to tile 0 (which means values that are still live are overwritten).
+//
+// CHECK-BAD-LABEL: @constant_loop_init_with_multiple_users
+// CHECK-BAD: arm_sme.zero {tile_id = 0 : i32} : vector<[4]x[4]xf32>
+// CHECK-BAD: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+// CHECK-BAD: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+func.func @constant_loop_init_with_multiple_users(%a: vector<[4]xf32>, %b: vector<[4]xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %init = arm_sme.zero : vector<[4]x[4]xf32>
+  %tile_a = scf.for %i = %c0 to %c10 step %c1 iter_args(%iter = %init) -> vector<[4]x[4]xf32> {
+    %new_tile = arm_sme.move_vector_to_tile_slice %a, %iter, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+    scf.yield %new_tile : vector<[4]x[4]xf32>
+  }
+  %tile_b = scf.for %i = %c0 to %c10 step %c1 iter_args(%iter = %init) -> vector<[4]x[4]xf32> {
+    %new_tile = arm_sme.move_vector_to_tile_slice %a, %iter, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+    scf.yield %new_tile : vector<[4]x[4]xf32>
+  }
+  "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+  "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+  return
+}
+
+// -----
+
+// Incorrect result! Everything assigned to tile 0 (which means values that are still live are overwritten).
+//
+// CHECK-BAD-LABEL: @run_out_of_tiles_but_avoid_spill
+// CHECK-BAD: arm_sme.zero {tile_id = 0 : i32}
+// CHECK-BAD-COUNT-4: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 0 : i32} : vector<[4]xf32> into vector<[4]x[4]xf32>
+func.func @run_out_of_tiles_but_avoid_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) {
+  %init = arm_sme.zero : vector<[4]x[4]xf32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  scf.for %i = %c0 to %c10 step %c1 {
+    %tile_a, %tile_b, %tile_c, %tile_d = scf.for %j = %c0 to %c10 step %c1
+      iter_args(%iter_a = %init, %iter_b = %init, %iter_c = %init, %iter_d = %init)
+        -> (vector<[4]x[4]xf32>, vector<[4]x[4]xf32> , vector<[4]x[4]xf32> , vector<[4]x[4]xf32>) {
+        %new_a = arm_sme.move_vector_to_tile_slice %a, %iter_a, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+        %new_b = arm_sme.move_vector_to_tile_slice %b, %iter_b, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+        %new_c = arm_sme.move_vector_to_tile_slice %c, %iter_c, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+        %new_d = arm_sme.move_vector_to_tile_slice %d, %iter_d, %i : vector<[4]xf32> into vector<[4]x[4]xf32>
+        scf.yield %new_a, %new_b, %new_c, %new_d : vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>, vector<[4]x[4]xf32>
+    }
+    "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+    "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+    "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
+    "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
+  }
+  return
+}
+
+// -----
+
+// Incorrect result! Everything other than zero assigned to tile 1 (which means values that are still live are overwritten).
+//
+// CHECK-BAD-LABEL: @avoidable_spill
+// CHECK-BAD: arm_sme.zero {tile_id = 0 : i32}
+// CHECK-BAD: arm_sme.get_tile {tile_id = 1 : i32}
+// CHECK-BAD-COUNT-4: arm_sme.move_vector_to_tile_slice {{.*}} {tile_id = 1 : i32}
+func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<[4]xf32>, %d: vector<[4]xf32>) {
+  %zero = arm_sme.zero : vector<[4]x[4]xf32>
+  %tile = arm_sme.get_tile : vector<[4]x[4]xf32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  scf.for %i = %c0 to %c10 step %c1 {
+    "test.some_use"(%zero) : (vector<[4]x[4]xf32>) -> ()
+    %tile_a = arm_sme.move_vector_to_tile_slice %a, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32>
+    %tile_b = arm_sme.move_vector_to_tile_slice %b, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32>
+    %tile_c = arm_sme.move_vector_to_tile_slice %c, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32>
+    %tile_d = arm_sme.move_vector_to_tile_slice %d, %tile, %c0 : vector<[4]xf32> into vector<[4]x[4]xf32>
+    "test.some_use"(%tile_a) : (vector<[4]x[4]xf32>) -> ()
+    "test.some_use"(%tile_b) : (vector<[4]x[4]xf32>) -> ()
+    "test.some_use"(%tile_c) : (vector<[4]x[4]xf32>) -> ()
+    "test.some_use"(%tile_d) : (vector<[4]x[4]xf32>) -> ()
+  }
+  return
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
index 9a3e14b6d391..efe59af97d96 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -132,7 +132,7 @@ func.func @shape_mismatch(%t: tensor<5x6x128xf32>) -> tensor<5x6x128xf32> {
   %cst = arith.constant 8.0 : f32
   %0 = tensor.empty() : tensor<128xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128xf32>) -> tensor<128xf32>
-  %2 = tensor.expand_shape %1 [[0, 1, 2]]
+  %2 = tensor.expand_shape %1 [[0, 1, 2]] output_shape [1, 1, 128]
       : tensor<128xf32> into tensor<1x1x128xf32>
   %3 = tensor.insert_slice %2 into %t[2, 3, 0][1, 1, 128][1, 1, 1]
       : tensor<1x1x128xf32> into tensor<5x6x128xf32>
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 601add9a9f91..5e4724c9d309 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -54,14 +54,43 @@ func.func @launch() {
 // CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
 // CHECK-NEXT: = gpu.block_dim y
 // CHECK-NEXT: = gpu.block_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
 // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
 
 // -----
 
+// Verify that we can outline a CFG
+// CHECK-LABEL:  gpu.func @launchCFG_kernel(
+// CHECK: cf.br
+// CHECK: gpu.return
+func.func @launchCFG() {
+  %0 = "op"() : () -> (f32)
+  %1 = "op"() : () -> (memref<?xf32, 1>)
+  %gDimX = arith.constant 8 : index
+  %gDimY = arith.constant 12 : index
+  %gDimZ = arith.constant 16 : index
+  %bDimX = arith.constant 20 : index
+  %bDimY = arith.constant 24 : index
+  %bDimZ = arith.constant 28 : index
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
+                                       %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
+                                        %block_z = %bDimZ) {
+    "use"(%0): (f32) -> ()
+    cf.br ^bb1
+  ^bb1:
+    "some_op"(%bx, %block_x) : (index, index) -> ()
+    %42 = memref.load %1[%tx] : memref<?xf32, 1>
+    gpu.terminator
+  }
+  return
+}
+
+
+// -----
+
 // This test checks gpu-out-lining can handle gpu.launch kernel from an llvm.func
 // CHECK-LABEL: @launch_from_llvm_func
 llvm.func @launch_from_llvm_func() {
@@ -475,8 +504,6 @@ func.func @launch_cluster() {
 // CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x
 // CHECK-NEXT: = gpu.cluster_dim y
 // CHECK-NEXT: = gpu.cluster_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
 // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
diff --git a/mlir/test/Dialect/LLVMIR/constant-folding.mlir b/mlir/test/Dialect/LLVMIR/constant-folding.mlir
index f800f2690467..454126321eb9 100644
--- a/mlir/test/Dialect/LLVMIR/constant-folding.mlir
+++ b/mlir/test/Dialect/LLVMIR/constant-folding.mlir
@@ -51,3 +51,53 @@ llvm.func @or_basic() -> i32 {
   // CHECK: llvm.return %[[RES]] : i32
   llvm.return %2 : i32
 }
+
+// -----
+
+// CHECK-LABEL: llvm.func @addressof
+llvm.func @addressof() {
+  // CHECK-NEXT: %[[ADDRESSOF:.+]] = llvm.mlir.addressof @foo
+  %0 = llvm.mlir.addressof @foo : !llvm.ptr
+  %1 = llvm.mlir.addressof @foo : !llvm.ptr
+  // CHECK-NEXT: llvm.call @bar(%[[ADDRESSOF]], %[[ADDRESSOF]])
+  llvm.call @bar(%0, %1) : (!llvm.ptr, !llvm.ptr) -> ()
+  // CHECK-NEXT: llvm.return
+  llvm.return
+}
+
+llvm.mlir.global constant @foo() : i32
+
+llvm.func @bar(!llvm.ptr, !llvm.ptr)
+
+// -----
+
+// CHECK-LABEL: llvm.func @addressof_select
+llvm.func @addressof_select(%arg: i1) -> !llvm.ptr {
+  // CHECK-NEXT: %[[ADDRESSOF:.+]] = llvm.mlir.addressof @foo
+  %0 = llvm.mlir.addressof @foo : !llvm.ptr
+  %1 = llvm.mlir.addressof @foo : !llvm.ptr
+  %2 = arith.select %arg, %0, %1 : !llvm.ptr
+  // CHECK-NEXT: llvm.return %[[ADDRESSOF]]
+  llvm.return %2 : !llvm.ptr
+}
+
+llvm.mlir.global constant @foo() : i32
+
+llvm.func @bar(!llvm.ptr, !llvm.ptr)
+
+// -----
+
+// CHECK-LABEL: llvm.func @addressof_blocks
+llvm.func @addressof_blocks(%arg: i1) -> !llvm.ptr {
+  // CHECK-NEXT: %[[ADDRESSOF:.+]] = llvm.mlir.addressof @foo
+  llvm.cond_br %arg, ^bb1, ^bb2
+^bb1:
+  %0 = llvm.mlir.addressof @foo : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+^bb2:
+  %1 = llvm.mlir.addressof @foo : !llvm.ptr
+  // CHECK: return %[[ADDRESSOF]]
+  llvm.return %1 : !llvm.ptr
+}
+
+llvm.mlir.global constant @foo() : i32
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index de1ab9db8e8d..0914f0023210 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -1221,17 +1221,17 @@ func.func @extract_scalable_from_fixed_length_vector(%arg0 : vector<16xf32>) {
 
 // -----
 
-func.func @experimental_vector_interleave2_bad_type0(%vec1: vector<[2]xf16>, %vec2 : vector<[4]xf16>) {
+func.func @vector_interleave2_bad_type0(%vec1: vector<[2]xf16>, %vec2 : vector<[4]xf16>) {
   // expected-error@+1 {{op failed to verify that all of {vec1, vec2} have same type}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
   return
 }
 
 // -----
 
-func.func @experimental_vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[8]xf16>
   return
 }
 
@@ -1239,9 +1239,9 @@ func.func @experimental_vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %ve
 
 /// result vector type is not scalable.
 
-func.func @experimental_vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<4xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<4xf16>
   return
 }
 
@@ -1250,9 +1250,9 @@ func.func @experimental_vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %ve
 
 /// element type doesn't match.
 
-func.func @experimental_vector_interleave2_bad_type3(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type3(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[4]xf32>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[4]xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 31acf2b95e46..3b94db389f54 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -342,10 +342,10 @@ func.func @mixed_vect(%arg0: vector<8xf32>, %arg1: vector<4xf32>, %arg2: vector<
   return
 }
 
-// CHECK-LABEL: @experimental_vector_interleave2
-func.func @experimental_vector_interleave2(%vec1: vector<[4]xf16>, %vec2 : vector<[4]xf16>) {
-  // CHECK: = "llvm.intr.experimental.vector.interleave2"({{.*}}) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-LABEL: @vector_interleave2
+func.func @vector_interleave2(%vec1: vector<[4]xf16>, %vec2 : vector<[4]xf16>) {
+  // CHECK: = "llvm.intr.vector.interleave2"({{.*}}) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
   return
 }
 
diff --git a/mlir/test/Dialect/Linalg/bubble-up-extract-slice-op.mlir b/mlir/test/Dialect/Linalg/bubble-up-extract-slice-op.mlir
index 0e353a1fa43f..4bf81820f0e8 100644
--- a/mlir/test/Dialect/Linalg/bubble-up-extract-slice-op.mlir
+++ b/mlir/test/Dialect/Linalg/bubble-up-extract-slice-op.mlir
@@ -165,7 +165,9 @@ func.func @rank_reducing_slice(%width : index) -> tensor<1x1x1x?xf32> {
   %init = tensor.empty(%width) : tensor<1x?xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x?xf32>) -> tensor<1x?xf32>
   %slice = tensor.extract_slice %fill[0, 0] [1, %width] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
-  %expand = tensor.expand_shape %slice [[0, 1, 2, 3]] : tensor<?xf32> into tensor<1x1x1x?xf32>
+  %c0 = arith.constant 0 : index
+  %sz0 = tensor.dim %slice, %c0 : tensor<?xf32>
+  %expand = tensor.expand_shape %slice [[0, 1, 2, 3]] output_shape [1, 1, 1, %sz0] : tensor<?xf32> into tensor<1x1x1x?xf32>
   return %expand : tensor<1x1x1x?xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/collapse-dim.mlir b/mlir/test/Dialect/Linalg/collapse-dim.mlir
index 547320f53387..61bedecbdca5 100644
--- a/mlir/test/Dialect/Linalg/collapse-dim.mlir
+++ b/mlir/test/Dialect/Linalg/collapse-dim.mlir
@@ -52,7 +52,7 @@ func.func @collapse_parallel(
 //  CHECK-SAME:     iterator_types = ["parallel", "parallel", "parallel"]}
 //  CHECK-SAME:     ins(%[[S]] : tensor<32x2x40960xf32>) outs(%[[D]] : tensor<2x32x40960xf32>) {
 //       CHECK:   } -> tensor<2x32x40960xf32>
-//       CHECK:  tensor.expand_shape %[[R]] {{\[}}[0], [1], [2, 3]] : tensor<2x32x40960xf32> into tensor<2x32x10x4096xf32>
+//       CHECK:  tensor.expand_shape %[[R]] {{\[}}[0], [1], [2, 3]] output_shape [2, 32, 10, 4096] : tensor<2x32x40960xf32> into tensor<2x32x10x4096xf32>
 
 // -----
 
@@ -127,8 +127,8 @@ func.func @uncollapsable_strided_memref(%arg0: memref<2x6x24x48xi32>, %arg1: mem
 // CHECK:           %[[VAL_4:.*]] = tensor.collapse_shape %[[VAL_2]] {{\[\[}}0], [1], [2, 3]] : tensor<1x2x12x5xf32> into tensor<1x2x60xf32>
 // CHECK:           %[[VAL_5:.*]] = tensor.collapse_shape %[[VAL_3]] {{\[\[}}0], [1], [2, 3]] : tensor<1x2x12x5xf32> into tensor<1x2x60xf32>
 // CHECK:           %[[VAL_6:.*]] = linalg.copy ins(%[[VAL_4]] : tensor<1x2x60xf32>) outs(%[[VAL_5]] : tensor<1x2x60xf32>) -> tensor<1x2x60xf32>
-// CHECK:           %[[VAL_7:.*]] = tensor.expand_shape %[[VAL_6]] {{\[\[}}0], [1], [2, 3]] : tensor<1x2x60xf32> into tensor<1x2x12x5xf32>
-// CHECK:           %[[VAL_8:.*]] = tensor.expand_shape %[[VAL_7]] {{\[\[}}0], [1], [2, 3], [4]] : tensor<1x2x12x5xf32> into tensor<1x2x3x4x5xf32, 3 : i64>
+// CHECK:           %[[VAL_7:.*]] = tensor.expand_shape %[[VAL_6]] {{\[\[}}0], [1], [2, 3]] output_shape [1, 2, 12, 5] : tensor<1x2x60xf32> into tensor<1x2x12x5xf32>
+// CHECK:           %[[VAL_8:.*]] = tensor.expand_shape %[[VAL_7]] {{\[\[}}0], [1], [2, 3], [4]] output_shape [1, 2, 3, 4, 5] : tensor<1x2x12x5xf32> into tensor<1x2x3x4x5xf32, 3 : i64>
 // CHECK:           return %[[VAL_8]] : tensor<1x2x3x4x5xf32, 3 : i64>
 // CHECK:         }
 
diff --git a/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir b/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
index a64319963531..c7c846d7ecc9 100644
--- a/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
+++ b/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir
@@ -50,7 +50,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK: linalg.yield %[[EXTRACTED_INPUT]] : f32
 
 // CHECK: IR printer: transformed
-// CHECK: tensor.expand_shape %{{[^ ]*}} {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
+// CHECK: tensor.expand_shape %{{[^ ]*}} {{\[}}[0], [1, 2], [3]] output_shape [1, 14, 14, 16] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
 
 // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
@@ -78,7 +78,7 @@ module attributes {transform.with_named_sequence} {
 //                CHECK:     %[[ADD:.+]] = arith.addf %[[MUL]], %[[ARG2]] : f32
 //                CHECK:     linalg.yield %[[ADD]] : f32
 //                CHECK: } -> tensor<1x196x16xf32>
-//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
+//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] output_shape [1, 14, 14, 16] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
 //      CHECK: return %[[RESULT]]
 
 func.func @conv_16433136(%arg0: tensor<1x16x16x4xf32>, %arg1: tensor<3x3x4x16xf32>, %arg2: tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32> {
@@ -204,7 +204,7 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:     %[[ADD:.+]] = arith.addf %[[MUL]], %[[ARG2]] : f32
 //      CHECK:     linalg.yield %[[ADD]] : f32
 //      CHECK:   } -> tensor<8x196x16xf32>
-//      CHECK:   %[[CS_FINAL:.+]] = tensor.expand_shape %[[MATMUL]] {{\[}}[0], [1, 2], [3]] : tensor<8x196x16xf32> into tensor<8x14x14x16xf32>
+//      CHECK:   %[[CS_FINAL:.+]] = tensor.expand_shape %[[MATMUL]] {{\[}}[0], [1, 2], [3]] output_shape [8, 14, 14, 16] : tensor<8x196x16xf32> into tensor<8x14x14x16xf32>
 //      CHECK:   return %[[CS_FINAL]]
 func.func @batch_nhwc_conv(%arg0: tensor<8x16x16x4xf32>, %arg1: tensor<3x3x4x16xf32>, %arg2: tensor<8x14x14x16xf32>) -> tensor<8x14x14x16xf32> {
     %0 = linalg.conv_2d_nhwc_hwcf
@@ -269,7 +269,7 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:     %[[ADD:.+]] = arith.addf %[[MUL]], %[[ARG2]] : f32
 //      CHECK:     linalg.yield %[[ADD]] : f32
 //      CHECK:   } -> tensor<8x16x196xf32>
-//      CHECK:   %[[CS_FINAL:.+]] = tensor.expand_shape %[[MATMUL]] {{\[}}[0], [1], [2, 3]] : tensor<8x16x196xf32> into tensor<8x16x14x14xf32>
+//      CHECK:   %[[CS_FINAL:.+]] = tensor.expand_shape %[[MATMUL]] {{\[}}[0], [1], [2, 3]] output_shape [8, 16, 14, 14] : tensor<8x16x196xf32> into tensor<8x16x14x14xf32>
 //      CHECK:   return %[[CS_FINAL]]
 func.func @batch_nchw_conv(%arg0: tensor<8x4x16x16xf32>, %arg1: tensor<16x4x3x3xf32>, %arg2: tensor<8x16x14x14xf32>) -> tensor<8x16x14x14xf32> {
     %0 = linalg.conv_2d_nchw_fchw
@@ -310,7 +310,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK: linalg.yield %[[EXTRACTED_INPUT]] : f32
 
 // CHECK: IR printer: transformed
-// CHECK: tensor.expand_shape %{{[^ ]*}} {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
+// CHECK: tensor.expand_shape %{{[^ ]*}} {{\[}}[0], [1, 2], [3]] output_shape [1, 14, 14, 16] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
 
 // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
@@ -338,7 +338,7 @@ module attributes {transform.with_named_sequence} {
 //                CHECK:     %[[ADD:.+]] = arith.addf %[[MUL]], %[[ARG2]] : f32
 //                CHECK:     linalg.yield %[[ADD]] : f32
 //                CHECK: } -> tensor<1x196x16xf32>
-//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
+//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] output_shape [1, 14, 14, 16] : tensor<1x196x16xf32> into tensor<1x14x14x16xf32>
 //      CHECK: return %[[RESULT]]
 
 func.func @conv_2d_nhwc_fhwc(%arg0: tensor<1x16x16x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32> {
@@ -378,7 +378,7 @@ module attributes {transform.with_named_sequence} {
 //                CHECK:     %[[ADD:.+]] = arith.addi %[[MUL]], %[[ARG2]] : i32
 //                CHECK:     linalg.yield %[[ADD]] : i32
 //                CHECK: } -> tensor<1x196x16xi32>
-//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xi32> into tensor<1x14x14x16xi32>
+//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] output_shape [1, 14, 14, 16] : tensor<1x196x16xi32> into tensor<1x14x14x16xi32>
 //      CHECK: return %[[RESULT]]
 
 func.func @conv_integer_extend(%arg0: tensor<1x16x16x4xi8>, %arg1: tensor<3x3x4x16xi8>, %arg2: tensor<1x14x14x16xi32>) -> tensor<1x14x14x16xi32> {
@@ -416,7 +416,7 @@ module attributes {transform.with_named_sequence} {
 //                CHECK:     %[[ADD:.+]] = complex.add %[[MUL]], %[[ARG2]] : complex<f32>
 //                CHECK:     linalg.yield %[[ADD]] : complex<f32>
 //                CHECK: } -> tensor<1x196x16xcomplex<f32>>
-//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xcomplex<f32>> into tensor<1x14x14x16xcomplex<f32>>
+//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] output_shape [1, 14, 14, 16] : tensor<1x196x16xcomplex<f32>> into tensor<1x14x14x16xcomplex<f32>>
 //      CHECK: return %[[RESULT]]
 
 func.func @conv_complex(%arg0: tensor<1x16x16x4xcomplex<f32>>, %arg1: tensor<3x3x4x16xcomplex<f32>>, %arg2: tensor<1x14x14x16xcomplex<f32>>) -> tensor<1x14x14x16xcomplex<f32>> {
@@ -459,7 +459,7 @@ module attributes {transform.with_named_sequence} {
 //                CHECK:     %[[ADD:.+]] = complex.add %[[MUL]], %[[ARG2]] : complex<f32>
 //                CHECK:     linalg.yield %[[ADD]] : complex<f32>
 //                CHECK: } -> tensor<1x196x16xcomplex<f32>>
-//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xcomplex<f32>> into tensor<1x14x14x16xcomplex<f32>>
+//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] output_shape [1, 14, 14, 16] : tensor<1x196x16xcomplex<f32>> into tensor<1x14x14x16xcomplex<f32>>
 //      CHECK: return %[[RESULT]]
 
 func.func @conv_complex_extended(%arg0: tensor<1x16x16x4xcomplex<f32>>, %arg1: tensor<3x3x4x16xcomplex<f16>>, %arg2: tensor<1x14x14x16xcomplex<f32>>) -> tensor<1x14x14x16xcomplex<f32>> {
@@ -500,7 +500,7 @@ module attributes {transform.with_named_sequence} {
 //                CHECK:     %[[ADD:.+]] = complex.add %[[MUL]], %[[ARG2]] : complex<f32>
 //                CHECK:     linalg.yield %[[ADD]] : complex<f32>
 //                CHECK: } -> tensor<1x196x16xcomplex<f32>>
-//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] : tensor<1x196x16xcomplex<f32>> into tensor<1x14x14x16xcomplex<f32>>
+//      CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0], [1, 2], [3]] output_shape [1, 14, 14, 16] : tensor<1x196x16xcomplex<f32>> into tensor<1x14x14x16xcomplex<f32>>
 //      CHECK: return %[[RESULT]]
 
 func.func @conv_complex_f16_extended(%arg0: tensor<1x16x16x4xcomplex<f32>>, %arg1: tensor<3x3x4x16xf16>, %arg2: tensor<1x14x14x16xcomplex<f32>>) -> tensor<1x14x14x16xcomplex<f32>> {
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index 79d61ab757e3..bee08503298f 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -988,17 +988,20 @@ func.func @no_bubble_up_pack_through_non_divisible_collapse(%1: tensor<3072x64x4
 
 // -----
 
-func.func @push_down_unpack_through_expand(%5: tensor<?x32x8x8xf32>, %dim: index) -> tensor<?x256x256xf32> {
+func.func @push_down_unpack_through_expand(%5: tensor<?x32x8x8xf32>, %dim: index, %sz0: index) -> tensor<?x256x256xf32> {
   %6 = tensor.empty(%dim) : tensor<?x256xf32>
   %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<?x32x8x8xf32> -> tensor<?x256xf32>
-  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] : tensor<?x256xf32> into tensor<?x256x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor<?x256xf32> into tensor<?x256x256xf32>
   func.return %expanded : tensor<?x256x256xf32>
 }
 // CHECK-LABEL: func.func @push_down_unpack_through_expand
 // CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
 // CHECK-SAME:      %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[C32:.+]] = arith.constant 32 : index
 // CHECK:         %[[C0:.+]] = arith.constant 0 : index
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32>
+// CHECK:         %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x32x8x8xf32>
+// CHECK:         %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C32]] : index
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] output_shape [%[[SZ0]], 32, 32, 8, 8] : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32>
 // CHECK:         %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor<?x32x32x8x8xf32>
 // CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x256x256xf32>
 // CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<?x32x32x8x8xf32> -> tensor<?x256x256xf32>
@@ -1009,12 +1012,12 @@ func.func @push_down_unpack_through_expand(%5: tensor<?x32x8x8xf32>, %dim: index
 func.func @push_down_permuted_unpack_through_expand(%5: tensor<4x32x384x8x8xf32>) -> tensor<4x12x256x256xf32> {
   %6 = tensor.empty() : tensor<4x3072x256xf32>
   %unpack = tensor.unpack %5 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 8] into %6 : tensor<4x32x384x8x8xf32> -> tensor<4x3072x256xf32>
-  %expanded = tensor.expand_shape %unpack [[0], [1, 2], [3]] : tensor<4x3072x256xf32> into tensor<4x12x256x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0], [1, 2], [3]] output_shape [4, 12, 256, 256] : tensor<4x3072x256xf32> into tensor<4x12x256x256xf32>
   func.return %expanded : tensor<4x12x256x256xf32>
 }
 // CHECK-LABEL: @push_down_permuted_unpack_through_expand
 // CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1], [2, 3], [4], [5]] : tensor<4x32x384x8x8xf32> into tensor<4x32x12x32x8x8xf32>
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1], [2, 3], [4], [5]] output_shape [4, 32, 12, 32, 8, 8] : tensor<4x32x384x8x8xf32> into tensor<4x32x12x32x8x8xf32>
 // CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<4x12x256x256xf32>
 // CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<4x32x12x32x8x8xf32> -> tensor<4x12x256x256xf32>
 // CHECK:         return %[[UNPACK]] : tensor<4x12x256x256xf32>
@@ -1024,29 +1027,32 @@ func.func @push_down_permuted_unpack_through_expand(%5: tensor<4x32x384x8x8xf32>
 func.func @push_down_unpack_through_unit_expand(%5: tensor<6x32x8x8xf32>) -> tensor<3x16x1x256xf32> {
   %6 = tensor.empty() : tensor<48x256xf32>
   %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<6x32x8x8xf32> -> tensor<48x256xf32>
-  %expanded = tensor.expand_shape %unpack [[0, 1, 2], [3]] : tensor<48x256xf32> into tensor<3x16x1x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1, 2], [3]] output_shape [3, 16, 1, 256] : tensor<48x256xf32> into tensor<3x16x1x256xf32>
   func.return %expanded : tensor<3x16x1x256xf32>
 }
 // CHECK-LABEL: func.func @push_down_unpack_through_unit_expand
 // CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3], [4], [5]] : tensor<6x32x8x8xf32> into tensor<3x2x1x32x8x8xf32>
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3], [4], [5]] output_shape [3, 2, 1, 32, 8, 8] : tensor<6x32x8x8xf32> into tensor<3x2x1x32x8x8xf32>
 // CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<3x16x1x256xf32>
 // CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<3x2x1x32x8x8xf32> -> tensor<3x16x1x256xf32>
 // CHECK:         return %[[UNPACK]] : tensor<3x16x1x256xf32>
 
 // -----
 
-func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor<?x32x8xf32>, %dim: index) -> tensor<?x256x256xf32> {
+func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor<?x32x8xf32>, %dim: index, %sz0: index) -> tensor<?x256x256xf32> {
   %6 = tensor.empty(%dim) : tensor<?x256xf32>
   %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [8] into %6 : tensor<?x32x8xf32> -> tensor<?x256xf32>
-  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] : tensor<?x256xf32> into tensor<?x256x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor<?x256xf32> into tensor<?x256x256xf32>
   func.return %expanded : tensor<?x256x256xf32>
 }
 // CHECK-LABEL: func.func @push_down_unpack_through_expand_on_outer_dims
 // CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
 // CHECK-SAME:      %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[C256:.+]] = arith.constant 256 : index
 // CHECK:         %[[C0:.+]] = arith.constant 0 : index
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3]] : tensor<?x32x8xf32> into tensor<?x256x32x8xf32>
+// CHECK:         %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x32x8xf32>
+// CHECK:         %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C256]] : index
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3]] output_shape [%[[SZ0]], 256, 32, 8] : tensor<?x32x8xf32> into tensor<?x256x32x8xf32>
 // CHECK:         %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor<?x256x32x8xf32>
 // CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x256x256xf32>
 // CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [8] into %[[EMPTY]] : tensor<?x256x32x8xf32> -> tensor<?x256x256xf32>
@@ -1057,11 +1063,11 @@ func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor<?x32x8xf32>,
 func.func @no_push_down_unpack_through_non_divisible_expand(%5: tensor<384x32x8x8xf32>) -> tensor<256x12x256xf32> {
   %6 = tensor.empty() : tensor<3072x256xf32>
   %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<384x32x8x8xf32> -> tensor<3072x256xf32>
-  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] : tensor<3072x256xf32> into tensor<256x12x256xf32>
+  %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [256, 12, 256] : tensor<3072x256xf32> into tensor<256x12x256xf32>
   func.return %expanded : tensor<256x12x256xf32>
 }
 // CHECK-LABEL: func.func @no_push_down_unpack_through_non_divisible_expand
 // CHECK-SAME:      %[[ARG0:[a-zA-Z0-9]+]]
 // CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[ARG0]]
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[UNPACK]] {{\[}}[0, 1], [2]] : tensor<3072x256xf32> into tensor<256x12x256xf32>
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[UNPACK]] {{\[}}[0, 1], [2]] output_shape [256, 12, 256] : tensor<3072x256xf32> into tensor<256x12x256xf32>
 // CHECK:         return %[[EXPANDED]] : tensor<256x12x256xf32>
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index c140b6abcc37..a9cbaaf7fdc4 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -25,13 +25,22 @@ func.func @drop_one_trip_loops(%arg0 : tensor<?x1x?xf32>, %arg1 : f32, %shape: t
 //   CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //   CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1, d2) -> ()>
 //   CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+//   CHECK-DAG: #[[$MAP4:.*]] = affine_map<()[s0, s1] -> (s0 * s1)>
 // CHECK-LABEL: func @drop_one_trip_loops
-//       CHECK: tensor.collapse_shape %{{.*}} {{\[}}[0, 1], [2]]
-//       CHECK: tensor.collapse_shape %{{.*}} {{\[}}[0, 1], [2, 3], [4]]
+//       CHECK: %[[C2:.*]] = arith.constant 2 : index
+//       CHECK: %[[C1:.*]] = arith.constant 1 : index
+//       CHECK: %[[C0:.*]] = arith.constant 0 : index
+//       CHECK: tensor.collapse_shape %{{.*}} {{\[\[}}0, 1], [2]]
+//       CHECK: tensor.collapse_shape %{{.*}} {{\[\[}}0, 1], [2, 3], [4]]
 //       CHECK: linalg.generic
 //  CHECK-SAME:   indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP3]]]
 //  CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel"]
-//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1], [2, 3], [4]]
+//       CHECK: %[[DIM:.*]] = tensor.dim %{{.*}}, %[[C0]]
+//       CHECK: %[[VAL_1:.*]] = affine.apply #[[$MAP4]]()[%[[DIM]], %[[C1]]]
+//       CHECK: %[[DIM_1:.*]] = tensor.dim %{{.*}}, %[[C2]]
+//       CHECK: %[[VAL_2:.*]] = affine.apply #[[$MAP4]]()[%[[DIM_1]], %[[C1]]]
+//       CHECK: %[[DIM_2:.*]] = tensor.dim %{{.*}}, %[[C2]]
+//       CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %{{.*}} {{\[\[}}0, 1], [2, 3], [4]] output_shape [%[[VAL_1]], 1, %[[VAL_2]], 1, %[[DIM_2]]] : tensor<?x?x?xf32> into tensor<?x1x?x1x?xf32>
 
 //   CHECK-SLICES-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //   CHECK-SLICES-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1, d2) -> ()>
@@ -70,13 +79,18 @@ func.func @drop_one_trip_loops_all_ones(%arg0 : tensor<1x1x1xf32>, %arg1 : f32,
 }
 //   CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0) -> ()>
 //   CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (d0)>
+//   CHECK-DAG: #[[$MAP3:.*]] = affine_map<()[s0, s1, s2, s3, s4] -> ((((s0 * s1) * s2) * s3) * s4)>
 // CHECK-LABEL: func @drop_one_trip_loops_all_ones
+//       CHECK: %[[C2:.*]] = arith.constant 2 : index
+//       CHECK: %[[C1:.*]] = arith.constant 1 : index
 //       CHECK: tensor.collapse_shape %{{.*}} []
 //       CHECK: tensor.collapse_shape %{{.*}} {{\[}}[0, 1, 2, 3, 4]]
 //       CHECK: linalg.generic
 //  CHECK-SAME:   indexing_maps = [#[[$MAP1]], #[[$MAP1]], #[[$MAP2]]]
 //  CHECK-SAME:   iterator_types = ["parallel"]
-//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1, 2, 3, 4]]
+//       CHECK: %[[DIM:.*]] = tensor.dim %{{.*}}, %[[C2]] : tensor<1x1x?x1x1xf32>
+//       CHECK: %[[SZ:.*]] = affine.apply #[[$MAP3]]()[%[[C1]], %[[C1]], %[[DIM]], %[[C1]], %[[C1]]]
+//       CHECK: %[[EXPAND:.*]] = tensor.expand_shape %{{.*}} {{\[\[}}0, 1, 2, 3, 4]] output_shape [1, 1, %[[SZ]], 1, 1] : tensor<?xf32> into tensor<1x1x?x1x1xf32>
 
 // -----
 
@@ -232,8 +246,8 @@ func.func @leading_dim_1_canonicalization(%arg0: tensor<1x5xf32>, %shape: tensor
 
 func.func @broadcast_test(%arg0 : tensor<5xf32>, %arg1 : tensor<5xf32>, %shape : tensor<5x5xf32>) -> tensor<5x5xf32>
 {
-  %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<5xf32> into tensor<1x5xf32>
-  %1 = tensor.expand_shape %arg1 [[0, 1]] : tensor<5xf32> into tensor<5x1xf32>
+  %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [1, 5] : tensor<5xf32> into tensor<1x5xf32>
+  %1 = tensor.expand_shape %arg1 [[0, 1]] output_shape [5, 1] : tensor<5xf32> into tensor<5x1xf32>
   %2 = linalg.generic #trait
      ins(%0, %1 : tensor<1x5xf32>, tensor<5x1xf32>)
     outs(%shape : tensor<5x5xf32>) {
@@ -331,7 +345,6 @@ func.func @fold_unit_dim_for_empty_tensor(%input: tensor<1x1000xf32>) -> tensor<
 
 //       CHECK: func @fold_unit_dim_for_empty_tensor
 
-
 //       CHECK: %[[INPUT_RESHAPE:.+]] = tensor.collapse_shape %{{.+}} {{\[}}[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32>
 //       CHECK: %[[INIT:.+]] = tensor.empty() : tensor<f32>
 //       CHECK: %[[FILL:.+]] = linalg.fill ins(%cst : f32) outs(%[[INIT]] : tensor<f32>) -> tensor<f32>
@@ -340,7 +353,7 @@ func.func @fold_unit_dim_for_empty_tensor(%input: tensor<1x1000xf32>) -> tensor<
 //  CHECK-SAME:     iterator_types = ["reduction"]
 //  CHECK-SAME:   ins(%[[INPUT_RESHAPE]] : tensor<1000xf32>)
 //  CHECK-SAME:   outs(%[[FILL]] : tensor<f32>)
-//       CHECK: %[[GENERIC_RESHAPE:.+]] = tensor.expand_shape %[[GENERIC]] [] : tensor<f32> into tensor<1xf32>
+//       CHECK: %[[GENERIC_RESHAPE:.+]] = tensor.expand_shape %[[GENERIC]] [] output_shape [1] : tensor<f32> into tensor<1xf32>
 //       CHECK: return %[[GENERIC_RESHAPE:.+]] : tensor<1xf32>
 
 
@@ -364,11 +377,11 @@ func.func @fold_slice(
 //      CHECK:   %[[SLICE1:.+]] = tensor.extract_slice %[[ARG0]]
 // CHECK-SAME:       to tensor<?x?x?xf32>
 //      CHECK:   %[[RESULT1:.+]] = tensor.expand_shape %[[SLICE1]]
-// CHECK-SAME:       [0, 1], [2], [3, 4, 5, 6]
+// CHECK-SAME:       {{\[\[}}0, 1], [2], [3, 4, 5, 6]] output_shape [1, %arg5, %arg6, 1, %arg7, 1, 1] : tensor<?x?x?xf32> into tensor<1x?x?x1x?x1x1xf32>
 //      CHECK:   %[[SLICE2:.+]] = tensor.extract_slice %[[ARG1]]
 // CHECK-SAME:       to tensor<?x?x?xf32>
 //      CHECK:   %[[RESULT2:.+]] = tensor.expand_shape %[[SLICE2]]
-// CHECK-SAME:       [0, 1], [2], [3, 4, 5, 6]
+// CHECK-SAME:       {{\[\[}}0, 1], [2], [3, 4, 5, 6]] output_shape [1, %arg5, %arg6, 1, %arg7, 1, 1] : tensor<?x?x?xf32> into tensor<1x?x?x1x?x1x1xf32>
 //      CHECK:   return %[[RESULT1]], %[[RESULT2]]
 
 // -----
@@ -391,20 +404,27 @@ func.func @unit_dim_for_reduction(%arg0: tensor<1x?x1x?xf32>) -> tensor<1x?xf32>
   } -> tensor<1x?xf32>
   return %3 : tensor<1x?xf32>
 }
-//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
-//  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0, s1, s2] -> ((s0 * s1) * s2)>
 //      CHECK: func @unit_dim_for_reduction
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<1x?x1x?xf32>
-//  CHECK-DAG:   %[[RESHAPE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
-//      CHECK:   %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?xf32>
-//      CHECK:   %[[FILL:.+]] = linalg.fill ins(%{{.+}}{{.*}}outs(%[[INIT]]
-//      CHECK:   %[[RESULT:.+]] = linalg.generic
-// CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP3]]]
+//      CHECK: %[[C1:.+]] = arith.constant 1 : index
+//      CHECK: %[[CST:.+]] = arith.constant 1.000000e+00 : f32
+//      CHECK: %[[C3:.+]] = arith.constant 3 : index
+//      CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C3]] : tensor<1x?x1x?xf32>
+//      CHECK: %[[RESHAPE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
+//      CHECK: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?xf32>
+//      CHECK: %[[FILL:.+]] = linalg.fill ins(%{{.+}}{{.*}}outs(%[[INIT]]
+//      CHECK: %[[RESULT:.+]] = linalg.generic
+// CHECK-SAME:     indexing_maps = [#[[MAP]], #[[MAP2]]]
 // CHECK-SAME:     iterator_types = ["parallel", "reduction"]
 // CHECK-SAME:     ins(%[[RESHAPE]] : tensor<?x?xf32>)
 // CHECK-SAME:     outs(%[[FILL]] : tensor<?xf32>)
-//      CHECK:   %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[RESULT]] {{\[}}[0, 1]]
-//      CHECK:   return %[[RESULT_RESHAPE]]
+//      CHECK: %[[DIM_0:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<1x?x1x?xf32>
+//      CHECK: %[[VAL_3:.*]] = affine.apply #[[$MAP3]]()[%[[C1]], %[[DIM_0]], %[[C1]]]
+//      CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0, 1]] output_shape [1, %[[VAL_3]]] : tensor<?xf32> into tensor<1x?xf32>
+//      CHECK: return %[[EXPANDED]] : tensor<1x?xf32>
 
 // -----
 
@@ -437,7 +457,7 @@ func.func @unit_dim_for_both_reduction(%arg0: tensor<1x?x1x1xf32>) -> tensor<1x1
 // CHECK-SAME:     iterator_types = ["parallel"]
 // CHECK-SAME:     ins(%[[RESHAPE]], %[[FILL]] : tensor<?xf32>, tensor<1xf32>)
 // CHECK-SAME:     outs(%[[INIT2]] : tensor<1xf32>)
-//      CHECK:   %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[RESULT]] {{\[}}[0, 1]]
+//      CHECK:   %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[RESULT]] {{\[}}[0, 1]] output_shape [1, 1]
 //      CHECK:   return %[[RESULT_RESHAPE]]
 
 // -----
@@ -460,20 +480,28 @@ func.func @unit_dim_for_reduction_inner(%arg0: tensor<?x1x?x1xf32>) -> tensor<?x
   } -> tensor<?x1xf32>
   return %3 : tensor<?x1xf32>
 }
-//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
-//  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0, s1] -> (s0 * s1)>
 //      CHECK: func @unit_dim_for_reduction_inner
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<?x1x?x1xf32>
-//  CHECK-DAG:   %[[RESHAPE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]]
-//      CHECK:   %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?xf32>
-//      CHECK:   %[[FILL:.+]] = linalg.fill ins(%{{.+}}{{.*}}outs(%[[INIT]]
-//      CHECK:   %[[RESULT:.+]] = linalg.generic
-// CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP3]]]
+//      CHECK: %[[C1:.*]] = arith.constant 1 : index
+//      CHECK: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[CST:.*]] = arith.constant 1.000000e+00 : f32
+//      CHECK: %[[C2:.*]] = arith.constant 2 : index
+//      CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C2]] : tensor<?x1x?x1xf32>
+//      CHECK: %[[RESHAPE:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]]
+//      CHECK: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor<?xf32>
+//      CHECK: %[[FILL:.+]] = linalg.fill ins(%{{.+}}{{.*}}outs(%[[INIT]]
+//      CHECK: %[[RESULT:.+]] = linalg.generic
+// CHECK-SAME:     indexing_maps = [#[[MAP]], #[[MAP2]]]
 // CHECK-SAME:     iterator_types = ["parallel", "reduction"]
 // CHECK-SAME:     ins(%[[RESHAPE]] : tensor<?x?xf32>)
 // CHECK-SAME:     outs(%[[FILL]] : tensor<?xf32>)
-//      CHECK:   %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[RESULT]] {{\[}}[0, 1]]
-//      CHECK:   return %[[RESULT_RESHAPE]]
+//      CHECK: %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x1x?x1xf32>
+//      CHECK: %[[VAL_3:.+]] = affine.apply #[[$MAP3]]()[%[[DIM_0]], %[[C1]]]
+//      CHECK: %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[RESULT]] {{\[}}[0, 1]] output_shape [%[[VAL_3]], 1] : tensor<?xf32> into tensor<?x1xf32>
+//      CHECK: return %[[RESULT_RESHAPE]]
 
 // -----
 
@@ -484,7 +512,7 @@ func.func @slice_unit_dims(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
 // CHECK-LABEL: func @slice_unit_dims
 //       CHECK:   %[[SLICE:.+]] = tensor.extract_slice
 //  CHECK-SAME:     tensor<1x3xf32> to tensor<f32>
-//       CHECK:   %[[RESULT:.+]] = tensor.expand_shape %[[SLICE]] []
+//       CHECK:   %[[RESULT:.+]] = tensor.expand_shape %[[SLICE]] [] output_shape [1, 1]
 //       CHECK:   return %[[RESULT]]
 
 // -----
@@ -496,7 +524,7 @@ func.func @rank_reduced_extract_slice(%arg0: tensor<1x1x3x1x3xf32>) -> tensor<1x
 // CHECK-LABEL: func @rank_reduced_extract_slice
 //       CHECK:   %[[SLICE:.+]] = tensor.extract_slice
 //  CHECK-SAME:     tensor<1x1x3x1x3xf32> to tensor<3x3xf32>
-//       CHECK:   %[[RESULT:.+]] = tensor.expand_shape %[[SLICE]] {{\[}}[0, 1], [2]]
+//       CHECK:   %[[RESULT:.+]] = tensor.expand_shape %[[SLICE]] {{\[}}[0, 1], [2]] output_shape [1, 3, 3]
 //       CHECK:   return %[[RESULT]]
 
 // -----
@@ -709,8 +737,8 @@ func.func @leading_dim_1_canonicalization(%arg0: memref<1x5xf32>, %shape: memref
 
 func.func @broadcast_test(%arg0 : memref<5xf32>, %arg1 : memref<5xf32>, %shape : memref<5x5xf32>) -> memref<5x5xf32>
 {
-  %0 = memref.expand_shape %arg0 [[0, 1]] : memref<5xf32> into memref<1x5xf32>
-  %1 = memref.expand_shape %arg1 [[0, 1]] : memref<5xf32> into memref<5x1xf32>
+  %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [1, 5] : memref<5xf32> into memref<1x5xf32>
+  %1 = memref.expand_shape %arg1 [[0, 1]] output_shape [5, 1] : memref<5xf32> into memref<5x1xf32>
   linalg.generic #trait
      ins(%0, %1 : memref<1x5xf32>, memref<5x1xf32>)
     outs(%shape : memref<5x5xf32>) {
@@ -966,7 +994,7 @@ func.func @drop_unit_pad_dims(%arg0: tensor<1x1x3x1x1xf32>) -> tensor<1x2x3x1x3x
 //       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[1, 0, 0] high[0, 0, 2]
 //       CHECK:   } : tensor<1x3x1xf32> to tensor<2x3x3xf32>
 //       CHECK:   tensor.expand_shape %[[PADDED]]
-//  CHECK-SAME:     {{\[}}[0, 1], [2, 3], [4]{{\]}} : tensor<2x3x3xf32> into tensor<1x2x3x1x3xf32>
+//  CHECK-SAME:     {{\[}}[0, 1], [2, 3], [4]{{\]}} output_shape [1, 2, 3, 1, 3] : tensor<2x3x3xf32> into tensor<1x2x3x1x3xf32>
 
 // CHECK-SLICES-LABEL: func @drop_unit_pad_dims
 //       CHECK-SLICES:   %[[EXTRACT:.+]] = tensor.extract_slice
@@ -989,13 +1017,19 @@ func.func @drop_unit_pad_dynamic_dims(%arg0: tensor<1x?xf32>) -> tensor<1x?xf32>
   return %0 : tensor<1x?xf32>
 }
 
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 * s1)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 + 11)>
 // CHECK-LABEL: func @drop_unit_pad_dynamic_dims
+//       CHECK:   %[[C1:.*]] = arith.constant 1 : index
+//       CHECK:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 //       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape
 //  CHECK-SAME:     {{\[}}[0, 1]{{\]}} : tensor<1x?xf32> into tensor<?xf32>
 //       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[5] high[6]
 //       CHECK:   } : tensor<?xf32> to tensor<?xf32>
-//       CHECK:   tensor.expand_shape %[[PADDED]]
-//  CHECK-SAME:     {{\[}}[0, 1]{{\]}} : tensor<?xf32> into tensor<1x?xf32>
+//       CHECK:   %[[DIM:.+]] = tensor.dim %{{.*}}, %[[C1]] : tensor<1x?xf32>
+//       CHECK:   %[[VAL_0:.+]] = affine.apply #[[$MAP]]()[%[[C1]], %[[DIM]]]
+//       CHECK:   %[[VAL_1:.+]] = affine.apply #[[$MAP1]]()[%[[VAL_0]]]
+//       CHECK:   %[[EXPANDED:.+]] = tensor.expand_shape %[[PADDED]] {{\[\[}}0, 1]] output_shape [1, %[[VAL_1]]] : tensor<?xf32> into tensor<1x?xf32>
 
 // CHECK-SLICES: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + 11)>
 
@@ -1052,4 +1086,4 @@ func.func @drop_known_unit_constant_low_high(%arg0: tensor<1x383x128xf32>) -> te
 //       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[1, 0] high[0, 0]
 //       CHECK:   } : tensor<383x128xf32> to tensor<384x128xf32>
 //       CHECK:   tensor.expand_shape %[[PADDED]]
-//  CHECK-SAME:     {{\[}}[0, 1], [2]] : tensor<384x128xf32> into tensor<1x384x128xf32>
+//  CHECK-SAME:     {{\[}}[0, 1], [2]] output_shape [1, 384, 128] : tensor<384x128xf32> into tensor<1x384x128xf32>
diff --git a/mlir/test/Dialect/Linalg/flatten-elementwise.mlir b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
index 5a27fe76b134..9fe50a521d2d 100644
--- a/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
+++ b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
@@ -26,7 +26,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:                         %[[ARG1:.*]]: tensor<32x7xf32>
 // CHECK-NEXT:    %[[FLATTENED:.*]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
 // CHECK-NEXT:    %[[FLATTENED_RESULT:.*]] = linalg.fill ins(%[[ARG0]] : f32) outs(%[[FLATTENED]] : tensor<224xf32>)
-// CHECK-NEXT:    %[[RESULT:.*]] = tensor.expand_shape %[[FLATTENED_RESULT]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[RESULT:.*]] = tensor.expand_shape %[[FLATTENED_RESULT]] {{\[}}[0, 1]] output_shape [32, 7] : tensor<224xf32> into tensor<32x7xf32>
 func.func @fill_tensor(%cst: f32, %arg: tensor<32x7xf32>) -> tensor<32x7xf32> {
     %0 = linalg.fill ins(%cst: f32) outs(%arg: tensor<32x7xf32>) ->  tensor<32x7xf32>
     return %0 :  tensor<32x7xf32>
diff --git a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir
index 50d308b6a9fe..0d40df534a3b 100644
--- a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir
+++ b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir
@@ -9,8 +9,7 @@
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 func.func @fuse_by_collapsing(%arg0 : tensor<2x12x5x336x9xi32>,
     %arg1 : tensor<2x3x4xi32>, %arg2 : tensor<5x6x7x8xi32>) -> tensor<2x3x4x5x6x7x8x9xi32> {
-  %expand = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4, 5, 6], [7]]
-      : tensor<2x12x5x336x9xi32> into tensor<2x3x4x5x6x7x8x9xi32>
+  %expand = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4, 5, 6], [7]] output_shape [2, 3, 4, 5, 6, 7, 8, 9] : tensor<2x12x5x336x9xi32> into tensor<2x3x4x5x6x7x8x9xi32>
   %init = tensor.empty() : tensor<2x3x4x5x6x7x8x9xi32>
   %generic = linalg.generic {
     indexing_maps = [#map0, #map1, #map2, #map3],
@@ -40,7 +39,7 @@ func.func @fuse_by_collapsing(%arg0 : tensor<2x12x5x336x9xi32>,
 // CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]
 // CHECK-SAME:       ins(%[[ARG0]], %[[ARG1_RESHAPE]], %[[ARG2_RESHAPE]] :
 // CHECK-SAME:       outs(%[[INIT_RESHAPE]] :
-//      CHECK:   %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[COLLAPSED_OP]] {{\[}}[0], [1, 2], [3], [4, 5, 6], [7]{{\]}}
+//      CHECK:   %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[COLLAPSED_OP]] {{\[}}[0], [1, 2], [3], [4, 5, 6], [7]{{\]}} output_shape [2, 3, 4, 5, 6, 7, 8, 9]
 //      CHECK:   return %[[RESULT_RESHAPE]]
 
 //      CONTROL: func @fuse_by_collapsing(
@@ -60,8 +59,7 @@ func.func @fuse_by_collapsing(%arg0 : tensor<2x12x5x336x9xi32>,
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 func.func @fuse_by_collapsing_indexing_op(%arg0 : tensor<2x12x5x336x9xi32>,
     %arg1 : tensor<2x3x4xi32>, %arg2 : tensor<5x6x7x8xi32>) -> tensor<2x3x4x5x6x7x8x9xi32> {
-  %expand = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4, 5, 6], [7]]
-      : tensor<2x12x5x336x9xi32> into tensor<2x3x4x5x6x7x8x9xi32>
+  %expand = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4, 5, 6], [7]] output_shape [2, 3, 4, 5, 6, 7, 8, 9] : tensor<2x12x5x336x9xi32> into tensor<2x3x4x5x6x7x8x9xi32>
   %init = tensor.empty() : tensor<2x3x4x5x6x7x8x9xi32>
   %generic = linalg.generic {
     indexing_maps = [#map0, #map1, #map2, #map3],
@@ -122,8 +120,7 @@ func.func @fuse_by_collapsing_indexing_op(%arg0 : tensor<2x12x5x336x9xi32>,
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 func.func @fuse_by_collapsing_change_reshape_order(%arg0 : tensor<9x56x2x60x6xi32>,
     %arg1 : tensor<7x8x2xi32>, %arg2 : tensor<6x3x4x5xi32>) -> tensor<2x3x4x5x6x7x8x9xi32> {
-  %expand = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4, 5, 6], [7]]
-      : tensor<9x56x2x60x6xi32> into tensor<9x7x8x2x3x4x5x6xi32>
+  %expand = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4, 5, 6], [7]] output_shape [9, 7, 8, 2, 3, 4, 5, 6] : tensor<9x56x2x60x6xi32> into tensor<9x7x8x2x3x4x5x6xi32>
   %init = tensor.empty() : tensor<2x3x4x5x6x7x8x9xi32>
   %generic = linalg.generic {
     indexing_maps = [#map0, #map1, #map2, #map3],
@@ -154,7 +151,7 @@ func.func @fuse_by_collapsing_change_reshape_order(%arg0 : tensor<9x56x2x60x6xi3
 // CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]
 // CHECK-SAME:       ins(%[[ARG0]], %[[ARG1_RESHAPE]], %[[ARG2_RESHAPE]] :
 // CHECK-SAME:       outs(%[[INIT_RESHAPE]] :
-//      CHECK:   %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[COLLAPSED_OP]] {{\[}}[0], [1, 2, 3], [4], [5, 6], [7]{{\]}}
+//      CHECK:   %[[RESULT_RESHAPE:.+]] = tensor.expand_shape %[[COLLAPSED_OP]] {{\[}}[0], [1, 2, 3], [4], [5, 6], [7]{{\]}} output_shape [2, 3, 4, 5, 6, 7, 8, 9]
 //      CHECK:   return %[[RESULT_RESHAPE]]
 
 // -----
@@ -165,11 +162,11 @@ func.func @fuse_by_collapsing_change_reshape_order(%arg0 : tensor<9x56x2x60x6xi3
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d1, d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
 func.func @fuse_by_collapsing_dynamic(%arg0 : tensor<?x?x?x?x?xi32>,
-    %arg1 : tensor<?x?x?xi32>, %arg2 : tensor<?x?x?x?xi32>) -> tensor<?x3x?x5x?x7x?x?xi32> {
+    %arg1 : tensor<?x?x?xi32>, %arg2 : tensor<?x?x?x?xi32>, %sz0: index, %sz1: index, %sz2: index, %sz3: index, %sz4: index) -> tensor<?x3x?x5x?x7x?x?xi32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %expand = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4, 5, 6], [7]]
+  %expand = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4, 5, 6], [7]] output_shape [%sz0, 7, %sz1, %sz2, 3, %sz3, 5, %sz4]
       : tensor<?x?x?x?x?xi32> into tensor<?x7x?x?x3x?x5x?xi32>
   %d0 = tensor.dim %arg1, %c2 : tensor<?x?x?xi32>
   %d2 = tensor.dim %arg2, %c2 : tensor<?x?x?x?xi32>
@@ -203,8 +200,8 @@ func.func @fuse_by_collapsing_dynamic(%arg0 : tensor<?x?x?x?x?xi32>,
     } -> tensor<?x3x?x5x?x7x?x?xi32>
   return %generic : tensor<?x3x?x5x?x7x?x?xi32>
 }
-//      CHECK: func @fuse_by_collapsing_dynamic(
-// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?x?xi32>
+//      CHECK: func @fuse_by_collapsing_dynamic
+// CHECK-SAME:     (%[[ARG0:.+]]: tensor<?x?x?x?x?xi32>, %[[SZ0:.+]]: index, %[[SZ1:.+]]: index, %[[SZ2:.+]]: index, %[[SZ3:.+]]: index, %[[SZ4:.+]]: index)
 //  CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 //  CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
 //      CHECK:   %[[EXPAND:.+]] = tensor.expand_shape %[[ARG0]]
@@ -224,8 +221,8 @@ func.func @fuse_by_collapsing_dynamic(%arg0 : tensor<?x?x?x?x?xi32>,
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3)>
-func.func @fuse_reductions(%arg0 : tensor<2x?x5xf32>, %arg1 : tensor<2x5xf32>) -> tensor<2x5xf32> {
-  %0 = tensor.expand_shape %arg0 [[0], [1, 2], [3]] : tensor<2x?x5xf32> into tensor<2x6x?x5xf32>
+func.func @fuse_reductions(%arg0 : tensor<2x?x5xf32>, %arg1 : tensor<2x5xf32>, %sz0: index) -> tensor<2x5xf32> {
+  %0 = tensor.expand_shape %arg0 [[0], [1, 2], [3]] output_shape [2, 6, %sz0, 5] : tensor<2x?x5xf32> into tensor<2x6x?x5xf32>
   %1 = linalg.generic {
       indexing_maps = [#map0, #map1],
       iterator_types = ["parallel", "reduction", "reduction", "parallel"]}
@@ -240,7 +237,8 @@ func.func @fuse_reductions(%arg0 : tensor<2x?x5xf32>, %arg1 : tensor<2x5xf32>) -
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //      CHECK: func @fuse_reductions(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<2x?x5xf32>
-// CHECK-SAME:     %[[ARG1:.+]]: tensor<2x5xf32>) -> tensor<2x5xf32>
+// CHECK-SAME:     %[[ARG1:.+]]: tensor<2x5xf32>
+// CHECK-SAME:     %[[SZ0:.+]]: index) -> tensor<2x5xf32>
 //      CHECK:   %[[GENERIC:.+]] = linalg.generic
 // CHECK-SAME:       indexing_maps = [#[[MAP0]], #[[MAP1]]]
 // CHECK-SAME:       iterator_types = ["parallel", "reduction", "parallel"]
@@ -253,7 +251,7 @@ func.func @fuse_reductions(%arg0 : tensor<2x?x5xf32>, %arg1 : tensor<2x5xf32>) -
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 func.func @no_fuse_unpreserved_folding(%arg0 : tensor<2x12x5xf32>, %arg1 : tensor<2x3xf32>) -> tensor<2x3x4x5xf32> {
-  %0 = tensor.expand_shape %arg0 [[0], [1, 2], [3]] : tensor<2x12x5xf32> into tensor<2x3x4x5xf32>
+  %0 = tensor.expand_shape %arg0 [[0], [1, 2], [3]] output_shape [2, 3, 4, 5] : tensor<2x12x5xf32> into tensor<2x3x4x5xf32>
   %init = tensor.empty(): tensor<2x3x4x5xf32>
   %1 = linalg.generic {
       indexing_maps = [#map0, #map1, #map0],
@@ -280,7 +278,7 @@ func.func @no_fuse_unpreserved_folding(%arg0 : tensor<2x12x5xf32>, %arg1 : tenso
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
 func.func @no_fuse_unpreserved_folding_transpose(%arg0 : tensor<2x12x5xf32>, %arg1 : tensor<2xf32>) -> tensor<2x4x3x5xf32> {
-  %0 = tensor.expand_shape %arg0 [[0], [1, 2], [3]] : tensor<2x12x5xf32> into tensor<2x3x4x5xf32>
+  %0 = tensor.expand_shape %arg0 [[0], [1, 2], [3]] output_shape [2, 3, 4, 5] : tensor<2x12x5xf32> into tensor<2x3x4x5xf32>
   %init = tensor.empty() : tensor<2x4x3x5xf32>
   %1 = linalg.generic {
       indexing_maps = [#map0, #map1, #map2],
@@ -307,7 +305,7 @@ func.func @no_fuse_unpreserved_folding_transpose(%arg0 : tensor<2x12x5xf32>, %ar
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d3)>
 func.func @no_fuse_mismatched_iterator_types(%arg0 : tensor<2x12x5xf32>, %arg1 : tensor<2x3xf32>) -> tensor<2x5xf32> {
-  %0 = tensor.expand_shape %arg0 [[0], [1, 2], [3]] : tensor<2x12x5xf32> into tensor<2x3x4x5xf32>
+  %0 = tensor.expand_shape %arg0 [[0], [1, 2], [3]] output_shape [2, 3, 4, 5] : tensor<2x12x5xf32> into tensor<2x3x4x5xf32>
   %init = tensor.empty() : tensor<2x5xf32>
   %1 = linalg.generic {
       indexing_maps = [#map0, #map1, #map2],
@@ -335,8 +333,8 @@ func.func @no_fuse_mismatched_iterator_types(%arg0 : tensor<2x12x5xf32>, %arg1 :
 #map1 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 func.func @control_fusion(%arg0 : tensor<6xf32>, %arg1 : tensor<20xf32>) -> tensor<2x3x4x5xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<6xf32> into tensor<2x3xf32>
-  %1 = tensor.expand_shape %arg1 [[0, 1]] : tensor<20xf32> into tensor<4x5xf32>
+  %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [2, 3] : tensor<6xf32> into tensor<2x3xf32>
+  %1 = tensor.expand_shape %arg1 [[0, 1]] output_shape [4, 5] : tensor<20xf32> into tensor<4x5xf32>
     %init = tensor.empty() : tensor<2x3x4x5xf32>
   %2 = linalg.generic {
       indexing_maps = [#map0, #map1, #map2],
@@ -359,8 +357,8 @@ func.func @control_fusion(%arg0 : tensor<6xf32>, %arg1 : tensor<20xf32>) -> tens
 // CHECK-SAME:       iterator_types = ["parallel", "parallel"]
 // CHECK-SAME:       ins(%[[ARG0]], %[[ARG1]] :
 // CHECK-SAME:       outs(%{{.+}}: tensor<6x20xf32>)
-//      CHECK:   %[[RESHAPE1:.+]] = tensor.expand_shape %[[GENERIC]] {{\[}}[0], [1, 2]{{\]}}
-//      CHECK:   %[[RESHAPE2:.+]] = tensor.expand_shape %[[RESHAPE1]] {{\[}}[0, 1], [2], [3]{{\]}}
+//      CHECK:   %[[RESHAPE1:.+]] = tensor.expand_shape %[[GENERIC]] {{\[}}[0], [1, 2]{{\]}} output_shape [6, 4, 5]
+//      CHECK:   %[[RESHAPE2:.+]] = tensor.expand_shape %[[RESHAPE1]] {{\[}}[0, 1], [2], [3]{{\]}} output_shape [2, 3, 4, 5]
 //      CHECK:   return %[[RESHAPE2]]
 
 //  CONTROL-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
@@ -375,14 +373,14 @@ func.func @control_fusion(%arg0 : tensor<6xf32>, %arg1 : tensor<20xf32>) -> tens
 //      CONTROL:     %[[GENERIC:.+]] = linalg.generic
 // CONTROL-SAME:         ins(%[[EXPAND]], %[[ARG1]] :
 // CONTROL-SAME:         outs(%[[INIT_RESHAPE]] :
-//      CONTROL:     %[[RESULT:.+]] = tensor.expand_shape %[[GENERIC]] {{\[}}[0], [1], [2, 3]{{\]}}
+//      CONTROL:     %[[RESULT:.+]] = tensor.expand_shape %[[GENERIC]] {{\[}}[0], [1], [2, 3]{{\]}} output_shape [2, 3, 4, 5]
 
 // -----
 
 // Corner case that isnt handled currently.
 #map = affine_map<(d0) -> (d0)>
 func.func @zero_D_test(%arg0: tensor<f32>) -> tensor<1xf32> {
-  %0 = tensor.expand_shape %arg0 [] : tensor<f32> into tensor<1xf32>
+  %0 = tensor.expand_shape %arg0 [] output_shape [1] : tensor<f32> into tensor<1xf32>
   %init = tensor.empty() : tensor<1xf32>
   %1 = linalg.generic {
       indexing_maps = [#map, #map],
@@ -404,8 +402,8 @@ func.func @zero_D_test(%arg0: tensor<f32>) -> tensor<1xf32> {
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d2, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-func.func @fuse_only_one_reassociation(%arg0 : tensor<?x?xf32>, %arg1 : tensor<4x?x?x8xf32>) -> tensor<4x?x?x8xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2, 3]] : tensor<?x?xf32> into tensor<?x4x?x8xf32>
+func.func @fuse_only_one_reassociation(%arg0 : tensor<?x?xf32>, %arg1 : tensor<4x?x?x8xf32>, %sz0: index, %sz1: index) -> tensor<4x?x?x8xf32> {
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [%sz0, 4, %sz1, 8] : tensor<?x?xf32> into tensor<?x4x?x8xf32>
   %1 = linalg.generic {
       indexing_maps = [#map0, #map1, #map1],
       iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
@@ -419,10 +417,12 @@ func.func @fuse_only_one_reassociation(%arg0 : tensor<?x?xf32>, %arg1 : tensor<4
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-//      CHECK: func @fuse_only_one_reassociation(
-// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
-// CHECK-SAME:     %[[ARG1:.+]]: tensor<4x?x?x8xf32>
-//  CHECK-DAG:   %[[EXPAND_ARG0:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]{{\]}}
+//      CHECK: func @fuse_only_one_reassociation
+// CHECK-SAME:     (%[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG1:.+]]: tensor<4x?x?x8xf32>, %[[SZ0:.+]]: index, %[[SZ1:.+]]: index)
+//  CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
+//  CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//  CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[EXPAND_ARG0:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [%[[SZ0]], 4, %[[SZ1]], 8]
 //  CHECK-DAG:   %[[COLLAPSE_ARG0:.+]] = tensor.collapse_shape %[[EXPAND_ARG0]] {{\[}}[0], [1], [2, 3]{{\]}}
 //  CHECK-DAG:   %[[COLLAPSE_ARG1_0:.+]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}}
 //  CHECK-DAG:   %[[COLLAPSE_ARG1_1:.+]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0], [1], [2, 3]{{\]}}
@@ -431,17 +431,20 @@ func.func @fuse_only_one_reassociation(%arg0 : tensor<?x?xf32>, %arg1 : tensor<4
 // CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel"]
 // CHECK-SAME:       ins(%[[COLLAPSE_ARG0]], %[[COLLAPSE_ARG1_0]] :
 // CHECK-SAME:       outs(%[[COLLAPSE_ARG1_1]] :
-//      CHECK:   %[[EXPAND_GENERIC:.+]] = tensor.expand_shape %[[GENERIC]] {{\[}}[0], [1], [2, 3]{{\]}}
-//      CHECK:   return %[[EXPAND_GENERIC]]
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor<4x?x?xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C2]] : tensor<4x?x?xf32>
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C8]] : index
+//      CHECK:   %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0], [1], [2, 3]] output_shape [4, %[[DIM]], %[[VAL_1]], 8] : tensor<4x?x?xf32> into tensor<4x?x?x8xf32>
+//      CHECK:   return %[[EXPANDED_3]]
 
 // -----
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0, d2)>
-func.func @fold_non_consecutive_dims(%arg0 : tensor<?x?xi32>) -> tensor<?x8x?x4xi32> {
+func.func @fold_non_consecutive_dims(%arg0 : tensor<?x?xi32>, %sz0: index, %sz1: index) -> tensor<?x8x?x4xi32> {
   %c0 = arith.constant 0 : index
   %c2 = arith.constant 2 : index
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2, 3]] : tensor<?x?xi32> into tensor<?x4x?x8xi32>
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [%sz0, 4, %sz1, 8] : tensor<?x?xi32> into tensor<?x4x?x8xi32>
   %d0 = tensor.dim %0, %c0 : tensor<?x4x?x8xi32>
   %d1 = tensor.dim %0, %c2 : tensor<?x4x?x8xi32>
   %init = tensor.empty(%d1, %d0) : tensor<?x8x?x4xi32>
@@ -465,10 +468,16 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor<?x?xi32>) -> tensor<?x8x?x4x
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d1, d0)>
 //      CHECK: func @fold_non_consecutive_dims(
-// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xi32>)
-//  CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-//  CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
-//      CHECK:   %[[INIT:.+]] = tensor.empty
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xi32>, %[[SZ0:.+]]: index, %[[SZ1:.+]]: index)
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C4:.+]] = arith.constant 4 : index
+//      CHECK:   %[[C8:.+]] = arith.constant 8 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[C2:.+]] = arith.constant 2 : index
+//      CHECK:   %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[SZ0]], 4, %[[SZ1]], 8] : tensor<?x?xi32> into tensor<?x4x?x8xi32>
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]]
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[EXPANDED]], %[[C2]]
+//      CHECK:   %[[INIT:.+]] = tensor.empty(%[[DIM_0]], %[[DIM]])
 //      CHECK:   %[[COLLAPSE_INIT:.+]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1], [2, 3]{{\]}}
 //      CHECK:   %[[GENERIC:.+]] = linalg.generic
 // CHECK-SAME:       indexing_maps = [#[[MAP0]], #[[MAP1]]]
@@ -487,8 +496,12 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor<?x?xi32>) -> tensor<?x8x?x4x
 //  CHECK-DAG:       %[[T6:.+]] = arith.addi %[[T5]], %[[T3]]
 //  CHECK-DAG:       %[[T7:.+]] = arith.index_cast %[[T6]]
 //      CHECK:       linalg.yield %[[T7]]
-//      CHECK:   %[[EXPAND_GENERIC:.+]] = tensor.expand_shape %[[GENERIC]] {{\[}}[0, 1], [2, 3]{{\]}}
-//      CHECK:   return %[[EXPAND_GENERIC]]
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[GENERIC]], %[[C0]] : tensor<?x?xi32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor<?x?xi32>
+//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C8]] : index
+//      CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C4]] : index
+//      CHECK:   %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 8, %[[VAL_3]], 4] : tensor<?x?xi32> into tensor<?x8x?x4xi32>
+//      CHECK:   return %[[EXPANDED_3]]
 
 // -----
 
@@ -496,10 +509,10 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor<?x?xi32>) -> tensor<?x8x?x4x
 // So no change in the code.
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
 #map1 = affine_map<(d0, d1, d2, d3) -> ()>
-func.func @no_fold_non_consecutive_reduction_dims(%arg0 : tensor<?x?xi32>) -> tensor<i32> {
+func.func @no_fold_non_consecutive_reduction_dims(%arg0 : tensor<?x?xi32>, %sz0: index, %sz1: index) -> tensor<i32> {
   %c0 = arith.constant 0 : index
   %c2 = arith.constant 2 : index
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2, 3]] : tensor<?x?xi32> into tensor<?x4x?x8xi32>
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [%sz0, 4, %sz1, 8] : tensor<?x?xi32> into tensor<?x4x?x8xi32>
   %init = tensor.empty() : tensor<i32>
   %1 = linalg.generic {
       indexing_maps = [#map0, #map1],
@@ -519,8 +532,8 @@ func.func @no_fold_non_consecutive_reduction_dims(%arg0 : tensor<?x?xi32>) -> te
   return %1 : tensor<i32>
 }
 //      CHECK: func @no_fold_non_consecutive_reduction_dims(
-// CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xi32>)
-//      CHECK:   %[[EXPAND_ARG0:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]{{\]}}
+// CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xi32>, %[[SZ0:.+]]: index, %[[SZ1:.+]]: index)
+//      CHECK:   %[[EXPAND_ARG0:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2, 3]{{\]}} output_shape [%[[SZ0]], 4, %[[SZ1]], 8]
 //      CHECK:   %[[GENERIC:.+]] = linalg.generic
 // CHECK-SAME:       ins(%[[EXPAND_ARG0]] :
 //      CHECK:   return %[[GENERIC]]
diff --git a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
index f1c729ef963b..751ece37bc09 100644
--- a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
@@ -4,15 +4,19 @@
 // CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1) -> (d1)>
 
 // CHECK-LABEL: func @reshape
-// CHECK-SAME: (%[[A:.*]]: tensor<?x16xf32>, %[[B:.*]]: tensor<16xf32>, %[[INIT:.*]]: tensor<?x112x16xf32>)
+// CHECK-SAME: (%[[A:.*]]: tensor<?x16xf32>, %[[B:.*]]: tensor<16xf32>, %[[INIT:.*]]: tensor<?x112x16xf32>, %[[SZ0:.*]]: index)
+//      CHECK: %[[C112:.*]] = arith.constant 112 : index
+//      CHECK: %[[C0:.*]] = arith.constant 0 : index
 //      CHECK: %[[RI:.*]] = tensor.collapse_shape %[[INIT]] {{\[}}[0, 1], [2]] : tensor<?x112x16xf32> into tensor<?x16xf32>
 //      CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]], #[[$MAP2]]],
 // CHECK-SAME: iterator_types = ["parallel", "parallel"]}
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x16xf32>, tensor<16xf32>) outs(%[[RI]] : tensor<?x16xf32>)
-//      CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[}}[0, 1], [2]] : tensor<?x16xf32> into tensor<?x112x16xf32>
+//      CHECK: %[[DIM:.*]] = tensor.dim %[[R]], %[[C0]] : tensor<?x16xf32>
+//      CHECK: %[[VAL_1:.*]] = arith.divui %[[DIM]], %[[C112]] : index
+//      CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[\[}}0, 1], [2]] output_shape [%[[VAL_1]], 112, 16] : tensor<?x16xf32> into tensor<?x112x16xf32>
 //      CHECK: return %[[RR]] : tensor<?x112x16xf32>
-func.func @reshape(%A: tensor<?x16xf32>, %B: tensor<16xf32>, %init: tensor<?x112x16xf32>) -> tensor<?x112x16xf32> {
-  %0 = tensor.expand_shape %A [[0, 1], [2]]
+func.func @reshape(%A: tensor<?x16xf32>, %B: tensor<16xf32>, %init: tensor<?x112x16xf32>, %sz0: index) -> tensor<?x112x16xf32> {
+  %0 = tensor.expand_shape %A [[0, 1], [2]] output_shape [%sz0, 112, 16]
       : tensor<?x16xf32> into tensor<?x112x16xf32>
   %2 = linalg.generic {indexing_maps = [
     affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>,
@@ -39,13 +43,13 @@ func.func @reshape(%A: tensor<?x16xf32>, %B: tensor<16xf32>, %init: tensor<?x112
 //      CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP2]], #[[$MAP3]], #[[$MAP2]]],
 // CHECK-SAME: iterator_types = ["parallel", "parallel"]}
 // CHECK-SAME: ins(%[[A]], %[[B]], %[[C]] : tensor<12544x16xf32>, tensor<12544x16xf32>, tensor<16xf32>) outs(%[[RI]] : tensor<12544x16xf32>)
-//      CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[}}[0, 1], [2]] : tensor<12544x16xf32> into tensor<112x112x16xf32>
+//      CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[}}[0, 1], [2]] output_shape [112, 112, 16] : tensor<12544x16xf32> into tensor<112x112x16xf32>
 //      CHECK: return %[[RR]] : tensor<112x112x16xf32>
 func.func @reshape_multiple(%A: tensor<12544x16xf32>, %B: tensor<12544x16xf32>,
   %C: tensor<16xf32>) -> tensor<112x112x16xf32> {
-  %0 = tensor.expand_shape %A [[0, 1], [2]]
+  %0 = tensor.expand_shape %A [[0, 1], [2]] output_shape [112, 112, 16]
       : tensor<12544x16xf32> into tensor<112x112x16xf32>
-  %1 = tensor.expand_shape %B [[0, 1], [2]]
+  %1 = tensor.expand_shape %B [[0, 1], [2]] output_shape [112, 112, 16]
       : tensor<12544x16xf32> into tensor<112x112x16xf32>
   %2 = tensor.empty() : tensor<112x112x16xf32>
   %3 = linalg.generic {indexing_maps = [
@@ -69,11 +73,11 @@ func.func @reshape_multiple(%A: tensor<12544x16xf32>, %B: tensor<12544x16xf32>,
 // Negative test, since the second source is broadcasted from d1 we cannot merge
 // d0 and d1 dimensions
 // CHECK-LABEL: func @reshape_negative
-// CHECK: tensor.expand_shape {{.*}} : tensor<12544x16xf32> into tensor<112x112x16xf32>
+// CHECK: tensor.expand_shape {{.*}} {{\[\[}}0, 1], [2]] output_shape [112, 112, 16] : tensor<12544x16xf32> into tensor<112x112x16xf32>
 // CHECK: linalg.generic
 // CHECK: } -> tensor<112x112x16xf32>
 func.func @reshape_negative(%A: tensor<12544x16xf32>, %B: tensor<112xf32>) -> tensor<112x112x16xf32> {
-  %20 = tensor.expand_shape %A [[0, 1], [2]]
+  %20 = tensor.expand_shape %A [[0, 1], [2]] output_shape [112, 112, 16]
       : tensor<12544x16xf32> into tensor<112x112x16xf32>
   %21 = tensor.empty() : tensor<112x112x16xf32>
   %22 = linalg.generic {indexing_maps = [
@@ -96,7 +100,7 @@ func.func @type_correctness(%arg0 : tensor<6x5xi32>, %arg1 : tensor<5xf32>,
   %cst_6 = arith.constant 1.000000e+00 : f32
   %cst_7 = arith.constant 7.000000e+00 : f32
   %cst_8 = arith.constant 1.1920929E-7 : f32
-  %25 = tensor.expand_shape %arg0 [[0, 1], [2]]
+  %25 = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [2, 3, 5]
       : tensor<6x5xi32> into tensor<2x3x5xi32>
   %26 = tensor.empty() : tensor<2x3x5xf32>
   %28 = linalg.generic {
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index e852824cdb73..667ea3c18c8a 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -565,6 +565,155 @@ func.func @generalize_negf(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>)
 
 // -----
 
+func.func @generalize_reciprocal(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.reciprocal ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_reciprocal
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: %[[one:.+]] = arith.constant 1.000000e+00 : f32
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[reciprocal:.+]] = arith.divf %[[one]], %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[reciprocal]] : f32
+
+// -----
+
+func.func @generalize_round(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.round ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_round
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[round:.+]] = math.round %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[round]] : f32
+
+// -----
+
+func.func @generalize_sqrt(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.sqrt ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_sqrt
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[sqrt:.+]] = math.sqrt %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[sqrt]] : f32
+
+// -----
+
+func.func @generalize_rsqrt(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.rsqrt ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_rsqrt
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[rsqrt:.+]] = math.rsqrt %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[rsqrt]] : f32
+
+// -----
+
+func.func @generalize_square(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.square ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_square
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[square:.+]] = arith.mulf %[[BBARG0]], %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[square]] : f32
+
+// -----
+
+func.func @generalize_tanh(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.tanh ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_tanh
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[tanh:.+]] = math.tanh %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[tanh]] : f32
+
+// -----
+
+func.func @generalize_erf(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.erf ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_erf
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[erf:.+]] = math.erf %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[erf]] : f32
+
+// -----
+
 func.func @generalize_max(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
                           %out: memref<7x14x21xf32>) {
   linalg.max ins(%lhs, %rhs : memref<7x14x21xf32>, memref<7x14x21xf32>)
@@ -590,6 +739,58 @@ func.func @generalize_max(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
 
 // -----
 
+func.func @generalize_min(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
+                          %out: memref<7x14x21xf32>) {
+  linalg.min ins(%lhs, %rhs : memref<7x14x21xf32>, memref<7x14x21xf32>)
+             outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_min
+// CHECK-SAME: (%[[LHS:.+]]: memref<7x14x21xf32>, %[[RHS:.+]]: memref<7x14x21xf32>,
+// CHECK-SAME:  %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]], %[[RHS]] : memref<7x14x21xf32>, memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
+// CHECK-NEXT:      %[[min:.+]] = arith.minimumf %[[BBARG0]], %[[BBARG1]] : f32
+// CHECK-NEXT:      linalg.yield %[[min]] : f32
+
+
+// -----
+
+func.func @generalize_powf(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
+                          %out: memref<7x14x21xf32>) {
+  linalg.powf ins(%lhs, %rhs : memref<7x14x21xf32>, memref<7x14x21xf32>)
+             outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_powf
+// CHECK-SAME: (%[[LHS:.+]]: memref<7x14x21xf32>, %[[RHS:.+]]: memref<7x14x21xf32>,
+// CHECK-SAME:  %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]], %[[RHS]] : memref<7x14x21xf32>, memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
+// CHECK-NEXT:      %[[powf:.+]] = math.powf %[[BBARG0]], %[[BBARG1]] : f32
+// CHECK-NEXT:      linalg.yield %[[powf]] : f32
+
+
+// -----
+
 
 // CHECK-LABEL: func @fill_tensor
 func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor<f32>, tensor<vector<2x4xf32>>) {
diff --git a/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir b/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
index 24c7bdd9e105..4bfed475d44f 100644
--- a/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
+++ b/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
@@ -1062,6 +1062,28 @@ module attributes { transform.target_tag = "start_here" } {
     return %result : tensor<10x18x15xf64>
   }
 
+  func.func @convolution_depthwise(%input: tensor<1x10x196x48xf32>, %filter: tensor<1x4x48xf32>) -> tensor<1x10x191x48xf32> {
+    %cst = arith.constant 0.0 : f32 
+    %empty = tensor.empty() : tensor<1x10x191x48xf32>
+    %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1x10x191x48xf32>) -> tensor<1x10x191x48xf32>
+    // expected-remark @below {{convolution}}
+    // expected-remark @below {{batch dims 0}}
+    // expected-remark @below {{output image dims 1 : i64, 2 : i64}}
+    // expected-remark @below {{output channel dims}}
+    // expected-remark @below {{filter loop dims 4 : i64, 5 : i64}}
+    // expected-remark @below {{input channel dims}}
+    // expected-remark @below {{depth dims 3}}
+    // expected-remark @below {{strides 1 : i64, 1 : i64}}
+    // expected-remark @below {{dilations 1 : i64, 1 : i64}}
+    %result = linalg.depthwise_conv_2d_nhwc_hwc {
+      dilations = dense<1> : tensor<2xi64>,
+      strides = dense<1> : tensor<2xi64>}
+      ins(%input, %filter : tensor<1x10x196x48xf32>, tensor<1x4x48xf32>)
+      outs(%fill : tensor<1x10x191x48xf32>) -> tensor<1x10x191x48xf32>
+
+    return %result : tensor<1x10x191x48xf32>
+  }
+
   func.func @convolution_multi_channel(%input: tensor<2x34x68x16xf32>, %filter: tensor<8x2x3x5x16x16xf32>) -> tensor<8x32x32x16xf32> {
     %cst = arith.constant 0.0 : f32
     %empty = tensor.empty() : tensor<8x32x32x16xf32>
diff --git a/mlir/test/Dialect/Linalg/named-ops-fail.mlir b/mlir/test/Dialect/Linalg/named-ops-fail.mlir
index c351e139a97e..e92a77aa7ad0 100644
--- a/mlir/test/Dialect/Linalg/named-ops-fail.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops-fail.mlir
@@ -176,6 +176,118 @@ func.func @negf_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
 
 // -----
 
+func.func @reciprocal_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.reciprocal ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @reciprocal_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.reciprocal ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @round_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.round ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @round_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.round ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @sqrt_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.sqrt ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @sqrt_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.sqrt ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @rsqrt_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.rsqrt ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @rsqrt_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.rsqrt ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @square_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.square ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @square_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.square ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @tanh_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.tanh ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @tanh_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.tanh ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @erf_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.erf ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @erf_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.erf ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
 func.func @max_type_cast(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf16>, %arg2: memref<4x8x16xf32>) {
   // CHECK: op requires the same type for all operands and results
   linalg.max ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf16>) outs(%arg2: memref<4x8x16xf32>)
@@ -189,3 +301,36 @@ func.func @max_broadcast(%arg0: memref<8x16xf32>, %arg1: memref<4x8x16xf32>, %ar
   linalg.max ins(%arg0, %arg1 : memref<8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
   return
 }
+
+// -----
+
+func.func @min_type_cast(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf16>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op requires the same type for all operands and results
+  linalg.min ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf16>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @min_broadcast(%arg0: memref<8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.min ins(%arg0, %arg1 : memref<8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @powf_type_cast(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf16>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op requires the same type for all operands and results
+  linalg.powf ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf16>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @powf_broadcast(%arg0: memref<8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.powf ins(%arg0, %arg1 : memref<8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 7064e1b3f9dc..fefe5578947f 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1597,6 +1597,223 @@ func.func @negf_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
 
 // -----
 
+// CHECK-LABEL: func @reciprocal_dynamic
+func.func @reciprocal_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.reciprocal
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.reciprocal ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @reciprocal_static
+func.func @reciprocal_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.reciprocal
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.reciprocal ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @reciprocal_tensor
+func.func @reciprocal_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.reciprocal
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.reciprocal ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @round_dynamic
+func.func @round_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.round
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.round ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @round_static
+func.func @round_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.round
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.round ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @round_tensor
+func.func @round_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.round
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.round ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @sqrt_dynamic
+func.func @sqrt_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.sqrt
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.sqrt ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sqrt_static
+func.func @sqrt_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.sqrt
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.sqrt ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sqrt_tensor
+func.func @sqrt_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.sqrt
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.sqrt ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @rsqrt_dynamic
+func.func @rsqrt_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.rsqrt
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.rsqrt ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @rsqrt_static
+func.func @rsqrt_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.rsqrt
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.rsqrt ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @rsqrt_tensor
+func.func @rsqrt_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.rsqrt
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.rsqrt ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @square_dynamic
+func.func @square_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.square
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.square ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @square_static
+func.func @square_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.square
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.square ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @square_tensor
+func.func @square_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.square
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.square ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @tanh_dynamic
+func.func @tanh_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.tanh
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.tanh ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @tanh_static
+func.func @tanh_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.tanh
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.tanh ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @tanh_tensor
+func.func @tanh_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.tanh
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.tanh ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @erf_dynamic
+func.func @erf_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.erf ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @erf_static
+func.func @erf_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.erf ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @erf_tensor
+func.func @erf_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.erf ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @max_dynamic
 func.func @max_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
   // CHECK: linalg.max
@@ -1631,6 +1848,74 @@ func.func @max_tensor(%arg0: tensor<4x8x16xf32>, %arg1: tensor<4x8x16xf32>) -> t
 
 // -----
 
+// CHECK-LABEL: func @min_dynamic
+func.func @min_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
+  // CHECK: linalg.min
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.min ins(%arg0, %arg1 : memref<?x?x?xf32>, memref<?x?x?xf32>) outs(%arg2: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @min_static
+func.func @min_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: linalg.min
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<4x8x16xf32>, memref<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.min ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @min_tensor
+func.func @min_tensor(%arg0: tensor<4x8x16xf32>, %arg1: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.min
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.min ins(%arg0, %arg1 : tensor<4x8x16xf32>, tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @powf_dynamic
+func.func @powf_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.powf ins(%arg0, %arg1 : memref<?x?x?xf32>, memref<?x?x?xf32>) outs(%arg2: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @powf_static
+func.func @powf_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<4x8x16xf32>, memref<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.powf ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @powf_tensor
+func.func @powf_tensor(%arg0: tensor<4x8x16xf32>, %arg1: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.powf ins(%arg0, %arg1 : tensor<4x8x16xf32>, tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @fill_tensor
 func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor<f32>, tensor<vector<2x4xf32>>) {
   %e0 = tensor.empty() : tensor<f32>
diff --git a/mlir/test/Dialect/Linalg/reshape_control_fusion.mlir b/mlir/test/Dialect/Linalg/reshape_control_fusion.mlir
index ab948988b7b6..0f0337a3604e 100644
--- a/mlir/test/Dialect/Linalg/reshape_control_fusion.mlir
+++ b/mlir/test/Dialect/Linalg/reshape_control_fusion.mlir
@@ -48,7 +48,7 @@ func.func @control_consumer_reshape_fusion(%arg0 : tensor<1x?x?xf32>, %arg1 : te
       ^bb0(%arg2: f32):
         linalg.yield %cst : f32
       } -> tensor<?x?xf32>
-  %0 = tensor.expand_shape %fill [[0, 1], [2]] : tensor<?x?xf32> into tensor<1x?x?xf32>
+  %0 = tensor.expand_shape %fill [[0, 1], [2]] output_shape [1, %d0, %d1] : tensor<?x?xf32> into tensor<1x?x?xf32>
   %1 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x?x?xf32>, tensor<1x?x?xf32>)
       outs(%0 : tensor<1x?x?xf32>) -> tensor<1x?x?xf32>
   return %1 : tensor<1x?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/reshape_fusion.mlir b/mlir/test/Dialect/Linalg/reshape_fusion.mlir
index 342c067b5c4b..f42666f81bba 100644
--- a/mlir/test/Dialect/Linalg/reshape_fusion.mlir
+++ b/mlir/test/Dialect/Linalg/reshape_fusion.mlir
@@ -30,10 +30,20 @@ func.func @generic_op_reshape_producer_fusion(%arg0 : tensor<?x?x4x?xf32>,
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x4x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: f32
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0], [1], [2, 3]
-//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0], [1], [2, 3]
+//      CHECK:   %[[C4:.+]] = arith.constant 4 : index
+//      CHECK:   %[[C2:.+]] = arith.constant 2 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?x?xf32>
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?x?xf32>
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor<?x?x?xf32>
+//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM_1]], %[[C4]] : index
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1], [2, 3]] output_shape [%[[DIM]], %[[DIM_0]], %[[VAL_0]], 4] : tensor<?x?x?xf32> into tensor<?x?x?x4xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?x?xf32>
+//      CHECK:   %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?x?xf32>
+//      CHECK:   %[[DIM_4:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor<?x?x?xf32>
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_4]], %[[C4]] : index
+//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1], [2, 3]] output_shape [%[[DIM_2]], %[[DIM_3]], %[[VAL_1]], 4] : tensor<?x?x?xf32> into tensor<?x?x?x4xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP5]], #[[MAP6]], #[[MAP7]], #[[MAP6]]]
 // CHECK-SAME:     ["parallel", "parallel", "parallel", "parallel"]
@@ -50,7 +60,9 @@ func.func @generic_op_reshape_producer_fusion(%arg0 : tensor<?x?x4x?xf32>,
 #map1 = affine_map<(d0, d1) -> ()>
 func.func @generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?xf32>,
                                          %arg1 : tensor<?x?xf32>,
-                                         %arg2 : f32) ->
+                                         %arg2 : f32,
+                                         %sz0: index,
+                                         %sz1: index) ->
                                          tensor<?x4x?x5xf32>
 {
   %0 = linalg.generic {
@@ -63,7 +75,7 @@ func.func @generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?xf32>,
       %2 = arith.addf %1, %arg5 : f32
       linalg.yield %2 : f32
   } -> tensor<?x?xf32>
-  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] :
+  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] output_shape [%sz0, 4, %sz1, 5] :
     tensor<?x?xf32> into tensor<?x4x?x5xf32>
   return %1 : tensor<?x4x?x5xf32>
 }
@@ -75,14 +87,22 @@ func.func @generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?xf32>,
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: f32
-//      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0], [1, 2, 3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x4x?x5xf32>
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0], [1, 2, 3]
-//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0], [1, 2, 3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x4x?x5xf32>
+// CHECK-SAME:   %[[SZ0:.+]]: index, %[[SZ1:.+]]: index
+//      CHECK:   %[[C20:.+]] = arith.constant 20 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM_0]], %[[C20]] : index
+//      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM]], 4, %[[VAL_0]], 5] : tensor<?x?xf32> into tensor<?x4x?x5xf32>
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], 4, %[[VAL_1]], 5] : tensor<?x?xf32> into tensor<?x4x?x5xf32>
+//      CHECK:   %[[DIM_4:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index
+//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], 4, %[[VAL_2]], 5] : tensor<?x?xf32> into tensor<?x4x?x5xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP2]], #[[MAP3]], #[[MAP2]]]
 // CHECK-SAME:     ["parallel", "parallel", "parallel", "parallel"]
@@ -94,7 +114,7 @@ func.func @generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?xf32>,
 // -----
 
 func.func @reshape_as_consumer_permutation
-  (%a : tensor<?x?x?xf32>, %b : tensor<?x?xf32>)
+  (%a : tensor<?x?x?xf32>, %b : tensor<?x?xf32>, %sz0: index, %sz1: index, %sz2: index)
     -> tensor<?x2x?x3x4x?xf32> {
   %c = linalg.generic {
          indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>,
@@ -107,8 +127,7 @@ func.func @reshape_as_consumer_permutation
          %1 = arith.addf %arg0, %arg1 : f32
          linalg.yield %1 : f32
        } -> tensor<?x?x?xf32>
-  %d = tensor.expand_shape %c [[0, 1], [2], [3, 4, 5]]
-       : tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
+  %d = tensor.expand_shape %c [[0, 1], [2], [3, 4, 5]] output_shape [%sz0, 2, %sz1, 3, 4, %sz2] : tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
   return %d : tensor<?x2x?x3x4x?xf32>
 }
 //  CHECK-DAG: #[[MAP8:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d3, d4, d0, d1, d5)>
@@ -117,15 +136,27 @@ func.func @reshape_as_consumer_permutation
 //      CHECK: func @reshape_as_consumer_permutation
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-//      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0, 1, 2], [3, 4], [5]
-// CHECK-SAME:     tensor<?x?x?xf32> into tensor<3x4x?x?x2x?xf32>
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0, 1, 2], [3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<3x4x?x?xf32>
-//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0, 1], [2], [3, 4, 5]]
-// CHECK-SAME:     tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
+// CHECK-SAME:   %[[SZ0:.+]]: index, %[[SZ1:.+]]: index, %[[SZ2:.+]]: index
+//      CHECK:   %[[C12:.+]] = arith.constant 12 : index
+//      CHECK:   %[[C2:.+]] = arith.constant 2 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C12]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C2]] : index
+//      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3, 4], [5]] output_shape [3, 4, %[[VAL_0]], %[[VAL_1]], 2, %[[DIM_1]]] : tensor<?x?x?xf32> into tensor<3x4x?x?x2x?xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_2]], %[[C12]] : index
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [3, 4, %[[VAL_2]], %[[DIM_3]]] : tensor<?x?xf32> into tensor<3x4x?x?xf32>
+//      CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+//      CHECK:   %[[DIM_6:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+//      CHECK:   %[[DIM_7:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+//      CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_5]], %[[C2]] : index
+//      CHECK:   %[[VAL_4:.+]] = arith.divui %[[DIM_7]], %[[C12]] : index
+//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2], [3, 4, 5]] output_shape [%[[VAL_3]], 2, %[[DIM_6]], 3, 4, %[[VAL_4]]] : tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP8]], #[[MAP9]], #[[MAP10]]]
 // CHECK-SAME:     ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
@@ -152,7 +183,7 @@ func.func @generic_op_reshape_consumer_static(%arg0: tensor<264x4xf32>)
       %2 = arith.mulf %arg1, %arg2 : f32
       linalg.yield %2 : f32
     } -> tensor<264x4xf32>
-  %2 = tensor.expand_shape %1 [[0, 1], [2]] :
+  %2 = tensor.expand_shape %1 [[0, 1], [2]] output_shape [8, 33, 4] :
     tensor<264x4xf32> into tensor<8x33x4xf32>
   return %2 : tensor<8x33x4xf32>
 }
@@ -163,12 +194,8 @@ func.func @generic_op_reshape_consumer_static(%arg0: tensor<264x4xf32>)
 //  CHECK-DAG:   %[[CST:.+]] = arith.constant
 // CHECK-SAME:     : tensor<8x33x4xf32>
 //  CHECK-DAG:   %[[INIT:.+]] = tensor.empty()
-//      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0, 1], [2]
-// CHECK-SAME:     tensor<264x4xf32> into tensor<8x33x4xf32>
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[INIT]]
-// CHECK-SAME:     [0, 1], [2]
-// CHECK-SAME:     : tensor<264x4xf32> into tensor<8x33x4xf32>
+//      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2]] output_shape [8, 33, 4] : tensor<264x4xf32> into tensor<8x33x4xf32>
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1], [2]] output_shape [8, 33, 4] : tensor<264x4xf32> into tensor<8x33x4xf32>
 //      CHECK:   %[[T2:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP2]], #[[MAP2]]]
 // CHECK-SAME:     ["parallel", "parallel", "parallel"]
@@ -232,7 +259,8 @@ func.func @indexed_consumer_reshape_producer_fusion(%arg0 : tensor<?x?x4x?xi32>,
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 func.func @indexed_producer_reshape_consumer_fusion(%arg0 : tensor<?x?xi32>,
-                                         %arg1 : tensor<?x?xi32>) ->
+                                         %arg1 : tensor<?x?xi32>, 
+                                         %sz0: index, %sz1: index) ->
                                          tensor<?x?x4x5xi32>
 {
   %0 = linalg.generic {
@@ -250,7 +278,7 @@ func.func @indexed_producer_reshape_consumer_fusion(%arg0 : tensor<?x?xi32>,
       %5 = arith.addi %3, %4 : i32
       linalg.yield %5 : i32
   } -> tensor<?x?xi32>
-  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] :
+  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] output_shape [%sz0, %sz1, 4, 5] :
     tensor<?x?xi32> into tensor<?x?x4x5xi32>
   return %1 : tensor<?x?x4x5xi32>
 }
@@ -302,8 +330,7 @@ func.func @reshape_as_consumer_permutation
          %7 = arith.addi %5, %6 : i32
          linalg.yield %7 : i32
        } -> tensor<6x4x210xi32>
-  %d = tensor.expand_shape %c [[0, 1], [2], [3, 4, 5]]
-       : tensor<6x4x210xi32> into tensor<2x3x4x5x6x7xi32>
+  %d = tensor.expand_shape %c [[0, 1], [2], [3, 4, 5]] output_shape [2, 3, 4, 5, 6, 7] : tensor<6x4x210xi32> into tensor<2x3x4x5x6x7xi32>
   return %d : tensor<2x3x4x5x6x7xi32>
 }
 
@@ -319,13 +346,9 @@ func.func @reshape_as_consumer_permutation
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<210x6x4xi32>
 //  CHECK-SAME:   %[[ARG1:.+]]: tensor<210x4xi32>
 //   CHECK-DAG:   %[[INIT:.+]] = tensor.empty()
-//   CHECK-DAG:   %[[T1:.+]] = tensor.expand_shape %[[ARG0]]
-//  CHECK-SAME:     [0, 1, 2], [3, 4], [5]
-//   CHECK-DAG:   %[[T2:.+]] = tensor.expand_shape %[[ARG1]]
-//  CHECK-SAME:     [0, 1, 2], [3]
-//   CHECK-DAG:   %[[T3:.+]] = tensor.expand_shape %[[INIT]]
-//  CHECK-SAME:     [0, 1], [2], [3, 4, 5]
-//  CHECK-SAME:     : tensor<6x4x210xi32> into tensor<2x3x4x5x6x7xi32>
+//       CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3, 4], [5]] output_shape [5, 6, 7, 2, 3, 4] : tensor<210x6x4xi32> into tensor<5x6x7x2x3x4xi32>
+//       CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [5, 6, 7, 4] : tensor<210x4xi32> into tensor<5x6x7x4xi32>
+//       CHECK:   %[[T3:.+]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1], [2], [3, 4, 5]] output_shape [2, 3, 4, 5, 6, 7] : tensor<6x4x210xi32> into tensor<2x3x4x5x6x7xi32>
 //       CHECK:   %[[T4:.+]] = linalg.generic
 //  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
 //  CHECK-SAME:     ins(%[[T1]], %[[T2]] : tensor<5x6x7x2x3x4xi32>, tensor<5x6x7x4xi32>)
@@ -411,7 +434,8 @@ func.func @reshape_as_producer_projected_permutation(
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1, d0)>
 func.func @generic_op_reshape_consumer_fusion_projected(%arg0 : tensor<?x?xf32>,
-                                                   %arg1 : tensor<?x?xf32>) ->
+                                                   %arg1 : tensor<?x?xf32>,
+                                                   %sz0: index, %sz1: index) ->
                                                    tensor<?x?x4x5xf32>
 {
   %0 = linalg.generic {
@@ -423,7 +447,7 @@ func.func @generic_op_reshape_consumer_fusion_projected(%arg0 : tensor<?x?xf32>,
       %1 = arith.mulf %arg3, %arg4 : f32
       linalg.yield %1 : f32
   } -> tensor<?x?xf32>
-  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] :
+  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] output_shape [%sz0, %sz1, 4, 5] :
     tensor<?x?xf32> into tensor<?x?x4x5xf32>
   return %1 : tensor<?x?x4x5xf32>
 }
@@ -433,15 +457,22 @@ func.func @generic_op_reshape_consumer_fusion_projected(%arg0 : tensor<?x?xf32>,
 //      CHECK: func @generic_op_reshape_consumer_fusion_projected
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-//      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0, 1, 2], [3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x4x5x?xf32>
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0, 1, 2], [3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x4x5x?xf32>
-//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0], [1, 2, 3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x?x4x5xf32>
+// CHECK-SAME:   %[[SZ0:.+]]: index, %[[SZ1:.+]]: index
+//      CHECK:   %[[C20:.+]] = arith.constant 20 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C20]] : index
+//      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_0]], 4, 5, %[[DIM_0]]] : tensor<?x?xf32> into tensor<?x4x5x?xf32>
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_1]], %[[C20]] : index
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_1]], 4, 5, %[[DIM_2]]] : tensor<?x?xf32> into tensor<?x4x5x?xf32>
+//      CHECK:   %[[DIM_4:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index
+//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], %[[VAL_2]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP4]], #[[MAP4]], #[[MAP5]]]
 // CHECK-SAME:     ["parallel", "parallel", "parallel", "parallel"]
@@ -466,6 +497,7 @@ func.func @no_fuse_dynamic_dims(%arg0: tensor<?x?xf32>) -> tensor<?xf32> {
     } -> tensor<?xf32>
   return %3 : tensor<?xf32>
 }
+
 //      CHECK: func @no_fuse_dynamic_dims
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
 //      CHECK:   %[[RESHAPE:.+]] = tensor.collapse_shape %[[ARG0]]
@@ -503,7 +535,8 @@ func.func @no_fuse_mismatched_dynamism(%arg0: tensor<2x1xi64>, %arg1: tensor<?xi
 // -----
 
 func.func @reshape_as_consumer_permutation_with_multiple_results
-  (%a : tensor<?x?x?xf32>, %b : tensor<?x?xf32>)
+  (%a : tensor<?x?x?xf32>, %b : tensor<?x?xf32>, %sz0: index, 
+   %sz1: index, %sz2: index, %sz3: index, %sz4: index)
     -> (tensor<?x2x?x3x4x?xf32>, tensor<?x?x2x3x4x?xf32>) {
   %c:2 = linalg.generic {
          indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>,
@@ -517,10 +550,8 @@ func.func @reshape_as_consumer_permutation_with_multiple_results
          %1 = arith.addf %arg0, %arg1 : f32
          linalg.yield %1, %1 : f32, f32
        } -> (tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-  %d = tensor.expand_shape %c#0 [[0, 1], [2], [3, 4, 5]]
-       : tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
-  %e = tensor.expand_shape %c#1 [[0], [1, 2], [3, 4, 5]]
-       : tensor<?x?x?xf32> into tensor<?x?x2x3x4x?xf32>
+  %d = tensor.expand_shape %c#0 [[0, 1], [2], [3, 4, 5]] output_shape [%sz0, 2, %sz1, 3, 4, %sz2] : tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
+  %e = tensor.expand_shape %c#1 [[0], [1, 2], [3, 4, 5]] output_shape [%sz3, %sz4, 2, 3, 4, %sz2] : tensor<?x?x?xf32> into tensor<?x?x2x3x4x?xf32>
   return %d, %e : tensor<?x2x?x3x4x?xf32>, tensor<?x?x2x3x4x?xf32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d3, d4, d0, d1, d5)>
@@ -528,17 +559,40 @@ func.func @reshape_as_consumer_permutation_with_multiple_results
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d2, d3, d4)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d5, d0, d1, d2, d3, d4)>
 //      CHECK: func @reshape_as_consumer_permutation_with_multiple_results
-// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?x?xf32>
-// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
-//  CHECK-DAG:  %[[RESHAPE0:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3, 4], [5]{{\]}}
-//  CHECK-DAG:  %[[RESHAPE1:.+]] = tensor.expand_shape %[[ARG1]] {{\[}}[0, 1, 2], [3]{{\]}}
-//  CHECK-DAG:  %[[RESHAPE2:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3, 4, 5]{{\]}}
-//  CHECK-DAG:  %[[RESHAPE3:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2], [3, 4, 5]{{\]}}
-//      CHECK:  %[[GENERIC:.+]]:2 = linalg.generic
-// CHECK-SAME:      indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP3]]]
-// CHECK-SAME:      ins(%[[RESHAPE0]], %[[RESHAPE1]] :
-// CHECK-SAME:      outs(%[[RESHAPE2]], %[[RESHAPE3]] :
-//      CHECK:  return %[[GENERIC]]#0, %[[GENERIC]]#1
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?x?xf32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+//  CHECK-SAME:   %[[SZ0:.+]]: index, %[[SZ1:.+]]: index, %[[SZ2:.+]]: index, %[[SZ3:.+]]: index, %[[SZ4:.+]]: index
+//       CHECK:   %[[C12:.+]] = arith.constant 12 : index
+//       CHECK:   %[[C2:.+]] = arith.constant 2 : index
+//       CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+//       CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+//       CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+//       CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C12]] : index
+//       CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C2]] : index
+//       CHECK:   %[[RESHAPE0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3, 4], [5]] output_shape [3, 4, %[[VAL_0]], %[[VAL_1]], 2, %[[DIM_1]]] : tensor<?x?x?xf32> into tensor<3x4x?x?x2x?xf32>
+//       CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
+//       CHECK:   %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
+//       CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_2]], %[[C12]] : index
+//       CHECK:   %[[RESHAPE1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [3, 4, %[[VAL_2]], %[[DIM_3]]] : tensor<?x?xf32> into tensor<3x4x?x?xf32>
+//       CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+//       CHECK:   %[[DIM_6:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+//       CHECK:   %[[DIM_7:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+//       CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_5]], %[[C2]] : index
+//       CHECK:   %[[VAL_4:.+]] = arith.divui %[[DIM_7]], %[[C12]] : index
+//       CHECK:   %[[RESHAPE2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2], [3, 4, 5]] output_shape [%[[VAL_3]], 2, %[[DIM_6]], 3, 4, %[[VAL_4]]] : tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
+//       CHECK:   %[[DIM_9:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+//       CHECK:   %[[DIM_10:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+//       CHECK:   %[[DIM_11:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+//       CHECK:   %[[VAL_5:.+]] = arith.divui %[[DIM_10]], %[[C2]] : index
+//       CHECK:   %[[VAL_6:.+]] = arith.divui %[[DIM_11]], %[[C12]] : index
+//       CHECK:   %[[RESHAPE3:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2], [3, 4, 5]] output_shape [%[[DIM_9]], %[[VAL_5]], 2, 3, 4, %[[VAL_6]]] : tensor<?x?x?xf32> into tensor<?x?x2x3x4x?xf32>
+//       CHECK:   %[[GENERIC:.+]]:2 = linalg.generic
+//  CHECK-SAME:      indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP3]]]
+//  CHECK-SAME:      ins(%[[RESHAPE0]], %[[RESHAPE1]] :
+//  CHECK-SAME:      outs(%[[RESHAPE2]], %[[RESHAPE3]] :
+//       CHECK:  return %[[GENERIC]]#0, %[[GENERIC]]#1
 
 // -----
 
@@ -556,7 +610,7 @@ module {
         %2 = arith.addf %arg4, %arg5 : f32
         linalg.yield %2, %2 : f32, f32
       } -> (tensor<512xf32>, tensor<200x512xf32>)
-    %1 = tensor.expand_shape %0#1 [[0, 1, 2], [3]] : tensor<200x512xf32> into tensor<25x8x1x512xf32>
+    %1 = tensor.expand_shape %0#1 [[0, 1, 2], [3]] output_shape [25, 8, 1, 512] : tensor<200x512xf32> into tensor<25x8x1x512xf32>
     return %1 : tensor<25x8x1x512xf32>
   }
 }
@@ -567,7 +621,7 @@ module {
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<512xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<512xf32>
 // CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: tensor<200x512xf32>
-//      CHECK:   %[[OUTS:.+]] = tensor.expand_shape %[[ARG3]] {{\[}}[0, 1, 2], [3]{{\]}}
+//      CHECK:     %[[OUTS:.+]] = tensor.expand_shape %[[ARG3]] {{\[\[}}0, 1, 2], [3]] output_shape [25, 8, 1, 512] : tensor<200x512xf32> into tensor<25x8x1x512xf32>
 //      CHECK:   %[[GENERIC:.+]]:2 = linalg.generic
 // CHECK-SAME:       indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]], #[[MAP1]]]
 // CHECK-SAME:       ins(%[[ARG0]], %[[ARG1]] :
@@ -581,7 +635,9 @@ module {
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 func.func @generic_op_reshape_consumer_fusion_reduction(%arg0 : tensor<?x?xf32>,
                                                         %arg1 : tensor<?x?xf32>,
-                                                        %arg2 : tensor<?x?xf32>) ->
+                                                        %arg2 : tensor<?x?xf32>,
+                                                        %sz0: index,
+                                                        %sz1: index) ->
                                                         tensor<?x?x4x5xf32>
 {
   %0 = linalg.generic {
@@ -593,7 +649,7 @@ func.func @generic_op_reshape_consumer_fusion_reduction(%arg0 : tensor<?x?xf32>,
       %1 = arith.mulf %arg3, %arg4 : f32
       linalg.yield %1 : f32
   } -> tensor<?x?xf32>
-  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] :
+  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] output_shape [%sz0, %sz1, 4, 5] :
     tensor<?x?xf32> into tensor<?x?x4x5xf32>
   return %1 : tensor<?x?x4x5xf32>
 }
@@ -605,12 +661,18 @@ func.func @generic_op_reshape_consumer_fusion_reduction(%arg0 : tensor<?x?xf32>,
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0, 1, 2], [3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x4x5x?xf32>
-//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]]
-// CHECK-SAME:     [0], [1, 2, 3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x?x4x5xf32>
+// CHECK-SAME:   %[[SZ0:.+]]: index, %[[SZ1:.+]]: index
+//      CHECK:   %[[C20:.+]] = arith.constant 20 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C20]] : index
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_0]], 4, 5, %[[DIM_0]]] : tensor<?x?xf32> into tensor<?x4x5x?xf32>
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index
+//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], %[[VAL_1]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
 // CHECK-SAME:     ["parallel", "parallel", "parallel", "parallel", "reduction"]
@@ -650,10 +712,21 @@ func.func @generic_op_reshape_producer_fusion_with_reduction(%arg0 : tensor<?x7x
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x7x?x8xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x4x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0, 1], [2], [3, 4]
-//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]]
-// CHECK-SAME:     [0, 1], [2, 3]
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C7:.+]] = arith.constant 7 : index
+//      CHECK:   %[[C8:.+]] = arith.constant 8 : index
+//      CHECK:   %[[C2:.+]] = arith.constant 2 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x4x?xf32>
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor<?x4x?xf32>
+//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C8]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C7]] : index
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1], [2], [3, 4]] output_shape [%[[VAL_0]], 8, 4, %[[VAL_1]], 7] : tensor<?x4x?xf32> into tensor<?x8x4x?x7xf32>
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C8]] : index
+//      CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C7]] : index
+//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 8, %[[VAL_3]], 7] : tensor<?x?xf32> into tensor<?x8x?x7xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
 // CHECK-SAME:     ["parallel", "parallel", "reduction", "parallel", "parallel"]
@@ -668,12 +741,14 @@ func.func @generic_op_reshape_producer_fusion_with_reduction(%arg0 : tensor<?x7x
 
 func.func @linalg_add_reshape_consumer_fusion(%arg0 : tensor<?x?xf32>,
                                               %arg1 : tensor<?x?xf32>,
-                                              %arg2 : tensor<?x?xf32>) ->
+                                              %arg2 : tensor<?x?xf32>,
+                                              %sz0: index,
+                                              %sz1: index) ->
                                               tensor<?x?x4x5xf32>
 {
   %0 = linalg.add ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
        outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] :
+  %1 = tensor.expand_shape %0 [[0], [1, 2, 3]] output_shape [%sz0, %sz1, 4, 5] :
     tensor<?x?xf32> into tensor<?x?x4x5xf32>
   return %1 : tensor<?x?x4x5xf32>
 }
@@ -683,15 +758,22 @@ func.func @linalg_add_reshape_consumer_fusion(%arg0 : tensor<?x?xf32>,
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0], [1, 2, 3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x?x4x5xf32>
-//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0], [1, 2, 3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x?x4x5xf32>
-//      CHECK:   %[[T3:.+]] = tensor.expand_shape %[[ARG2]]
-// CHECK-SAME:     [0], [1, 2, 3]
-// CHECK-SAME:     tensor<?x?xf32> into tensor<?x?x4x5xf32>
+// CHECK-SAME:   %[[SZ0:.+]]: index, %[[SZ1:.+]]: index
+//      CHECK:   %[[C20:.+]] = arith.constant 20 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM_0]], %[[C20]] : index
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM]], %[[VAL_0]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index
+//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], %[[VAL_1]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
+//      CHECK:   %[[DIM_4:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index
+//      CHECK:   %[[T3:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], %[[VAL_2]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
 //      CHECK:   %[[T4:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]]
 // CHECK-SAME:     ["parallel", "parallel", "parallel", "parallel"]
@@ -721,10 +803,20 @@ func.func @linalg_add_reshape_producer_fusion(%arg0 : tensor<?x7x?x8xf32>,
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x7x?x8xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]]
-// CHECK-SAME:     [0, 1], [2, 3]
-//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]]
-// CHECK-SAME:     [0, 1], [2, 3]
+//      CHECK:   %[[C8:.+]] = arith.constant 8 : index
+//      CHECK:   %[[C7:.+]] = arith.constant 7 : index
+//      CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C7]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C8]] : index
+//      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_0]], 7, %[[VAL_1]], 8] : tensor<?x?xf32> into tensor<?x7x?x8xf32>
+//      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
+//      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
+//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C7]] : index
+//      CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C8]] : index
+//      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 7, %[[VAL_3]], 8] : tensor<?x?xf32> into tensor<?x7x?x8xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP]]]
 // CHECK-SAME:     ["parallel", "parallel", "parallel", "parallel"]
diff --git a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
index 4262cd23e746..8fb84248c961 100644
--- a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
+++ b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
@@ -199,13 +199,12 @@ func.func @empty_tensor_dim_of_linalg_result(%arg_0 : tensor<?xf32>,
 
 // -----
 
-func.func @dim_reshape_expansion(%arg0 : tensor<6x5x?xf32>) -> (index, index, index)
+func.func @dim_reshape_expansion(%arg0 : tensor<6x5x?xf32>, %sz0: index) -> (index, index, index)
 {
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
   %c4 = arith.constant 4 : index
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2], [3, 4, 5]]
-      : tensor<6x5x?xf32> into tensor<2x3x5x4x?x7xf32>
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2], [3, 4, 5]] output_shape [2, 3, 5, 4, %sz0, 7] : tensor<6x5x?xf32> into tensor<2x3x5x4x?x7xf32>
   %1 = tensor.dim %0, %c1 : tensor<2x3x5x4x?x7xf32>
   %2 = tensor.dim %0, %c3 : tensor<2x3x5x4x?x7xf32>
   %3 = tensor.dim %0, %c4 : tensor<2x3x5x4x?x7xf32>
diff --git a/mlir/test/Dialect/Linalg/runtime-verification.mlir b/mlir/test/Dialect/Linalg/runtime-verification.mlir
new file mode 100644
index 000000000000..a4f29d8457e5
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/runtime-verification.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir-opt %s -generate-runtime-verification | FileCheck %s
+
+// Most of the tests for linalg runtime-verification are implemented as integration tests.
+
+#identity = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: @static_dims
+func.func @static_dims(%arg0: tensor<5xf32>, %arg1: tensor<5xf32>) -> (tensor<5xf32>) {
+    // CHECK: %[[TRUE:.*]] = index.bool.constant true
+    // CHECK: cf.assert %[[TRUE]]
+    %result = tensor.empty() : tensor<5xf32> 
+    %0 = linalg.generic {
+      indexing_maps = [#identity, #identity, #identity],
+      iterator_types = ["parallel"]
+    } ins(%arg0, %arg1 : tensor<5xf32>, tensor<5xf32>)
+      outs(%result : tensor<5xf32>) {
+      ^bb0(%gen_arg1: f32, %gen_arg2: f32, %out: f32) :
+        %tmp1 = arith.addf %gen_arg1, %gen_arg2 : f32
+        linalg.yield %tmp1 : f32
+    } -> tensor<5xf32>
+    return %0 : tensor<5xf32>
+}
+
+// -----
+
+#map = affine_map<() -> ()>
+
+// CHECK-LABEL: @scalars
+func.func @scalars(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
+    // No runtime checks are required if the operands are all scalars
+    // CHECK-NOT: cf.assert
+    %result = tensor.empty() : tensor<f32> 
+    %0 = linalg.generic {
+      indexing_maps = [#map, #map, #map],
+      iterator_types = []
+    } ins(%arg0, %arg1 : tensor<f32>, tensor<f32>)
+      outs(%result : tensor<f32>) {
+      ^bb0(%gen_arg1: f32, %gen_arg2: f32, %out: f32) :
+        %tmp1 = arith.addf %gen_arg1, %gen_arg2 : f32
+        linalg.yield %tmp1 : f32
+    } -> tensor<f32>
+    return %0 : tensor<f32>
+}
diff --git a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
new file mode 100644
index 000000000000..7f3997633a30
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
@@ -0,0 +1,83 @@
+// RUN: mlir-opt %s --transform-interpreter --split-input-file -resolve-shaped-type-result-dims -canonicalize | FileCheck %s
+
+// Demonstrates what happens when peeling the 4th loop (that corresponds to the
+// "depth" dimension in depthwise convs) followed by vectorization in the
+// presence of _scalable_ vectors (these are introduced through scalable
+// tiling). The main goal is to verify that canonicalizations fold away the
+// masks in the main loop.
+
+func.func @conv(%arg0: tensor<1x1080x1962x48xi32>, %arg1: tensor<1x43x48xi32>) -> tensor<1x1080x1920x48xi32> {
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (-(48 mod s0) + 48)>
+
+// CHECK-LABEL:   func.func @conv(
+// CHECK-DAG:       %[[C_43:.*]] = arith.constant 43 : index
+// CHECK-DAG:       %[[C_48:.*]] = arith.constant 48 : index
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C4:.*]] = arith.constant 4 : index
+// CHECK:           %[[VSCALE:.*]] = vector.vscale
+// CHECK:           %[[VSCALE_X_4:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+
+// Loop over the channel/depth dim - the main part after vectorisation (vectorized, no masking)
+// CHECK:               %[[UB_DEPTH_LOOP:.*]] = affine.apply #[[$MAP]](){{\[}}%[[VSCALE_X_4]]]
+// CHECK-NEXT:          %[[VAL_21:.*]] = scf.for {{.*}} to %[[UB_DEPTH_LOOP]] step %[[VSCALE_X_4]]
+// Loop over the Filter width dim
+// CHECK:                 scf.for %{{.*}} = %[[C0]] to %[[C_43]] step %[[C1]] {{.*}} -> (tensor<1x1x4x?xi32>) {
+// CHECK-NOT:               vector.mask
+// CHECK:                   vector.broadcast {{.*}} : vector<[4]xi32> to vector<1x4x[4]xi32>
+// CHECK-NEXT:              arith.muli {{.*}} : vector<1x4x[4]xi32>
+// CHECK-NEXT:              arith.addi {{.*}} : vector<1x4x[4]xi32>
+// CHECK-NOT:               vector.mask
+// CHECK:                   scf.yield {{.*}} : tensor<1x1x4x?xi32>
+// CHECK:                 }
+// CHECK:                 tensor.insert_slice {{.*}}  tensor<1x1x4x?xi32> into tensor<1x1080x1920x48xi32>
+// CHECK:                 scf.yield {{.*}} : tensor<1x1080x1920x48xi32>
+
+// CHECK-NEXT:          }
+
+// Loop over the channel/depth dim - the remainder part (not vectorized)
+// CHECK:               scf.for {{.*}} to %[[C_48]] step %[[VSCALE_X_4]]
+// Loop over the Filter width dim
+// CHECK:                 scf.for %{{.*}} = %[[C0]] to %[[C_43]] step %[[C1]] {{.*}} -> (tensor<1x1x4x?xi32>) {
+// CHECK:                   linalg.depthwise_conv_1d_nwc_wc {{.*}} -> tensor<1x4x?xi32>
+// CHECK:                   scf.yield %{{.*}} : tensor<1x1x4x?xi32>
+// CHECK:                 }
+// CHECK:                 tensor.insert_slice {{.*}} tensor<1x1x4x?xi32> into tensor<1x1080x1920x48xi32>
+// CHECK-NEXT:            scf.yield %{{.*}} : tensor<1x1080x1920x48xi32>
+// CHECK-NEXT:          }
+
+
+  %0 = tensor.empty() : tensor<1x1080x1920x48xi32>
+  %c0_i32 = arith.constant 0 : i32
+  %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x1080x1920x48xi32>) -> tensor<1x1080x1920x48xi32>
+  %2 = linalg.depthwise_conv_2d_nhwc_hwc {
+    dilations = dense<1> : tensor<2xi64>,
+    strides = dense<1> : tensor<2xi64>}
+    ins(%arg0, %arg1 : tensor<1x1080x1962x48xi32>, tensor<1x43x48xi32>) outs(%1 : tensor<1x1080x1920x48xi32>) -> tensor<1x1080x1920x48xi32>
+  return %2 : tensor<1x1080x1920x48xi32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.consume}) {
+    // 1. Tile parallel dims
+    %1 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %root : (!transform.any_op) -> !transform.any_op
+    %tiled_linalg_op_0, %loops_1:4 = transform.structured.tile_using_for %1[1, 1, 4, [4], 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">)
+
+    // 2. Tile reduction dims
+    %2 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %loops_1#3 : (!transform.op<"scf.for">) -> !transform.any_op
+    %tiled_linalg_op_1, %loops_2:2 = transform.structured.tile_using_for %2[0, 0, 0, 0, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // 3. Decompose 2D conv into 2 x 1D conv
+    %3 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %loops_1#3 : (!transform.op<"scf.for">) -> !transform.any_op
+    %4 = transform.structured.decompose %3 : (!transform.any_op) -> !transform.any_op
+
+    // 4. Apply loop peeling - only the 4th loop
+    %main_loop, %remainder_loop = transform.loop.peel %loops_1#3 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">, !transform.op<"scf.for">)
+    %5 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %main_loop : (!transform.op<"scf.for">) -> !transform.any_op
+
+    // 5. Vectorize, but only the main loop
+    transform.structured.vectorize %5 vector_sizes [2, 4, [4], 16] : !transform.any_op
+
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
index 006d6105677e..31e9fd00cffa 100644
--- a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
@@ -13,8 +13,8 @@ func.func @matmul_split(%A : tensor<16x256xf32>, %B: tensor<256x32xf32>, %C: ten
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 //  CHECK-LABEL: @matmul_split
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<16x256xf32> into tensor<16x4x64xf32>
-//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<256x32xf32> into tensor<4x64x32xf32>
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] output_shape [16, 4, 64] : tensor<16x256xf32> into tensor<16x4x64xf32>
+//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] output_shape [4, 64, 32] : tensor<256x32xf32> into tensor<4x64x32xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<16x32x4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<16x32x4xf32>) -> tensor<16x32x4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
@@ -65,7 +65,7 @@ func.func @generic_split_1d(%arg0: tensor<32xf32>, %arg1: tensor<f32>, %out: ten
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0) -> ()>
 //CHECK-LABEL: @generic_split_1d
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 1.000000e+00 : f32
-//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] : tensor<32xf32> into tensor<4x8xf32>
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] output_shape [4, 8] : tensor<32xf32> into tensor<4x8xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic
@@ -119,8 +119,8 @@ func.func @generic_split_3d(%input: tensor<32x2xf32>, %input_2: tensor<5x32xf32>
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 // CHECK-LABEL:  func @generic_split_3d
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 0xFF800000 : f32
-//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<4x8x2xf32>
-//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x4x8xf32>
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] output_shape [4, 8, 2] : tensor<32x2xf32> into tensor<4x8x2xf32>
+//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] output_shape [5, 4, 8] : tensor<5x32xf32> into tensor<5x4x8xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<5x2x4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>) -> tensor<5x2x4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel"]}
@@ -177,8 +177,8 @@ func.func @generic_split_3d_ninf(%input: tensor<32x2xf32>, %input_2: tensor<5x32
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 // CHECK-LABEL:  func @generic_split_3d_ninf
 //  CHECK-DAG: %[[ID:.*]] = arith.constant -3.40282347E+38 : f32
-//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<4x8x2xf32>
-//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x4x8xf32>
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] output_shape [4, 8, 2] : tensor<32x2xf32> into tensor<4x8x2xf32>
+//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] output_shape [5, 4, 8] : tensor<5x32xf32> into tensor<5x4x8xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<5x2x4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>) -> tensor<5x2x4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel"]}
@@ -218,8 +218,8 @@ func.func @matmul_split(%A : tensor<16x256xf32>, %B: tensor<256x32xf32>, %C: ten
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 //  CHECK-LABEL: @matmul_split
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<16x256xf32> into tensor<16x64x4xf32>
-//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<256x32xf32> into tensor<64x4x32xf32>
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] output_shape [16, 64, 4] : tensor<16x256xf32> into tensor<16x64x4xf32>
+//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] output_shape [64, 4, 32] : tensor<256x32xf32> into tensor<64x4x32xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<16x32x4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<16x32x4xf32>) -> tensor<16x32x4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
@@ -270,7 +270,7 @@ func.func @generic_split_1d(%arg0: tensor<32xf32>, %arg1: tensor<f32>, %out: ten
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0) -> ()>
 //CHECK-LABEL: @generic_split_1d
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 1.000000e+00 : f32
-//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] : tensor<32xf32> into tensor<8x4xf32>
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] output_shape [8, 4] : tensor<32xf32> into tensor<8x4xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic
@@ -324,8 +324,8 @@ func.func @generic_split_3d(%input: tensor<32x2xf32>, %input_2: tensor<5x32xf32>
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 // CHECK-LABEL:  func @generic_split_3d
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 0x7F800000 : f32
-//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<8x4x2xf32>
-//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x8x4xf32>
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] output_shape [8, 4, 2] : tensor<32x2xf32> into tensor<8x4x2xf32>
+//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] output_shape [5, 8, 4] : tensor<5x32xf32> into tensor<5x8x4xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<5x2x4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>) -> tensor<5x2x4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel"]}
@@ -382,8 +382,8 @@ func.func @generic_split_3d(%input: tensor<32x2xf32>, %input_2: tensor<5x32xf32>
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 // CHECK-LABEL:  func @generic_split_3d
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 3.40282347E+38 : f32
-//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<8x4x2xf32>
-//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x8x4xf32>
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] output_shape [8, 4, 2] : tensor<32x2xf32> into tensor<8x4x2xf32>
+//  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] output_shape [5, 8, 4] : tensor<5x32xf32> into tensor<5x8x4xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<5x2x4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>) -> tensor<5x2x4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel"]}
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index 58d4b21ea2dd..d7ff1ded9d93 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -1710,10 +1710,12 @@ module attributes {transform.with_named_sequence} {
 #map = affine_map<(d0) -> (d0)>
 // CHECK-LABEL:   @not_vectorizable
 func.func @not_vectorizable(%arg0: tensor<1x?xf32>, %arg1: index, %arg2: index, %arg3: index) -> tensor<1x128xf32> {
+  %c0 = arith.constant 0 : index
   %0 = tensor.empty() : tensor<1x128xf32>
   %1 = scf.for %arg5 = %arg2 to %arg1 step %arg3 iter_args(%arg6 = %0) -> (tensor<1x128xf32>) {
     %extracted_slice = tensor.extract_slice %arg6[0, 0] [1, %arg1] [1, 1] : tensor<1x128xf32> to tensor<?xf32>
-    %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<?xf32> into tensor<1x?xf32>
+    %sz0 = tensor.dim %extracted_slice, %c0 : tensor<?xf32>
+    %expanded = tensor.expand_shape %extracted_slice [[0, 1]] output_shape [1, %sz0] : tensor<?xf32> into tensor<1x?xf32>
     %extracted_slice_0 = tensor.extract_slice %arg0[0, %arg3] [1, %arg2] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
     %extracted_slice_1 = tensor.extract_slice %expanded[0, %arg3] [1, %arg2] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
     %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%extracted_slice_0 : tensor<?xf32>) outs(%extracted_slice_1 : tensor<?xf32>) {
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index 506ed1f1c10b..f442a61dc31e 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -13,7 +13,7 @@ func.func @collapse_shape_identity_fold(%arg0 : memref<5xi8>) -> memref<5xi8> {
 // CHECK-LABEL: expand_shape_identity_fold
 // CHECK-NEXT: return
 func.func @expand_shape_identity_fold(%arg0 : memref<5x4xi8>) -> memref<5x4xi8> {
-  %0 = memref.expand_shape %arg0 [[0], [1]] : memref<5x4xi8> into memref<5x4xi8>
+  %0 = memref.expand_shape %arg0 [[0], [1]] output_shape [5, 4] : memref<5x4xi8> into memref<5x4xi8>
   return %0 : memref<5x4xi8>
 }
 
@@ -23,7 +23,7 @@ func.func @expand_shape_identity_fold(%arg0 : memref<5x4xi8>) -> memref<5x4xi8>
 // CHECK-NEXT: return
 func.func @collapse_expand_rank0_cancel(%arg0 : memref<1x1xi8>) -> memref<1x1xi8> {
   %0 = memref.collapse_shape %arg0 [] : memref<1x1xi8> into memref<i8>
-  %1 = memref.expand_shape %0 [] : memref<i8> into memref<1x1xi8>
+  %1 = memref.expand_shape %0 [] output_shape [1, 1] : memref<i8> into memref<1x1xi8>
   return %1 : memref<1x1xi8>
 }
 
@@ -455,9 +455,9 @@ func.func @compose_collapse_of_collapse(%arg0 : memref<?x?x?x?x?xf32>)
 // -----
 
 func.func @do_not_compose_collapse_of_expand_non_identity_layout(
-    %arg0: memref<?x?xf32, strided<[?, 1], offset: 0>>)
+    %arg0: memref<?x?xf32, strided<[?, 1], offset: 0>>, %sz0: index, %sz1: index)
     -> memref<?xf32, strided<[?], offset: 0>> {
-  %1 = memref.expand_shape %arg0 [[0, 1], [2]] :
+  %1 = memref.expand_shape %arg0 [[0, 1], [2]] output_shape [%sz0, 4, %sz1] :
     memref<?x?xf32, strided<[?, 1], offset: 0>> into
     memref<?x4x?xf32, strided<[?, ?, 1], offset: 0>>
   %2 = memref.collapse_shape %1 [[0, 1, 2]] :
@@ -471,35 +471,34 @@ func.func @do_not_compose_collapse_of_expand_non_identity_layout(
 
 // -----
 
-func.func @compose_expand_of_expand(%arg0 : memref<?x?xf32>)
+func.func @compose_expand_of_expand(%arg0 : memref<?x?xf32>, %sz0: index, %sz1: index, %sz2: index, %sz3: index)
     -> memref<?x6x4x5x?xf32> {
-  %0 = memref.expand_shape %arg0 [[0, 1], [2]]
+  %0 = memref.expand_shape %arg0 [[0, 1], [2]] output_shape [%sz0, 4, %sz1]
       : memref<?x?xf32> into memref<?x4x?xf32>
-  %1 = memref.expand_shape %0 [[0, 1], [2], [3, 4]]
-      : memref<?x4x?xf32> into memref<?x6x4x5x?xf32>
+  %1 = memref.expand_shape %0 [[0, 1], [2], [3, 4]] output_shape [%sz2, 6, 4, 5, %sz3] : memref<?x4x?xf32> into memref<?x6x4x5x?xf32>
   return %1 : memref<?x6x4x5x?xf32>
 }
 // CHECK-LABEL: func @compose_expand_of_expand
-//       CHECK:   memref.expand_shape %{{.*}} {{\[}}[0, 1, 2], [3, 4]]
+//       CHECK:   memref.expand_shape %{{.*}} {{\[}}[0, 1, 2], [3, 4]] output_shape [%{{.*}}, 6, 4, 5, %{{.*}}]
 //   CHECK-NOT:   memref.expand_shape
 
 // -----
 
 func.func @compose_expand_of_expand_of_zero_dim(%arg0 : memref<f32>)
     -> memref<1x1x1xf32> {
-  %0 = memref.expand_shape %arg0 [] : memref<f32> into memref<1xf32>
-  %1 = memref.expand_shape %0 [[0, 1, 2]]
+  %0 = memref.expand_shape %arg0 [] output_shape [1] : memref<f32> into memref<1xf32>
+  %1 = memref.expand_shape %0 [[0, 1, 2]] output_shape [1, 1, 1]
       : memref<1xf32> into memref<1x1x1xf32>
   return %1 : memref<1x1x1xf32>
 }
 // CHECK-LABEL: func @compose_expand_of_expand_of_zero_dim
-//       CHECK:   memref.expand_shape %{{.*}} []
+//       CHECK:   memref.expand_shape %{{.*}} [] output_shape [1, 1, 1]
 //  CHECK-SAME:     memref<f32> into memref<1x1x1xf32>
 
 // -----
 
 func.func @fold_collapse_of_expand(%arg0 : memref<12x4xf32>) -> memref<12x4xf32> {
-  %0 = memref.expand_shape %arg0 [[0, 1], [2]]
+  %0 = memref.expand_shape %arg0 [[0, 1], [2]] output_shape [3, 4, 4]
       : memref<12x4xf32> into memref<3x4x4xf32>
   %1 = memref.collapse_shape %0 [[0, 1], [2]]
       : memref<3x4x4xf32> into memref<12x4xf32>
@@ -510,9 +509,9 @@ func.func @fold_collapse_of_expand(%arg0 : memref<12x4xf32>) -> memref<12x4xf32>
 
 // -----
 
-func.func @fold_collapse_collapse_of_expand(%arg0 : memref<?x?xf32>)
+func.func @fold_collapse_collapse_of_expand(%arg0 : memref<?x?xf32>, %sz0: index, %sz1: index)
     -> memref<?x?xf32> {
-  %0 = memref.expand_shape %arg0 [[0, 1], [2]]
+  %0 = memref.expand_shape %arg0 [[0, 1], [2]] output_shape [%sz0, 4, %sz1]
       : memref<?x?xf32> into memref<?x4x?xf32>
   %1 = memref.collapse_shape %0 [[0, 1], [2]]
       : memref<?x4x?xf32> into memref<?x?xf32>
@@ -525,7 +524,7 @@ func.func @fold_collapse_collapse_of_expand(%arg0 : memref<?x?xf32>)
 
 func.func @fold_memref_expand_cast(%arg0 : memref<?x?xf32>) -> memref<2x4x4xf32> {
   %0 = memref.cast %arg0 : memref<?x?xf32> to memref<8x4xf32>
-  %1 = memref.expand_shape %0 [[0, 1], [2]]
+  %1 = memref.expand_shape %0 [[0, 1], [2]] output_shape [2, 4, 4]
       : memref<8x4xf32> into memref<2x4x4xf32>
   return %1 : memref<2x4x4xf32>
 }
@@ -981,10 +980,10 @@ func.func @memref_realloc_dead(%src : memref<2xf32>, %v : f32) -> memref<2xf32>{
 //  CHECK-SAME:     %[[m:.*]]: memref<?xf32, strided<[1]>, 3>
 //       CHECK:   %[[casted:.*]] = memref.cast %[[m]] : memref<?xf32, strided<[1]>, 3> to memref<?xf32, 3
 //       CHECK:   return %[[casted]]
-func.func @collapse_expand_fold_to_cast(%m: memref<?xf32, strided<[1]>, 3>)
+func.func @collapse_expand_fold_to_cast(%m: memref<?xf32, strided<[1]>, 3>, %sz0: index)
     -> (memref<?xf32, 3>)
 {
-  %0 = memref.expand_shape %m [[0, 1]]
+  %0 = memref.expand_shape %m [[0, 1]] output_shape [1, %sz0]
       : memref<?xf32, strided<[1]>, 3> into memref<1x?xf32, 3>
   %1 = memref.collapse_shape %0 [[0, 1]]
       : memref<1x?xf32, 3> into memref<?xf32, 3>
diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
index fd37b7ff0a27..435dcc944778 100644
--- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
@@ -430,3 +430,23 @@ func.func @rank_zero_memref_store(%arg0: i4) -> () {
 //       CHECK32:   %[[EXTUI:.+]] = arith.extui %[[ARG0]] : i4 to i32
 //       CHECK32:   %[[WRITE_RMW:.+]] = memref.atomic_rmw assign %[[EXTUI]], %[[ALLOC]][] : (i32, memref<i32>) -> i32
 //       CHECK32:   return
+
+// -----
+
+func.func @memref_collapse_shape_i4(%idx0 : index, %idx1 : index) -> i4 {
+  %arr = memref.alloc() : memref<32x8x128xi4>
+  %collapse = memref.collapse_shape %arr[[0, 1], [2]] : memref<32x8x128xi4> into memref<256x128xi4>
+  %1 = memref.load %collapse[%idx0, %idx1] : memref<256x128xi4>
+  return %1 : i4
+}
+
+// CHECK-LABEL:   func.func @memref_collapse_shape_i4(
+//       CHECK:     %[[ALLOC:.*]] = memref.alloc() : memref<16384xi8>
+//   CHECK-NOT:     memref.collapse_shape
+//       CHECK:     memref.load %[[ALLOC]][%{{.*}}] : memref<16384xi8>
+
+// CHECK32-LABEL:   func.func @memref_collapse_shape_i4(
+//       CHECK32:     %[[ALLOC:.*]] = memref.alloc() : memref<4096xi32>
+//   CHECK32-NOT:     memref.collapse_shape
+//       CHECK32:     memref.load %[[ALLOC]][%{{.*}}] : memref<4096xi32>
+
diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
index 28b700430059..3bd6b7c1fd79 100644
--- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
+++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
@@ -421,10 +421,11 @@ func.func @simplify_expand_shape(
     %base: memref<?x?xf32, strided<[?,?], offset:?>>,
     %offset0: index, %offset1: index, %offset2: index,
     %size0: index, %size1: index, %size2: index,
-    %stride0: index, %stride1: index, %stride2: index)
+    %stride0: index, %stride1: index, %stride2: index,
+    %sz0: index, %sz1: index)
     -> memref<?x7x8x9x10x2x?x3xf32, strided<[?, ?, ?, ?, ?, ?, ?, ?], offset: ?>> {
 
-  %subview = memref.expand_shape %base[[0, 1, 2, 3],[4, 5, 6, 7]] :
+  %subview = memref.expand_shape %base [[0, 1, 2, 3],[4, 5, 6, 7]] output_shape [%sz0, 7, 8, 9, 10, 2, %sz1, 3] :
     memref<?x?xf32, strided<[?,?], offset: ?>> into
       memref<?x7x8x9x10x2x?x3xf32, strided<[?, ?, ?, ?, ?, ?, ?, ?], offset: ?>>
 
@@ -491,7 +492,7 @@ func.func @extract_strided_metadata_of_expand_shape_all_static(
        index, index, index, index, index,
        index, index, index, index, index) {
 
-  %expand_shape = memref.expand_shape %arg[[0, 1, 2], [3, 4]] :
+  %expand_shape = memref.expand_shape %arg[[0, 1, 2], [3, 4]] output_shape [3, 5, 2, 2, 2] :
     memref<30x4xi16> into memref<3x5x2x2x2xi16>
 
   %base, %offset, %sizes:5, %strides:5 = memref.extract_strided_metadata %expand_shape :
@@ -595,12 +596,13 @@ func.func @extract_strided_metadata_of_expand_shape_all_dynamic(
     %base: memref<?x?xf32, strided<[?,?], offset:?>>,
     %offset0: index, %offset1: index, %offset2: index,
     %size0: index, %size1: index, %size2: index,
-    %stride0: index, %stride1: index, %stride2: index)
+    %stride0: index, %stride1: index, %stride2: index,
+    %sz0: index, %sz1: index)
     -> (memref<f32>, index,
        index, index, index, index, index, index, index, index,
        index, index, index, index, index, index, index, index) {
 
-  %subview = memref.expand_shape %base[[0, 1, 2, 3],[4, 5, 6, 7]] :
+  %subview = memref.expand_shape %base[[0, 1, 2, 3],[4, 5, 6, 7]] output_shape [%sz0, 7, 8, 9, 10, 2, %sz1, 3] :
     memref<?x?xf32, strided<[?,?], offset: ?>> into
       memref<?x7x8x9x10x2x?x3xf32, strided<[?, ?, ?, ?, ?, ?, ?, ?], offset: ?>>
 
@@ -643,7 +645,7 @@ func.func @extract_strided_metadata_of_expand_shape_all_static_0_rank(
        index, index, index, index, index,
        index, index, index, index, index) {
 
-  %expand_shape = memref.expand_shape %arg[] :
+  %expand_shape = memref.expand_shape %arg[] output_shape [1, 1, 1, 1, 1] :
     memref<i16, strided<[], offset: ?>> into memref<1x1x1x1x1xi16, strided<[1,1,1,1,1], offset: ?>>
 
   %base, %offset, %sizes:5, %strides:5 = memref.extract_strided_metadata %expand_shape :
@@ -1456,6 +1458,7 @@ func.func @extract_strided_metadata_of_cast_w_csts(
       index, index,
       index, index
 }
+
 // -----
 
 // Check that we don't simplify extract_strided_metadata of
@@ -1497,6 +1500,7 @@ func.func @extract_strided_metadata_of_cast_unranked(
 
 
 // -----
+
 memref.global "private" @dynamicShmem : memref<0xf16,3>
 
 // CHECK-LABEL: func @zero_sized_memred
@@ -1513,4 +1517,26 @@ func.func @zero_sized_memred(%arg0: f32) -> (memref<f16, 3>, index,index,index)
     %sizes, %strides :
       memref<f16,3>, index,
       index, index
-}
-\ No newline at end of file
+}
+
+// -----
+
+func.func @extract_strided_metadata_of_collapse_shape(%base: memref<5x4xf32>)
+    -> (memref<f32>, index, index, index) {
+
+  %collapse = memref.collapse_shape %base[[0, 1]] :
+    memref<5x4xf32> into memref<20xf32>
+
+  %base_buffer, %offset, %size, %stride = memref.extract_strided_metadata %collapse :
+    memref<20xf32> -> memref<f32>, index, index, index
+
+  return %base_buffer, %offset, %size, %stride :
+    memref<f32>, index, index, index
+}
+
+// CHECK-LABEL:  func @extract_strided_metadata_of_collapse_shape
+//   CHECK-DAG:    %[[OFFSET:.*]] = arith.constant 0 : index
+//   CHECK-DAG:    %[[SIZE:.*]] = arith.constant 20 : index
+//   CHECK-DAG:    %[[STEP:.*]] = arith.constant 1 : index
+//       CHECK:    %[[BASE:.*]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata
+//       CHECK:    return %[[BASE]], %[[OFFSET]], %[[SIZE]], %[[STEP]] : memref<f32>, index, index, index
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index 5b853a6cc5a3..254cd4015eed 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -412,7 +412,7 @@ func.func @fold_static_stride_subview_with_affine_load_store(%arg0 : memref<12x3
 // CHECK-LABEL: fold_static_stride_subview_with_affine_load_store_expand_shape
 // CHECK-SAME: (%[[ARG0:.*]]: memref<12x32xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index) -> f32 {
 func.func @fold_static_stride_subview_with_affine_load_store_expand_shape(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index) -> f32 {
-  %0 = memref.expand_shape %arg0 [[0, 1], [2]] : memref<12x32xf32> into memref<2x6x32xf32>
+  %0 = memref.expand_shape %arg0 [[0, 1], [2]] output_shape [2, 6, 32] : memref<12x32xf32> into memref<2x6x32xf32>
   %1 = affine.load %0[%arg1, %arg2, %arg3] : memref<2x6x32xf32>
   return %1 : f32
 }
@@ -458,7 +458,7 @@ func.func @fold_dynamic_size_collapse_shape_with_affine_load(%arg0 : memref<?x6x
 // CHECK-LABEL: fold_static_stride_subview_with_affine_load_store_expand_shape_3d
 // CHECK-SAME: (%[[ARG0:.*]]: memref<12x32xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index) -> f32 {
 func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_3d(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4: index) -> f32 {
-  %0 = memref.expand_shape %arg0 [[0, 1, 2], [3]] : memref<12x32xf32> into memref<2x2x3x32xf32>
+  %0 = memref.expand_shape %arg0 [[0, 1, 2], [3]] output_shape [2, 2, 3, 32] : memref<12x32xf32> into memref<2x2x3x32xf32>
   %1 = affine.load %0[%arg1, %arg2, %arg3, %arg4] : memref<2x2x3x32xf32>
   return %1 : f32
 }
@@ -469,15 +469,17 @@ func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_3d(%ar
 // -----
 
 // CHECK-LABEL: fold_dynamic_subview_with_memref_load_store_expand_shape
-func.func @fold_dynamic_subview_with_memref_load_store_expand_shape(%arg0 : memref<16x?xf32, strided<[16, 1]>>, %arg1 : index, %arg2 : index) -> f32 {
+// CHECK-SAME: (%[[ARG0:.*]]: memref<16x?xf32, strided<[16, 1]>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[SZ0:.*]]: index)
+func.func @fold_dynamic_subview_with_memref_load_store_expand_shape(%arg0 : memref<16x?xf32, strided<[16, 1]>>, %arg1 : index, %arg2 : index, %sz0: index) -> f32 {
   %c0 = arith.constant 0 : index
-  %expand_shape = memref.expand_shape %arg0 [[0, 1], [2, 3]] : memref<16x?xf32, strided<[16, 1]>> into memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
+  %expand_shape = memref.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [1, 16, %sz0, 1] : memref<16x?xf32, strided<[16, 1]>> into memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
   %0 = memref.load %expand_shape[%c0, %arg1, %arg2, %c0] : memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
   return %0 : f32
 }
-// CHECK: %[[EXPAND_SHAPE:.+]] = memref.expand_shape {{.+}} : memref<16x?xf32, strided<[16, 1]>> into memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
-// CHECK: %[[LOAD:.+]] = memref.load %[[EXPAND_SHAPE]]
-// CHECK: return %[[LOAD]]
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[EXPAND_SHAPE:.*]] = memref.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2, 3]] output_shape [1, 16, %[[SZ0]], 1] : memref<16x?xf32, strided<[16, 1]>> into memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
+// CHECK: %[[VAL_0:.*]] = memref.load %[[EXPAND_SHAPE]][%[[C0]], %[[ARG1]], %[[ARG2]], %[[C0]]] : memref<1x16x?x1xf32, strided<[256, 16, 1, 1]>>
+// CHECK: return %[[VAL_0]] : f32
 
 // -----
 
@@ -486,7 +488,7 @@ func.func @fold_dynamic_subview_with_memref_load_store_expand_shape(%arg0 : memr
 // CHECK-LABEL: fold_static_stride_subview_with_affine_load_store_expand_shape
 // CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1xf32>, %[[ARG2:.*]]: index)
 func.func @fold_static_stride_subview_with_affine_load_store_expand_shape(%arg0: memref<1024x1024xf32>, %arg1: memref<1xf32>, %arg2: index) -> f32 {
-  %0 = memref.expand_shape %arg0 [[0, 1], [2, 3]] : memref<1024x1024xf32> into memref<1x1024x1024x1xf32>
+  %0 = memref.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [1, 1024, 1024, 1] : memref<1024x1024xf32> into memref<1x1024x1024x1xf32>
   affine.for %arg3 = 0 to 1 {
     affine.for %arg4 = 0 to 1024 {
       affine.for %arg5 = 0 to 1020 {
@@ -515,7 +517,7 @@ func.func @fold_static_stride_subview_with_affine_load_store_expand_shape(%arg0:
 // CHECK-LABEL: fold_static_stride_subview_with_affine_load_store_expand_shape_when_access_index_is_an_expression
 // CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1xf32>, %[[ARG2:.*]]: index)
 func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_when_access_index_is_an_expression(%arg0: memref<1024x1024xf32>, %arg1: memref<1xf32>, %arg2: index) -> f32 {
-  %0 = memref.expand_shape %arg0 [[0, 1], [2, 3]] : memref<1024x1024xf32> into memref<1x1024x1024x1xf32>
+  %0 = memref.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [1, 1024, 1024, 1] : memref<1024x1024xf32> into memref<1x1024x1024x1xf32>
   affine.for %arg3 = 0 to 1 {
     affine.for %arg4 = 0 to 1024 {
       affine.for %arg5 = 0 to 1020 {
@@ -544,7 +546,7 @@ func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_when_a
 // CHECK-LABEL: fold_static_stride_subview_with_affine_load_store_expand_shape_with_constant_access_index
 // CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1xf32>, %[[ARG2:.*]]: index)
 func.func @fold_static_stride_subview_with_affine_load_store_expand_shape_with_constant_access_index(%arg0: memref<1024x1024xf32>, %arg1: memref<1xf32>, %arg2: index) -> f32 {
-  %0 = memref.expand_shape %arg0 [[0, 1], [2, 3]] : memref<1024x1024xf32> into memref<1x1024x1024x1xf32>
+  %0 = memref.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [1, 1024, 1024, 1] : memref<1024x1024xf32> into memref<1x1024x1024x1xf32>
   %cst = arith.constant 0 : index
   affine.for %arg3 = 0 to 1 {
     affine.for %arg4 = 0 to 1024 {
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 1aef417549d9..70c96aad9555 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -392,9 +392,9 @@ func.func @copy_different_eltype(%arg0: memref<2xf32>, %arg1: memref<2xf16>) {
 
 // -----
 
-func.func @expand_shape(%arg0: memref<?x?xf32>) {
+func.func @expand_shape(%arg0: memref<?x?xf32>, %sz0: index, %sz1: index) {
   // expected-error @+1 {{invalid number of reassociation groups: found 1, expected 2}}
-  %0 = memref.expand_shape %arg0 [[0, 1]] : memref<?x?xf32> into memref<?x5x?xf32>
+  %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [%sz0, 5, %sz1] : memref<?x?xf32> into memref<?x5x?xf32>
   return
 }
 
@@ -402,7 +402,15 @@ func.func @expand_shape(%arg0: memref<?x?xf32>) {
 
 func.func @expand_shape(%arg0: memref<f32>) {
   // expected-error @+1 {{rank 0 memrefs can only be extended/collapsed with/from ones}}
-  %0 = memref.expand_shape %arg0 [] : memref<f32> into memref<1x2xf32>
+  %0 = memref.expand_shape %arg0 [] output_shape [1, 2] : memref<f32> into memref<1x2xf32>
+  return
+}
+
+// -----
+
+func.func @expand_shape_illegal_output_shape(%arg0: memref<2xf32>) {
+  // expected-error @+1 {{expected number of static shape bounds to be equal to the output rank (3) but found 2 inputs instead}}
+  %0 = memref.expand_shape %arg0 [[0, 1, 2]] output_shape [1, 2] : memref<2xf32> into memref<1x1x2xf32>
   return
 }
 
@@ -415,9 +423,9 @@ func.func @collapse_shape_out_of_bounds(%arg0: memref<?x?xf32>) {
 
 // -----
 
-func.func @expand_shape_out_of_bounds(%arg0: memref<?xf32>) {
+func.func @expand_shape_out_of_bounds(%arg0: memref<?xf32>, %sz0: index) {
   // expected-error @+1 {{op reassociation index 2 is out of bounds}}
-  %0 = memref.expand_shape %arg0 [[0, 1, 2]] : memref<?xf32> into memref<4x?xf32>
+  %0 = memref.expand_shape %arg0 [[0, 1, 2]] output_shape [4, %sz0] : memref<?xf32> into memref<4x?xf32>
 }
 
 // -----
@@ -425,7 +433,7 @@ func.func @expand_shape_out_of_bounds(%arg0: memref<?xf32>) {
 func.func @expand_shape_invalid_result_layout(
     %arg0: memref<30x20xf32, strided<[4000, 2], offset: 100>>) {
   // expected-error @+1 {{expected expanded type to be 'memref<2x15x20xf32, strided<[60000, 4000, 2], offset: 100>>' but found 'memref<2x15x20xf32, strided<[5000, 4000, 2], offset: 100>>'}}
-  %0 = memref.expand_shape %arg0 [[0, 1], [2]] :
+  %0 = memref.expand_shape %arg0 [[0, 1], [2]] output_shape [2, 15, 20] :
       memref<30x20xf32, strided<[4000, 2], offset: 100>>
       into memref<2x15x20xf32, strided<[5000, 4000, 2], offset: 100>>
 }
@@ -462,7 +470,7 @@ func.func @collapse_shape_invalid_reassociation_expansion(%arg0: memref<?xf32>)
 // like this. Verify that a sensible error is emitted in this case.
 func.func @expand_shape_invalid_reassociation(%arg0: memref<2x3x1xf32>) {
   // expected-error @+1 {{'memref.expand_shape' op has source rank 3 and result rank 2. This is not an expansion (3 > 2)}}
-  %0 = memref.expand_shape %arg0 [[0], [1], [1]] :
+  %0 = memref.expand_shape %arg0 [[0], [1], [1]] output_shape [2, 3] :
     memref<2x3x1xf32> into memref<2x3xf32>
 }
 
@@ -495,20 +503,10 @@ func.func @collapse_shape_wrong_collapsed_type(%arg0: memref<?x?x?xf32>) {
 
 // -----
 
-func.func @expand_shape_illegal_dynamic_memref
-  (%arg0: memref<?x?x?xf32>) -> memref<?x?x?x4x?xf32> {
-  // expected-error @+1 {{at most one dimension in a reassociation group may be dynamic}}
-  %0 = memref.expand_shape %arg0 [[0], [1], [2, 3, 4]]
-      : memref<?x?x?xf32> into memref<?x?x?x4x?xf32>
-  return %0 : memref<?x?x?x4x?xf32>
-}
-
-// -----
-
 func.func @expand_shape_illegal_static_memref
   (%arg0: memref<2x3x20xf32>) -> memref<2x3x2x4x5xf32> {
   // expected-error @+1 {{collapsed dim size (20) must equal reassociation group size (40)}}
-  %0 = memref.expand_shape %arg0 [[0], [1], [2, 3, 4]]
+  %0 = memref.expand_shape %arg0 [[0], [1], [2, 3, 4]] output_shape [2, 3, 2, 4, 5]
       : memref<2x3x20xf32> into memref<2x3x2x4x5xf32>
   return %0 : memref<2x3x2x4x5xf32>
 }
@@ -525,30 +523,30 @@ func.func @collapse_shape_illegal_static_memref
 
 // -----
 
-func.func @expand_shape_illegal_mixed_memref(%arg0 : memref<?x?xf32>)
+func.func @expand_shape_illegal_mixed_memref(%arg0 : memref<?x?xf32>, %sz0: index)
     -> memref<?x4x5xf32> {
   // expected-error @+1 {{collapsed dim (1) must be dynamic if and only if reassociation group is dynamic}}
-  %0 = memref.expand_shape %arg0 [[0, 1], [2]]
+  %0 = memref.expand_shape %arg0 [[0, 1], [2]] output_shape [%sz0, 4, 5]
       : memref<?x?xf32> into memref<?x4x5xf32>
   return %0 : memref<?x4x5xf32>
 }
 
 // -----
 
-func.func @expand_shape_illegal_mixed_memref_2(%arg0 : memref<?x?xf32>)
+func.func @expand_shape_illegal_mixed_memref_2(%arg0 : memref<?x?xf32>, %sz0: index)
     -> memref<?x4x5xf32> {
   // expected-error @+1 {{collapsed dim (1) must be dynamic if and only if reassociation group is dynamic}}
-  %0 = memref.expand_shape %arg0 [[0], [1, 2]]
+  %0 = memref.expand_shape %arg0 [[0], [1, 2]] output_shape [%sz0, 4, 5]
       : memref<?x?xf32> into memref<?x4x5xf32>
   return %0 : memref<?x4x5xf32>
 }
 
 // -----
 
-func.func @expand_shape_invalid_static_dim_size(%arg0 : memref<?x21xf32>)
+func.func @expand_shape_invalid_static_dim_size(%arg0 : memref<?x21xf32>, %sz0: index)
     -> memref<?x4x5xf32> {
   // expected-error @+1 {{collapsed dim size (21) must equal reassociation group size (20)}}
-  %0 = memref.expand_shape %arg0 [[0], [1, 2]]
+  %0 = memref.expand_shape %arg0 [[0], [1, 2]] output_shape [%sz0, 4, 5]
       : memref<?x21xf32> into memref<?x4x5xf32>
   return %0 : memref<?x4x5xf32>
 }
diff --git a/mlir/test/Dialect/MemRef/normalize-memrefs-ops.mlir b/mlir/test/Dialect/MemRef/normalize-memrefs-ops.mlir
index 34420c50a51a..3bede131325a 100644
--- a/mlir/test/Dialect/MemRef/normalize-memrefs-ops.mlir
+++ b/mlir/test/Dialect/MemRef/normalize-memrefs-ops.mlir
@@ -149,3 +149,19 @@ func.func @test_norm_reinterpret_cast(%arg0 : memref<3xf32, #map_1d_tile>) -> (m
     // CHECK: memref.reinterpret_cast %[[v0]] to offset: [0], sizes: [3, 1, 1], strides: [1, 1, 1] : memref<3xf32> to memref<3x1x1xf32>
     return %1 : memref<3x1x1xf32>
 }
+
+
+// -----
+
+// Test normalization of memrefs for prefetch.affine
+
+// CHECK-LABEL: func.func @prefetch_normalize
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<16x32xf32>) {
+func.func @prefetch_normalize(%arg0: memref<512xf32, affine_map<(d0) -> (d0 floordiv 32, d0 mod 32)>>) -> () {
+  // CHECK: affine.for [[I_0_:%.+]] = 0 to 8 {
+  affine.for %arg3 = 0 to 8  {
+    // CHECK: affine.prefetch [[PARAM_0_]]{{.}}[[I_0_]] floordiv 32, [[I_0_]] mod 32], read, locality<3>, data : memref<16x32xf32>
+    affine.prefetch %arg0[%arg3], read, locality<3>, data : memref<512xf32, affine_map<(d0) -> (d0 floordiv 32, d0 mod 32)>>
+  }
+  return
+}
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index 2d69904f27db..60fb0ffeee24 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -106,9 +106,9 @@ func.func @expand_collapse_shape_static(
   %0 = memref.collapse_shape %arg0 [[0, 1], [2]] :
     memref<3x4x5xf32> into memref<12x5xf32>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]]
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]] output_shape [3, 4, 5]
 //  CHECK-SAME:     memref<12x5xf32> into memref<3x4x5xf32>
-  %r0 = memref.expand_shape %0 [[0, 1], [2]] :
+  %r0 = memref.expand_shape %0 [[0, 1], [2]] output_shape [3, 4, 5] :
     memref<12x5xf32> into memref<3x4x5xf32>
 
 //       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0], [1, 2]]
@@ -116,9 +116,9 @@ func.func @expand_collapse_shape_static(
   %1 = memref.collapse_shape %arg0 [[0], [1, 2]] :
     memref<3x4x5xf32> into memref<3x20xf32>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0], [1, 2]]
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0], [1, 2]] output_shape [3, 4, 5]
 //  CHECK-SAME:     memref<3x20xf32> into memref<3x4x5xf32>
-  %r1 = memref.expand_shape %1 [[0], [1, 2]] :
+  %r1 = memref.expand_shape %1 [[0], [1, 2]] output_shape [3, 4, 5] :
     memref<3x20xf32> into memref<3x4x5xf32>
 
 //       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1, 2]]
@@ -126,29 +126,29 @@ func.func @expand_collapse_shape_static(
   %2 = memref.collapse_shape %arg0 [[0, 1, 2]] :
     memref<3x4x5xf32> into memref<60xf32>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1, 2]]
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1, 2]] output_shape [3, 4, 5]
 //  CHECK-SAME:     memref<60xf32> into memref<3x4x5xf32>
-  %r2 = memref.expand_shape %2 [[0, 1, 2]] :
+  %r2 = memref.expand_shape %2 [[0, 1, 2]] output_shape [3, 4, 5] :
       memref<60xf32> into memref<3x4x5xf32>
 
-//       CHECK:   memref.expand_shape {{.*}} []
+//       CHECK:   memref.expand_shape {{.*}} [] output_shape [1, 1]
 //  CHECK-SAME:     memref<f32> into memref<1x1xf32>
-  %r5 = memref.expand_shape %arg5 [] :
+  %r5 = memref.expand_shape %arg5 [] output_shape [1, 1] :
       memref<f32> into memref<1x1xf32>
 
 // Reshapes with a custom layout map.
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0], [1, 2]]
-  %l0 = memref.expand_shape %arg3 [[0], [1, 2]] :
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0], [1, 2]] output_shape [30, 4, 5]
+  %l0 = memref.expand_shape %arg3 [[0], [1, 2]] output_shape [30, 4, 5] :
       memref<30x20xf32, strided<[4000, 2], offset: 100>>
       into memref<30x4x5xf32, strided<[4000, 10, 2], offset: 100>>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]]
-  %l1 = memref.expand_shape %arg3 [[0, 1], [2]] :
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]] output_shape [2, 15, 20]
+  %l1 = memref.expand_shape %arg3 [[0, 1], [2]] output_shape [2, 15, 20] :
       memref<30x20xf32, strided<[4000, 2], offset: 100>>
       into memref<2x15x20xf32, strided<[60000, 4000, 2], offset: 100>>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0], [1, 2]]
-  %r4 = memref.expand_shape %arg4 [[0], [1, 2]] :
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0], [1, 2]] output_shape [1, 1, 5]
+  %r4 = memref.expand_shape %arg4 [[0], [1, 2]] output_shape [1, 1, 5] :
       memref<1x5xf32, strided<[5, 1], offset: ?>> into
       memref<1x1x5xf32, strided<[5, 5, 1], offset: ?>>
 
@@ -164,9 +164,9 @@ func.func @expand_collapse_shape_static(
       memref<2049xi64, strided<[?], offset: ?>>
 
   // Reshapes that expand and collapse back a contiguous buffer with some 1's.
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]]
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]] output_shape [1, 3, 4, 1, 5]
 //  CHECK-SAME:     memref<3x4x5xf32> into memref<1x3x4x1x5xf32>
-  %3 = memref.expand_shape %arg0 [[0, 1], [2], [3, 4]] :
+  %3 = memref.expand_shape %arg0 [[0, 1], [2], [3, 4]] output_shape [1, 3, 4, 1, 5]:
     memref<3x4x5xf32> into memref<1x3x4x1x5xf32>
 
 //       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]]
@@ -176,15 +176,18 @@ func.func @expand_collapse_shape_static(
 
   // Reshapes on tensors.
 //       CHECK:   tensor.expand_shape {{.*}}: tensor<3x4x5xf32> into tensor<1x3x4x1x5xf32>
-  %t0 = tensor.expand_shape %arg1 [[0, 1], [2], [3, 4]] :
+  %t0 = tensor.expand_shape %arg1 [[0, 1], [2], [3, 4]] output_shape [1, 3, 4, 1, 5] :
     tensor<3x4x5xf32> into tensor<1x3x4x1x5xf32>
 
 //       CHECK:   tensor.collapse_shape {{.*}}: tensor<1x3x4x1x5xf32> into tensor<3x4x5xf32>
   %rt0 = tensor.collapse_shape %t0 [[0, 1], [2], [3, 4]] :
     tensor<1x3x4x1x5xf32> into tensor<3x4x5xf32>
 
+//       CHECK:   tensor.dim %arg2, {{.*}} : tensor<3x?x5xf32>
 //       CHECK:   tensor.expand_shape {{.*}}: tensor<3x?x5xf32> into tensor<1x3x?x1x5xf32>
-  %t1 = tensor.expand_shape %arg2 [[0, 1], [2], [3, 4]] :
+  %c1 = arith.constant 1 : index
+  %sz1 = tensor.dim %arg2, %c1 : tensor<3x?x5xf32>
+  %t1 = tensor.expand_shape %arg2 [[0, 1], [2], [3, 4]] output_shape [1, 3, %sz1, 1, 5] :
     tensor<3x?x5xf32> into tensor<1x3x?x1x5xf32>
 
 //       CHECK:   tensor.collapse_shape {{.*}}: tensor<1x3x?x1x5xf32> into tensor<1x?x5xf32>
@@ -197,15 +200,18 @@ func.func @expand_collapse_shape_static(
 func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
          %arg1: memref<?x?x?xf32, strided<[?, ?, 1], offset: 0>>,
          %arg2: memref<?x?x?xf32, strided<[?, ?, 1], offset: ?>>,
-         %arg3: memref<?x42xf32, strided<[42, 1], offset: 0>>) {
+         %arg3: memref<?x42xf32, strided<[42, 1], offset: 0>>,
+         %arg4: index,
+         %arg5: index,
+         %arg6: index) {
 //       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]]
 //  CHECK-SAME:     memref<?x?x?xf32> into memref<?x?xf32>
   %0 = memref.collapse_shape %arg0 [[0, 1], [2]] :
     memref<?x?x?xf32> into memref<?x?xf32>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]]
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]] output_shape [%arg4, 4, %arg5]
 //  CHECK-SAME:     memref<?x?xf32> into memref<?x4x?xf32>
-  %r0 = memref.expand_shape %0 [[0, 1], [2]] :
+  %r0 = memref.expand_shape %0 [[0, 1], [2]] output_shape [%arg4, 4, %arg5] :
     memref<?x?xf32> into memref<?x4x?xf32>
 
 //       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]]
@@ -214,9 +220,9 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
     memref<?x?x?xf32, strided<[?, ?, 1], offset: 0>> into
     memref<?x?xf32, strided<[?, 1], offset: 0>>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]]
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]] output_shape [%arg4, 4, %arg5]
 //  CHECK-SAME:     memref<?x?xf32, strided<[?, 1]>> into memref<?x4x?xf32, strided<[?, ?, 1]>>
-  %r1 = memref.expand_shape %1 [[0, 1], [2]] :
+  %r1 = memref.expand_shape %1 [[0, 1], [2]] output_shape [%arg4, 4, %arg5] :
     memref<?x?xf32, strided<[?, 1], offset: 0>> into
     memref<?x4x?xf32, strided<[?, ?, 1], offset: 0>>
 
@@ -226,9 +232,9 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
     memref<?x?x?xf32, strided<[?, ?, 1], offset: ?>> into
     memref<?x?xf32, strided<[?, 1], offset: ?>>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]]
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2]] output_shape [%arg4, 4, %arg5]
 //  CHECK-SAME:     memref<?x?xf32, strided<[?, 1], offset: ?>> into memref<?x4x?xf32, strided<[?, ?, 1], offset: ?>>
-  %r2 = memref.expand_shape %2 [[0, 1], [2]] :
+  %r2 = memref.expand_shape %2 [[0, 1], [2]] output_shape [%arg4, 4, %arg5] :
     memref<?x?xf32, strided<[?, 1], offset: ?>> into
     memref<?x4x?xf32, strided<[?, ?, 1], offset: ?>>
 
@@ -238,9 +244,9 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
     memref<?x42xf32, strided<[42, 1], offset: 0>> into
     memref<?xf32, strided<[1]>>
 
-//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1]]
+//       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1]] output_shape [%arg6, 42]
 //  CHECK-SAME:     memref<?xf32, strided<[1]>> into memref<?x42xf32>
-  %r3 = memref.expand_shape %3 [[0, 1]] :
+  %r3 = memref.expand_shape %3 [[0, 1]] output_shape [%arg6, 42] :
     memref<?xf32, strided<[1]>> into memref<?x42xf32>
   return
 }
@@ -248,12 +254,12 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
 func.func @expand_collapse_shape_zero_dim(%arg0 : memref<1x1xf32>, %arg1 : memref<f32>)
     -> (memref<f32>, memref<1x1xf32>) {
   %0 = memref.collapse_shape %arg0 [] : memref<1x1xf32> into memref<f32>
-  %1 = memref.expand_shape %0 [] : memref<f32> into memref<1x1xf32>
+  %1 = memref.expand_shape %0 [] output_shape [1, 1] : memref<f32> into memref<1x1xf32>
   return %0, %1 : memref<f32>, memref<1x1xf32>
 }
 // CHECK-LABEL: func @expand_collapse_shape_zero_dim
 //       CHECK:   memref.collapse_shape %{{.*}} [] : memref<1x1xf32> into memref<f32>
-//       CHECK:   memref.expand_shape %{{.*}} [] : memref<f32> into memref<1x1xf32>
+//       CHECK:   memref.expand_shape %{{.*}} [] output_shape [1, 1] : memref<f32> into memref<1x1xf32>
 
 func.func @collapse_shape_to_dynamic
   (%arg0: memref<?x?x?x4x?xf32>) -> memref<?x?x?xf32> {
@@ -270,16 +276,18 @@ func.func @collapse_shape_to_dynamic
 // CHECK-LABEL: func @expand_collapse_shape_transposed_layout
 func.func @expand_collapse_shape_transposed_layout(
     %m0: memref<?x?xf32, strided<[1, 10], offset: 0>>,
-    %m1: memref<4x5x6xf32, strided<[1, ?, 1000], offset: 0>>) {
+    %m1: memref<4x5x6xf32, strided<[1, ?, 1000], offset: 0>>,
+    %sz0: index,
+    %sz1: index) {
 
-  %r0 = memref.expand_shape %m0 [[0], [1, 2]] :
+  %r0 = memref.expand_shape %m0 [[0], [1, 2]] output_shape [%sz0, %sz1, 5] :
     memref<?x?xf32, strided<[1, 10], offset: 0>> into
     memref<?x?x5xf32, strided<[1, 50, 10], offset: 0>>
   %rr0 = memref.collapse_shape %r0 [[0], [1, 2]] :
     memref<?x?x5xf32, strided<[1, 50, 10], offset: 0>> into
     memref<?x?xf32, strided<[1, 10], offset: 0>>
 
-  %r1 = memref.expand_shape %m1 [[0, 1], [2], [3, 4]] :
+  %r1 = memref.expand_shape %m1 [[0, 1], [2], [3, 4]] output_shape [2, 2, 5, 2, 3] :
     memref<4x5x6xf32, strided<[1, ?, 1000], offset: 0>> into
     memref<2x2x5x2x3xf32, strided<[2, 1, ?, 3000, 1000], offset: 0>>
   %rr1 = memref.collapse_shape %r1 [[0, 1], [2], [3, 4]] :
diff --git a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
index 18e9a9d02e10..40f88de01b8b 100644
--- a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
+++ b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
@@ -25,3 +25,31 @@ func.func @dim_out_of_bounds_2(%idx1 : index, %idx2 : index) -> index {
   %0 = tensor.dim %alloc, %idx : tensor<?x?xf32>
   return %0 : index
 }
+
+// -----
+
+// CHECK-LABEL:   func.func @dynamic_dim_of_transpose_op(
+//  CHECK-SAME:                                   %[[arg:.*]]: tensor<1x2x?x8xi8>) -> index {
+//  CHECK-NEXT:           %[[c2:.*]] = arith.constant 2
+//  CHECK-NEXT:           tensor.dim %[[arg]], %[[c2]]
+//  CHECK-NEXT:           return
+func.func @dynamic_dim_of_transpose_op(%arg0: tensor<1x2x?x8xi8>) -> index {
+  %0 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %1 = tosa.transpose %arg0, %0 : (tensor<1x2x?x8xi8>, tensor<4xi32>) -> tensor<1x8x2x?xi8>
+  %c3 = arith.constant 3 : index
+  %dim = tensor.dim %1, %c3 : tensor<1x8x2x?xi8>
+  return %dim : index
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @static_dim_of_transpose_op(
+//  CHECK:           arith.constant 100 : index
+//  CHECK:           return
+func.func @static_dim_of_transpose_op(%arg0: tensor<1x100x?x8xi8>) -> index {
+  %0 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %1 = tosa.transpose %arg0, %0 : (tensor<1x100x?x8xi8>, tensor<4xi32>) -> tensor<1x8x100x?xi8>
+  %c2 = arith.constant 2 : index
+  %dim = tensor.dim %1, %c2 : tensor<1x8x100x?xi8>
+  return %dim : index
+}
diff --git a/mlir/test/Dialect/MemRef/runtime-verification.mlir b/mlir/test/Dialect/MemRef/runtime-verification.mlir
index 4d7fcf6ac7cb..28777a3e8867 100644
--- a/mlir/test/Dialect/MemRef/runtime-verification.mlir
+++ b/mlir/test/Dialect/MemRef/runtime-verification.mlir
@@ -2,13 +2,14 @@
 
 // CHECK-LABEL: func @expand_shape(
 //  CHECK-SAME:     %[[m:.*]]: memref<?xf32>
+//  CHECK-SAME:     %[[sz0:.*]]: index 
 //   CHECK-DAG:   %[[c0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[c5:.*]] = arith.constant 5 : index
 //   CHECK-DAG:   %[[dim:.*]] = memref.dim %[[m]], %[[c0]]
 //       CHECK:   %[[mod:.*]] = arith.remsi %[[dim]], %[[c5]]
 //       CHECK:   %[[cmpi:.*]] = arith.cmpi eq, %[[mod]], %[[c0]]
 //       CHECK:   cf.assert %[[cmpi]], "ERROR: Runtime op verification failed
-func.func @expand_shape(%m: memref<?xf32>) -> memref<?x5xf32> {
-  %0 = memref.expand_shape %m [[0, 1]] : memref<?xf32> into memref<?x5xf32>
+func.func @expand_shape(%m: memref<?xf32>, %sz0: index) -> memref<?x5xf32> {
+  %0 = memref.expand_shape %m [[0, 1]] output_shape [%sz0, 5] : memref<?xf32> into memref<?x5xf32>
   return %0 : memref<?x5xf32>
 }
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index e329b3010017..511e7d396c68 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -2151,6 +2151,17 @@ omp.private {type = private} @x.privatizer : i32 alloc {
 
 // -----
 
+omp.private {type = private} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+} dealloc {
+^bb0(%arg0: f32):
+  // expected-error @below {{Did not expect any values to be yielded.}}
+  omp.yield(%arg0 : f32)
+}
+
+// -----
+
 omp.private {type = private} @x.privatizer : i32 alloc {
 ^bb0(%arg0: i32):
   // expected-error @below {{expected exit block terminator to be an `omp.yield` op.}}
@@ -2178,6 +2189,17 @@ omp.private {type = firstprivate} @x.privatizer : f32 alloc {
 
 // -----
 
+// expected-error @below {{`dealloc`: expected 1 region arguments, got: 2}}
+omp.private {type = private} @x.privatizer : f32 alloc {
+^bb0(%arg0: f32):
+  omp.yield(%arg0 : f32)
+} dealloc {
+^bb0(%arg0: f32, %arg1: f32):
+  omp.yield
+}
+
+// -----
+
 // expected-error @below {{`private` clauses require only an `alloc` region.}}
 omp.private {type = private} @x.privatizer : f32 alloc {
 ^bb0(%arg0: f32):
@@ -2249,4 +2271,4 @@ func.func @undefined_privatizer(%arg0: !llvm.ptr) {
       omp.terminator
     }) : (!llvm.ptr) -> ()
   return
-}
-\ No newline at end of file
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index a012588f0b55..60fc10f9d64b 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -2492,11 +2492,22 @@ func.func @parallel_op_privatizers(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
   return
 }
 
+// CHECK-LABEL: omp.private {type = private} @a.privatizer : !llvm.ptr alloc {
+omp.private {type = private} @a.privatizer : !llvm.ptr alloc {
+// CHECK: ^bb0(%{{.*}}: {{.*}}):
+^bb0(%arg0: !llvm.ptr):
+  omp.yield(%arg0 : !llvm.ptr)
+}
+
 // CHECK-LABEL: omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
 omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
 // CHECK: ^bb0(%{{.*}}: {{.*}}):
 ^bb0(%arg0: !llvm.ptr):
   omp.yield(%arg0 : !llvm.ptr)
+} dealloc {
+// CHECK: ^bb0(%{{.*}}: {{.*}}):
+^bb0(%arg0: !llvm.ptr):
+  omp.yield
 }
 
 // CHECK-LABEL: omp.private {type = firstprivate} @y.privatizer : !llvm.ptr alloc {
@@ -2509,6 +2520,10 @@ omp.private {type = firstprivate} @y.privatizer : !llvm.ptr alloc {
 // CHECK: ^bb0(%{{.*}}: {{.*}}, %{{.*}}: {{.*}}):
 ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
   omp.yield(%arg0 : !llvm.ptr)
+} dealloc {
+// CHECK: ^bb0(%{{.*}}: {{.*}}):
+^bb0(%arg0: !llvm.ptr):
+  omp.yield
 }
 
 // CHECK-LABEL: parallel_op_reduction_and_private
diff --git a/mlir/test/Dialect/Polynomial/ops.mlir b/mlir/test/Dialect/Polynomial/ops.mlir
new file mode 100644
index 000000000000..ea1b279fa1ff
--- /dev/null
+++ b/mlir/test/Dialect/Polynomial/ops.mlir
@@ -0,0 +1,82 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// This simply tests for syntax.
+
+#my_poly = #polynomial.polynomial<1 + x**1024>
+#my_poly_2 = #polynomial.polynomial<2>
+#my_poly_3 = #polynomial.polynomial<3x>
+#my_poly_4 = #polynomial.polynomial<t**3 + 4t + 2>
+#ring1 = #polynomial.ring<coefficientType=i32, coefficientModulus=2837465, polynomialModulus=#my_poly>
+#one_plus_x_squared = #polynomial.polynomial<1 + x**2>
+
+#ideal = #polynomial.polynomial<-1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i32, coefficientModulus=18, polynomialModulus=#ideal>
+!poly_ty = !polynomial.polynomial<#ring>
+
+module {
+  func.func @test_multiply() -> !polynomial.polynomial<#ring1> {
+    %c0 = arith.constant 0 : index
+    %two = arith.constant 2 : i16
+    %five = arith.constant 5 : i16
+    %coeffs1 = tensor.from_elements %two, %two, %five : tensor<3xi16>
+    %coeffs2 = tensor.from_elements %five, %five, %two : tensor<3xi16>
+
+    %poly1 = polynomial.from_tensor %coeffs1 : tensor<3xi16> -> !polynomial.polynomial<#ring1>
+    %poly2 = polynomial.from_tensor %coeffs2 : tensor<3xi16> -> !polynomial.polynomial<#ring1>
+
+    %3 = polynomial.mul %poly1, %poly2 : !polynomial.polynomial<#ring1>
+
+    return %3 : !polynomial.polynomial<#ring1>
+  }
+
+  func.func @test_elementwise(%p0 : !polynomial.polynomial<#ring1>, %p1: !polynomial.polynomial<#ring1>) {
+    %tp0 = tensor.from_elements %p0, %p1 : tensor<2x!polynomial.polynomial<#ring1>>
+    %tp1 = tensor.from_elements %p1, %p0 : tensor<2x!polynomial.polynomial<#ring1>>
+
+    %c = arith.constant 2 : i32
+    %mul_const_sclr = polynomial.mul_scalar %tp0, %c : tensor<2x!polynomial.polynomial<#ring1>>, i32
+
+    %add = polynomial.add %tp0, %tp1 : tensor<2x!polynomial.polynomial<#ring1>>
+    %sub = polynomial.sub %tp0, %tp1 : tensor<2x!polynomial.polynomial<#ring1>>
+    %mul = polynomial.mul %tp0, %tp1 : tensor<2x!polynomial.polynomial<#ring1>>
+
+    return
+  }
+
+  func.func @test_to_from_tensor(%p0 : !polynomial.polynomial<#ring1>) {
+    %c0 = arith.constant 0 : index
+    %two = arith.constant 2 : i16
+    %coeffs1 = tensor.from_elements %two, %two : tensor<2xi16>
+    // CHECK: from_tensor
+    %poly = polynomial.from_tensor %coeffs1 : tensor<2xi16> -> !polynomial.polynomial<#ring1>
+    // CHECK: to_tensor
+    %tensor = polynomial.to_tensor %poly : !polynomial.polynomial<#ring1> -> tensor<1024xi16>
+
+    return
+  }
+
+  func.func @test_degree(%p0 : !polynomial.polynomial<#ring1>) {
+    %0, %1 = polynomial.leading_term %p0 : !polynomial.polynomial<#ring1> -> (index, i32)
+    return
+  }
+
+  func.func @test_monomial() {
+    %deg = arith.constant 1023 : index
+    %five = arith.constant 5 : i16
+    %0 = polynomial.monomial %five, %deg : (i16, index) -> !polynomial.polynomial<#ring1>
+    return
+  }
+
+  func.func @test_monic_monomial_mul() {
+    %five = arith.constant 5 : index
+    %0 = polynomial.constant #one_plus_x_squared : !polynomial.polynomial<#ring1>
+    %1 = polynomial.monic_monomial_mul %0, %five : (!polynomial.polynomial<#ring1>, index) -> !polynomial.polynomial<#ring1>
+    return
+  }
+
+  func.func @test_constant() {
+    %0 = polynomial.constant #one_plus_x_squared : !polynomial.polynomial<#ring1>
+    %1 = polynomial.constant <1 + x**2> : !polynomial.polynomial<#ring1>
+    return
+  }
+}
diff --git a/mlir/test/Dialect/Polynomial/ops_errors.mlir b/mlir/test/Dialect/Polynomial/ops_errors.mlir
new file mode 100644
index 000000000000..c34a7de30e5f
--- /dev/null
+++ b/mlir/test/Dialect/Polynomial/ops_errors.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt --split-input-file --verify-diagnostics %s
+
+#my_poly = #polynomial.polynomial<1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly>
+!ty = !polynomial.polynomial<#ring>
+
+func.func @test_from_tensor_too_large_coeffs() {
+  %two = arith.constant 2 : i32
+  %coeffs1 = tensor.from_elements %two, %two : tensor<2xi32>
+  // expected-error@below {{is too large to fit in the coefficients}}
+  // expected-note@below {{rescaled to fit}}
+  %poly = polynomial.from_tensor %coeffs1 : tensor<2xi32> -> !ty
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<1 + x**4>
+#ring = #polynomial.ring<coefficientType=i32, coefficientModulus=256, polynomialModulus=#my_poly>
+!ty = !polynomial.polynomial<#ring>
+func.func @test_from_tensor_wrong_tensor_type() {
+  %two = arith.constant 2 : i32
+  %coeffs1 = tensor.from_elements %two, %two, %two, %two, %two : tensor<5xi32>
+  // expected-error@below {{input type 'tensor<5xi32>' does not match output type '!polynomial.polynomial<#polynomial.ring<coefficientType=i32, coefficientModulus=256 : i32, polynomialModulus=#polynomial.polynomial<1 + x**4>>>'}}
+  // expected-note@below {{at most the degree of the polynomialModulus of the output type's ring attribute}}
+  %poly = polynomial.from_tensor %coeffs1 : tensor<5xi32> -> !ty
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<1 + x**4>
+#ring = #polynomial.ring<coefficientType=i32, coefficientModulus=256, polynomialModulus=#my_poly>
+!ty = !polynomial.polynomial<#ring>
+func.func @test_to_tensor_wrong_output_tensor_type(%arg0 : !ty) {
+  // expected-error@below {{input type '!polynomial.polynomial<#polynomial.ring<coefficientType=i32, coefficientModulus=256 : i32, polynomialModulus=#polynomial.polynomial<1 + x**4>>>' does not match output type 'tensor<5xi32>'}}
+  // expected-note@below {{at most the degree of the polynomialModulus of the input type's ring attribute}}
+  %tensor = polynomial.to_tensor %arg0 : !ty -> tensor<5xi32>
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly>
+!ty = !polynomial.polynomial<#ring>
+
+func.func @test_mul_scalar_wrong_type(%arg0: !ty) -> !ty {
+  %scalar = arith.constant 2 : i32  // should be i16
+  // expected-error@below {{polynomial coefficient type 'i16' does not match scalar type 'i32'}}
+  %poly = polynomial.mul_scalar %arg0, %scalar : !ty, i32
+  return %poly : !ty
+}
diff --git a/mlir/test/Dialect/Polynomial/types.mlir b/mlir/test/Dialect/Polynomial/types.mlir
index 64b74d9d36bb..00296a36e890 100644
--- a/mlir/test/Dialect/Polynomial/types.mlir
+++ b/mlir/test/Dialect/Polynomial/types.mlir
@@ -40,3 +40,17 @@ func.func @test_non_x_variable_64_bit(%0: !ty2) -> !ty2 {
 func.func @test_linear_poly(%0: !ty3) -> !ty3 {
   return %0 : !ty3
 }
+
+// CHECK-LABEL: func @test_negative_leading_1
+// CHECK-SAME:  !polynomial.polynomial<
+// CHECK-SAME:    #polynomial.ring<
+// CHECK-SAME:       coefficientType=i32,
+// CHECK-SAME:       coefficientModulus=2837465 : i32,
+// CHECK-SAME:       polynomialModulus=#polynomial.polynomial<-1 + x**1024>>>
+#my_poly_4 = #polynomial.polynomial<-1 + x**1024>
+#ring4 = #polynomial.ring<coefficientType=i32, coefficientModulus=2837465, polynomialModulus=#my_poly_4>
+!ty4 = !polynomial.polynomial<#ring4>
+func.func @test_negative_leading_1(%0: !ty4) -> !ty4 {
+  return %0 : !ty4
+}
+
diff --git a/mlir/test/Dialect/SparseTensor/fuse_sparse_concat_with_extract_slice.mlir b/mlir/test/Dialect/SparseTensor/fuse_sparse_concat_with_extract_slice.mlir
new file mode 100644
index 000000000000..5d93301bc8ca
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/fuse_sparse_concat_with_extract_slice.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt %s --pre-sparsification-rewrite | FileCheck %s
+
+#CCCD = #sparse_tensor.encoding<{ map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : compressed, d2 : compressed, d3 : dense) }>
+
+
+
+// CHECK-LABEL: func.func @fuse_concat_with_extract(
+// CHECK-SAME:    %[[VAL_0:.*0]]: tensor<128x32x32x1xf32, #sparse{{[0-9]*}}>,
+// CHECK-SAME:    %[[VAL_1:.*1]]: tensor<128x32x32x1xf32, #sparse{{[0-9]*}}>,
+// CHECK-SAME:    %[[VAL_2:.*2]]: tensor<128x32x32x1xf32, #sparse{{[0-9]*}}>)
+// CHECK-NOT:     tensor.concat
+// CHECK-NOT:     tensor.extract_slice
+// CHECK:         return %[[VAL_0]], %[[VAL_1]], %[[VAL_2]]
+// CHECK:       }
+func.func @fuse_concat_with_extract(%t0 : tensor<128x32x32x1xf32, #CCCD>,
+                                    %t1 : tensor<128x32x32x1xf32, #CCCD>,
+                                    %t2 : tensor<128x32x32x1xf32, #CCCD>) -> (tensor<128x32x32x1xf32, #CCCD>, tensor<128x32x32x1xf32, #CCCD>, tensor<128x32x32x1xf32, #CCCD>) {
+  %concat = tensor.concat dim(3) %t0, %t1, %t2 : (tensor<128x32x32x1xf32, #CCCD>, tensor<128x32x32x1xf32, #CCCD>, tensor<128x32x32x1xf32, #CCCD>) -> tensor<128x32x32x3xf32, #CCCD>
+  %r0 = tensor.extract_slice %concat[0, 0, 0, 0] [128, 32, 32, 1] [1, 1, 1, 1] : tensor<128x32x32x3xf32, #CCCD> to tensor<128x32x32x1xf32, #CCCD>
+  %r1 = tensor.extract_slice %concat[0, 0, 0, 1] [128, 32, 32, 1] [1, 1, 1, 1] : tensor<128x32x32x3xf32, #CCCD> to tensor<128x32x32x1xf32, #CCCD>
+  %r2 = tensor.extract_slice %concat[0, 0, 0, 2] [128, 32, 32, 1] [1, 1, 1, 1] : tensor<128x32x32x3xf32, #CCCD> to tensor<128x32x32x1xf32, #CCCD>
+  return %r0, %r1, %r2 : tensor<128x32x32x1xf32, #CCCD>, tensor<128x32x32x1xf32, #CCCD>, tensor<128x32x32x1xf32, #CCCD>
+}
diff --git a/mlir/test/Dialect/SparseTensor/fuse_sparse_convert_into_producer.mlir b/mlir/test/Dialect/SparseTensor/fuse_sparse_convert_into_producer.mlir
new file mode 100644
index 000000000000..efa92e565ba5
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/fuse_sparse_convert_into_producer.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-opt %s --pre-sparsification-rewrite --sparse-reinterpret-map  | FileCheck %s --check-prefix=CHECK-FOLD
+// RUN: mlir-opt %s --pre-sparsification-rewrite --sparse-reinterpret-map --sparsification | FileCheck %s
+
+#trait = {
+  indexing_maps = [
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+  ],
+  iterator_types = ["parallel", "parallel", "parallel", "parallel"]
+}
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+#COO = #sparse_tensor.encoding<{map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique, soa), d2 : singleton(soa))}>
+#CCCD = #sparse_tensor.encoding<{ map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : compressed, d2 : compressed, d3 : dense) }>
+
+// CHECK-LABEL:   func.func @fold_convert(
+// CHECK:           scf.for
+// CHECK:             scf.for
+// CHECK:               scf.for
+// CHECK:                 scf.if
+// CHECK-NEXT:               tensor.insert
+// CHECK-NEXT:               scf.yield
+// CHECK-NEXT:             else
+// CHECK-NEXT:               scf.yield
+// CHECK:                 scf.yield
+// CHECK:               scf.yield
+// CHECK:             scf.yield
+// CHECK:           sparse_tensor.load
+
+// CHECK-FOLD-LABEL:   func.func @fold_convert(
+// CHECK-FOLD-NOT:     sparse_tensor.convert
+func.func @fold_convert(%arg0: tensor<128x32x32x1xf32>, %arg1: tensor<128x32x32x1xf32>, %arg2: tensor<128x32x32x1xf32>) -> tensor<128x32x32x1xf32, #CCCD> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant 1.000000e+00 : f32
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %0 = tensor.empty() : tensor<128x32x32x1xf32>
+  %1 = linalg.generic #trait
+  ins(%arg0, %arg1, %arg2 : tensor<128x32x32x1xf32>, tensor<128x32x32x1xf32>, tensor<128x32x32x1xf32>)
+  outs(%0 : tensor<128x32x32x1xf32>) {
+    ^bb0(%in: f32, %in_2: f32, %in_3: f32, %out: f32):
+      %3 = arith.subf %cst_0, %in_2 : f32
+      %4 = arith.mulf %in, %3 : f32
+      %5 = arith.mulf %4, %cst_1 : f32
+      %6 = arith.addf %5, %in_3 : f32
+      %7 = arith.subf %6, %cst_0 : f32
+      %8 = arith.cmpf uge, %7, %cst : f32
+      %9 = arith.uitofp %8 : i1 to f32
+      linalg.yield %9 : f32
+    } -> tensor<128x32x32x1xf32>
+  %2 = sparse_tensor.convert %1 : tensor<128x32x32x1xf32> to tensor<128x32x32x1xf32, #CCCD>
+  return %2 : tensor<128x32x32x1xf32, #CCCD>
+}
+
+
+// FIXME: The following kernel is not sparsifiable because `arith.select`
+// operations is not handled by the sparse compiler at the moment.
+//
+// CHECK-FOLD-LABEL:   func.func @fold_cast(
+// CHECK-FOLD-NOT:     sparse_tensor.convert
+func.func @fold_cast(%0: tensor<10x20x30xf64, #COO>) -> tensor<10x20x30xf64, #COO> {
+  %cst = arith.constant 0.000000e+00 : f64
+  %1 = tensor.empty() : tensor<10x20x30xf64>
+  %2 = linalg.generic { indexing_maps = [#map, #map],
+                        iterator_types = ["parallel", "parallel", "parallel"]
+                      }
+  ins (%0 : tensor<10x20x30xf64, #COO>)
+  outs(%1 : tensor<10x20x30xf64>) {
+      ^bb0(%in: f64, %out: f64):
+        %4 = arith.cmpf ugt, %in, %cst : f64
+        %5 = arith.select %4, %in, %cst : f64
+        linalg.yield %5 : f64
+  } -> tensor<10x20x30xf64>
+  %cast = tensor.cast %2 : tensor<10x20x30xf64> to tensor<10x20x30xf64, #COO>
+  return %cast : tensor<10x20x30xf64, #COO>
+}
diff --git a/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir b/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir
new file mode 100644
index 000000000000..4f509bf747ab
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir
@@ -0,0 +1,79 @@
+// RUN: mlir-opt %s --sparse-reinterpret-map -sparsification -canonicalize | FileCheck %s
+
+#CSR = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
+}>
+
+#elemwise = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>,  // A
+    affine_map<(i,j) -> (i,j)>,  // B
+    affine_map<(i,j) -> (i,j)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel"],
+  doc = "X(i,j) = A(i,j) OP B(i,j)"
+}
+
+
+// CHECK-LABEL:   func.func @padded_mul(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<4x4xf32, #sparse>,
+// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<8x8xf32>) -> tensor<8x8xf32> {
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant -1 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 6 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_9:.*]] = tensor.empty() : tensor<8x8xf32>
+// CHECK:           %[[VAL_10:.*]] = linalg.fill ins(%[[VAL_8]] : f32) outs(%[[VAL_9]] : tensor<8x8xf32>) -> tensor<8x8xf32>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<4x4xf32, #sparse> to memref<?xindex>
+// CHECK:           %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<4x4xf32, #sparse> to memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<4x4xf32, #sparse> to memref<?xf32>
+// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_10]] : memref<8x8xf32>
+// CHECK:           linalg.fill ins(%[[VAL_8]] : f32) outs(%[[VAL_14]] : memref<8x8xf32>)
+// CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_5]] {
+// CHECK:             %[[VAL_16:.*]] = arith.subi %[[VAL_15]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_15]], %[[VAL_7]] : index
+// CHECK:             %[[VAL_18:.*]] = arith.cmpi uge, %[[VAL_15]], %[[VAL_3]] : index
+// CHECK:             %[[VAL_19:.*]] = arith.ori %[[VAL_17]], %[[VAL_18]] : i1
+// CHECK:             %[[VAL_20:.*]]:2 = scf.if %[[VAL_19]] -> (index, index) {
+// CHECK:               scf.yield %[[VAL_6]], %[[VAL_6]] : index, index
+// CHECK:             } else {
+// CHECK:               %[[VAL_21:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_16]]] : memref<?xindex>
+// CHECK:               %[[VAL_22:.*]] = arith.addi %[[VAL_15]], %[[VAL_2]] : index
+// CHECK:               %[[VAL_23:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_22]]] : memref<?xindex>
+// CHECK:               scf.yield %[[VAL_21]], %[[VAL_23]] : index, index
+// CHECK:             }
+// CHECK:             scf.for %[[VAL_24:.*]] = %[[VAL_20]]#0 to %[[VAL_20]]#1 step %[[VAL_5]] {
+// CHECK:               %[[VAL_26:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_24]]] : memref<?xindex>
+// CHECK:               %[[VAL_27:.*]] = arith.addi %[[VAL_26]], %[[VAL_7]] : index
+// CHECK:               %[[VAL_28:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_24]]] : memref<?xf32>
+// CHECK:               %[[VAL_29:.*]] = tensor.extract %[[VAL_1]]{{\[}}%[[VAL_15]], %[[VAL_27]]] : tensor<8x8xf32>
+// CHECK:               %[[VAL_30:.*]] = arith.mulf %[[VAL_28]], %[[VAL_29]] : f32
+// CHECK:               memref.store %[[VAL_30]], %[[VAL_14]]{{\[}}%[[VAL_15]], %[[VAL_27]]] : memref<8x8xf32>
+// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           %[[VAL_31:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<8x8xf32>
+// CHECK:           return %[[VAL_31]] : tensor<8x8xf32>
+// CHECK:         }
+func.func @padded_mul(%arg0: tensor<4x4xf32, #CSR>, %arg1: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %cst_0 = arith.constant 0.00000e+00 : f32
+  %buf = tensor.empty() : tensor<8x8xf32>
+  %s = linalg.fill ins(%cst_0 : f32) outs(%buf : tensor<8x8xf32>) -> tensor<8x8xf32>
+
+  %padded = tensor.pad %arg0 low[2, 2] high[2, 2] {
+  ^bb0(%arg75: index, %arg76: index):
+    tensor.yield %cst_0 : f32
+  } : tensor<4x4xf32, #CSR> to tensor<8x8xf32, #CSR>
+
+  %0 = linalg.generic #elemwise
+     ins(%padded, %arg1: tensor<8x8xf32, #CSR>, tensor<8x8xf32>)
+    outs(%s: tensor<8x8xf32>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
+        %0 = arith.mulf %a, %b : f32
+        linalg.yield %0 : f32
+  } -> tensor<8x8xf32>
+
+  return %0 : tensor<8x8xf32>
+}
diff --git a/mlir/test/Dialect/SparseTensor/no_fold_into_consumer.mlir b/mlir/test/Dialect/SparseTensor/no_fold_into_consumer.mlir
deleted file mode 100644
index bbc7f397e793..000000000000
--- a/mlir/test/Dialect/SparseTensor/no_fold_into_consumer.mlir
+++ /dev/null
@@ -1,47 +0,0 @@
-// RUN: mlir-opt %s --canonicalize --pre-sparsification-rewrite | FileCheck %s
-
-#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-#sparse = #sparse_tensor.encoding<{
-  map = (d0, d1, d2) ->
-          (d0 : compressed(nonunique),
-	   d1 : singleton(nonunique, soa),
-	   d2 : singleton(soa)),
-  posWidth = 64,
-  crdWidth = 64
-}>
-
-
-module {
-  //
-  // This IR should not end up in an infinite loop trying to fold
-  // the linalg producer into the tensor cast consumer (even though
-  // static sizes can fold, the different encodings cannot). The
-  // cast was sloppy to begin with (but it has been observed by
-  // external sources) and can be easily repaired by the sparsifier.
-  //
-  // CHECK-LABEL: func @avoid_fold
-  // CHECK:       arith.constant
-  // CHECK:       tensor.empty()
-  // CHECK:       linalg.generic
-  // CHECK:       sparse_tensor.convert
-  // CHECK:       return
-  //
-  func.func @avoid_fold(%0: tensor<10x20x30xf64, #sparse>) -> tensor<10x20x30xf64, #sparse> {
-    %1 = tensor.empty() : tensor<10x20x30xf64>
-    %2 = linalg.generic { indexing_maps = [#map, #map],
-                          iterator_types = ["parallel", "parallel", "parallel"]
-                        }
-    ins (%0 : tensor<10x20x30xf64, #sparse>)
-    outs(%1 : tensor<10x20x30xf64>) {
-        ^bb0(%in: f64, %out: f64):
-          %cst = arith.constant 0.000000e+00 : f64
-          %4 = arith.cmpf ugt, %in, %cst : f64
-          %5 = arith.select %4, %in, %cst : f64
-          linalg.yield %5 : f64
-    } -> tensor<10x20x30xf64>
-    %cast = tensor.cast %2 : tensor<10x20x30xf64> to tensor<10x20x30xf64, #sparse>
-    return %cast : tensor<10x20x30xf64, #sparse>
-  }
-}
-
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
index 66e61afd897d..7eeda9a98802 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
@@ -22,6 +22,64 @@ func.func private @sparse_csr(tensor<?x?xf32, #CSR>)
 
 // -----
 
+#CSR_OnlyOnes = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 64,
+  crdWidth = 64,
+  explicitVal = 1.0 : f32,
+  implicitVal = 0.0 : f32
+}>
+
+// CHECK: #[[$CSR_OnlyOnes:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 64, crdWidth = 64, explicitVal = 1.000000e+00 : f32, implicitVal = 0.000000e+00 : f32 }>
+// CHECK-LABEL: func private @sparse_csr(
+// CHECK-SAME: tensor<?x?xf32, #[[$CSR_OnlyOnes]]>)
+func.func private @sparse_csr(tensor<?x?xf32, #CSR_OnlyOnes>)
+
+// -----
+
+#CSR_OnlyOnes = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  explicitVal = 1.0 : f64,
+  implicitVal = 0.0 : f64
+}>
+
+// CHECK: #[[$CSR_OnlyOnes:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), explicitVal = 1.000000e+00 : f64, implicitVal = 0.000000e+00 : f64 }>
+// CHECK-LABEL: func private @sparse_csr(
+// CHECK-SAME: tensor<?x?xf64, #[[$CSR_OnlyOnes]]>)
+func.func private @sparse_csr(tensor<?x?xf64, #CSR_OnlyOnes>)
+
+// -----
+
+#CSR_OnlyOnes = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 64,
+  crdWidth = 64,
+  explicitVal = 1 : i32,
+  implicitVal = 0 : i32
+}>
+
+// CHECK: #[[$CSR_OnlyOnes:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 64, crdWidth = 64, explicitVal = 1 : i32, implicitVal = 0 : i32 }>
+// CHECK-LABEL: func private @sparse_csr(
+// CHECK-SAME: tensor<?x?xi32, #[[$CSR_OnlyOnes]]>)
+func.func private @sparse_csr(tensor<?x?xi32, #CSR_OnlyOnes>)
+
+// -----
+
+#CSR_OnlyOnes = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  posWidth = 64,
+  crdWidth = 64,
+  explicitVal = 1 : i64,
+  implicitVal = 0 : i64
+}>
+
+// CHECK: #[[$CSR_OnlyOnes:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 64, crdWidth = 64, explicitVal = 1 : i64, implicitVal = 0 : i64 }>
+// CHECK-LABEL: func private @sparse_csr(
+// CHECK-SAME: tensor<?x?xi64, #[[$CSR_OnlyOnes]]>)
+func.func private @sparse_csr(tensor<?x?xi64, #CSR_OnlyOnes>)
+
+// -----
+
 #BCSR = #sparse_tensor.encoding<{
   map = (d0, d1, d2) -> (d0 : batch, d1: dense, d2 : compressed),
 }>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_matmul_one.mlir b/mlir/test/Dialect/SparseTensor/sparse_matmul_one.mlir
new file mode 100755
index 000000000000..82f3147d3206
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_matmul_one.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-opt %s --linalg-generalize-named-ops \
+// RUN:             --sparsification-and-bufferization | FileCheck %s
+
+#CSR_ones_complex = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
+// explicitVal = (1.0, 0.0) : complex<f32>,
+// implicitVal = (0.0, 0.0) : complex<f32>
+}>
+
+#CSR_ones_fp = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  explicitVal = 1.0 : f32,
+  implicitVal = 0.0 : f32
+}>
+
+#CSR_ones_int = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed),
+  explicitVal = 1 : i32,
+  implicitVal = 0 : i32
+}>
+
+// CHECK-LABEL:   func.func @matmul_complex
+//
+// TODO: make this work
+//
+func.func @matmul_complex(%a: tensor<10x20xcomplex<f32>>,
+                          %b: tensor<20x30xcomplex<f32>, #CSR_ones_complex>,
+                          %c: tensor<10x30xcomplex<f32>>) -> tensor<10x30xcomplex<f32>> {
+  %0 = linalg.matmul
+    ins(%a, %b: tensor<10x20xcomplex<f32>>, tensor<20x30xcomplex<f32>,#CSR_ones_complex>)
+    outs(%c: tensor<10x30xcomplex<f32>>) -> tensor<10x30xcomplex<f32>>
+  return %0 : tensor<10x30xcomplex<f32>>
+}
+
+// CHECK-LABEL:   func.func @matmul_fp
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             %[[X:.*]] = memref.load
+// CHECK:             scf.for
+// CHECK:               %[[I:.*]] = memref.load
+// CHECK:               %[[Y:.*]] = memref.load
+// CHECK:               %[[M:.*]] = arith.addf %[[Y]], %[[X]] : f32
+// CHECK:               memref.store %[[M]]
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
+func.func @matmul_fp(%a: tensor<10x20xf32>,
+                     %b: tensor<20x30xf32, #CSR_ones_fp>,
+                     %c: tensor<10x30xf32>) -> tensor<10x30xf32> {
+  %0 = linalg.matmul
+    ins(%a, %b: tensor<10x20xf32>, tensor<20x30xf32,#CSR_ones_fp>)
+    outs(%c: tensor<10x30xf32>) -> tensor<10x30xf32>
+  return %0 : tensor<10x30xf32>
+}
+
+// CHECK-LABEL:   func.func @matmul_int
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             %[[X:.*]] = memref.load
+// CHECK:             scf.for
+// CHECK:               %[[I:.*]] = memref.load
+// CHECK:               %[[Y:.*]] = memref.load
+// CHECK:               %[[M:.*]] = arith.addi %[[Y]], %[[X]] : i32
+// CHECK:               memref.store %[[M]]
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
+func.func @matmul_int(%a: tensor<10x20xi32>,
+                      %b: tensor<20x30xi32, #CSR_ones_int>,
+                      %c: tensor<10x30xi32>) -> tensor<10x30xi32> {
+  %0 = linalg.matmul
+    ins(%a, %b: tensor<10x20xi32>, tensor<20x30xi32,#CSR_ones_int>)
+    outs(%c: tensor<10x30xi32>) -> tensor<10x30xi32>
+  return %0 : tensor<10x30xi32>
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
index edb53fa024c2..c96f9c31443d 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
@@ -12,7 +12,7 @@
 //
 // CHECK-ROUND-LABEL: func.func @sparse_expand(
 // CHECK-ROUND-SAME:  %[[A:.*]]: tensor<100xf64, #sparse{{[0-9]*}}>) -> tensor<10x10xf64, #sparse{{[0-9]*}}>
-//      CHECK-ROUND:  %[[E:.*]] = tensor.expand_shape %[[A]] {{\[\[}}0, 1]] : tensor<100xf64, #sparse{{[0-9]*}}> into tensor<10x10xf64, #sparse{{[0-9]*}}>
+//      CHECK-ROUND:  %[[E:.*]] = tensor.expand_shape %[[A]] {{\[\[}}0, 1]] output_shape [10, 10] : tensor<100xf64, #sparse{{[0-9]*}}> into tensor<10x10xf64, #sparse{{[0-9]*}}>
 //      CHECK-ROUND:  return %[[E]] : tensor<10x10xf64, #sparse{{[0-9]*}}>
 //
 // CHECK-LABEL:   func.func @sparse_expand(
@@ -39,7 +39,7 @@
 // CHECK:         return %[[NT1]] : tensor<10x10xf64, #sparse{{[0-9]*}}>
 //
 func.func @sparse_expand(%arg0: tensor<100xf64, #SparseVector>) -> tensor<10x10xf64, #SparseMatrix> {
-  %0 = tensor.expand_shape %arg0 [[0, 1]] :
+  %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [10, 10] :
     tensor<100xf64, #SparseVector> into tensor<10x10xf64, #SparseMatrix>
   return %0 : tensor<10x10xf64, #SparseMatrix>
 }
@@ -94,8 +94,8 @@ func.func @sparse_collapse(%arg0: tensor<10x10xf64, #SparseMatrix>) -> tensor<10
 // roundtrip:
 //
 // CHECK-ROUND-LABEL: func.func @dynamic_sparse_expand(
-// CHECK-ROUND-SAME:  %[[A:.*]]: tensor<?xf64, #sparse{{[0-9]*}}>) -> tensor<?x10xf64, #sparse{{[0-9]*}}>
-//      CHECK-ROUND:  %[[E:.*]] = tensor.expand_shape %[[A]] {{\[\[}}0, 1]] : tensor<?xf64, #sparse{{[0-9]*}}> into tensor<?x10xf64, #sparse{{[0-9]*}}>
+// CHECK-ROUND-SAME:  %[[A:.*]]: tensor<?xf64, #sparse{{[0-9]*}}>, %[[SZ0:.*]]: index) -> tensor<?x10xf64, #sparse{{[0-9]*}}>
+//      CHECK-ROUND:  %[[E:.*]] = tensor.expand_shape %[[A]] {{\[\[}}0, 1]] output_shape [%[[SZ0]], 10] : tensor<?xf64, #sparse{{[0-9]*}}> into tensor<?x10xf64, #sparse{{[0-9]*}}>
 //      CHECK-ROUND:  return %[[E]] : tensor<?x10xf64, #sparse{{[0-9]*}}>
 //
 // CHECK-LABEL:   func.func @dynamic_sparse_expand(
@@ -127,8 +127,8 @@ func.func @sparse_collapse(%arg0: tensor<10x10xf64, #SparseMatrix>) -> tensor<10
 // CHECK-NOT:     sparse_tensor.convert
 // CHECK:         return %[[NT1]] : tensor<?x10xf64, #sparse{{[0-9]*}}>
 //
-func.func @dynamic_sparse_expand(%arg0: tensor<?xf64, #SparseVector>) -> tensor<?x10xf64, #SparseMatrix> {
-  %0 = tensor.expand_shape %arg0 [[0, 1]] :
+func.func @dynamic_sparse_expand(%arg0: tensor<?xf64, #SparseVector>, %sz0: index) -> tensor<?x10xf64, #SparseMatrix> {
+  %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [%sz0, 10] :
     tensor<?xf64, #SparseVector> into tensor<?x10xf64, #SparseMatrix>
   return %0 : tensor<?x10xf64, #SparseMatrix>
 }
diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir
index 815bc383af95..4f553adcc500 100644
--- a/mlir/test/Dialect/Tensor/bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/bufferize.mlir
@@ -367,11 +367,14 @@ func.func @tensor.insert(%t1: tensor<5xf32>, %idx1: index, %f: f32) -> tensor<5x
 
 // CHECK-LABEL: func @tensor.expand_shape(
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x10xf32>
-func.func @tensor.expand_shape(%t1: tensor<?x10xf32>) -> tensor<2x?x10xf32> {
+func.func @tensor.expand_shape(%t1: tensor<?x10xf32>, %sz0: index) -> tensor<2x?x10xf32> {
   // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?x10xf32>
-  // CHECK: %[[expanded:.*]] = memref.expand_shape %[[m1]] [
-  // CHECK-SAME: [0, 1], [2]] : memref<?x10xf32> into memref<2x?x10xf32>
-  %0 = tensor.expand_shape %t1 [[0, 1], [2]]
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[DIM:.*]] = memref.dim %[[m1]], %[[C0]] : memref<?x10xf32>
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[VAL_1:.*]] = arith.divui %[[DIM]], %[[C2]] : index
+  // CHECK: %[[expanded:.*]] = memref.expand_shape %[[m1]] {{\[\[}}0, 1], [2]] output_shape [2, %[[VAL_1]], 10] : memref<?x10xf32> into memref<2x?x10xf32>
+  %0 = tensor.expand_shape %t1 [[0, 1], [2]] output_shape [2, %sz0, 10]
       : tensor<?x10xf32> into tensor<2x?x10xf32>
 
   // CHECK: %[[r:.*]] = bufferization.to_tensor %[[expanded]]
@@ -384,14 +387,15 @@ func.func @tensor.expand_shape(%t1: tensor<?x10xf32>) -> tensor<2x?x10xf32> {
 // CHECK-LABEL: func @tensor.expand_shape_of_slice(
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x20xf32>
 func.func @tensor.expand_shape_of_slice(
-    %t1: tensor<?x20xf32>, %o1: index, %s1: index) -> tensor<?x7x2x5xf32> {
+    %t1: tensor<?x20xf32>, %o1: index, %s1: index, %sz0: index) -> tensor<?x7x2x5xf32> {
   // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?x20xf32>
   // CHECK: %[[subview:.*]] = memref.subview %[[m1]][%{{.*}}, 5] [%{{.*}}, 10] [1, 1] : memref<?x20xf32> to memref<?x10xf32, strided<[20, 1], offset: ?>>
   %0 = tensor.extract_slice %t1[%o1, 5][%s1, 10][1, 1] :
       tensor<?x20xf32> to tensor<?x10xf32>
-  // CHECK: %[[expanded:.*]] = memref.expand_shape %[[subview]] [
-  // CHECK-SAME: [0, 1], [2, 3]] : memref<?x10xf32, strided<[20, 1], offset: ?>> into memref<?x7x2x5xf32, strided<[140, 20, 5, 1], offset: ?>>
-  %1 = tensor.expand_shape %0 [[0, 1], [2, 3]] :
+  // CHECK: %[[C7:.*]] = arith.constant 7 : index
+  // CHECK: %[[VAL_1:.*]] = arith.divui %{{.*}}, %[[C7]] : index
+  // CHECK: %[[expanded:.*]] = memref.expand_shape %[[subview]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_1]], 7, 2, 5] : memref<?x10xf32, strided<[20, 1], offset: ?>> into memref<?x7x2x5xf32, strided<[140, 20, 5, 1], offset: ?>>
+  %1 = tensor.expand_shape %0 [[0, 1], [2, 3]] output_shape [%sz0, 7, 2, 5] :
       tensor<?x10xf32> into tensor<?x7x2x5xf32>
   // CHECK: %[[r:.*]] = bufferization.to_tensor %[[expanded]]
   // CHECK: return %[[r]]
@@ -407,8 +411,8 @@ func.func @tensor.expand_shape_of_scalar_slice(
   // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?xf32>
   // CHECK: %[[subview:.*]] = memref.subview %[[m1]][%{{.*}}] [1] [1] :  memref<?xf32> to memref<f32, strided<[], offset: ?>>
   %0 = tensor.extract_slice %t1[%o1][1][1] : tensor<?xf32> to tensor<f32>
-  // CHECK: %[[expanded:.*]] = memref.expand_shape %[[subview]] [] : memref<f32, strided{{.*}}> into memref<1xf32, strided<[1], offset: ?>>
-  %1 = tensor.expand_shape %0 [] : tensor<f32> into tensor<1xf32>
+  // CHECK: %[[expanded:.*]] = memref.expand_shape %[[subview]] [] output_shape [1] : memref<f32, strided{{.*}}> into memref<1xf32, strided<[1], offset: ?>>
+  %1 = tensor.expand_shape %0 [] output_shape [1] : tensor<f32> into tensor<1xf32>
   // CHECK: %[[r:.*]] = bufferization.to_tensor %[[expanded]]
   // CHECK: return %[[r]]
   return %1 : tensor<1xf32>
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 751c57eacd7a..6177fe3c752c 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -4,7 +4,7 @@
 // CHECK-LABEL: expand_shape_identity_fold
 // CHECK-NEXT: return
 func.func @expand_shape_identity_fold(%arg0 : tensor<5xf32>) -> tensor<5xf32> {
-  %0 = tensor.expand_shape %arg0 [[0]] : tensor<5xf32> into tensor<5xf32>
+  %0 = tensor.expand_shape %arg0 [[0]] output_shape [5] : tensor<5xf32> into tensor<5xf32>
   return %0 : tensor<5xf32>
 }
 
@@ -13,7 +13,7 @@ func.func @expand_shape_identity_fold(%arg0 : tensor<5xf32>) -> tensor<5xf32> {
 // CHECK-LABEL: expand_shape_rank0_identity_fold
 // CHECK-NEXT: return
 func.func @expand_shape_rank0_identity_fold(%arg0 : tensor<f32>) -> tensor<f32> {
-  %0 = tensor.expand_shape %arg0 [] : tensor<f32> into tensor<f32>
+  %0 = tensor.expand_shape %arg0 [] output_shape [] : tensor<f32> into tensor<f32>
   return %0 : tensor<f32>
 }
 
@@ -1051,29 +1051,28 @@ func.func @fold_overlapping_insert(%input : tensor<?x?x?xf32>, %slice1: tensor<4
 
 // -----
 
-func.func @compose_expand_of_expand(%arg0 : tensor<?x?xf32>)
+func.func @compose_expand_of_expand(%arg0 : tensor<?x?xf32>, %arg1: index, %arg2: index, %arg3: index, %arg4: index)
     -> tensor<?x6x4x?x5xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2]]
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [%arg1, 4, %arg2]
       : tensor<?x?xf32> into tensor<?x4x?xf32>
-  %1 = tensor.expand_shape %0 [[0, 1], [2], [3, 4]]
-      : tensor<?x4x?xf32> into tensor<?x6x4x?x5xf32>
+  %1 = tensor.expand_shape %0 [[0, 1], [2], [3, 4]] output_shape [%arg3, 6, 4, %arg4, 5] : tensor<?x4x?xf32> into tensor<?x6x4x?x5xf32>
   return %1 : tensor<?x6x4x?x5xf32>
 }
 // CHECK-LABEL: compose_expand_of_expand
-//       CHECK:   tensor.expand_shape %{{.*}} {{\[}}[0, 1, 2], [3, 4]]
+//       CHECK:   tensor.expand_shape %{{.*}} {{\[}}[0, 1, 2], [3, 4]] output_shape [%arg3, 6, 4, %arg4, 5]
 //   CHECK-NOT:   tensor.expand_shape
 
 // -----
 
 func.func @compose_expand_of_expand_of_zero_dim(%arg0 : tensor<f32>)
     -> tensor<1x1x1xf32> {
-  %0 = tensor.expand_shape %arg0 [] : tensor<f32> into tensor<1xf32>
-  %1 = tensor.expand_shape %0 [[0, 1, 2]]
+  %0 = tensor.expand_shape %arg0 [] output_shape [1] : tensor<f32> into tensor<1xf32>
+  %1 = tensor.expand_shape %0 [[0, 1, 2]] output_shape [1, 1, 1]
       : tensor<1xf32> into tensor<1x1x1xf32>
   return %1 : tensor<1x1x1xf32>
 }
 // CHECK-LABEL: compose_expand_of_expand_of_zero_dim
-//       CHECK:   tensor.expand_shape %{{.*}} []
+//       CHECK:   tensor.expand_shape %{{.*}} [] output_shape [1, 1, 1]
 //  CHECK-SAME:     tensor<f32> into tensor<1x1x1xf32>
 
 // -----
@@ -1093,7 +1092,7 @@ func.func @collapse_of_cast(%t: tensor<8x12x32xf32>) -> tensor<?x32xf32> {
 // -----
 
 func.func @fold_collapse_of_expand(%arg0 : tensor<12x4xf32>) -> tensor<12x4xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2]]
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [3, 4, 4]
       : tensor<12x4xf32> into tensor<3x4x4xf32>
   %1 = tensor.collapse_shape %0 [[0, 1], [2]]
       : tensor<3x4x4xf32> into tensor<12x4xf32>
@@ -1104,9 +1103,9 @@ func.func @fold_collapse_of_expand(%arg0 : tensor<12x4xf32>) -> tensor<12x4xf32>
 
 // -----
 
-func.func @fold_collapse_of_expand_dynamic(%arg0 : tensor<?x?xf32>)
+func.func @fold_collapse_of_expand_dynamic(%arg0 : tensor<?x?xf32>, %arg1: index, %arg2: index)
     -> tensor<?x?xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2]]
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [%arg1, 4, %arg2]
       : tensor<?x?xf32> into tensor<?x4x?xf32>
   %1 = tensor.collapse_shape %0 [[0, 1], [2]]
       : tensor<?x4x?xf32> into tensor<?x?xf32>
@@ -1121,7 +1120,7 @@ func.func @compose_expand_of_collapse(%arg0 : tensor<2x3x4x5x6x7x8xf32>)
     -> tensor<24x5x42x8xf32> {
   %0 = tensor.collapse_shape %arg0 [[0, 1, 2, 3, 4, 5, 6]]
       : tensor<2x3x4x5x6x7x8xf32> into tensor<40320xf32>
-  %1 = tensor.expand_shape %0 [[0, 1, 2, 3]]
+  %1 = tensor.expand_shape %0 [[0, 1, 2, 3]] output_shape [24, 5, 42, 8]
       : tensor<40320xf32> into tensor<24x5x42x8xf32>
   return %1 : tensor<24x5x42x8xf32>
 }
@@ -1137,7 +1136,7 @@ func.func @compose_expand_of_collapse_7D(%arg0 : tensor<24x5x42x8xf32>)
     -> tensor<2x3x4x5x6x7x8xf32> {
   %0 = tensor.collapse_shape %arg0 [[0, 1, 2, 3]]
       : tensor<24x5x42x8xf32> into tensor<40320xf32>
-  %1 = tensor.expand_shape %0 [[0, 1, 2, 3, 4, 5, 6]]
+  %1 = tensor.expand_shape %0 [[0, 1, 2, 3, 4, 5, 6]] output_shape [2, 3, 4, 5, 6, 7, 8]
       : tensor<40320xf32> into tensor<2x3x4x5x6x7x8xf32>
   return %1 : tensor<2x3x4x5x6x7x8xf32>
 }
@@ -1149,16 +1148,16 @@ func.func @compose_expand_of_collapse_7D(%arg0 : tensor<24x5x42x8xf32>)
 
 // -----
 
-func.func @compose_collapse_of_expand(%arg : tensor<?x?x?xi64>)
+func.func @compose_collapse_of_expand(%arg : tensor<?x?x?xi64>, %arg1: index, %arg2: index, %arg3: index)
     -> tensor<?x?xi64> {
-  %0 = tensor.expand_shape %arg [[0], [1], [2, 3]]
+  %0 = tensor.expand_shape %arg [[0], [1], [2, 3]] output_shape [%arg1, %arg2, %arg3, 1]
     : tensor<?x?x?xi64> into tensor<?x?x?x1xi64>
   %1 = tensor.collapse_shape %0 [[0, 1], [2, 3]]
     : tensor<?x?x?x1xi64> into tensor<?x?xi64>
   return %1 : tensor<?x?xi64>
 }
 // CHECK-LABEL: func @compose_collapse_of_expand
-//       CHECK:   (%[[ARG:.*]]: tensor<?x?x?xi64>)
+//       CHECK:   (%[[ARG:.*]]: tensor<?x?x?xi64>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index)
 //  CHECK-NEXT: tensor.collapse_shape %[[ARG]]
 //  CHECK-SAME:   [0, 1], [2]
 //  CHECK-SAME:   : tensor<?x?x?xi64> into tensor<?x?xi64>
@@ -1167,14 +1166,14 @@ func.func @compose_collapse_of_expand(%arg : tensor<?x?x?xi64>)
 
 func.func @compose_collapse_of_expand_1D(%arg0 : tensor<2048xf32>)
     -> tensor<4x512xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1, 2, 3]]
+  %0 = tensor.expand_shape %arg0 [[0, 1, 2, 3]] output_shape [1, 4, 1, 512]
     : tensor<2048xf32> into tensor<1x4x1x512xf32>
   %1 = tensor.collapse_shape %0 [[0, 1, 2], [3]]
     : tensor<1x4x1x512xf32> into tensor<4x512xf32>
   return %1 : tensor<4x512xf32>
 }
 //       CHECK: func @compose_collapse_of_expand_1D
-//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1]]
+//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1]] output_shape [4, 512]
 //  CHECK-SAME:   tensor<2048xf32> into tensor<4x512xf32>
 
 // -----
@@ -1183,14 +1182,14 @@ func.func @compose_expand_of_collapse_0_rank_to_expand(%arg0 : tensor<1x1x1xf32>
     -> tensor<1x1x1x1xf32> {
   %0 = tensor.collapse_shape %arg0 []
       : tensor<1x1x1xf32> into tensor<f32>
-  %1 = tensor.expand_shape %0 []
+  %1 = tensor.expand_shape %0 [] output_shape [1, 1, 1, 1]
       : tensor<f32> into tensor<1x1x1x1xf32>
   return %1 : tensor<1x1x1x1xf32>
 }
 //      CHECK: func @compose_expand_of_collapse_0_rank_to_expand
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<1x1x1xf32>
 //      CHECK:   %[[RESULT:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:     [0], [1], [2, 3]
+// CHECK-SAME:     {{\[}}[0], [1], [2, 3]] output_shape [1, 1, 1, 1]
 //      CHECK:   return %[[RESULT]]
 
 // -----
@@ -1199,7 +1198,7 @@ func.func @compose_expand_of_collapse_0_rank_to_collapse(%arg0 : tensor<1x1x1x1x
     -> tensor<1x1x1xf32> {
   %0 = tensor.collapse_shape %arg0 []
       : tensor<1x1x1x1xf32> into tensor<f32>
-  %1 = tensor.expand_shape %0 []
+  %1 = tensor.expand_shape %0 [] output_shape [1, 1, 1]
       : tensor<f32> into tensor<1x1x1xf32>
   return %1 : tensor<1x1x1xf32>
 }
@@ -1214,8 +1213,8 @@ func.func @compose_expand_of_collapse_0_rank_to_collapse(%arg0 : tensor<1x1x1x1x
 // CHECK-LABEL: func @zero_rank_reshape_multi
 func.func @zero_rank_reshape_multi(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: return %arg0
-  %0 = tensor.expand_shape %arg0 [] : tensor<f32> into tensor<1xf32>
-  %1 = tensor.expand_shape %0 [[0, 1]] : tensor<1xf32> into tensor<1x1xf32>
+  %0 = tensor.expand_shape %arg0 [] output_shape [1] : tensor<f32> into tensor<1xf32>
+  %1 = tensor.expand_shape %0 [[0, 1]] output_shape [1, 1] : tensor<1xf32> into tensor<1x1xf32>
   %2 = tensor.collapse_shape %1 [] : tensor<1x1xf32> into tensor<f32>
   return %2 : tensor<f32>
 }
@@ -1250,7 +1249,7 @@ func.func @compose_collapse_of_collapse_zero_dim(%arg0 : tensor<1x1x1xf32>)
 // -----
 
 func.func @fold_collapse_of_expand_1D(%arg0 : tensor<4x512xf32>) -> tensor<2048xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1, 2], [3]]
+  %0 = tensor.expand_shape %arg0 [[0, 1, 2], [3]] output_shape [1, 4, 1, 512]
     : tensor<4x512xf32> into tensor<1x4x1x512xf32>
   %1 = tensor.collapse_shape %0 [[0, 1, 2, 3]]
     : tensor<1x4x1x512xf32> into tensor<2048xf32>
@@ -1264,42 +1263,40 @@ func.func @fold_collapse_of_expand_1D(%arg0 : tensor<4x512xf32>) -> tensor<2048x
 
 func.func @fold_collapse_of_expand_unit_dims(%arg0 : tensor<2048x1x1xf32>)
     -> tensor<4x512x1x1xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1, 2, 3], [4], [5]]
-    : tensor<2048x1x1xf32> into tensor<1x4x1x512x1x1xf32>
+  %0 = tensor.expand_shape %arg0 [[0, 1, 2, 3], [4], [5]] output_shape [1, 4, 1, 512, 1, 1] : tensor<2048x1x1xf32> into tensor<1x4x1x512x1x1xf32>
   %1 = tensor.collapse_shape %0 [[0, 1, 2], [3], [4], [5]]
     : tensor<1x4x1x512x1x1xf32> into tensor<4x512x1x1xf32>
   return %1 : tensor<4x512x1x1xf32>
 }
 //       CHECK: func @fold_collapse_of_expand_unit_dims
-//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1], [2], [3]]
+//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1], [2], [3]] output_shape [4, 512, 1, 1]
 //  CHECK-SAME:   tensor<2048x1x1xf32> into tensor<4x512x1x1xf32>
 
 // -----
 
 func.func @compose_collapse_of_expand_unit_dims(%arg0 : tensor<2048x1x2048xf32>)
     -> tensor<4x512x1x512x4xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1, 2, 3, 4], [5], [6, 7, 8]]
-    : tensor<2048x1x2048xf32> into tensor<1x4x1x512x1x1x512x1x4xf32>
+  %0 = tensor.expand_shape %arg0 [[0, 1, 2, 3, 4], [5], [6, 7, 8]] output_shape [1, 4, 1, 512, 1, 1, 512, 1, 4] : tensor<2048x1x2048xf32> into tensor<1x4x1x512x1x1x512x1x4xf32>
   %1 = tensor.collapse_shape %0 [[0, 1, 2], [3, 4], [5], [6, 7], [8]]
     : tensor<1x4x1x512x1x1x512x1x4xf32> into tensor<4x512x1x512x4xf32>
   return %1 : tensor<4x512x1x512x4xf32>
 }
 //       CHECK: func @compose_collapse_of_expand_unit_dims
-//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1], [2], [3, 4]]
+//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1], [2], [3, 4]] output_shape [4, 512, 1, 512, 4]
 //  CHECK-SAME:   tensor<2048x1x2048xf32> into tensor<4x512x1x512x4xf32>
 
 // -----
 
 func.func @compose_collapse_of_expand_trailing_unit_dims(%arg0: tensor<2xf32>)
     -> tensor<2x1xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1, 2]]
+  %0 = tensor.expand_shape %arg0 [[0, 1, 2]] output_shape [2, 1, 1]
       : tensor<2xf32> into tensor<2x1x1xf32>
   %1 = tensor.collapse_shape %0 [[0], [1, 2]]
       : tensor<2x1x1xf32> into tensor<2x1xf32>
   return %1 : tensor<2x1xf32>
 }
 //       CHECK: func @compose_collapse_of_expand_trailing_unit_dims
-//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1]]
+//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1]] output_shape [2, 1]
 //  CHECK-SAME:   tensor<2xf32> into tensor<2x1xf32>
 
 // -----
@@ -1321,14 +1318,13 @@ func.func @compose_collapse_of_collapse_unit_dims_dynamic(
 
 func.func @fold_collapse_of_expand_trailing_unit_dims(%arg0: tensor<2xf32>)
     -> tensor<2x1xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1, 2]]
-      : tensor<2xf32> into tensor<2x1x1xf32>
+  %0 = tensor.expand_shape %arg0 [[0, 1, 2]] output_shape [2, 1, 1] : tensor<2xf32> into tensor<2x1x1xf32>
   %1 = tensor.collapse_shape %0 [[0], [1, 2]]
       : tensor<2x1x1xf32> into tensor<2x1xf32>
   return %1 : tensor<2x1xf32>
 }
 //       CHECK: func @fold_collapse_of_expand_trailing_unit_dims
-//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1]]
+//       CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1]] output_shape [2, 1]
 //  CHECK-SAME:   tensor<2xf32> into tensor<2x1xf32>
 
 // -----
@@ -1349,8 +1345,7 @@ func.func @fold_collapse_of_collapse_trailing_unit_dims_dynamic(
 
 func.func @fold_collapse_of_expand_trailing_unit_dims(%arg0: tensor<12x42x1x1xf32>)
     -> tensor<12x42xf32> {
-  %0 = tensor.expand_shape %arg0 [[0], [1], [2], [3, 4]]
-      : tensor<12x42x1x1xf32> into tensor<12x42x1x1x1xf32>
+  %0 = tensor.expand_shape %arg0 [[0], [1], [2], [3, 4]] output_shape [12, 42, 1, 1, 1] : tensor<12x42x1x1xf32> into tensor<12x42x1x1x1xf32>
   %1 = tensor.collapse_shape %0 [[0], [1, 2, 3, 4]]
       : tensor<12x42x1x1x1xf32> into tensor<12x42xf32>
   return %1 : tensor<12x42xf32>
@@ -1361,9 +1356,9 @@ func.func @fold_collapse_of_expand_trailing_unit_dims(%arg0: tensor<12x42x1x1xf3
 
 // -----
 
-func.func @fold_collapse_of_expand_unit_dims_in_middle(%arg0 : tensor<?x?x?xf32>)
+func.func @fold_collapse_of_expand_unit_dims_in_middle(%arg0 : tensor<?x?x?xf32>, %sz0: index, %sz1: index, %sz2: index)
     -> tensor<?x?xf32> {
-  %0 = tensor.expand_shape %arg0 [[0], [1], [2, 3]]
+  %0 = tensor.expand_shape %arg0 [[0], [1], [2, 3]] output_shape [%sz0, %sz1, 1, %sz2]
       : tensor<?x?x?xf32> into tensor<?x?x1x?xf32>
   %1 = tensor.collapse_shape %0 [[0], [1, 2, 3]]
       : tensor<?x?x1x?xf32> into tensor<?x?xf32>
@@ -1378,7 +1373,7 @@ func.func @fold_collapse_of_expand_unit_dims_in_middle(%arg0 : tensor<?x?x?xf32>
 
 func.func @no_fold_collapse_of_expand_incompatible(%arg0 : tensor<4x6x8xf32>)
     -> tensor<2x6x16xf32> {
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2, 3], [4]]
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2, 3], [4]] output_shape [2, 2, 3, 2, 8]
       : tensor<4x6x8xf32> into tensor<2x2x3x2x8xf32>
   %1 = tensor.collapse_shape %0 [[0], [1, 2], [3, 4]]
       : tensor<2x2x3x2x8xf32> into tensor<2x6x16xf32>
@@ -1392,7 +1387,7 @@ func.func @no_fold_collapse_of_expand_incompatible(%arg0 : tensor<4x6x8xf32>)
 
 func.func @no_fold_collapse_of_expand_empty_expr(%arg0: tensor<3x2x2xf32>)
     -> tensor<12x1xf32> {
-  %0 = tensor.expand_shape %arg0 [[0], [1], [2, 3]]
+  %0 = tensor.expand_shape %arg0 [[0], [1], [2, 3]] output_shape [3, 2, 2, 1]
       : tensor<3x2x2xf32> into tensor<3x2x2x1xf32>
   %1 = tensor.collapse_shape %0 [[0, 1, 2], [3]]
       : tensor<3x2x2x1xf32> into tensor<12x1xf32>
@@ -1401,7 +1396,7 @@ func.func @no_fold_collapse_of_expand_empty_expr(%arg0: tensor<3x2x2xf32>)
 //      CHECK: func @no_fold_collapse_of_expand_empty_expr
 // CHECK-SAME:    %[[ARG0:.+]]: tensor<3x2x2xf32>
 //      CHECK:    %[[RARG0:.+]] = tensor.expand_shape %[[ARG0]]
-// CHECK-SAME:      [0], [1], [2, 3]
+// CHECK-SAME:      {{\[}}[0], [1], [2, 3]] output_shape [3, 2, 2, 1]
 //      CHECK:    %[[RES:.+]] = tensor.collapse_shape %[[RARG0]]
 // CHECK-SAME:      [0, 1, 2], [3]
 //      CHECK:    return %[[RES:.+]] : tensor<12x1xf32>
@@ -1410,7 +1405,7 @@ func.func @no_fold_collapse_of_expand_empty_expr(%arg0: tensor<3x2x2xf32>)
 
 func.func @reshape_splat_constant_int32() -> tensor<2x4x2xi32> {
   %c0 = arith.constant dense<42> : tensor<2x8xi32>
-  %0 = tensor.expand_shape %c0 [[0], [1, 2]]
+  %0 = tensor.expand_shape %c0 [[0], [1, 2]] output_shape [2, 4, 2]
       : tensor<2x8xi32> into tensor<2x4x2xi32>
   return %0 : tensor<2x4x2xi32>
 }
@@ -1421,7 +1416,7 @@ func.func @reshape_splat_constant_int32() -> tensor<2x4x2xi32> {
 // -----
 func.func @expand_shape_splat(%arg : f32) -> tensor<2x2x2xf32> {
   %c0 = tensor.splat %arg : tensor<2x4xf32>
-  %0 = tensor.expand_shape %c0 [[0], [1, 2]]
+  %0 = tensor.expand_shape %c0 [[0], [1, 2]] output_shape [2, 2, 2]
       : tensor<2x4xf32> into tensor<2x2x2xf32>
   return %0 : tensor<2x2x2xf32>
 }
@@ -1434,13 +1429,12 @@ func.func @expand_shape_splat(%arg : f32) -> tensor<2x2x2xf32> {
 // -----
 
 // CHECK-LABEL: @expand_shape_splat_dynamic_no_fold
-// CHECK-SAME: %[[F:.+]]: f32
-// CHECK-SAME: %[[M:.+]]: index
-func.func @expand_shape_splat_dynamic_no_fold(%arg: f32, %m: index) -> tensor<2x2x?xf32> {
-  // CHECK: %[[SPLAT:.+]] = tensor.splat %[[F]][%[[M]]]
+// CHECK-SAME: (%[[F:.+]]: f32, %[[M:.+]]: index, %[[SZ0:.+]]: index)
+func.func @expand_shape_splat_dynamic_no_fold(%arg: f32, %m: index, %sz0: index) -> tensor<2x2x?xf32> {
+  // CHECK: %[[SPLAT:.+]] = tensor.splat %[[F]][%[[M]]] : tensor<2x?xf32>
   // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[SPLAT]]
   %c0 = tensor.splat %arg[%m] : tensor<2x?xf32>
-  %0 = tensor.expand_shape %c0 [[0], [1, 2]] : tensor<2x?xf32> into tensor<2x2x?xf32>
+  %0 = tensor.expand_shape %c0 [[0], [1, 2]] output_shape [2, 2, %sz0] : tensor<2x?xf32> into tensor<2x2x?xf32>
   return %0 : tensor<2x2x?xf32>
 }
 
@@ -1475,7 +1469,7 @@ func.func @collapse_shape_splat_dynamic_no_fold(%f: f32, %m: index) -> tensor<2x
 
 func.func @reshape_splat_constant_int16() -> tensor<2x4x2xi16> {
   %c0 = arith.constant dense<42> : tensor<2x8xi16>
-  %0 = tensor.expand_shape %c0 [[0], [1, 2]]
+  %0 = tensor.expand_shape %c0 [[0], [1, 2]] output_shape [2, 4, 2]
       : tensor<2x8xi16> into tensor<2x4x2xi16>
   return %0 : tensor<2x4x2xi16>
 }
@@ -1488,7 +1482,7 @@ func.func @reshape_splat_constant_int16() -> tensor<2x4x2xi16> {
 
 func.func @reshape_splat_constant_float32() -> tensor<2x4x2xf32> {
   %c0 = arith.constant dense<42.0> : tensor<2x8xf32>
-  %0 = tensor.expand_shape %c0 [[0], [1, 2]]
+  %0 = tensor.expand_shape %c0 [[0], [1, 2]] output_shape [2, 4, 2]
       : tensor<2x8xf32> into tensor<2x4x2xf32>
   return %0 : tensor<2x4x2xf32>
 }
@@ -1501,7 +1495,7 @@ func.func @reshape_splat_constant_float32() -> tensor<2x4x2xf32> {
 
 func.func @reshape_splat_constant_float64() -> tensor<2x4x2xf64> {
   %c0 = arith.constant dense<42.0> : tensor<2x8xf64>
-  %0 = tensor.expand_shape %c0 [[0], [1, 2]]
+  %0 = tensor.expand_shape %c0 [[0], [1, 2]] output_shape [2, 4, 2]
       : tensor<2x8xf64> into tensor<2x4x2xf64>
   return %0 : tensor<2x4x2xf64>
 }
@@ -1851,7 +1845,7 @@ func.func @fold_expand_shape_from_elements(%arg0: i32) -> tensor<1xi32> {
   // CHECK: %[[FROM:.+]] = tensor.from_elements %arg0 : tensor<1xi32>
   // CHECK: return %[[FROM]] : tensor<1xi32>
   %0 = tensor.from_elements %arg0 : tensor<i32>
-  %1 = tensor.expand_shape %0 [] : tensor<i32> into tensor<1xi32>
+  %1 = tensor.expand_shape %0 [] output_shape [1] : tensor<i32> into tensor<1xi32>
   return %1 : tensor<1xi32>
 }
 
@@ -2073,9 +2067,9 @@ func.func @empty_tensor_canonicalize(%i : index) {
 //       CHECK:   %[[dim:.*]] = tensor.dim %[[t]], %[[c1]] : tensor<?x?xf32>
 //       CHECK:   %[[apply:.*]] = affine.apply #[[$map]]()[%[[dim]]]
 //       CHECK:   return %[[apply]]
-func.func @dim_of_expand_shape(%t: tensor<?x?xf32>) -> index {
+func.func @dim_of_expand_shape(%t: tensor<?x?xf32>, %sz0: index, %sz1: index) -> index {
   %c2 = arith.constant 2 : index
-  %0 = tensor.expand_shape %t [[0], [1, 2, 3, 4, 5]]
+  %0 = tensor.expand_shape %t [[0], [1, 2, 3, 4, 5]] output_shape [%sz0, 1, %sz1, 5, 1, 8]
       : tensor<?x?xf32> into tensor<?x1x?x5x1x8xf32>
   %1 = tensor.dim %0, %c2 : tensor<?x1x?x5x1x8xf32>
   return %1 : index
@@ -2107,9 +2101,9 @@ func.func @dim_of_collapse_shape(%t: tensor<?x?x?x7x?xf32>) -> index {
 // CHECK-LABEL: func @collapse_expand_fold_to_cast(
 //  CHECK-SAME:     %[[t:.*]]: tensor<?xf32>
 //       CHECK:   return %[[t]]
-func.func @collapse_expand_fold_to_cast(%t: tensor<?xf32>) -> (tensor<?xf32>)
+func.func @collapse_expand_fold_to_cast(%t: tensor<?xf32>, %sz0: index) -> (tensor<?xf32>)
 {
-  %0 = tensor.expand_shape %t [[0, 1]] : tensor<?xf32> into tensor<1x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1]] output_shape [1, %sz0] : tensor<?xf32> into tensor<1x?xf32>
   %1 = tensor.collapse_shape %0 [[0, 1]] : tensor<1x?xf32> into tensor<?xf32>
   return %1 : tensor<?xf32>
 }
@@ -2431,6 +2425,15 @@ func.func @reshape_nofold_2d(%arg0 : tensor<?x?xi32>) -> tensor<?x?xi32> {
   return %reshape : tensor<?x?xi32>
 }
 
+// -----
+
+// CHECK-LABEL: @reshape_nofold_2d_ins
+func.func @reshape_nofold_2d_ins(%arg0 : tensor<?x?xi32>, %arg1: index, %arg2: index) -> tensor<?x?xi32> {
+  %ds = tensor.from_elements %arg1, %arg2 : tensor<2xindex>
+  // CHECK: tensor.reshape
+  %reshape = tensor.reshape %arg0(%ds) : (tensor<?x?xi32>, tensor<2xindex>) -> tensor<?x?xi32>
+  return %reshape : tensor<?x?xi32>
+}
 
 // -----
 
diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
index 15f841f2128e..e200a4f89261 100644
--- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir
+++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
@@ -13,10 +13,9 @@ module attributes {transform.with_named_sequence} {
 // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 floordiv 28)>
 // CHECK: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 * 28)>
 
-func.func @empty_reshape_expansion(%arg0 : index) -> tensor<2x3x5x4x?x7xf32> {
+func.func @empty_reshape_expansion(%arg0 : index, %sz0: index) -> tensor<2x3x5x4x?x7xf32> {
   %0 = tensor.empty(%arg0) : tensor<6x5x?xf32>
-  %1 = tensor.expand_shape %0 [[0, 1], [2], [3, 4, 5]]
-      : tensor<6x5x?xf32> into tensor<2x3x5x4x?x7xf32>
+  %1 = tensor.expand_shape %0 [[0, 1], [2], [3, 4, 5]] output_shape [2, 3, 5, 4, %sz0, 7] : tensor<6x5x?xf32> into tensor<2x3x5x4x?x7xf32>
   return %1 : tensor<2x3x5x4x?x7xf32>
 }
 // CHECK-LABEL: func @empty_reshape_expansion
diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
index aa860e33cf35..9f486f9146ad 100644
--- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
@@ -137,7 +137,7 @@ func.func @tensor_pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> t
 //      CHECK:   %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32>
 //      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]]
 // CHECK-SAME:      outer_dims_perm = [2, 1, 0, 3]
-// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32] 
+// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:       into %[[INIT]]
 //      CHECK:   return %[[PACK]]
 
@@ -163,7 +163,7 @@ func.func @tensor_pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x
 //      CHECK:   %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32>
 //      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]] padding_value(%[[PADDING]] : f32)
 // CHECK-SAME:      outer_dims_perm = [2, 1, 0, 3]
-// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32] 
+// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:       into %[[INIT]]
 //      CHECK:   return %[[PACK]]
 
@@ -188,7 +188,7 @@ func.func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56
 //      CHECK:   %[[INIT:.+]] = tensor.empty() : tensor<1x2x56x57x32xf32>
 //      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]]
 // CHECK-SAME:      outer_dims_perm = [2, 3, 0, 1]
-// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32] 
+// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:       into %[[INIT]]
 //      CHECK:   return %[[PACK]]
 
@@ -214,7 +214,7 @@ func.func @tensor_pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<5
 //      CHECK:   %[[INIT:.+]] = tensor.empty() : tensor<12x56x4x9x32x8x2xf32>
 //      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]]
 // CHECK-SAME:      outer_dims_perm = [2, 0, 3, 1]
-// CHECK-SAME:      inner_dims_pos = [3, 1, 2] inner_tiles = [32, 8, 2] 
+// CHECK-SAME:      inner_dims_pos = [3, 1, 2] inner_tiles = [32, 8, 2]
 // CHECK-SAME:       into %[[INIT]]
 //      CHECK:   return %[[PACK]]
 
@@ -256,7 +256,7 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56
     outs(%1 : tensor<1x57x56x2x32xf32>)
     permutation = [2, 3, 0, 1, 4]
 
-  %return_value = tensor.cast %transposed : tensor<1x57x56x2x32xf32> to tensor<?x?x56x2x32xf32>  
+  %return_value = tensor.cast %transposed : tensor<1x57x56x2x32xf32> to tensor<?x?x56x2x32xf32>
   return %return_value : tensor<?x?x56x2x32xf32>
 }
 //      CHECK: func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(
@@ -268,7 +268,7 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56
 //      CHECK:   %[[INIT:.+]] = tensor.empty(%[[dim_0]], %[[dim]]) : tensor<?x?x56x2x32xf32>
 //      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]]
 // CHECK-SAME:      outer_dims_perm = [2, 1, 0, 3]
-// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32] 
+// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:       into %[[INIT]]
 //      CHECK:   return %[[PACK]]
 
@@ -286,14 +286,14 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0:
     ins(%pack : tensor<56x9x12x4x8x2x32xf32>)
     outs(%1 : tensor<12x4x56x9x32x8x2xf32>)
     permutation = [2, 3, 0, 1, 6, 4, 5]
-  
-  %return_value = tensor.cast %transposed : tensor<12x4x56x9x32x8x2xf32> to tensor<?x?x56x9x32x8x2xf32> 
+
+  %return_value = tensor.cast %transposed : tensor<12x4x56x9x32x8x2xf32> to tensor<?x?x56x9x32x8x2xf32>
   return %return_value : tensor<?x?x56x9x32x8x2xf32>
 }
 //   CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)>
 //   CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)>
 // CHECK-LABEL:   func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(
-//  CHECK-SAME:   %[[ARG0:.+]]: tensor<56x?x?x128xf32>) 
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor<56x?x?x128xf32>)
 //   CHECK-DAG:     %[[c1:.+]] = arith.constant 1 : index
 //   CHECK-DAG:     %[[c2:.+]] = arith.constant 2 : index
 //       CHECK:     %[[dim:.+]] = tensor.dim %[[ARG0]], %[[c1]] : tensor<56x?x?x128xf32>
@@ -328,7 +328,7 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_s
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?x?x?xf32>,
 // CHECK-SAME:   %[[PACK_DEST:.+]]: tensor<?x?x?x?x?x?x?xf32>, %[[TRANSPOSE_DEST:.+]]: tensor<?x?x?x?x?x?x?xf32>,
 // CHECK-SAME:   %[[ARG1:.+]]: index, %[[ARG2:.+]]: index,
-// CHECK-SAME:   %[[ARG3:.+]]: index) 
+// CHECK-SAME:   %[[ARG3:.+]]: index)
 //  CHECK-DAG:     %[[c0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:     %[[c1:.+]] = arith.constant 1 : index
 //  CHECK-DAG:     %[[c2:.+]] = arith.constant 2 : index
@@ -367,7 +367,7 @@ func.func @linalg_transpose_tensor_pack_fold(%arg0: tensor<56x57x1x64xf32>) -> t
 //      CHECK:   %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32>
 //      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]]
 // CHECK-SAME:      outer_dims_perm = [2, 1, 0, 3]
-// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32] 
+// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:       into %[[INIT]]
 //      CHECK:   return %[[PACK]]
 
@@ -379,7 +379,7 @@ func.func @linalg_transpose_tensor_pack_fold_with_padding(%arg0: tensor<56x57x1x
     ins(%arg0 : tensor<56x57x1x55xf32>)
     outs(%0 : tensor<1x56x57x55xf32>)
     permutation = [2, 0, 1, 3]
-  
+
   %1 = tensor.empty() : tensor<1x57x56x2x32xf32>
   %pack = tensor.pack %transpose padding_value(%padding : f32)
     outer_dims_perm = [0, 2, 1, 3]
@@ -393,7 +393,7 @@ func.func @linalg_transpose_tensor_pack_fold_with_padding(%arg0: tensor<56x57x1x
 //      CHECK:   %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32>
 //      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]] padding_value(%[[PADDING]] : f32)
 // CHECK-SAME:      outer_dims_perm = [2, 1, 0, 3]
-// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32] 
+// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:       into %[[INIT]]
 //      CHECK:   return %[[PACK]]
 
@@ -405,7 +405,7 @@ func.func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm(%arg0: tensor<56
     ins(%arg0 : tensor<56x57x1x64xf32>)
     outs(%0 : tensor<1x56x57x64xf32>)
     permutation = [2, 0, 1, 3]
-  
+
   %1 = tensor.empty() : tensor<1x56x57x2x32xf32>
   %pack = tensor.pack %transposed
     inner_dims_pos = [3]
@@ -418,7 +418,7 @@ func.func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm(%arg0: tensor<56
 //      CHECK:   %[[INIT:.+]] = tensor.empty() : tensor<1x56x57x2x32xf32>
 //      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]]
 // CHECK-SAME:      outer_dims_perm = [2, 0, 1, 3]
-// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32] 
+// CHECK-SAME:      inner_dims_pos = [3] inner_tiles = [32]
 // CHECK-SAME:       into %[[INIT]]
 //      CHECK:   return %[[PACK]]
 
@@ -429,7 +429,7 @@ func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change(%arg0: te
     ins(%arg0 : tensor<25x30x35x40xf32>)
     outs(%transpose_dest : tensor<35x40x25x30xf32>)
     permutation = [2, 3, 0, 1]
-  
+
   %pack = tensor.pack %transposed
     outer_dims_perm = [3, 0, 2, 1]
     inner_dims_pos = [1, 3, 2]
@@ -438,15 +438,15 @@ func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change(%arg0: te
   return %pack : tensor<3x35x5x8x5x10x5xf32>
 }
 //CHECK-LABEL:   func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change(
-// CHECK-SAME:     %[[ARG0:.+]]: tensor<25x30x35x40xf32>, 
-// CHECK-SAME:     %[[ARG1:.+]]: tensor<35x40x25x30xf32>, 
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<25x30x35x40xf32>,
+// CHECK-SAME:     %[[ARG1:.+]]: tensor<35x40x25x30xf32>,
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<3x35x5x8x5x10x5xf32>) -> tensor<3x35x5x8x5x10x5xf32> {
 //      CHECK:     %[[VAL0:.+]] = tensor.empty() : tensor<3x35x5x8x5x10x5xf32>
-//      CHECK:     %[[PACK:.+]] = tensor.pack %[[ARG0]] 
-// CHECK-SAME:        outer_dims_perm = [1, 2, 0, 3] 
-// CHECK-SAME:        inner_dims_pos = [3, 1, 0] 
-// CHECK-SAME:        inner_tiles = [5, 10, 5] 
-// CHECK-SAME:         into %[[VAL0]] 
+//      CHECK:     %[[PACK:.+]] = tensor.pack %[[ARG0]]
+// CHECK-SAME:        outer_dims_perm = [1, 2, 0, 3]
+// CHECK-SAME:        inner_dims_pos = [3, 1, 0]
+// CHECK-SAME:        inner_tiles = [5, 10, 5]
+// CHECK-SAME:         into %[[VAL0]]
 //      CHECK:     return %[[PACK]]
 
 // -----
@@ -456,7 +456,7 @@ func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_s
     ins(%arg0 : tensor<?x?x?x?xf32>)
     outs(%transpose_dest : tensor<?x?x?x?xf32>)
     permutation = [2, 3, 0, 1]
-  
+
   %pack = tensor.pack %transposed
     outer_dims_perm = [3, 0, 2, 1]
     inner_dims_pos = [1, 3, 2]
@@ -466,7 +466,7 @@ func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_s
 }
 //      CHECK:   #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
 //CHECK-LABEL:   func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_sizes(
-// CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?x?xf32>, 
+// CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?x?xf32>,
 // CHECK-SAME:   %[[ARG2:.+]]: tensor<?x?x?x?x?x?x?xf32>, %[[ARG3:.+]]: index, %[[ARG4:.+]]: index, %[[ARG5:.+]]: index) -> tensor<?x?x?x?x?x?x?xf32> {
 //      CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
 //      CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
@@ -491,17 +491,17 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor<?x32x128xbf
   %dim = tensor.dim %arg0, %c0 : tensor<?x32x128xbf16>
 
   %0 = tensor.empty(%dim) : tensor<32x128x?xbf16>
-  %transposed = linalg.transpose 
-    ins(%arg0 : tensor<?x32x128xbf16>) 
-    outs(%0 : tensor<32x128x?xbf16>) 
+  %transposed = linalg.transpose
+    ins(%arg0 : tensor<?x32x128xbf16>)
+    outs(%0 : tensor<32x128x?xbf16>)
     permutation = [1, 2, 0]
 
   %2 = tensor.empty(%dim) : tensor<32x?x64x16x2xbf16>
-  %pack = tensor.pack %transposed 
-    padding_value(%cst : bf16) 
-    outer_dims_perm = [0, 2, 1] 
-    inner_dims_pos = [2, 1] 
-    inner_tiles = [16, 2] 
+  %pack = tensor.pack %transposed
+    padding_value(%cst : bf16)
+    outer_dims_perm = [0, 2, 1]
+    inner_dims_pos = [2, 1]
+    inner_tiles = [16, 2]
     into %2 : tensor<32x128x?xbf16> -> tensor<32x?x64x16x2xbf16>
   return %pack : tensor<32x?x64x16x2xbf16>
 }
@@ -513,11 +513,11 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor<?x32x128xbf
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x32x128xbf16>
 //      CHECK:   %[[VAL0:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]]
 //      CHECK:   %[[VAL1:.+]] = tensor.empty(%[[VAL0]]) : tensor<32x?x64x16x2xbf16>
-//      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]] 
-// CHECK-SAME:      padding_value(%[[CST]] : bf16) 
-// CHECK-SAME:      outer_dims_perm = [1, 0, 2] 
-// CHECK-SAME:      inner_dims_pos = [0, 2] 
-// CHECK-SAME:      inner_tiles = [16, 2] 
+//      CHECK:   %[[PACK:.+]] = tensor.pack %[[ARG0]]
+// CHECK-SAME:      padding_value(%[[CST]] : bf16)
+// CHECK-SAME:      outer_dims_perm = [1, 0, 2]
+// CHECK-SAME:      inner_dims_pos = [0, 2]
+// CHECK-SAME:      inner_tiles = [16, 2]
 // CHECK-SAME:      into %[[VAL1]] : tensor<?x32x128xbf16> -> tensor<32x?x64x16x2xbf16>
 //      CHECK:   return %[[PACK]] : tensor<32x?x64x16x2xbf16>
 //      CHECK:  }
@@ -526,24 +526,24 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor<?x32x128xbf
 
 func.func @linalg_transpose_tensor_unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> tensor<16x4xi32> {
   %0 = tensor.empty() : tensor<1x1x16x4xi32>
-  %transposed = linalg.transpose ins(%arg0 : tensor<1x1x4x16xi32>) 
-                outs(%0 : tensor<1x1x16x4xi32>) 
+  %transposed = linalg.transpose ins(%arg0 : tensor<1x1x4x16xi32>)
+                outs(%0 : tensor<1x1x16x4xi32>)
                 permutation = [1, 0, 3, 2]
   %1 = tensor.empty() : tensor<16x4xi32>
-  %unpack = tensor.unpack %transposed 
-            outer_dims_perm = [0, 1] 
-            inner_dims_pos = [0, 1] 
-            inner_tiles = [16, 4] into 
+  %unpack = tensor.unpack %transposed
+            outer_dims_perm = [0, 1]
+            inner_dims_pos = [0, 1]
+            inner_tiles = [16, 4] into
             %1 : tensor<1x1x16x4xi32> -> tensor<16x4xi32>
   return %unpack : tensor<16x4xi32>
 }
 //CHECK-LABEL:  func.func @linalg_transpose_tensor_unpack_fold(
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<1x1x4x16xi32>) -> tensor<16x4xi32> {
 //      CHECK:     %[[OUT:.+]] = tensor.empty() : tensor<16x4xi32>
-//      CHECK:     %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] 
-// CHECK-SAME:        outer_dims_perm = [1, 0] 
-// CHECK-SAME:        inner_dims_pos = [1, 0] 
-// CHECK-SAME:        inner_tiles = [4, 16] 
+//      CHECK:     %[[UNPACK:.+]] = tensor.unpack %[[ARG0]]
+// CHECK-SAME:        outer_dims_perm = [1, 0]
+// CHECK-SAME:        inner_dims_pos = [1, 0]
+// CHECK-SAME:        inner_tiles = [4, 16]
 // CHEKC-SAME:        into %[[OUT]] : tensor<1x1x4x16xi32> -> tensor<16x4xi32>
 //      CHECK:     return %[[UNPACK]] : tensor<16x4xi32>
 //      CHECK:   }
@@ -555,7 +555,7 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile
     ins(%arg0 : tensor<?x?x?x?xf32>)
     outs(%transpose_dest : tensor<?x?x?x?xf32>)
     permutation = [1, 0, 3, 2]
-  
+
   %unpack = tensor.unpack %transposed
     outer_dims_perm = [1, 0]
     inner_dims_pos = [0, 1]
@@ -565,7 +565,7 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile
 }
 //       CHECK:    #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 * s1)>
 // CHECK-LABEL:   func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes(
-//  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>, 
+//  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>,
 //  CHECK-SAME:     %[[IDX1:.+]]: index, %[[IDX2:.+]]: index) -> tensor<?x?xf32> {
 //   CHECK-DAG:       %[[CST1:.+]] = arith.constant 1 : index
 //   CHECK-DAG:       %[[CST0:.+]] = arith.constant 0 : index
@@ -574,10 +574,10 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile
 //   CHECK-DAG:       %[[AMAP0:.+]] = affine.apply #[[$MAP]]()[%[[DIM1]], %[[IDX2]]]
 //   CHECK-DAG:       %[[AMAP1:.+]] = affine.apply #[[$MAP]]()[%[[DIM0]], %[[IDX1]]]
 //       CHECK:       %[[OUT:.+]] = tensor.empty(%[[AMAP1]], %[[AMAP0]]) : tensor<?x?xf32>
-//       CHECK:       %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] 
-//  CHECK-SAME:         outer_dims_perm = [0, 1] 
-//  CHECK-SAME:         inner_dims_pos = [1, 0] 
-//  CHECK-SAME:         inner_tiles = [%[[IDX2]], %[[IDX1]]] 
+//       CHECK:       %[[UNPACK:.+]] = tensor.unpack %[[ARG0]]
+//  CHECK-SAME:         outer_dims_perm = [0, 1]
+//  CHECK-SAME:         inner_dims_pos = [1, 0]
+//  CHECK-SAME:         inner_tiles = [%[[IDX2]], %[[IDX1]]]
 //  CHECK-SAME:         into %[[OUT]] : tensor<?x?x?x?xf32> -> tensor<?x?xf32>
 //       CHECK:       return %[[UNPACK]] : tensor<?x?xf32>
 //       CHECK:   }
@@ -602,10 +602,37 @@ func.func @tensor_unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) ->
 // CHECK-LABEL:  func.func @tensor_unpack_linalg_transpose_fold(
 //  CHECK-SAME:    %[[ARG0:.+]]: tensor<56x57x1x64xf32>) -> tensor<3648x56xf32> {
 //       CHECK:        %[[OUT:.+]] = tensor.empty() : tensor<3648x56xf32>
-//       CHECK:        %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] 
-//  CHECK-SAME:        outer_dims_perm = [1, 0] 
-//  CHECK-SAME:        inner_dims_pos = [1, 0] 
-//  CHECK-SAME:        inner_tiles = [1, 64] 
+//       CHECK:        %[[UNPACK:.+]] = tensor.unpack %[[ARG0]]
+//  CHECK-SAME:        outer_dims_perm = [1, 0]
+//  CHECK-SAME:        inner_dims_pos = [1, 0]
+//  CHECK-SAME:        inner_tiles = [1, 64]
 //  CHECK-SAME:        into %[[OUT:.+]] : tensor<56x57x1x64xf32> -> tensor<3648x56xf32>
 //       CHECK:       return %[[UNPACK]] : tensor<3648x56xf32>
 //       CHECK:    }
+
+// -----
+
+func.func @tensor_padded_unpack_linalg_transpose_fold(%arg0: tensor<71x7x4x16x16xf32>) -> tensor<100x71x64xf32> {
+  %0 = tensor.empty() : tensor<71x100x64xf32>
+  %pack = tensor.unpack %arg0
+    inner_dims_pos = [1, 2]
+    inner_tiles = [16, 16]
+    into %0 : tensor<71x7x4x16x16xf32> -> tensor<71x100x64xf32>
+
+  %1 = tensor.empty() : tensor<100x71x64xf32>
+  %transposed = linalg.transpose
+    ins(%pack : tensor<71x100x64xf32>)
+    outs(%1 : tensor<100x71x64xf32>)
+    permutation = [1, 0, 2]
+  return %transposed : tensor<100x71x64xf32>
+}
+// CHECK-LABEL:  func.func @tensor_padded_unpack_linalg_transpose_fold(
+//  CHECK-SAME:    %[[ARG0:.+]]: tensor<71x7x4x16x16xf32>) -> tensor<100x71x64xf32> {
+//       CHECK:        %[[OUT:.+]] = tensor.empty() : tensor<100x71x64xf32>
+//       CHECK:        %[[UNPACK:.+]] = tensor.unpack %[[ARG0]]
+//  CHECK-SAME:        outer_dims_perm = [1, 0, 2]
+//  CHECK-SAME:        inner_dims_pos = [0, 2]
+//  CHECK-SAME:        inner_tiles = [16, 16]
+//  CHECK-SAME:        into %[[OUT:.+]] : tensor<71x7x4x16x16xf32> -> tensor<100x71x64xf32>
+//       CHECK:       return %[[UNPACK]] : tensor<100x71x64xf32>
+//       CHECK:    }
diff --git a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
index 625408dfefe2..d3ac6ce792f3 100644
--- a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
+++ b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
@@ -11,9 +11,11 @@ func.func @expand_shape_of_rank_reducing_extract(
 {
   %0 = tensor.extract_slice %t[0, 0, 0, 0][%idx, 1, 1, 5][1, 1, 1, 1]
       : tensor<?x?x?x?xf32> to tensor<?x1x5xf32>
-  %1 = tensor.expand_shape %0 [[0], [1, 2], [3]]
+  %c0 = arith.constant 0 : index
+  %sz0 = tensor.dim %0, %c0 : tensor<?x1x5xf32>
+  %1 = tensor.expand_shape %0 [[0], [1, 2], [3]] output_shape [%sz0, 1, 1, 5]
       : tensor<?x1x5xf32> into tensor<?x1x1x5xf32>
-  %2 = tensor.expand_shape %0 [[0, 1], [2], [3]]
+  %2 = tensor.expand_shape %0 [[0, 1], [2], [3]] output_shape [%sz0, 1, 1, 5]
       : tensor<?x1x5xf32> into tensor<?x1x1x5xf32>
   return %1, %2 : tensor<?x1x1x5xf32>, tensor<?x1x1x5xf32>
 }
diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir
index 79ca0de68a1e..41b6529f64af 100644
--- a/mlir/test/Dialect/Tensor/invalid.mlir
+++ b/mlir/test/Dialect/Tensor/invalid.mlir
@@ -273,21 +273,10 @@ func.func @insert_slice_wrong_dynamic_type(%t1: tensor<?x4x4xf32>, %t2: tensor<8
 
 // -----
 
-func.func @illegal_expanding_reshape_dynamic_tensor
-  (%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?x4x?xf32> {
-  // expected-error @+1 {{invalid to have a single dimension (2) expanded into multiple dynamic dims (2,4)}}
-  %0 = tensor.expand_shape %arg0 [[0], [1], [2, 3, 4]]
-      : tensor<?x?x?xf32> into tensor<?x?x?x4x?xf32>
-  return %0 : tensor<?x?x?x4x?xf32>
-}
-
-// -----
-
-
 func.func @illegal_expanding_reshape_static_tensor
     (%arg0: tensor<2x3x20xf32>) -> tensor<2x3x2x4x5xf32> {
   // expected-error @+1 {{expected dimension 2 of collapsed type to be static value of 40}}
-  %0 = tensor.expand_shape %arg0 [[0], [1], [2, 3, 4]]
+  %0 = tensor.expand_shape %arg0 [[0], [1], [2, 3, 4]] output_shape [2, 3, 2, 4, 5]
       : tensor<2x3x20xf32> into tensor<2x3x2x4x5xf32>
   return %0 : tensor<2x3x2x4x5xf32>
 }
@@ -304,26 +293,35 @@ func.func @illegal_collapsing_reshape_static_tensor
 
 // -----
 
-func.func @illegal_expanding_reshape_mixed_tensor(%arg0 : tensor<?x?xf32>)
+func.func @illegal_expanding_reshape_mixed_tensor(%arg0 : tensor<?x?xf32>, %sz0: index)
     -> tensor<?x4x5xf32> {
   // expected-error @+1 {{expected dimension 1 of collapsed type to be static value of 5}}
-  %0 = tensor.expand_shape %arg0 [[0, 1], [2]]
+  %0 = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [%sz0, 4, 5]
       : tensor<?x?xf32> into tensor<?x4x5xf32>
   return %0 : tensor<?x4x5xf32>
 }
 
 // -----
 
-func.func @illegal_expanding_reshape_mixed_tensor_2(%arg0 : tensor<?x?xf32>)
+func.func @illegal_expanding_reshape_mixed_tensor_2(%arg0 : tensor<?x?xf32>, %sz0: index)
     -> tensor<?x4x5xf32> {
   // expected-error @+1 {{expected dimension 1 of collapsed type to be static value of 20}}
-  %0 = tensor.expand_shape %arg0 [[0], [1, 2]]
+  %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [%sz0, 4, 5]
       : tensor<?x?xf32> into tensor<?x4x5xf32>
   return %0 : tensor<?x4x5xf32>
 }
 
 // -----
 
+func.func @expand_shape_illegal_output_shape(%arg0: tensor<2xf32>) {
+  // expected-error @+1 {{expected number of static shape dims to be equal to the output rank (3) but found 2 inputs instead}}
+  %0 = tensor.expand_shape %arg0 [[0, 1, 2]] output_shape [1, 2] : tensor<2xf32> into tensor<1x1x2xf32>
+  return
+}
+
+
+// -----
+
 func.func @illegal_collapsing_reshape_mixed_tensor(%arg0 : tensor<?x4x5xf32>) -> tensor<?x?xf32> {
   // expected-error @+1 {{expected dimension 1 of collapsed type to be static value of 5}}
   %0 = tensor.collapse_shape %arg0 [[0, 1], [2]]
diff --git a/mlir/test/Dialect/Tensor/ops.mlir b/mlir/test/Dialect/Tensor/ops.mlir
index 2b0a74acce08..378137a14b59 100644
--- a/mlir/test/Dialect/Tensor/ops.mlir
+++ b/mlir/test/Dialect/Tensor/ops.mlir
@@ -194,12 +194,26 @@ func.func @insert_slice(
 func.func @tensor_reshape_zero_dim(%arg0 : tensor<1x1xf32>, %arg1 : tensor<f32>)
     -> (tensor<f32>, tensor<1x1xf32>) {
   %0 = tensor.collapse_shape %arg0 [] : tensor<1x1xf32> into tensor<f32>
-  %1 = tensor.expand_shape %0 [] : tensor<f32> into tensor<1x1xf32>
+  %1 = tensor.expand_shape %0 [] output_shape [1, 1] : tensor<f32> into tensor<1x1xf32>
   return %0, %1 : tensor<f32>, tensor<1x1xf32>
 }
 // CHECK-LABEL: func @tensor_reshape_zero_dim
 //       CHECK:   tensor.collapse_shape %{{.*}} [] : tensor<1x1xf32> into tensor<f32>
-//       CHECK:   tensor.expand_shape %{{.*}} [] : tensor<f32> into tensor<1x1xf32>
+//       CHECK:   tensor.expand_shape %{{.*}} [] output_shape [1, 1] : tensor<f32> into tensor<1x1xf32>
+
+// -----
+
+func.func @tensor_expand_shape_dynamic_dim(%arg0 : tensor<?x?xf32>, %sz0 : index, %sz1 : index, %sz2 : index)
+    -> (tensor<5x?x?x?xf32>) {
+  %1 = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [5, %sz0, %sz1, %sz2] : tensor<?x?xf32> into tensor<5x?x?x?xf32>
+  return %1 : tensor<5x?x?x?xf32>
+}
+
+// CHECK-LABEL:  func.func @tensor_expand_shape_dynamic_dim(%arg0: tensor<?x?xf32>, %arg1: index, %arg2: index, %arg3: index) -> tensor<5x?x?x?xf32> {
+//       CHECK:    %expanded = tensor.expand_shape %arg0 {{\[\[}}0, 1], [2, 3{{\]\]}} output_shape [5, %arg1, %arg2, %arg3] : tensor<?x?xf32> into tensor<5x?x?x?xf32>
+//       CHECK:    return %expanded : tensor<5x?x?x?xf32>
+//       CHECK:  }
+
 
 // -----
 
diff --git a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
index 9948c0246e6e..5a2eade0eccc 100644
--- a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: func.func @single_dim_packing(
 // CHECK-SAME:    %[[ARG0:.+]]: tensor<256xf32>)
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1]] : tensor<256xf32> into tensor<8x32xf32>
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1]] output_shape [8, 32] : tensor<256xf32> into tensor<8x32xf32>
 // CHECK:         return %[[EXPANDED]] : tensor<8x32xf32>
 func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> {
   %empty = tensor.empty() : tensor<8x32xf32>
@@ -27,7 +27,7 @@ func.func @single_dim_packing_with_padding(%arg0: tensor<255xf32>) -> tensor<8x3
 
 // CHECK-LABEL: func.func @single_last_inner_dim_packing(
 // CHECK-SAME:    %[[ARG0:.+]]: tensor<5x256xf32>)
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<5x256xf32> into tensor<5x8x32xf32>
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] output_shape [5, 8, 32] : tensor<5x256xf32> into tensor<5x8x32xf32>
 // CHECK:         return %[[EXPANDED]] : tensor<5x8x32xf32>
 func.func @single_last_inner_dim_packing(%arg0: tensor<5x256xf32>) -> tensor<5x8x32xf32> {
   %empty = tensor.empty() : tensor<5x8x32xf32>
@@ -39,7 +39,7 @@ func.func @single_last_inner_dim_packing(%arg0: tensor<5x256xf32>) -> tensor<5x8
 
 // CHECK-LABEL: func.func @pack_1d_with_outer_dims_perm(
 // CHECK-SAME:    %[[ARG0:.+]]: tensor<64xf32>)
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1]] : tensor<64xf32> into tensor<2x32xf32>
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1]] output_shape [2, 32] : tensor<64xf32> into tensor<2x32xf32>
 // CHECK:         return %[[EXPANDED]] : tensor<2x32xf32>
 func.func @pack_1d_with_outer_dims_perm(%arg0: tensor<64xf32>) -> tensor<2x32xf32> {
   %empty = tensor.empty() :  tensor<2x32xf32>
@@ -51,7 +51,7 @@ func.func @pack_1d_with_outer_dims_perm(%arg0: tensor<64xf32>) -> tensor<2x32xf3
 
 // CHECK-LABEL: func.func @single_last_inner_dim_packing_with_identity_outer_dims_perm(
 // CHECK-SAME:    %[[ARG0:.+]]: tensor<5x256xf32>)
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<5x256xf32> into tensor<5x8x32xf32>
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] output_shape [5, 8, 32] : tensor<5x256xf32> into tensor<5x8x32xf32>
 // CHECK:         return %[[EXPANDED]] : tensor<5x8x32xf32>
 func.func @single_last_inner_dim_packing_with_identity_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<5x8x32xf32> {
   %empty = tensor.empty() : tensor<5x8x32xf32>
@@ -85,7 +85,7 @@ func.func @single_first_inner_dim_packing(%arg0: tensor<256x5xf32>) -> tensor<8x
 
 // CHECK-LABEL: func.func @pack_1x32_to_1x32x1x1
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2, 3]]
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2, 3]] output_shape [1, 32, 1, 1]
 // CHECK:         return %[[EXPANDED]]
 func.func @pack_1x32_to_1x32x1x1(%arg0 : tensor<1x32xf32>) -> tensor<1x32x1x1xf32> {
   %empty = tensor.empty() : tensor<1x32x1x1xf32>
@@ -98,7 +98,7 @@ func.func @pack_1x32_to_1x32x1x1(%arg0 : tensor<1x32xf32>) -> tensor<1x32x1x1xf3
 
 // CHECK-LABEL: func.func @pack_1x32_to_1x16x1x2
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2, 3]]
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2, 3]] output_shape [1, 16, 1, 2]
 // CHECK:         return %[[EXPANDED]]
 func.func @pack_1x32_to_1x16x1x2(%arg0 : tensor<1x32xf32>) -> tensor<1x16x1x2xf32> {
   %empty = tensor.empty() : tensor<1x16x1x2xf32>
@@ -111,7 +111,7 @@ func.func @pack_1x32_to_1x16x1x2(%arg0 : tensor<1x32xf32>) -> tensor<1x16x1x2xf3
 
 // CHECK-LABEL: func.func @pack_32x1_to_16x1x2x1
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]
-// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]]
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] output_shape [1, 16, 2, 1]
 // CHECK:         return %[[EXPANDED]]
 func.func @pack_32x1_to_16x1x2x1(%arg0 : tensor<32x1xf32>) -> tensor<1x16x2x1xf32> {
   %empty = tensor.empty() : tensor<1x16x2x1xf32>
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index c9c60a94bf9e..ad82c9f8858e 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -213,7 +213,7 @@ func.func @fold_add_splat_f32() -> tensor<10xf32> {
 func.func @fold_div_zero_lhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
   %zero = "tosa.const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0>
-  %div = tosa.div %zero, %arg0 : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %div = tosa.int_div %zero, %arg0 : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: return %[[ZERO]]
   return %div : tensor<i32>
 }
@@ -223,7 +223,7 @@ func.func @fold_div_zero_lhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
 // CHECK-LABEL: @fold_div_one_rhs_i32
 func.func @fold_div_one_rhs_i32(%arg0: tensor<i32>) -> tensor<i32> {
   %one = "tosa.const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  %div = tosa.div %arg0, %one : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %div = tosa.int_div %arg0, %one : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: return %arg0
   return %div : tensor<i32>
 }
@@ -235,7 +235,7 @@ func.func @fold_div_splat_i32() -> tensor<i32> {
   %lhs = "tosa.const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   %rhs = "tosa.const"() {value = dense<-3> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[SPLAT:.+]] = "tosa.const"() <{value = dense<-3>
-  %div = tosa.div %lhs, %rhs : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %div = tosa.int_div %lhs, %rhs : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: return %[[SPLAT]]
   return %div : tensor<i32>
 }
@@ -1092,7 +1092,7 @@ func.func @reduce_sum_constant_aggressive() -> tensor<1x3xi32> {
   // AGGRESIVE-LABEL: func.func @reduce_sum_constant_aggressive() -> tensor<1x3xi32> {
   // AGGRESIVE:       %[[VAL_0:.*]] = "tosa.const"() <{value = dense<4> : tensor<1x3xi32>}> : () -> tensor<1x3xi32>
   // AGGRESIVE:       return %[[VAL_0:.*]] : tensor<1x3xi32>
-  
+
   // CHECK-LABEL:     func.func @reduce_sum_constant_aggressive() -> tensor<1x3xi32> {
   // CHECK:           %[[VAL_0:.*]] = "tosa.const"() <{value = dense<1> : tensor<2x3xi32>}> : () -> tensor<2x3xi32>
   // CHECK:           %[[VAL_1:.*]] = tosa.reduce_sum %[[VAL_0]] {axis = 0 : i32} : (tensor<2x3xi32>) -> tensor<1x3xi32>
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 01b27072a4b6..525ee917ccd9 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -257,9 +257,9 @@ func.func @test_bitwise_xor(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi3
 }
 
 // -----
-// CHECK-LABEL: div
-func.func @test_div(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> {
-  %0 = tosa.div %arg0, %arg1 : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+// CHECK-LABEL: int_div
+func.func @test_int_div(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> {
+  %0 = tosa.int_div %arg0, %arg1 : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
 
diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
index 22808aa7d6ac..f70d23a19322 100644
--- a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
@@ -281,6 +281,23 @@ func.func private @scalable_dims(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>
 // CHECK:           %[[VAL_163:.*]] = vector.shape_cast %[[VAL_162]] : vector<[32]xf32> to vector<8x[4]xf32>
 // CHECK:           return %[[VAL_163]] : vector<8x[4]xf32>
 
+// Check that OneDimMultiReductionToTwoDim handles scalable dim
+func.func @scalable_dim_1d(%A: vector<[4]xf32>, %B: f32, %C: vector<[4]xi1>) -> f32 {
+    %0 = vector.mask %C { vector.multi_reduction <add>, %A, %B [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
+    return %0 : f32
+}
+
+// CHECK-LABEL:  func.func @scalable_dim_1d(
+// CHECK-SAME:                                      %[[ARG_0:.*]]: vector<[4]xf32>,
+// CHECK-SAME:                                      %[[ARG_1:.*]]: f32,
+// CHECK-SAME:                                      %[[ARG_2:.*]]: vector<[4]xi1>) -> f32 {
+// CHECK-DAG:      %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK-DAG:      %[[VAL_1:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+// CHECK:          %[[VAL_2:.*]] = vector.mask %[[ARG_2]] { vector.reduction <add>, %[[ARG_0]], %[[ARG_1]] : vector<[4]xf32> into f32 } : vector<[4]xi1> -> f32
+// CHECK:          %[[VAL_3:.*]] = vector.insertelement %[[VAL_2]], %[[VAL_1]][%[[VAL_0]] : index] : vector<1xf32>
+// CHECK:          %[[VAL_4:.*]] = vector.extract %[[VAL_3]][0] : f32 from vector<1xf32>
+// CHECK:          return %[[VAL_4]] : f32
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
     %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
index 8f0148119806..614b2d494534 100644
--- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
+++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
@@ -324,6 +324,47 @@ func.func @i7_transpose(%a: vector<8x16xi7>) -> vector<16x8xi7> {
   return %0 : vector<16x8xi7>
 }
 
+// CHECK-LABEL: func.func @aligned_extui(
+func.func @aligned_extui(%a: vector<8xi4>) -> vector<8xi32> {
+// CHECK-SAME:                             %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> {
+// CHECK:           %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK:           %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8>
+// CHECK:           %[[LOW:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<4xi8>
+// CHECK:           %[[HIGH:.*]] = arith.shrui %[[BITCAST]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8>
+// CHECK:           %[[I32:.*]] = arith.extui %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32>
+  %0 = arith.extui %a : vector<8xi4> to vector<8xi32>
+  return %0 : vector<8xi32>
+}
+
+// CHECK-LABEL: func.func @aligned_extui_2d(
+func.func @aligned_extui_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> {
+// CHECK-SAME:                                %[[VAL_0:.*]]: vector<8x32xi4>) -> vector<8x32xi32> {
+// CHECK:           %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8>
+// CHECK:           %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<8x16xi8>
+// CHECK:           %[[BITCAST:.*]] = vector.bitcast %[[VAL_0]] : vector<8x32xi4> to vector<8x16xi8>
+// CHECK:           %[[LOW:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<8x16xi8>
+// CHECK:           %[[HIGH:.*]] = arith.shrui %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8>
+// CHECK:           %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8>
+// CHECK:           %[[I32:.*]] = arith.extui %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32>
+  %0 = arith.extui %a : vector<8x32xi4> to vector<8x32xi32>
+  return %0 : vector<8x32xi32>
+}
+
+// CHECK-LABEL: func.func @aligned_extui_base_case(
+func.func @aligned_extui_base_case(%a: vector<8xi4>) -> vector<8xi8> {
+// CHECK-SAME:                             %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> {
+// CHECK:           %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK:           %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8>
+// CHECK:           %[[LOW:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<4xi8>
+// CHECK:           %[[HIGH:.*]] = arith.shrui %[[BITCAST]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8>
+  %0 = arith.extui %a : vector<8xi4> to vector<8xi8>
+  return %0 : vector<8xi8>
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
     %f = transform.structured.match ops{["func.func"]} in %module_op
@@ -335,4 +376,3 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
index f0945c79a94a..00d32d2a2ee9 100644
--- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -80,7 +80,7 @@ gpu.func @test_prefetch_vc(%src: ui64) {
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
   %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
   // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>> 
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
   gpu.return
 }
 
@@ -121,4 +121,59 @@ gpu.func @test_create_update_tdesc_vc(%src: ui64) {
   gpu.return
 }
 
-}
-\ No newline at end of file
+// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x8x2xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
+gpu.func @test_dpas_vc(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
+  // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+  %1 = xegpu.dpas %a, %b: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
+gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]: ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : <16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+  xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @alloc_nbarrier({{.*}}) {
+gpu.func @alloc_nbarrier() {
+  // CHECK: xegpu.alloc_nbarrier
+  xegpu.alloc_nbarrier 8
+  gpu.return
+}
+
+// CHECK: gpu.func @init_nbarrier({{.*}}) {
+gpu.func @init_nbarrier() {
+  //CHECK: %[[c1:.*]] = arith.constant 1 : i8
+  //CHECK: %[[c16:.*]] = arith.constant 16 : i8
+  %nbarrier_id = arith.constant 1 : i8
+  %threads_count = arith.constant 16 : i8
+  //CHECK: xegpu.init_nbarrier %[[c1]], %[[c16]] : i8, i8 -> !xegpu.nbarrier
+  %nbarrier = xegpu.init_nbarrier %nbarrier_id, %threads_count : i8, i8 -> !xegpu.nbarrier
+  gpu.return
+}
+
+// CHECK: gpu.func @nbarrier_arrive(%[[arg0:.*]]: !xegpu.nbarrier) {
+gpu.func @nbarrier_arrive(%nbarrier : !xegpu.nbarrier) {
+  //CHECK: xegpu.nbarrier_arrive %[[arg0]] : !xegpu.nbarrier
+  xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
+  gpu.return
+}
+
+// CHECK: gpu.func @nbarrier_wait(%[[arg0:.*]]: !xegpu.nbarrier) {
+gpu.func @nbarrier_wait(%nbarrier : !xegpu.nbarrier) {
+  //CHECK: xegpu.nbarrier_wait %[[arg0]] : !xegpu.nbarrier
+  xegpu.nbarrier_wait %nbarrier : !xegpu.nbarrier
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @fence({{.*}}) {
+gpu.func @fence() {
+  //CHECK: xegpu.fence memory_kind = global, fence_scope = workgroup
+  xegpu.fence memory_kind = global, fence_scope = workgroup
+  gpu.return
+}
+
+}
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 5e29361ec690..7819ad60b97d 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -156,4 +156,32 @@ func.func @test_store_scatter_vc_2(%src: ui64) {
   xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<streaming>}> : vector<4x2xf32>,
           !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
   return
+}
+
+// -----
+func.func @test_dpas_vc_1(%a : vector<8x4x2xf16>, %b: vector<8x16x2xf16>) {
+  // expected-error@+1 {{K-dimension or vnni-factor mismatch}}
+  %1 = xegpu.dpas %a, %b : vector<8x4x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+  return
+}
+
+// -----
+func.func @test_dpas_vc_2(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
+  // expected-error@+1 {{lhs and rhs rank does not match for dpas op, or their rank is not 3}}
+  %1 = xegpu.dpas %a, %b : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+  return
+}
+
+// -----
+func.func @test_dpas_vc_3(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
+  // expected-error@+1 {{lhs and rhs rank does not match for dpas op, or their rank is not 3}}
+  %1 = xegpu.dpas %a, %b : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  return
+}
+
+// -----
+func.func @test_dpas_vc_4(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>, %c : vector<8x16xf16>) {
+  // expected-error@+1 {{Accumulator and Result for dpas op should have the same type}}
+  %1 = xegpu.dpas %a, %b, %c : vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf16> -> vector<8x16xf32>
+  return
 }
 \ No newline at end of file
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
new file mode 100644
index 000000000000..b05ef9422e59
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
@@ -0,0 +1,298 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -convert-linalg-to-loops \
+// RUN: -expand-strided-metadata \
+// RUN: -lower-affine \
+// RUN: -convert-scf-to-cf \
+// RUN: -test-cf-assert \
+// RUN: -convert-index-to-llvm \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils \
+// RUN:     -shared-libs=%mlir_c_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @main() {
+  %c5x = arith.constant dense<0.0> : tensor<5xf32>
+  %c4x = arith.constant dense<0.0> : tensor<4xf32>
+  %d5x = tensor.cast %c5x : tensor<5xf32> to tensor<?xf32>
+  %d4x = tensor.cast %c4x : tensor<4xf32> to tensor<?xf32>
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @simple_add(%d5x, %d5x) : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
+
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.generic
+  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  func.call @simple_add(%d5x, %d4x) : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
+
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.generic
+  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  func.call @simple_add(%d4x, %d5x) : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
+
+  %c1x1 = arith.constant dense<0.0> : tensor<1x1xf32>
+  %c1x4 = arith.constant dense<0.0> : tensor<1x4xf32>
+  %c4x4 = arith.constant dense<0.0> : tensor<4x4xf32>
+  %c4x5 = arith.constant dense<0.0> : tensor<4x5xf32>
+  %c5x4 = arith.constant dense<0.0> : tensor<5x4xf32>
+  %d1x1 = tensor.cast %c1x1 : tensor<1x1xf32> to tensor<?x?xf32>
+  %d1x4 = tensor.cast %c1x4 : tensor<1x4xf32> to tensor<?x?xf32>
+  %d4x4 = tensor.cast %c4x4 : tensor<4x4xf32> to tensor<?x?xf32>
+  %d4x5 = tensor.cast %c4x5 : tensor<4x5xf32> to tensor<?x?xf32>
+  %d5x4 = tensor.cast %c5x4 : tensor<5x4xf32> to tensor<?x?xf32>
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @broadcast_add(%d1x1, %d1x1) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @broadcast_add(%d1x1, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @broadcast_add(%d4x4, %d1x4) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.generic
+  // CHECK: ^ dimension #1 of input/output operand #1 is incompatible with inferred dimension size
+  func.call @broadcast_add(%d1x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.generic
+  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.generic
+  // CHECK: ^ dimension #1 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.generic
+  // CHECK: ^ dimension #1 of input/output operand #2 is incompatible with inferred dimension size
+  func.call @broadcast_add(%d5x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @matmul_generic(%d5x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.generic
+  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  func.call @matmul_generic(%d4x5, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @matmul_named(%d5x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.matmul
+  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  func.call @matmul_named(%d4x5, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  %c64x57 = arith.constant dense<0.0> : tensor<16x29xf32>
+  %c3x4 = arith.constant dense<0.0> : tensor<3x4xf32>
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @conv(%c64x57, %c3x4) : (tensor<16x29xf32>, tensor<3x4xf32>) -> (tensor<5x7xf32>)
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @reverse_from_3(%d4x) : (tensor<?xf32>) -> (tensor<?xf32>)
+
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK: linalg.generic
+  // CHECK: unexpected negative result on dimension #0 of input/output operand #0
+  func.call @reverse_from_3(%d5x) : (tensor<?xf32>) -> (tensor<?xf32>)
+
+  return
+}
+
+
+#identity1D = affine_map<(d0) -> (d0)>
+
+func.func @simple_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> (tensor<?xf32>) {
+    %c0 = arith.constant 0 : index
+    %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
+    %result = tensor.empty(%dim) : tensor<?xf32> 
+    %0 = linalg.generic {
+      indexing_maps = [#identity1D, #identity1D, #identity1D],
+      iterator_types = ["parallel"]
+    } ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>)
+      outs(%result : tensor<?xf32>) {
+      ^bb0(%gen_arg1: f32, %gen_arg2: f32, %out: f32) :
+        %tmp1 = arith.addf %gen_arg1, %gen_arg2 : f32
+        linalg.yield %tmp1 : f32
+    } -> tensor<?xf32>
+    return %0 : tensor<?xf32>
+}
+
+#broadcastD0 = affine_map<(d0, d1) -> (0, d1)>
+#broadcastD1 = affine_map<(d0, d1) -> (d0, 0)>
+#identity2D = affine_map<(d0, d1) -> (d0, d1)>
+
+func.func @broadcast_add(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+ // Calculate maximum dimension 0
+  %c0 = arith.constant 0 : index
+  %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %dim_0 = tensor.dim %arg1, %c0 : tensor<?x?xf32>
+  %0 = arith.maxui %dim, %dim_0 : index
+
+  // Calculate maximum dimension 1
+  %c1 = arith.constant 1 : index
+  %dim_1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %dim_2 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %1 = arith.maxui %dim_1, %dim_2 : index
+
+  // Broadcast dimension 0 of %arg0
+  %dim_3 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %2 = arith.cmpi eq, %dim_3, %c1 : index
+  %3 = scf.if %2 -> (tensor<?x?xf32>) {
+    %dim_7 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+    %12 = tensor.empty(%0, %dim_7) : tensor<?x?xf32>
+    %13 = linalg.generic {
+      indexing_maps = [#broadcastD0, #identity2D],
+      iterator_types = ["parallel", "parallel"]
+    } ins(%arg0 : tensor<?x?xf32>) outs(%12 : tensor<?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<?x?xf32>
+    scf.yield %13 : tensor<?x?xf32>
+  } else {
+    scf.yield %arg0 : tensor<?x?xf32>
+  }
+
+  // Broadcast dimension 1 of %arg0
+  %dim_4 = tensor.dim %3, %c1 : tensor<?x?xf32>
+  %4 = arith.cmpi eq, %dim_4, %c1 : index
+  %5 = scf.if %4 -> (tensor<?x?xf32>) {
+    %dim_7 = tensor.dim %3, %c0 : tensor<?x?xf32>
+    %12 = tensor.empty(%dim_7, %1) : tensor<?x?xf32>
+    %13 = linalg.generic {
+      indexing_maps = [#broadcastD1, #identity2D],
+      iterator_types = ["parallel", "parallel"]
+    } ins(%3 : tensor<?x?xf32>) outs(%12 : tensor<?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<?x?xf32>
+    scf.yield %13 : tensor<?x?xf32>
+  } else {
+    scf.yield %3 : tensor<?x?xf32>
+  }
+
+  // Broadcast dimension 0 of %arg1
+  %dim_5 = tensor.dim %arg1, %c0 : tensor<?x?xf32>
+  %6 = arith.cmpi eq, %dim_5, %c1 : index
+  %7 = scf.if %6 -> (tensor<?x?xf32>) {
+    %dim_7 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+    %12 = tensor.empty(%0, %dim_7) : tensor<?x?xf32>
+    %13 = linalg.generic {
+      indexing_maps = [#broadcastD0, #identity2D],
+      iterator_types = ["parallel", "parallel"]
+    } ins(%arg1 : tensor<?x?xf32>) outs(%12 : tensor<?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<?x?xf32>
+    scf.yield %13 : tensor<?x?xf32>
+  } else {
+    scf.yield %arg1 : tensor<?x?xf32>
+  }
+
+  // Broadcast dimension 1 of %arg1
+  %dim_6 = tensor.dim %7, %c1 : tensor<?x?xf32>
+  %8 = arith.cmpi eq, %dim_6, %c1 : index
+  %9 = scf.if %8 -> (tensor<?x?xf32>) {
+    %dim_7 = tensor.dim %7, %c0 : tensor<?x?xf32>
+    %12 = tensor.empty(%dim_7, %1) : tensor<?x?xf32>
+    %13 = linalg.generic {
+      indexing_maps = [#broadcastD1, #identity2D],
+      iterator_types = ["parallel", "parallel"]
+    } ins(%7 : tensor<?x?xf32>) outs(%12 : tensor<?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<?x?xf32>
+    scf.yield %13 : tensor<?x?xf32>
+  } else {
+    scf.yield %7 : tensor<?x?xf32>
+  }
+
+  // Perform element-wise computation
+  %10 = tensor.empty(%0, %1) : tensor<?x?xf32>
+  %11 = linalg.generic {
+    indexing_maps = [#identity2D, #identity2D, #identity2D],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%5, %9 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_7: f32, %out: f32):
+    %12 = arith.addf %in, %in_7 : f32
+    linalg.yield %12 : f32
+  } -> tensor<?x?xf32>
+  return %11 : tensor<?x?xf32>
+}
+
+#matmul_accesses = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmul_trait = {
+  iterator_types = ["parallel", "parallel", "reduction"],
+  indexing_maps = #matmul_accesses
+}
+
+func.func @matmul_generic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %cf0 = arith.constant 0.0 : f32 
+  %ci0 = arith.constant 0 : index 
+  %ci1 = arith.constant 1 : index 
+  %d0 = tensor.dim %arg0, %ci0 : tensor<?x?xf32>
+  %d1 = tensor.dim %arg1, %ci1 : tensor<?x?xf32>
+  %splat = tensor.splat %cf0[%d0, %d1] : tensor<?x?xf32>
+  %0 = linalg.generic #matmul_trait ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%splat : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+func.func @matmul_named(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %cf0 = arith.constant 0.0 : f32 
+  %ci0 = arith.constant 0 : index 
+  %ci1 = arith.constant 1 : index 
+  %d0 = tensor.dim %arg0, %ci0 : tensor<?x?xf32>
+  %d1 = tensor.dim %arg1, %ci1 : tensor<?x?xf32>
+  %splat = tensor.splat %cf0[%d0, %d1] : tensor<?x?xf32>
+  %0 = linalg.matmul  ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%splat : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+#conv_trait = {
+  indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0 * 3 + d2, d1 * 4 + d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
+  iterator_types = ["parallel", "parallel", "reduction", "reduction"]
+}
+
+func.func @conv(%arg0: tensor<16x29xf32>, %arg1: tensor<3x4xf32>) -> (tensor<5x7xf32>) {
+  %c0 = arith.constant 0.0 : f32 
+  %splat = tensor.splat %c0 : tensor<5x7xf32>
+  %result = linalg.generic #conv_trait ins(%arg0, %arg1 : tensor<16x29xf32>, tensor<3x4xf32>) outs(%splat : tensor<5x7xf32>) {
+  ^bb0(%in: f32, %in_64: f32, %out: f32):
+    %5 = arith.mulf %in, %in_64 : f32
+    %6 = arith.addf %out, %5 : f32
+    linalg.yield %6 : f32
+  } -> tensor<5x7xf32>
+  return %result : tensor<5x7xf32>
+}
+
+#reverse_trait = {
+  indexing_maps = [
+          affine_map<(i) -> (3 - i)>,
+          affine_map<(i) -> (i)>
+  ],
+  iterator_types = ["parallel"]
+}
+
+func.func @reverse_from_3(%arg0: tensor<?xf32>) -> (tensor<?xf32>) {
+  %cf0 = arith.constant 0.0 : f32 
+  %ci0 = arith.constant 0 : index 
+  %d0 = tensor.dim %arg0, %ci0 : tensor<?xf32>
+  %splat = tensor.splat %cf0[%d0] : tensor<?xf32>
+  %result = linalg.generic #reverse_trait ins(%arg0: tensor<?xf32>) outs(%splat: tensor<?xf32>) {
+    ^bb0(%a: f32, %b: f32):
+    linalg.yield %a : f32
+  } -> tensor<?xf32>
+  return %result : tensor<?xf32>
+}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
index a101b76ef186..db882f7a54d3 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
@@ -35,6 +35,12 @@ func.func @main() {
 func.func private @printMemrefF32(%ptr : tensor<*xf32>)
 
 func.func @expand_dynamic_shape(%arg0 : tensor<2x?x?xf32>) -> tensor<2x2x?x1x?xf32> {
-  %0 = tensor.expand_shape %arg0 [[0], [1, 2, 3], [4]]: tensor<2x?x?xf32> into tensor<2x2x?x1x?xf32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %d1 = tensor.dim %arg0, %c1 : tensor<2x?x?xf32>
+  %d2 = tensor.dim %arg0, %c2 : tensor<2x?x?xf32>
+  %sz1 = arith.divui %d1, %c2 : index
+  %0 = tensor.expand_shape %arg0 [[0], [1, 2, 3], [4]] output_shape [2, 2, %sz1, 1, %d2] : tensor<2x?x?xf32> into tensor<2x2x?x1x?xf32>
   return %0 : tensor<2x2x?x1x?xf32>
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/padded_sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/padded_sparse_conv_2d.mlir
new file mode 100644
index 000000000000..50dd989416e2
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/padded_sparse_conv_2d.mlir
@@ -0,0 +1,169 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+//  do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparsifier_opts} = enable-runtime-library=true
+// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and VLA vectorization.
+// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
+
+#CDCC_NHWC = #sparse_tensor.encoding<{
+  map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : dense, d2 : compressed, d3 : compressed)
+}>
+
+// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
+func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
+  %buf = tensor.empty(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
+  %ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %ret : tensor<?x?x?x?xf32>
+}
+
+func.func @conv_2d_nhwc_hwcf(%arg0: tensor<3x8x8x3xf32>, %arg1: tensor<5x5x3x1xf32>, %arg2: tensor<3x8x8x1xf32>) -> tensor<3x8x8x1xf32> {
+  %cst_0 = arith.constant 0.00000e+00 : f32
+
+  %padded = tensor.pad %arg0 low[0, 2, 2, 0] high[0, 2, 2, 0] {
+   ^bb0(%arg75: index, %arg76: index, %arg77: index, %arg78: index):
+     tensor.yield %cst_0 : f32
+   } : tensor<3x8x8x3xf32> to tensor<3x12x12x3xf32>
+
+  %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<1> : tensor<2xi64>}
+     ins (%padded, %arg1: tensor<3x12x12x3xf32>, tensor<5x5x3x1xf32>)
+    outs (%arg2: tensor<3x8x8x1xf32>) -> tensor<3x8x8x1xf32>
+
+  bufferization.dealloc_tensor %padded : tensor<3x12x12x3xf32>
+  return %ret : tensor<3x8x8x1xf32>
+}
+
+func.func @conv_2d_nhwc_hwcf_CDCC_NHWC(%arg0: tensor<3x8x8x3xf32, #CDCC_NHWC>, %arg1: tensor<5x5x3x1xf32>) -> tensor<3x8x8x1xf32> {
+  %cst_0 = arith.constant 0.00000e+00 : f32
+  %buf = tensor.empty() : tensor<3x8x8x1xf32>
+  %s = linalg.fill ins(%cst_0 : f32) outs(%buf : tensor<3x8x8x1xf32>) -> tensor<3x8x8x1xf32>
+
+  %padded = tensor.pad %arg0 low[0, 2, 2, 0] high[0, 2, 2, 0] {
+   ^bb0(%arg75: index, %arg76: index, %arg77: index, %arg78: index):
+     tensor.yield %cst_0 : f32
+   } : tensor<3x8x8x3xf32, #CDCC_NHWC> to tensor<3x12x12x3xf32, #CDCC_NHWC>
+
+  %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
+                                     strides = dense<1> : tensor<2xi64>}
+     ins (%padded, %arg1: tensor<3x12x12x3xf32, #CDCC_NHWC>, tensor<5x5x3x1xf32>)
+    outs (%s: tensor<3x8x8x1xf32>) -> tensor<3x8x8x1xf32>
+  return %ret : tensor<3x8x8x1xf32>
+}
+
+func.func @main() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %c5 = arith.constant 5 : index
+  %c6 = arith.constant 6 : index
+  %c8 = arith.constant 8 : index
+  %f10 = arith.constant 10.00000e+00 : f32
+  %val = arith.constant 2.00000e+00 : f32
+  %zero = arith.constant 0.00000e+00 : f32
+
+  %filter2D_nhwc = call @alloc_4d_filled_f32(%c5, %c5, %c3, %c1, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c3, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+  %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c0, %c3, %c0] : tensor<?x?x?x?xf32>
+  %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c1, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
+
+  %static_filter = tensor.cast %filter2D_nhwc : tensor<?x?x?x?xf32> to tensor<5x5x3x1xf32>
+  %static_input = tensor.cast %in2D_nhwc : tensor<?x?x?x?xf32> to tensor<3x8x8x3xf32>
+  %static_output = tensor.cast %out2D_nhwc : tensor<?x?x?x?xf32> to tensor<3x8x8x1xf32>
+
+  %dense_ret = call @conv_2d_nhwc_hwcf(%static_input, %static_filter, %static_output) : (tensor<3x8x8x3xf32>, tensor<5x5x3x1xf32>, tensor<3x8x8x1xf32>) -> (tensor<3x8x8x1xf32>)
+
+  %in2D_nhwc_CDCC_NHWC = sparse_tensor.convert %static_input : tensor<3x8x8x3xf32> to tensor<3x8x8x3xf32, #CDCC_NHWC>
+  %CDCC_NHWC_ret = call @conv_2d_nhwc_hwcf_CDCC_NHWC(%in2D_nhwc_CDCC_NHWC, %static_filter) : (tensor<3x8x8x3xf32, #CDCC_NHWC>, tensor<5x5x3x1xf32>) -> (tensor<3x8x8x1xf32>)
+
+
+  // CHECK:   ( ( ( ( 108 ), ( 160 ), ( 196 ), ( 196 ), ( 196 ), ( 196 ), ( 144 ), ( 108 ) ),
+  // CHECK-SAME:  ( ( 144 ), ( 208 ), ( 256 ), ( 256 ), ( 256 ), ( 256 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 256 ), ( 316 ), ( 316 ), ( 316 ), ( 316 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:  ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ) ),
+  // CHECK-SAME:( ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ),
+  // CHECK-SAME:  ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:  ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ) ),
+  // CHECK-SAME:( ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ),
+  // CHECK-SAME:  ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:  ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:  ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ) ) )
+  %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<3x8x8x1xf32>, vector<3x8x8x1xf32>
+  vector.print %dense_v : vector<3x8x8x1xf32>
+
+  // CHECK-NEXT: ( ( ( ( 108 ), ( 160 ), ( 196 ), ( 196 ), ( 196 ), ( 196 ), ( 144 ), ( 108 ) ),
+  // CHECK-SAME:     ( ( 144 ), ( 208 ), ( 256 ), ( 256 ), ( 256 ), ( 256 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 256 ), ( 316 ), ( 316 ), ( 316 ), ( 316 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:     ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ) ),
+  // CHECK-SAME:   ( ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ),
+  // CHECK-SAME:     ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:     ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ) ),
+  // CHECK-SAME:   ( ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ),
+  // CHECK-SAME:     ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 180 ), ( 240 ), ( 300 ), ( 300 ), ( 300 ), ( 300 ), ( 240 ), ( 180 ) ),
+  // CHECK-SAME:     ( ( 144 ), ( 192 ), ( 240 ), ( 240 ), ( 240 ), ( 240 ), ( 192 ), ( 144 ) ),
+  // CHECK-SAME:     ( ( 108 ), ( 144 ), ( 180 ), ( 180 ), ( 180 ), ( 180 ), ( 144 ), ( 108 ) ) ) )
+  %CDCC_NHWC_v = vector.transfer_read %CDCC_NHWC_ret[%c0, %c0, %c0, %c0], %zero
+      : tensor<3x8x8x1xf32>, vector<3x8x8x1xf32>
+  vector.print %CDCC_NHWC_v : vector<3x8x8x1xf32>
+
+  bufferization.dealloc_tensor %static_filter : tensor<5x5x3x1xf32>
+  bufferization.dealloc_tensor %static_input  : tensor<3x8x8x3xf32>
+  bufferization.dealloc_tensor %static_output : tensor<3x8x8x1xf32>
+
+  bufferization.dealloc_tensor %CDCC_NHWC_ret : tensor<3x8x8x1xf32>
+
+  bufferization.dealloc_tensor %in2D_nhwc_CDCC_NHWC : tensor<3x8x8x3xf32, #CDCC_NHWC>
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
index f7975e0738fa..73dddefb0e4a 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
@@ -44,7 +44,7 @@ module {
     %cst = arith.constant 0.000000e+00 : f32
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
-    %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
+    %expanded = tensor.expand_shape %2 [[0], [1, 2]] output_shape [5,2,3]: tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
 
     // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
@@ -60,7 +60,7 @@ module {
     %cst = arith.constant 0.000000e+00 : f32
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32, #COO_2D>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
-    %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
+    %expanded = tensor.expand_shape %2 [[0], [1, 2]] output_shape [5,2,3]: tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
 
     // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
@@ -76,7 +76,7 @@ module {
     %cst = arith.constant 0.000000e+00 : f32
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32>, tensor<6x6xf32>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
-    %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
+    %expanded = tensor.expand_shape %2 [[0], [1, 2]] output_shape [5,2,3]: tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
     return %ret1 :  tensor<?x?x?xf32>
   }
@@ -88,7 +88,7 @@ module {
     %cst = arith.constant 0.000000e+00 : f32
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32, #COO_2D>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
-    %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
+    %expanded = tensor.expand_shape %2 [[0], [1, 2]] output_shape [5,2,3]: tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
 
     // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand_shape.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand_shape.mlir
index 6679a81c7408..393242484576 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand_shape.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand_shape.mlir
@@ -53,62 +53,86 @@
 module {
 
   func.func @expand_dense(%arg0: tensor<12xf64>) -> tensor<3x4xf64> {
-    %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<12xf64> into tensor<3x4xf64>
+    %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [3, 4] : tensor<12xf64> into tensor<3x4xf64>
     return %0 : tensor<3x4xf64>
   }
 
   func.func @expand_from_sparse(%arg0: tensor<12xf64, #SparseVector>) -> tensor<3x4xf64> {
-    %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<12xf64, #SparseVector> into tensor<3x4xf64>
+    %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [3, 4] : tensor<12xf64, #SparseVector> into tensor<3x4xf64>
     return %0 : tensor<3x4xf64>
   }
 
   func.func @expand_to_sparse(%arg0: tensor<12xf64>) -> tensor<3x4xf64, #SparseMatrix> {
-    %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<12xf64> into tensor<3x4xf64, #SparseMatrix>
+    %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [3, 4] : tensor<12xf64> into tensor<3x4xf64, #SparseMatrix>
     return %0 : tensor<3x4xf64, #SparseMatrix>
   }
 
   func.func @expand_sparse2sparse(%arg0: tensor<12xf64, #SparseVector>) -> tensor<3x4xf64, #SparseMatrix> {
-    %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<12xf64, #SparseVector> into tensor<3x4xf64, #SparseMatrix>
+    %0 = tensor.expand_shape %arg0 [[0, 1]] output_shape [3, 4] : tensor<12xf64, #SparseVector> into tensor<3x4xf64, #SparseMatrix>
     return %0 : tensor<3x4xf64, #SparseMatrix>
   }
 
   func.func @expand_dense_3x2x2(%arg0: tensor<3x4xf64>) -> tensor<3x2x2xf64> {
-    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<3x4xf64> into tensor<3x2x2xf64>
+    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [3, 2, 2] : tensor<3x4xf64> into tensor<3x2x2xf64>
     return %0 : tensor<3x2x2xf64>
   }
 
   func.func @expand_from_sparse_3x2x2(%arg0: tensor<3x4xf64, #SparseMatrix>) -> tensor<3x2x2xf64> {
-    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<3x4xf64, #SparseMatrix> into tensor<3x2x2xf64>
+    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [3, 2, 2] : tensor<3x4xf64, #SparseMatrix> into tensor<3x2x2xf64>
     return %0 : tensor<3x2x2xf64>
   }
 
   func.func @expand_to_sparse_3x2x2(%arg0: tensor<3x4xf64>) -> tensor<3x2x2xf64, #Sparse3dTensor> {
-    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<3x4xf64> into tensor<3x2x2xf64, #Sparse3dTensor>
+    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [3, 2, 2] : tensor<3x4xf64> into tensor<3x2x2xf64, #Sparse3dTensor>
     return %0 : tensor<3x2x2xf64, #Sparse3dTensor>
   }
 
   func.func @expand_sparse2sparse_3x2x2(%arg0: tensor<3x4xf64, #SparseMatrix>) -> tensor<3x2x2xf64, #Sparse3dTensor> {
-    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<3x4xf64, #SparseMatrix> into tensor<3x2x2xf64, #Sparse3dTensor>
+    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [3, 2, 2] : tensor<3x4xf64, #SparseMatrix> into tensor<3x2x2xf64, #Sparse3dTensor>
     return %0 : tensor<3x2x2xf64, #Sparse3dTensor>
   }
 
   func.func @expand_dense_dyn(%arg0: tensor<?x?xf64>) -> tensor<?x2x?xf64> {
-    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<?x?xf64> into tensor<?x2x?xf64>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf64>
+    %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf64>
+    %d2 = arith.divui %d1, %c2 : index
+    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [%d0, 2, %d2]  : tensor<?x?xf64> into tensor<?x2x?xf64>
     return %0 : tensor<?x2x?xf64>
   }
 
   func.func @expand_from_sparse_dyn(%arg0: tensor<?x?xf64, #SparseMatrix>) -> tensor<?x2x?xf64> {
-    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<?x?xf64, #SparseMatrix> into tensor<?x2x?xf64>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf64, #SparseMatrix>
+    %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf64, #SparseMatrix>
+    %d2 = arith.divui %d1, %c2 : index
+    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [%d0, 2, %d2]  : tensor<?x?xf64, #SparseMatrix> into tensor<?x2x?xf64>
     return %0 : tensor<?x2x?xf64>
   }
 
   func.func @expand_to_sparse_dyn(%arg0: tensor<?x?xf64>) -> tensor<?x2x?xf64, #Sparse3dTensor> {
-    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<?x?xf64> into tensor<?x2x?xf64, #Sparse3dTensor>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf64>
+    %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf64>
+    %d2 = arith.divui %d1, %c2 : index
+    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [%d0, 2, %d2]  : tensor<?x?xf64> into tensor<?x2x?xf64, #Sparse3dTensor>
     return %0 : tensor<?x2x?xf64, #Sparse3dTensor>
   }
 
   func.func @expand_sparse2sparse_dyn(%arg0: tensor<?x?xf64, #SparseMatrix>) -> tensor<?x2x?xf64, #Sparse3dTensor> {
-    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<?x?xf64, #SparseMatrix> into tensor<?x2x?xf64, #Sparse3dTensor>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf64, #SparseMatrix>
+    %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf64, #SparseMatrix>
+    %d2 = arith.divui %d1, %c2 : index
+    %0 = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [%d0, 2, %d2]  : tensor<?x?xf64, #SparseMatrix> into tensor<?x2x?xf64, #Sparse3dTensor>
     return %0 : tensor<?x2x?xf64, #Sparse3dTensor>
   }
 
diff --git a/mlir/test/lib/Dialect/Test/CMakeLists.txt b/mlir/test/lib/Dialect/Test/CMakeLists.txt
index f63e4d330e6a..fab893780933 100644
--- a/mlir/test/lib/Dialect/Test/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Test/CMakeLists.txt
@@ -31,8 +31,6 @@ mlir_tablegen(TestOpEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRTestEnumDefIncGen)
 
 set(LLVM_TARGET_DEFINITIONS TestOps.td)
-mlir_tablegen(TestOps.h.inc -gen-op-decls)
-mlir_tablegen(TestOps.cpp.inc -gen-op-defs)
 mlir_tablegen(TestOpsDialect.h.inc -gen-dialect-decls -dialect=test)
 mlir_tablegen(TestOpsDialect.cpp.inc -gen-dialect-defs -dialect=test)
 mlir_tablegen(TestPatterns.inc -gen-rewriters)
@@ -43,6 +41,8 @@ mlir_tablegen(TestOpsSyntax.h.inc -gen-op-decls)
 mlir_tablegen(TestOpsSyntax.cpp.inc -gen-op-defs)
 add_public_tablegen_target(MLIRTestOpsSyntaxIncGen)
 
+add_sharded_ops(TestOps 20)
+
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestDialect
   TestAttributes.cpp
@@ -56,6 +56,7 @@ add_mlir_library(MLIRTestDialect
   TestTypes.cpp
   TestOpsSyntax.cpp
   TestDialectInterfaces.cpp
+  ${SHARDED_SRCS}
 
   EXCLUDE_FROM_LIBMLIR
 
@@ -66,6 +67,7 @@ add_mlir_library(MLIRTestDialect
   MLIRTestTypeDefIncGen
   MLIRTestOpsIncGen
   MLIRTestOpsSyntaxIncGen
+  MLIRTestOpsShardGen
 
   LINK_LIBS PUBLIC
   MLIRControlFlowInterfaces
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 77fd7e61bd3a..bfb9592e6382 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -326,12 +326,9 @@ struct TestOpEffectInterfaceFallback
 void TestDialect::initialize() {
   registerAttributes();
   registerTypes();
-  addOperations<
-#define GET_OP_LIST
-#include "TestOps.cpp.inc"
-      >();
   registerOpsSyntax();
   addOperations<ManualCppOpWithFold>();
+  registerTestDialectOperations(this);
   registerDynamicOp(getDynamicGenericOp(this));
   registerDynamicOp(getDynamicOneOperandTwoResultsOp(this));
   registerDynamicOp(getDynamicCustomParserPrinterOp(this));
diff --git a/mlir/test/lib/Dialect/Test/TestOps.cpp b/mlir/test/lib/Dialect/Test/TestOps.cpp
index ce7e476be74e..47d5b1b19121 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOps.cpp
@@ -14,5 +14,4 @@
 using namespace mlir;
 using namespace test;
 
-#define GET_OP_CLASSES
 #include "TestOps.cpp.inc"
diff --git a/mlir/test/mlir-tblgen/shard-op-defs.td b/mlir/test/mlir-tblgen/shard-op-defs.td
new file mode 100644
index 000000000000..84ac6b0fbe9e
--- /dev/null
+++ b/mlir/test/mlir-tblgen/shard-op-defs.td
@@ -0,0 +1,33 @@
+// RUN: mlir-tblgen -gen-op-defs -op-shard-count=2 -I %S/../../include %s | FileCheck %s --check-prefix=DEFS
+// RUN: mlir-tblgen -gen-op-decls -op-shard-count=2 -I %S/../../include %s | FileCheck %s --check-prefix=DECLS
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "test";
+}
+
+class Test_Op<string mnemonic, list<Trait> traits = []> 
+    : Op<Test_Dialect, mnemonic, traits>;
+
+def OpA : Test_Op<"a">;
+def OpB : Test_Op<"b">;
+def OpC : Test_Op<"c">;
+
+// DECLS: OpA
+// DECLS: OpB
+// DECLS: OpC
+// DECLS: registerTestDialectOperations(
+// DECLS: registerTestDialectOperations0(
+// DECLS: registerTestDialectOperations1(
+
+// DEFS-LABEL: GET_OP_DEFS_0
+// DEFS: void test::registerTestDialectOperations(
+// DEFS: void test::registerTestDialectOperations0(
+// DEFS: OpAAdaptor
+// DEFS: OpBAdaptor
+
+// DEFS-LABEL: GET_OP_DEFS_1
+// DEFS: void test::registerTestDialectOperations1(
+// DEFS: OpCAdaptor
diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py
index 5666d090c3d5..3cc4575eb3e2 100644
--- a/mlir/test/python/dialects/sparse_tensor/dialect.py
+++ b/mlir/test/python/dialects/sparse_tensor/dialect.py
@@ -2,6 +2,7 @@
 
 from mlir.ir import *
 from mlir.dialects import sparse_tensor as st
+import textwrap
 
 
 def run(f):
@@ -15,13 +16,18 @@ def run(f):
 def testEncodingAttr1D():
     with Context() as ctx:
         parsed = Attribute.parse(
-            "#sparse_tensor.encoding<{"
-            "  map = (d0) -> (d0 : compressed),"
-            "  posWidth = 16,"
-            "  crdWidth = 32"
-            "}>"
+            textwrap.dedent(
+                """\
+                #sparse_tensor.encoding<{
+                    map = (d0) -> (d0 : compressed),
+                    posWidth = 16,
+                    crdWidth = 32,
+                    explicitVal = 1.0 : f64
+                }>\
+            """
+            )
         )
-        # CHECK: #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed), posWidth = 16, crdWidth = 32 }>
+        # CHECK: #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed), posWidth = 16, crdWidth = 32, explicitVal = 1.000000e+00 : f64 }>
         print(parsed)
 
         casted = st.EncodingAttr(parsed)
@@ -38,9 +44,16 @@ def testEncodingAttr1D():
         print(f"pos_width: {casted.pos_width}")
         # CHECK: crd_width: 32
         print(f"crd_width: {casted.crd_width}")
+        # CHECK: explicit_val: 1.000000e+00
+        print(f"explicit_val: {casted.explicit_val}")
+        # CHECK: implicit_val: None
+        print(f"implicit_val: {casted.implicit_val}")
 
-        created = st.EncodingAttr.get(casted.lvl_types, None, None, 0, 0)
-        # CHECK: #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+        new_explicit_val = FloatAttr.get_f64(1.0)
+        created = st.EncodingAttr.get(
+            casted.lvl_types, None, None, 0, 0, new_explicit_val
+        )
+        # CHECK: #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed), explicitVal = 1.000000e+00 : f64 }>
         print(created)
         # CHECK: created_equal: False
         print(f"created_equal: {created == casted}")
@@ -57,12 +70,16 @@ def testEncodingAttr1D():
 def testEncodingAttrStructure():
     with Context() as ctx:
         parsed = Attribute.parse(
-            "#sparse_tensor.encoding<{"
-            "  map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense,"
-            "  d1 mod 4 : structured[2, 4]),"
-            "  posWidth = 16,"
-            "  crdWidth = 32"
-            "}>"
+            textwrap.dedent(
+                """\
+                #sparse_tensor.encoding<{
+                    map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense,
+                    d1 mod 4 : structured[2, 4]),
+                    posWidth = 16,
+                    crdWidth = 32,
+                }>\
+            """
+            )
         )
         # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 floordiv 4 : dense, d1 mod 4 : structured[2, 4]), posWidth = 16, crdWidth = 32 }>
         print(parsed)
@@ -144,11 +161,15 @@ def testEncodingAttrStructure():
 def testEncodingAttr2D():
     with Context() as ctx:
         parsed = Attribute.parse(
-            "#sparse_tensor.encoding<{"
-            "  map = (d0, d1) -> (d1 : dense, d0 : compressed),"
-            "  posWidth = 8,"
-            "  crdWidth = 32"
-            "}>"
+            textwrap.dedent(
+                """\
+                #sparse_tensor.encoding<{
+                    map = (d0, d1) -> (d1 : dense, d0 : compressed),
+                    posWidth = 8,
+                    crdWidth = 32,
+                }>\
+            """
+            )
         )
         # CHECK: #sparse_tensor.encoding<{ map = (d0, d1) -> (d1 : dense, d0 : compressed), posWidth = 8, crdWidth = 32 }>
         print(parsed)
@@ -187,11 +208,15 @@ def testEncodingAttrOnTensorType():
     with Context() as ctx, Location.unknown():
         encoding = st.EncodingAttr(
             Attribute.parse(
-                "#sparse_tensor.encoding<{"
-                "  map = (d0) -> (d0 : compressed), "
-                "  posWidth = 64,"
-                "  crdWidth = 32"
-                "}>"
+                textwrap.dedent(
+                    """\
+                    #sparse_tensor.encoding<{
+                        map = (d0) -> (d0 : compressed),
+                        posWidth = 64,
+                        crdWidth = 32,
+                    }>\
+                """
+                )
             )
         )
         tt = RankedTensorType.get((1024,), F32Type.get(), encoding=encoding)
diff --git a/mlir/tools/mlir-src-sharder/CMakeLists.txt b/mlir/tools/mlir-src-sharder/CMakeLists.txt
new file mode 100644
index 000000000000..4ef870b61124
--- /dev/null
+++ b/mlir/tools/mlir-src-sharder/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS Support)
+set(LIBS MLIRSupport)
+
+add_tablegen(mlir-src-sharder MLIR_SRC_SHARDER
+  mlir-src-sharder.cpp
+
+  DEPENDS
+  ${LIBS}
+  )
+
+set_target_properties(mlir-src-sharder PROPERTIES FOLDER "Tablegenning")
+target_link_libraries(mlir-src-sharder PRIVATE ${LIBS})
+
+mlir_check_all_link_libraries(mlir-src-sharder)
diff --git a/mlir/tools/mlir-src-sharder/mlir-src-sharder.cpp b/mlir/tools/mlir-src-sharder/mlir-src-sharder.cpp
new file mode 100644
index 000000000000..dc1e2939c7d2
--- /dev/null
+++ b/mlir/tools/mlir-src-sharder/mlir-src-sharder.cpp
@@ -0,0 +1,114 @@
+//===- mlir-src-sharder.cpp - A tool for sharder generated source files ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace mlir;
+
+/// Create a dependency file for `-d` option.
+///
+/// This functionality is generally only for the benefit of the build system,
+/// and is modeled after the same option in TableGen.
+static LogicalResult createDependencyFile(StringRef outputFilename,
+                                          StringRef dependencyFile) {
+  if (outputFilename == "-") {
+    llvm::errs() << "error: the option -d must be used together with -o\n";
+    return failure();
+  }
+
+  std::string errorMessage;
+  std::unique_ptr<llvm::ToolOutputFile> outputFile =
+      openOutputFile(dependencyFile, &errorMessage);
+  if (!outputFile) {
+    llvm::errs() << errorMessage << "\n";
+    return failure();
+  }
+
+  outputFile->os() << outputFilename << ":\n";
+  outputFile->keep();
+  return success();
+}
+
+int main(int argc, char **argv) {
+  // FIXME: This is necessary because we link in TableGen, which defines its
+  // options as static variables.. some of which overlap with our options.
+  llvm::cl::ResetCommandLineParser();
+
+  llvm::cl::opt<unsigned> opShardIndex(
+      "op-shard-index", llvm::cl::desc("The current shard index"));
+  llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
+                                           llvm::cl::desc("<input file>"),
+                                           llvm::cl::init("-"));
+  llvm::cl::opt<std::string> outputFilename(
+      "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
+      llvm::cl::init("-"));
+  llvm::cl::list<std::string> includeDirs(
+      "I", llvm::cl::desc("Directory of include files"),
+      llvm::cl::value_desc("directory"), llvm::cl::Prefix);
+  llvm::cl::opt<std::string> dependencyFilename(
+      "d", llvm::cl::desc("Dependency filename"),
+      llvm::cl::value_desc("filename"), llvm::cl::init(""));
+  llvm::cl::opt<bool> writeIfChanged(
+      "write-if-changed",
+      llvm::cl::desc("Only write to the output file if it changed"));
+
+  llvm::InitLLVM y(argc, argv);
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  // Open the input file.
+  std::string errorMessage;
+  std::unique_ptr<llvm::MemoryBuffer> inputFile =
+      openInputFile(inputFilename, &errorMessage);
+  if (!inputFile) {
+    llvm::errs() << errorMessage << "\n";
+    return 1;
+  }
+
+  // Write the output to a buffer.
+  std::string outputStr;
+  llvm::raw_string_ostream os(outputStr);
+  os << "#define GET_OP_DEFS_" << opShardIndex << "\n"
+     << inputFile->getBuffer();
+
+  // Determine whether we need to write the output file.
+  bool shouldWriteOutput = true;
+  if (writeIfChanged) {
+    // Only update the real output file if there are any differences. This
+    // prevents recompilation of all the files depending on it if there aren't
+    // any.
+    if (auto existingOrErr =
+            llvm::MemoryBuffer::getFile(outputFilename, /*IsText=*/true))
+      if (std::move(existingOrErr.get())->getBuffer() == os.str())
+        shouldWriteOutput = false;
+  }
+
+  // Populate the output file if necessary.
+  if (shouldWriteOutput) {
+    std::unique_ptr<llvm::ToolOutputFile> outputFile =
+        openOutputFile(outputFilename, &errorMessage);
+    if (!outputFile) {
+      llvm::errs() << errorMessage << "\n";
+      return 1;
+    }
+    outputFile->os() << os.str();
+    outputFile->keep();
+  }
+
+  // Always write the depfile, even if the main output hasn't changed. If it's
+  // missing, Ninja considers the output dirty.
+  if (!dependencyFilename.empty())
+    if (failed(createDependencyFile(outputFilename, dependencyFilename)))
+      return 1;
+
+  return 0;
+}
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 53ed5cb7c043..63fe5a809907 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -4303,32 +4303,15 @@ void OpOperandAdaptorEmitter::emitDef(
   emitter.adaptor.writeDefTo(os);
 }
 
-// Emits the opcode enum and op classes.
-static void emitOpClasses(const RecordKeeper &recordKeeper,
-                          const std::vector<Record *> &defs, raw_ostream &os,
-                          bool emitDecl) {
-  // First emit forward declaration for each class, this allows them to refer
-  // to each others in traits for example.
-  if (emitDecl) {
-    os << "#if defined(GET_OP_CLASSES) || defined(GET_OP_FWD_DEFINES)\n";
-    os << "#undef GET_OP_FWD_DEFINES\n";
-    for (auto *def : defs) {
-      Operator op(*def);
-      NamespaceEmitter emitter(os, op.getCppNamespace());
-      os << "class " << op.getCppClassName() << ";\n";
-    }
-    os << "#endif\n\n";
-  }
-
-  IfDefScope scope("GET_OP_CLASSES", os);
+/// Emit the class declarations or definitions for the given op defs.
+static void
+emitOpClasses(const RecordKeeper &recordKeeper,
+              const std::vector<Record *> &defs, raw_ostream &os,
+              const StaticVerifierFunctionEmitter &staticVerifierEmitter,
+              bool emitDecl) {
   if (defs.empty())
     return;
 
-  // Generate all of the locally instantiated methods first.
-  StaticVerifierFunctionEmitter staticVerifierEmitter(os, recordKeeper);
-  os << formatv(opCommentHeader, "Local Utility Method", "Definitions");
-  staticVerifierEmitter.emitOpConstraints(defs, emitDecl);
-
   for (auto *def : defs) {
     Operator op(*def);
     if (emitDecl) {
@@ -4358,34 +4341,145 @@ static void emitOpClasses(const RecordKeeper &recordKeeper,
   }
 }
 
-// Emits a comma-separated list of the ops.
-static void emitOpList(const std::vector<Record *> &defs, raw_ostream &os) {
-  IfDefScope scope("GET_OP_LIST", os);
+/// Emit the declarations for the provided op classes.
+static void emitOpClassDecls(const RecordKeeper &recordKeeper,
+                             const std::vector<Record *> &defs,
+                             raw_ostream &os) {
+  // First emit forward declaration for each class, this allows them to refer
+  // to each others in traits for example.
+  for (auto *def : defs) {
+    Operator op(*def);
+    NamespaceEmitter emitter(os, op.getCppNamespace());
+    os << "class " << op.getCppClassName() << ";\n";
+  }
+
+  // Emit the op class declarations.
+  IfDefScope scope("GET_OP_CLASSES", os);
+  if (defs.empty())
+    return;
+  StaticVerifierFunctionEmitter staticVerifierEmitter(os, recordKeeper);
+  staticVerifierEmitter.collectOpConstraints(defs);
+  emitOpClasses(recordKeeper, defs, os, staticVerifierEmitter,
+                /*emitDecl=*/true);
+}
+
+/// Emit the definitions for the provided op classes.
+static void emitOpClassDefs(const RecordKeeper &recordKeeper,
+                            ArrayRef<Record *> defs, raw_ostream &os,
+                            StringRef constraintPrefix = "") {
+  if (defs.empty())
+    return;
+
+  // Generate all of the locally instantiated methods first.
+  StaticVerifierFunctionEmitter staticVerifierEmitter(os, recordKeeper,
+                                                      constraintPrefix);
+  os << formatv(opCommentHeader, "Local Utility Method", "Definitions");
+  staticVerifierEmitter.collectOpConstraints(defs);
+  staticVerifierEmitter.emitOpConstraints(defs);
 
-  interleave(
-      // TODO: We are constructing the Operator wrapper instance just for
-      // getting it's qualified class name here. Reduce the overhead by having a
-      // lightweight version of Operator class just for that purpose.
-      defs, [&os](Record *def) { os << Operator(def).getQualCppClassName(); },
-      [&os]() { os << ",\n"; });
+  // Emit the classes.
+  emitOpClasses(recordKeeper, defs, os, staticVerifierEmitter,
+                /*emitDecl=*/false);
 }
 
+/// Emit op declarations for all op records.
 static bool emitOpDecls(const RecordKeeper &recordKeeper, raw_ostream &os) {
   emitSourceFileHeader("Op Declarations", os, recordKeeper);
 
   std::vector<Record *> defs = getRequestedOpDefinitions(recordKeeper);
-  emitOpClasses(recordKeeper, defs, os, /*emitDecl=*/true);
+  emitOpClassDecls(recordKeeper, defs, os);
+
+  // If we are generating sharded op definitions, emit the sharded op
+  // registration hooks.
+  SmallVector<ArrayRef<Record *>, 4> shardedDefs;
+  shardOpDefinitions(defs, shardedDefs);
+  if (defs.empty() || shardedDefs.size() <= 1)
+    return false;
+
+  Dialect dialect = Operator(defs.front()).getDialect();
+  NamespaceEmitter ns(os, dialect);
+
+  const char *const opRegistrationHook =
+      "void register{0}Operations{1}({2}::{0} *dialect);\n";
+  os << formatv(opRegistrationHook, dialect.getCppClassName(), "",
+                dialect.getCppNamespace());
+  for (unsigned i = 0; i < shardedDefs.size(); ++i) {
+    os << formatv(opRegistrationHook, dialect.getCppClassName(), i,
+                  dialect.getCppNamespace());
+  }
 
   return false;
 }
 
+/// Generate the dialect op registration hook and the op class definitions for a
+/// shard of ops.
+static void emitOpDefShard(const RecordKeeper &recordKeeper,
+                           ArrayRef<Record *> defs, const Dialect &dialect,
+                           unsigned shardIndex, unsigned shardCount,
+                           raw_ostream &os) {
+  std::string shardGuard = "GET_OP_DEFS_";
+  std::string indexStr = std::to_string(shardIndex);
+  shardGuard += indexStr;
+  IfDefScope scope(shardGuard, os);
+
+  // Emit the op registration hook in the first shard.
+  const char *const opRegistrationHook =
+      "void {0}::register{1}Operations{2}({0}::{1} *dialect) {{\n";
+  if (shardIndex == 0) {
+    os << formatv(opRegistrationHook, dialect.getCppNamespace(),
+                  dialect.getCppClassName(), "");
+    for (unsigned i = 0; i < shardCount; ++i) {
+      os << formatv("  {0}::register{1}Operations{2}(dialect);\n",
+                    dialect.getCppNamespace(), dialect.getCppClassName(), i);
+    }
+    os << "}\n";
+  }
+
+  // Generate the per-shard op registration hook.
+  os << formatv(opCommentHeader, dialect.getCppClassName(),
+                "Op Registration Hook")
+     << formatv(opRegistrationHook, dialect.getCppNamespace(),
+                dialect.getCppClassName(), shardIndex);
+  for (Record *def : defs) {
+    os << formatv("  ::mlir::RegisteredOperationName::insert<{0}>(*dialect);\n",
+                  Operator(def).getQualCppClassName());
+  }
+  os << "}\n";
+
+  // Generate the per-shard op definitions.
+  emitOpClassDefs(recordKeeper, defs, os, indexStr);
+}
+
+/// Emit op definitions for all op records.
 static bool emitOpDefs(const RecordKeeper &recordKeeper, raw_ostream &os) {
   emitSourceFileHeader("Op Definitions", os, recordKeeper);
 
   std::vector<Record *> defs = getRequestedOpDefinitions(recordKeeper);
-  emitOpList(defs, os);
-  emitOpClasses(recordKeeper, defs, os, /*emitDecl=*/false);
+  SmallVector<ArrayRef<Record *>, 4> shardedDefs;
+  shardOpDefinitions(defs, shardedDefs);
+
+  // If no shard was requested, emit the regular op list and class definitions.
+  if (shardedDefs.size() == 1) {
+    {
+      IfDefScope scope("GET_OP_LIST", os);
+      interleave(
+          defs, os,
+          [&](Record *def) { os << Operator(def).getQualCppClassName(); },
+          ",\n");
+    }
+    {
+      IfDefScope scope("GET_OP_CLASSES", os);
+      emitOpClassDefs(recordKeeper, defs, os);
+    }
+    return false;
+  }
 
+  if (defs.empty())
+    return false;
+  Dialect dialect = Operator(defs.front()).getDialect();
+  for (auto [idx, value] : llvm::enumerate(shardedDefs)) {
+    emitOpDefShard(recordKeeper, value, dialect, idx, shardedDefs.size(), os);
+  }
   return false;
 }
 
diff --git a/mlir/tools/mlir-tblgen/OpGenHelpers.cpp b/mlir/tools/mlir-tblgen/OpGenHelpers.cpp
index 7fd34df8460d..c2a2423a2402 100644
--- a/mlir/tools/mlir-tblgen/OpGenHelpers.cpp
+++ b/mlir/tools/mlir-tblgen/OpGenHelpers.cpp
@@ -31,6 +31,10 @@ static cl::opt<std::string> opExcFilter(
     "op-exclude-regex",
     cl::desc("Regex of name of op's to exclude (no filter if empty)"),
     cl::cat(opDefGenCat));
+static cl::opt<unsigned> opShardCount(
+    "op-shard-count",
+    cl::desc("The number of shards into which the op classes will be divided"),
+    cl::cat(opDefGenCat), cl::init(1));
 
 static std::string getOperationName(const Record &def) {
   auto prefix = def.getValueAsDef("opDialect")->getValueAsString("name");
@@ -79,4 +83,23 @@ bool mlir::tblgen::isPythonReserved(StringRef str) {
   reserved.insert("issubclass");
   reserved.insert("type");
   return reserved.contains(str);
-}
-\ No newline at end of file
+}
+
+void mlir::tblgen::shardOpDefinitions(
+    ArrayRef<llvm::Record *> defs,
+    SmallVectorImpl<ArrayRef<llvm::Record *>> &shardedDefs) {
+  assert(opShardCount > 0 && "expected a positive shard count");
+  if (opShardCount == 1) {
+    shardedDefs.push_back(defs);
+    return;
+  }
+
+  unsigned minShardSize = defs.size() / opShardCount;
+  unsigned numMissing = defs.size() - minShardSize * opShardCount;
+  shardedDefs.reserve(opShardCount);
+  for (unsigned i = 0, start = 0; i < opShardCount; ++i) {
+    unsigned size = minShardSize + (i < numMissing);
+    shardedDefs.push_back(defs.slice(start, size));
+    start += size;
+  }
+}
diff --git a/mlir/tools/mlir-tblgen/OpGenHelpers.h b/mlir/tools/mlir-tblgen/OpGenHelpers.h
index 3dcff14d1221..1b43d5d3ce3a 100644
--- a/mlir/tools/mlir-tblgen/OpGenHelpers.h
+++ b/mlir/tools/mlir-tblgen/OpGenHelpers.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_TOOLS_MLIRTBLGEN_OPGENHELPERS_H_
 #define MLIR_TOOLS_MLIRTBLGEN_OPGENHELPERS_H_
 
+#include "mlir/Support/LLVM.h"
 #include "llvm/TableGen/Record.h"
 #include <vector>
 
@@ -28,6 +29,10 @@ getRequestedOpDefinitions(const llvm::RecordKeeper &recordKeeper);
 /// Regenerate using python -c"print(set(sorted(__import__('keyword').kwlist)))"
 bool isPythonReserved(llvm::StringRef str);
 
+/// Shard the op defintions into the number of shards set by "op-shard-count".
+void shardOpDefinitions(ArrayRef<llvm::Record *> defs,
+                        SmallVectorImpl<ArrayRef<llvm::Record *>> &shardedDefs);
+
 } // namespace tblgen
 } // namespace mlir
 
diff --git a/mlir/unittests/Tools/lsp-server-support/Transport.cpp b/mlir/unittests/Tools/lsp-server-support/Transport.cpp
index 9877c12c3695..fee218405952 100644
--- a/mlir/unittests/Tools/lsp-server-support/Transport.cpp
+++ b/mlir/unittests/Tools/lsp-server-support/Transport.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Tools/lsp-server-support/Transport.h"
-#include "llvm/ADT/ScopeExit.h"
+#include "mlir/Tools/lsp-server-support/Logging.h"
+#include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "llvm/Support/FileSystem.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -29,37 +30,144 @@ TEST(TransportTest, SendReply) {
   EXPECT_THAT(out, HasSubstr("\"result\":null"));
 }
 
-TEST(TransportTest, MethodNotFound) {
-  auto tempOr = llvm::sys::fs::TempFile::create("lsp-unittest-%%%%%%.json");
-  ASSERT_TRUE((bool)tempOr);
-  auto discardTemp =
-      llvm::make_scope_exit([&]() { ASSERT_FALSE((bool)tempOr->discard()); });
+class TransportInputTest : public Test {
+  llvm::SmallVector<char> inputPath;
+  std::FILE *in = nullptr;
+  std::string output = "";
+  llvm::raw_string_ostream os;
+  std::optional<JSONTransport> transport = std::nullopt;
+  std::optional<MessageHandler> messageHandler = std::nullopt;
 
-  {
+protected:
+  TransportInputTest() : os(output) {}
+
+  void SetUp() override {
+    std::error_code ec =
+        llvm::sys::fs::createTemporaryFile("lsp-unittest", "json", inputPath);
+    ASSERT_FALSE(ec) << "Could not create temporary file: " << ec.message();
+
+    in = std::fopen(inputPath.data(), "r");
+    ASSERT_TRUE(in) << "Could not open temporary file: "
+                    << std::strerror(errno);
+    transport.emplace(in, os, JSONStreamStyle::Delimited);
+    messageHandler.emplace(*transport);
+  }
+
+  void TearDown() override {
+    EXPECT_EQ(std::fclose(in), 0)
+        << "Could not close temporary file FD: " << std::strerror(errno);
+    std::error_code ec =
+        llvm::sys::fs::remove(inputPath, /*IgnoreNonExisting=*/false);
+    EXPECT_FALSE(ec) << "Could not remove temporary file '" << inputPath.data()
+                     << "': " << ec.message();
+  }
+
+  void writeInput(StringRef buffer) {
     std::error_code ec;
-    llvm::raw_fd_ostream os(tempOr->TmpName, ec);
-    ASSERT_FALSE(ec);
-    os << "{\"jsonrpc\":\"2.0\",\"id\":29,\"method\":\"ack\"}\n";
+    llvm::raw_fd_ostream os(inputPath.data(), ec);
+    ASSERT_FALSE(ec) << "Could not write to '" << inputPath.data()
+                     << "': " << ec.message();
+    os << buffer;
     os.close();
   }
 
-  std::string out;
-  llvm::raw_string_ostream os(out);
-  std::FILE *in = std::fopen(tempOr->TmpName.c_str(), "r");
-  auto closeIn = llvm::make_scope_exit([&]() { std::fclose(in); });
+  StringRef getOutput() const { return output; }
+  MessageHandler &getMessageHandler() { return *messageHandler; }
 
-  JSONTransport transport(in, os, JSONStreamStyle::Delimited);
-  MessageHandler handler(transport);
+  void runTransport() {
+    bool gotEOF = false;
+    llvm::Error err = llvm::handleErrors(
+        transport->run(*messageHandler), [&](const llvm::ECError &ecErr) {
+          gotEOF = ecErr.convertToErrorCode() == std::errc::io_error;
+        });
+    llvm::consumeError(std::move(err));
+    EXPECT_TRUE(gotEOF);
+  }
+};
+
+TEST_F(TransportInputTest, RequestWithInvalidParams) {
+  struct Handler {
+    void onMethod(const TextDocumentItem &params,
+                  mlir::lsp::Callback<TextDocumentIdentifier> callback) {}
+  } handler;
+  getMessageHandler().method("invalid-params-request", &handler,
+                             &Handler::onMethod);
+
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":92,"
+             "\"method\":\"invalid-params-request\",\"params\":{}}\n");
+  runTransport();
+  EXPECT_THAT(getOutput(), HasSubstr("error"));
+  EXPECT_THAT(getOutput(), HasSubstr("missing value at (root).uri"));
+}
+
+TEST_F(TransportInputTest, NotificationWithInvalidParams) {
+  // JSON parsing errors are only reported via error logging. As a result, this
+  // test can't make any expectations -- but it prints the output anyway, by way
+  // of demonstration.
+  Logger::setLogLevel(Logger::Level::Error);
+
+  struct Handler {
+    void onNotification(const TextDocumentItem &params) {}
+  } handler;
+  getMessageHandler().notification("invalid-params-notification", &handler,
+                                   &Handler::onNotification);
+
+  writeInput("{\"jsonrpc\":\"2.0\",\"method\":\"invalid-params-notification\","
+             "\"params\":{}}\n");
+  runTransport();
+}
+
+TEST_F(TransportInputTest, MethodNotFound) {
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":29,\"method\":\"ack\"}\n");
+  runTransport();
+  EXPECT_THAT(getOutput(), HasSubstr("\"id\":29"));
+  EXPECT_THAT(getOutput(), HasSubstr("\"error\""));
+  EXPECT_THAT(getOutput(), HasSubstr("\"message\":\"method not found: ack\""));
+}
+
+TEST_F(TransportInputTest, OutgoingNotification) {
+  auto notifyFn = getMessageHandler().outgoingNotification<CompletionList>(
+      "outgoing-notification");
+  notifyFn(CompletionList{});
+  EXPECT_THAT(getOutput(), HasSubstr("\"method\":\"outgoing-notification\""));
+}
 
-  bool gotEOF = false;
-  llvm::Error err = llvm::handleErrors(
-      transport.run(handler), [&](const llvm::ECError &ecErr) {
-        gotEOF = ecErr.convertToErrorCode() == std::errc::io_error;
+TEST_F(TransportInputTest, ResponseHandlerNotFound) {
+  // Unhandled responses are only reported via error logging. As a result, this
+  // test can't make any expectations -- but it prints the output anyway, by way
+  // of demonstration.
+  Logger::setLogLevel(Logger::Level::Error);
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":81,\"result\":null}\n");
+  runTransport();
+}
+
+TEST_F(TransportInputTest, OutgoingRequest) {
+  // Make some outgoing requests.
+  int responseCallbackInvoked = 0;
+  auto callFn = getMessageHandler().outgoingRequest<CompletionList>(
+      "outgoing-request",
+      [&responseCallbackInvoked](llvm::json::Value id,
+                                 llvm::Expected<llvm::json::Value> value) {
+        // Make expectations on the expected response.
+        EXPECT_EQ(id, 83);
+        ASSERT_TRUE((bool)value);
+        EXPECT_EQ(debugString(*value), "{\"foo\":6}");
+        responseCallbackInvoked += 1;
+        llvm::outs() << "here!!!\n";
       });
-  llvm::consumeError(std::move(err));
-  EXPECT_TRUE(gotEOF);
-  EXPECT_THAT(out, HasSubstr("\"id\":29"));
-  EXPECT_THAT(out, HasSubstr("\"error\""));
-  EXPECT_THAT(out, HasSubstr("\"message\":\"method not found: ack\""));
+  callFn({}, 82);
+  callFn({}, 83);
+  callFn({}, 84);
+  EXPECT_THAT(getOutput(), HasSubstr("\"method\":\"outgoing-request\""));
+  EXPECT_EQ(responseCallbackInvoked, 0);
+
+  // One of the requests receives a response. The message handler handles this
+  // response by invoking the callback from above. Subsequent responses with the
+  // same ID are ignored.
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":83,\"result\":{\"foo\":6}}\n"
+             "// -----\n"
+             "{\"jsonrpc\":\"2.0\",\"id\":83,\"result\":{\"bar\":8}}\n");
+  runTransport();
+  EXPECT_EQ(responseCallbackInvoked, 1);
 }
 } // namespace
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index abc8baa0805f..42e0f5740f11 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -151,6 +151,25 @@ if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
   message(FATAL_ERROR "Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
 endif()
 
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
+    "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+
+if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
+  set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
+endif()
+
+set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS "")
+foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
+  set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS
+      "${LIBOMPTARGET_ENUM_PLUGIN_TARGETS}PLUGIN_TARGET(${plugin})\n")
+endforeach()
+string(STRIP ${LIBOMPTARGET_ENUM_PLUGIN_TARGETS} LIBOMPTARGET_ENUM_PLUGIN_TARGETS)
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/include/Shared/Targets.def.in
+  ${CMAKE_CURRENT_BINARY_DIR}/include/Shared/Targets.def
+)
+
 include_directories(${LIBOMPTARGET_LLVM_INCLUDE_DIRS})
 
 # This is a list of all the targets that are supported/tested right now.
@@ -283,11 +302,17 @@ endif()
 
 pythonize_bool(LIBOMPTARGET_OMPT_SUPPORT)
 
-set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LLVM_LIBC_GPU_BUILD} CACHE BOOL
+if(${LLVM_LIBC_GPU_BUILD})
+  set(LIBOMPTARGET_HAS_LIBC TRUE)
+else()
+  set(LIBOMPTARGET_HAS_LIBC FALSE)
+endif()
+set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LIBOMPTARGET_HAS_LIBC} CACHE BOOL
     "Libomptarget support for the GPU libc")
 pythonize_bool(LIBOMPTARGET_GPU_LIBC_SUPPORT)
 
 set(LIBOMPTARGET_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(LIBOMPTARGET_BINARY_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
 message(STATUS "OpenMP tools dir in libomptarget: ${LIBOMP_OMP_TOOLS_INCLUDE_DIR}")
 if(LIBOMP_OMP_TOOLS_INCLUDE_DIR)
   include_directories(${LIBOMP_OMP_TOOLS_INCLUDE_DIR})
diff --git a/offload/include/Shared/Targets.def.in b/offload/include/Shared/Targets.def.in
new file mode 100644
index 000000000000..f34b523b4542
--- /dev/null
+++ b/offload/include/Shared/Targets.def.in
@@ -0,0 +1,20 @@
+//===-- Shared/Targets.def - Target plugin enumerator -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Enumerates over all of the supported target plugins that are available to
+// the offloading library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PLUGIN_TARGET
+#  error Please define the macro PLUGIN_TARGET(TargetName)
+#endif
+
+@LIBOMPTARGET_ENUM_PLUGIN_TARGETS@
+
+#undef PLUGIN_TARGET
diff --git a/offload/plugins-nextgen/CMakeLists.txt b/offload/plugins-nextgen/CMakeLists.txt
index dbd82ac94517..df625e97c7eb 100644
--- a/offload/plugins-nextgen/CMakeLists.txt
+++ b/offload/plugins-nextgen/CMakeLists.txt
@@ -69,9 +69,12 @@ function(add_target_library target_name lib_name)
   set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET protected)
 endfunction()
 
-add_subdirectory(amdgpu)
-add_subdirectory(cuda)
-add_subdirectory(host)
+foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
+  if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${plugin})
+    message(FATAL_ERROR "Unknown plugin target '${plugin}'")
+  endif()
+  add_subdirectory(${plugin})
+endforeach()
 
 # Make sure the parent scope can see the plugins that will be created.
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index a7350e662a7c..acf0af63f050 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -62,6 +62,7 @@ target_link_options(PluginCommon PUBLIC ${offload_link_flags})
 target_include_directories(PluginCommon PUBLIC 
   ${CMAKE_CURRENT_SOURCE_DIR}/include
   ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+  ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
   ${LIBOMPTARGET_INCLUDE_DIR}
 )
 
diff --git a/offload/plugins-nextgen/host/CMakeLists.txt b/offload/plugins-nextgen/host/CMakeLists.txt
index 7da18ee278d4..6407f72e8db0 100644
--- a/offload/plugins-nextgen/host/CMakeLists.txt
+++ b/offload/plugins-nextgen/host/CMakeLists.txt
@@ -14,36 +14,36 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$")
 endif()
 
 # Create the library and add the default arguments.
-add_target_library(omptarget.rtl.${machine} ${machine})
+add_target_library(omptarget.rtl.host ${machine})
 
-target_sources(omptarget.rtl.${machine} PRIVATE src/rtl.cpp)
+target_sources(omptarget.rtl.host PRIVATE src/rtl.cpp)
 
 if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
   libomptarget_say("Building ${machine} plugin linked with libffi")
   if(FFI_STATIC_LIBRARIES)
-    target_link_libraries(omptarget.rtl.${machine} PRIVATE FFI::ffi_static)
+    target_link_libraries(omptarget.rtl.host PRIVATE FFI::ffi_static)
   else()
-    target_link_libraries(omptarget.rtl.${machine} PRIVATE FFI::ffi)
+    target_link_libraries(omptarget.rtl.host PRIVATE FFI::ffi)
   endif()
 else()
   libomptarget_say("Building ${machine} plugin for dlopened libffi")
-  target_sources(omptarget.rtl.${machine} PRIVATE dynamic_ffi/ffi.cpp)
-  target_include_directories(omptarget.rtl.${machine} PRIVATE dynamic_ffi)
+  target_sources(omptarget.rtl.host PRIVATE dynamic_ffi/ffi.cpp)
+  target_include_directories(omptarget.rtl.host PRIVATE dynamic_ffi)
 endif()
 
 # Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.${machine}
+install(TARGETS omptarget.rtl.host
         LIBRARY DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.${machine} PROPERTIES
+set_target_properties(omptarget.rtl.host PROPERTIES
   INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
   POSITION_INDEPENDENT_CODE ON
   CXX_VISIBILITY_PRESET protected)
 
-target_include_directories(omptarget.rtl.${machine} PRIVATE
+target_include_directories(omptarget.rtl.host PRIVATE
                            ${LIBOMPTARGET_INCLUDE_DIR})
 
 if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS omptarget.rtl.${machine})
+  list(APPEND LIBOMPTARGET_TESTED_PLUGINS omptarget.rtl.host)
   set(LIBOMPTARGET_TESTED_PLUGINS
       "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
 else()
@@ -52,36 +52,36 @@ endif()
 
 # Define the target specific triples and ELF machine values.
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_PPC64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_PPC64)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64le-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "powerpc64le-ibm-linux-gnu" "powerpc64le-ibm-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_PPC64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_PPC64)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "powerpc64-ibm-linux-gnu" "powerpc64-ibm-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_X86_64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_X86_64)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="x86_64-pc-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "x86_64-pc-linux-gnu" "x86_64-pc-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_AARCH64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_AARCH64)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="aarch64-unknown-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "aarch64-unknown-linux-gnu" "aarch64-unknown-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_S390)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_S390)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="s390x-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "s390x-ibm-linux-gnu" "s390x-ibm-linux-gnu-LTO")
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index fb1ad3d7ae70..eda5a85ff1ab 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -37,6 +37,7 @@ add_llvm_library(omptarget
 
   ADDITIONAL_HEADER_DIRS
   ${LIBOMPTARGET_INCLUDE_DIR}
+  ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
 
   LINK_COMPONENTS
   Support
@@ -49,7 +50,9 @@ add_llvm_library(omptarget
   NO_INSTALL_RPATH
   BUILDTREE_ONLY
 )
-target_include_directories(omptarget PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
+target_include_directories(omptarget PRIVATE 
+  ${LIBOMPTARGET_INCLUDE_DIR} ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
+)
 
 if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
   target_link_libraries(omptarget PRIVATE
@@ -65,23 +68,6 @@ target_compile_definitions(omptarget PRIVATE
 target_compile_options(omptarget PUBLIC ${offload_compile_flags})
 target_link_options(omptarget PUBLIC ${offload_link_flags})
 
-macro(check_plugin_target target)
-if (TARGET omptarget.rtl.${target})
-	list(APPEND LIBOMPTARGET_PLUGINS_TO_LOAD ${target})
-endif()
-endmacro()
-
-set(LIBOMPTARGET_PLUGINS_TO_LOAD "" CACHE STRING
-  "Comma separated list of plugin names to look for at runtime")
-if (NOT LIBOMPTARGET_PLUGINS_TO_LOAD)
-	check_plugin_target(ppc64)
-	check_plugin_target(x86_64)
-	check_plugin_target(cuda)
-	check_plugin_target(aarch64)
-	check_plugin_target(amdgpu)
-	check_plugin_target(s390x)
-endif()
-
 list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD PREPEND "\"libomptarget.rtl.")
 list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD APPEND "\"")
 list(JOIN LIBOMPTARGET_PLUGINS_TO_LOAD "," ENABLED_OFFLOAD_PLUGINS)
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index 792cae3e3dd5..dbb556c179e5 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -23,9 +23,6 @@ using namespace llvm::sys;
 
 PluginManager *PM = nullptr;
 
-// List of all plugins that can support offloading.
-static const char *RTLNames[] = {ENABLED_OFFLOAD_PLUGINS};
-
 Expected<std::unique_ptr<PluginAdaptorTy>>
 PluginAdaptorTy::create(const std::string &Name) {
   DP("Attempting to load library '%s'...\n", Name.c_str());
@@ -95,17 +92,19 @@ void PluginManager::init() {
 
   // Attempt to open all the plugins and, if they exist, check if the interface
   // is correct and if they are supporting any devices.
-  for (const char *Name : RTLNames) {
-    auto PluginAdaptorOrErr =
-        PluginAdaptorTy::create(std::string(Name) + ".so");
-    if (!PluginAdaptorOrErr) {
-      [[maybe_unused]] std::string InfoMsg =
-          toString(PluginAdaptorOrErr.takeError());
-      DP("%s", InfoMsg.c_str());
-    } else {
-      PluginAdaptors.push_back(std::move(*PluginAdaptorOrErr));
-    }
-  }
+#define PLUGIN_TARGET(Name)                                                    \
+  do {                                                                         \
+    auto PluginAdaptorOrErr =                                                  \
+        PluginAdaptorTy::create("libomptarget.rtl." #Name ".so");              \
+    if (!PluginAdaptorOrErr) {                                                 \
+      [[maybe_unused]] std::string InfoMsg =                                   \
+          toString(PluginAdaptorOrErr.takeError());                            \
+      DP("%s", InfoMsg.c_str());                                               \
+    } else {                                                                   \
+      PluginAdaptors.push_back(std::move(*PluginAdaptorOrErr));                \
+    }                                                                          \
+  } while (false);
+#include "Shared/Targets.def"
 
   DP("RTLs loaded!\n");
 }
diff --git a/offload/test/unified_shared_memory/api.c b/offload/test/unified_shared_memory/api.c
index c7ab055abb51..b938971b4b03 100644
--- a/offload/test/unified_shared_memory/api.c
+++ b/offload/test/unified_shared_memory/api.c
@@ -9,11 +9,6 @@
 #include <omp.h>
 #include <stdio.h>
 
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
 // End of definitions copied from OpenMP RTL.
 // ---------------------------------------------------------------------------
 
@@ -32,10 +27,6 @@ void init(int A[], int B[], int C[]) {
 int main(int argc, char *argv[]) {
   const int device = omp_get_default_device();
 
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
   // CHECK: Initial device: [[INITIAL_DEVICE:[0-9]+]]
   printf("Initial device: %d\n", omp_get_initial_device());
   // CHECK: Num devices: [[INITIAL_DEVICE]]
diff --git a/offload/test/unified_shared_memory/close_manual.c b/offload/test/unified_shared_memory/close_manual.c
index 9985e822c05d..c588cb1c403a 100644
--- a/offload/test/unified_shared_memory/close_manual.c
+++ b/offload/test/unified_shared_memory/close_manual.c
@@ -8,8 +8,6 @@
 // ---------------------------------------------------------------------------
 // Various definitions copied from OpenMP RTL
 
-extern void __tgt_register_requires(int64_t);
-
 extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
                                     void **args_base, void **args,
                                     int64_t *arg_sizes, int64_t *arg_types);
@@ -30,10 +28,6 @@ int main(int argc, char *argv[]) {
   void *host_alloc = 0, *device_alloc = 0;
   int *a = (int *)malloc(N * sizeof(int));
 
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
   // Init
   for (int i = 0; i < N; ++i) {
     a[i] = 10;
diff --git a/offload/test/unified_shared_memory/shared_update.c b/offload/test/unified_shared_memory/shared_update.c
index 65db9e4f6bdc..f8eb11d56a6c 100644
--- a/offload/test/unified_shared_memory/shared_update.c
+++ b/offload/test/unified_shared_memory/shared_update.c
@@ -11,11 +11,6 @@
 #include <omp.h>
 #include <stdio.h>
 
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
 // End of definitions copied from OpenMP RTL.
 // ---------------------------------------------------------------------------
 
@@ -30,10 +25,6 @@ int main(int argc, char *argv[]) {
   int *alloc = (int *)malloc(N * sizeof(int));
   int data[N];
 
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
   for (int i = 0; i < N; ++i) {
     alloc[i] = 10;
     data[i] = 1;
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 378e5aa296c4..f34e55555545 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -3038,15 +3038,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
 
   // Get the number of SMT threads per core.
-  int retval =
-      lpar_get_info(LPAR_INFO_FORMAT1, &cpuinfo, sizeof(lpar_info_format1_t));
-  if (!retval)
-    smt_threads = cpuinfo.smt_threads;
-  else {
-    CLEANUP_THREAD_INFO;
-    *msg_id = kmp_i18n_str_UnknownTopology;
-    return false;
-  }
+  smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
 
   // Allocate a resource set containing available system resourses.
   rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 8e9e7667eb90..3dc2c84d53f7 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -322,6 +322,8 @@ public:
 #include <sys/dr.h>
 #include <sys/rset.h>
 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
+#define GET_NUMBER_SMT_SETS 0x0004
+extern "C" int syssmt(int flags, int, int, int *);
 #endif
 class KMPNativeAffinity : public KMPAffinity {
   class Mask : public KMPAffinity::Mask {
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 11ce083f4801..affb577a5393 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -29,7 +29,9 @@
 #include <semaphore.h>
 #endif // KMP_OS_LINUX
 #include <sys/resource.h>
-#if !KMP_OS_AIX
+#if KMP_OS_AIX
+#include <sys/ldr.h>
+#else
 #include <sys/syscall.h>
 #endif
 #include <sys/time.h>
@@ -2338,9 +2340,48 @@ int __kmp_is_address_mapped(void *addr) {
   found = (int)addr < (__builtin_wasm_memory_size(0) * PAGESIZE);
 #elif KMP_OS_AIX
 
-  (void)rc;
-  // FIXME(AIX): Implement this
-  found = 1;
+  uint32_t loadQueryBufSize = 4096u; // Default loadquery buffer size.
+  char *loadQueryBuf;
+
+  for (;;) {
+    loadQueryBuf = (char *)KMP_INTERNAL_MALLOC(loadQueryBufSize);
+    if (loadQueryBuf == NULL) {
+      return 0;
+    }
+
+    rc = loadquery(L_GETXINFO | L_IGNOREUNLOAD, loadQueryBuf, loadQueryBufSize);
+    if (rc < 0) {
+      KMP_INTERNAL_FREE(loadQueryBuf);
+      if (errno != ENOMEM) {
+        return 0;
+      }
+      // errno == ENOMEM; double the size.
+      loadQueryBufSize <<= 1;
+      continue;
+    }
+    // Obtained the load info successfully.
+    break;
+  }
+
+  struct ld_xinfo *curLdInfo = (struct ld_xinfo *)loadQueryBuf;
+
+  // Loop through the load info to find if there is a match.
+  for (;;) {
+    uintptr_t curDataStart = (uintptr_t)curLdInfo->ldinfo_dataorg;
+    uintptr_t curDataEnd = curDataStart + curLdInfo->ldinfo_datasize;
+
+    // The data segment is readable and writable.
+    if (curDataStart <= (uintptr_t)addr && (uintptr_t)addr < curDataEnd) {
+      found = 1;
+      break;
+    }
+    if (curLdInfo->ldinfo_next == 0u) {
+      // Reached the end of load info.
+      break;
+    }
+    curLdInfo = (struct ld_xinfo *)((char *)curLdInfo + curLdInfo->ldinfo_next);
+  }
+  KMP_INTERNAL_FREE(loadQueryBuf);
 
 #else
 
diff --git a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
index 9823027b766c..884a6055cf4e 100644
--- a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
@@ -110,6 +110,7 @@ cc_test(
         "//clang:ast",
         "//clang:ast_matchers",
         "//clang:basic",
+        "//clang:frontend",
         "//clang:lex",
         "//clang:parse",
         "//clang:tooling",
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index 7958c6024875..b7b52f3ef59c 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -183,6 +183,20 @@ cc_binary(
     ],
 )
 
+gentbl_cc_library(
+    name = "lldb-sbapi-dwarf-enums",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            ["-gen-lldb-sbapi-dwarf-enum"],
+            "include/lldb/API/SBLanguages.h",
+        ),
+    ],
+    tblgen = ":lldb-tblgen",
+    td_file = "//llvm:include/llvm/BinaryFormat/Dwarf.def",
+    deps = [],
+)
+
 cc_library(
     name = "API",
     srcs = glob([
@@ -192,6 +206,7 @@ cc_library(
     hdrs = glob(["include/lldb/API/**/*.h"]),
     strip_include_prefix = "include",
     deps = [
+        ":lldb-sbapi-dwarf-enums",
         ":Breakpoint",
         ":Commands",
         ":Core",
@@ -272,6 +287,7 @@ cc_library(
     hdrs = glob(["include/lldb/Expression/**/*.h"]),
     strip_include_prefix = "include",
     deps = [
+        ":lldb-sbapi-dwarf-enums",
         ":Core",
         ":Headers",
         ":Host",
@@ -280,6 +296,7 @@ cc_library(
         ":TargetHeaders",
         ":Utility",
         "//lldb/source/Plugins:PluginSymbolFileDWARFHeaders",
+        "//llvm:BinaryFormat",
         "//llvm:Core",
         "//llvm:DebugInfoDWARF",
         "//llvm:ExecutionEngine",
@@ -346,7 +363,10 @@ cc_library(
     name = "ExpressionHeaders",
     hdrs = glob(["include/lldb/Expression/**/*.h"]),
     strip_include_prefix = "include",
-    deps = ["//llvm:ExecutionEngine"],
+    deps = [
+        ":lldb-sbapi-dwarf-enums",
+        "//llvm:ExecutionEngine"
+    ],
 )
 
 cc_library(
@@ -673,6 +693,7 @@ cc_library(
         ":TargetProperties",
         ":Utility",
         "//lldb/source/Plugins:PluginProcessUtility",
+        "//llvm:BinaryFormat",
         "//llvm:MC",
         "//llvm:Support",
     ],
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 13fec77fe567..d705af9167d8 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -202,6 +202,7 @@ cc_library(
         "//lldb:SymbolHeaders",
         "//lldb:TargetHeaders",
         "//lldb:Utility",
+        "//llvm:BinaryFormat",
         "//llvm:Core",
         "//llvm:ExecutionEngine",
         "//llvm:IPO",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 3223eb92d869..c159204cede7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -22,6 +22,7 @@ licenses(["notice"])
 exports_files([
     "LICENSE.TXT",
     "cmake/modules/llvm-driver-template.cpp.in",
+    "include/llvm/BinaryFormat/Dwarf.def",
     "include/llvm/CodeGen/SDNodeProperties.td",
     "include/llvm/CodeGen/ValueTypes.td",
     "include/llvm/Frontend/Directive/DirectiveBase.td",
@@ -3111,12 +3112,26 @@ cc_library(
 )
 
 cc_library(
-    name = "llvm-mca-headers",
+    name = "MCAApplication",
+    srcs = glob([
+        "tools/llvm-mca/Views/*.cpp",
+    ]) + [
+        mca_source
+        for mca_source in glob(["tools/llvm-mca/*.cpp"])
+        if mca_source != "tools/llvm-mca/llvm-mca.cpp"
+    ],
     hdrs = glob([
         "tools/llvm-mca/*.h",
         "tools/llvm-mca/Views/*.h",
     ]),
     strip_include_prefix = "tools/llvm-mca",
+    deps = [
+        ":MC",
+        ":MCA",
+        ":MCParser",
+        ":Support",
+        ":TargetParser",
+    ],
 )
 
 cc_library(
@@ -4034,12 +4049,9 @@ cc_binary(
 
 cc_binary(
     name = "llvm-mca",
-    srcs = glob([
-        "tools/llvm-mca/*.cpp",
-        "tools/llvm-mca/*.h",
-        "tools/llvm-mca/Views/*.cpp",
-        "tools/llvm-mca/Views/*.h",
-    ]),
+    srcs =[
+        "tools/llvm-mca/llvm-mca.cpp",
+    ],
     copts = llvm_copts,
     stamp = 0,
     deps = [
@@ -4049,10 +4061,10 @@ cc_binary(
         ":AllTargetsMCAs",
         ":MC",
         ":MCA",
+        ":MCAApplication",
         ":MCParser",
         ":Support",
         ":TargetParser",
-        ":llvm-mca-headers",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/driver.bzl b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
index 888626d7cf84..10796d919834 100644
--- a/utils/bazel/llvm-project-overlay/llvm/driver.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
@@ -37,7 +37,7 @@ _TOOLS = {
 # aliases for a given tool.
 _EXTRA_ALIASES = {
     "clang": ["clang++", "clang-cl", "clang-cpp"],
-    "lld": ["lld-link", "ld.lld", "ld64.lld", "wasm-ld"],
+    "lld": ["ld", "lld-link", "ld.lld", "ld64.lld", "wasm-ld"],
     "llvm-ar": ["ranlib", "lib", "dlltool"],
     "llvm-objcopy": ["bitcode-strip", "install-name-tool", "strip"],
     "llvm-objdump": ["otool"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index dd42f84d16dc..9be26ab551b0 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -811,3 +811,29 @@ cc_test(
         "//third-party/unittest:gtest_main",
     ],
 )
+
+cc_test(
+    name = "llvm_mca_tests",
+    size = "small",
+    srcs = glob(
+        [
+            "tools/llvm-mca/*.cpp",
+            "tools/llvm-mca/*.h",
+            "tools/llvm-mca/X86/*.cpp",
+            "tools/llvm-mca/X86/*.h",
+        ],
+        allow_empty = False,
+    ),
+    includes = ["tools/llvm-mca"],
+    deps = [
+        "//llvm:MC",
+        "//llvm:MCA",
+        "//llvm:MCAApplication",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+        "//llvm:X86CodeGen",
+        "//llvm:X86UtilsAndDesc",
+        "//third-party/unittest:gtest",
+        "//third-party/unittest:gtest_main",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 63b24b9f46b2..acd2d3a14d74 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5,8 +5,8 @@
 # Description:
 #   The MLIR "Multi-Level Intermediate Representation" Compiler Infrastructure
 
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load(
     ":build_defs.bzl",
     "cc_headers_only",
@@ -3653,14 +3653,6 @@ gentbl_cc_library(
             "include/mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc",
         ),
         (
-            ["-gen-enum-decls"],
-            "include/mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "include/mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc",
-        ),
-        (
             [
                 "-gen-attrdef-decls",
                 "-attrdefs-dialect=xegpu",
@@ -3677,7 +3669,44 @@ gentbl_cc_library(
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/XeGPU/IR/XeGPU.td",
-    deps = [":XeGPUTdFiles"],
+    deps = [
+        ":ArithOpsTdFiles",
+        ":XeGPUTdFiles",
+    ],
+)
+
+td_library(
+    name = "XeGPUAttrTdFiles",
+    srcs = [
+        "include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td",
+        "include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td",
+    ],
+    includes = ["include"],
+    deps = [
+        ":BuiltinDialectTdFiles",
+        ":OpBaseTdFiles",
+        ":ShapedOpInterfacesTdFiles",
+        ":ViewLikeInterfaceTdFiles",
+    ],
+)
+
+# Separated from the XeGPUIncGen target because the enum declaration causes
+# duplicate declarations with the Arith enums.
+gentbl_cc_library(
+    name = "XeGPUEnumsIncGen",
+    tbl_outs = [
+        (
+            ["-gen-enum-decls"],
+            "include/mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc",
+        ),
+        (
+            ["-gen-enum-defs"],
+            "include/mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td",
+    deps = [":XeGPUAttrTdFiles"],
 )
 
 cc_library(
@@ -3689,12 +3718,14 @@ cc_library(
     hdrs = ["include/mlir/Dialect/XeGPU/IR/XeGPU.h"],
     includes = ["include"],
     deps = [
+        ":ArithDialect",
         ":BytecodeOpInterface",
         ":DialectUtils",
         ":IR",
         ":ShapedOpInterfaces",
         ":SideEffectInterfaces",
         ":ViewLikeInterface",
+        ":XeGPUEnumsIncGen",
         ":XeGPUIncGen",
         "//llvm:Support",
     ],
@@ -6666,6 +6697,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":ArithDialect",
         ":IR",
         ":InferTypeOpInterface",
         ":PolynomialAttributesIncGen",
@@ -9741,6 +9773,15 @@ cc_binary(
 )
 
 cc_binary(
+    name = "mlir-src-sharder",
+    srcs = ["tools/mlir-src-sharder/mlir-src-sharder.cpp"],
+    deps = [
+        ":Support",
+        "//llvm:Support",
+    ],
+)
+
+cc_binary(
     name = "mlir-linalg-ods-yaml-gen",
     srcs = [
         "tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp",
@@ -11064,6 +11105,7 @@ cc_library(
         ":FuncTransforms",
         ":GPUDialect",
         ":IR",
+        ":IndexDialect",
         ":LinalgDialect",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
@@ -11075,6 +11117,7 @@ cc_library(
         ":MeshShardingInterface",
         ":MeshTransforms",
         ":Pass",
+        ":RuntimeVerifiableOpInterface",
         ":SCFDialect",
         ":SCFTransforms",
         ":SCFUtils",
@@ -12592,6 +12635,7 @@ cc_library(
     deps = [
         ":ArithDialect",
         ":ComplexDialect",
+        ":DialectUtils",
         ":IR",
         "//llvm:Support",
     ],
diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index fdf6a57107ac..e45ba1fe0ef7 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -432,3 +432,136 @@ def gentbl_cc_library(
         copts = copts,
         **kwargs
     )
+
+def _gentbl_shard_impl(ctx):
+    args = ctx.actions.args()
+    args.add(ctx.file.src_file)
+    args.add("-op-shard-index", ctx.attr.index)
+    args.add("-o", ctx.outputs.out.path)
+    ctx.actions.run(
+        outputs = [ctx.outputs.out],
+        inputs = [ctx.file.src_file],
+        executable = ctx.executable.sharder,
+        arguments = [args],
+        use_default_shell_env = True,
+        mnemonic = "ShardGenerate",
+    )
+
+gentbl_shard_rule = rule(
+    _gentbl_shard_impl,
+    doc = "",
+    output_to_genfiles = True,
+    attrs = {
+        "index": attr.int(mandatory = True, doc = ""),
+        "sharder": attr.label(
+            doc = "",
+            executable = True,
+            cfg = "exec",
+        ),
+        "src_file": attr.label(
+            doc = "",
+            allow_single_file = True,
+            mandatory = True,
+        ),
+        "out": attr.output(
+            doc = "",
+            mandatory = True,
+        ),
+    },
+)
+
+def gentbl_sharded_ops(
+        name,
+        tblgen,
+        sharder,
+        td_file,
+        shard_count,
+        src_file,
+        src_out,
+        hdr_out,
+        test = False,
+        includes = [],
+        strip_include_prefix = None,
+        deps = []):
+    """Generate sharded op declarations and definitions.
+
+    This special build rule shards op definitions in a TableGen file and generates multiple copies
+    of a template source file for including and compiling each shard. The rule defines a filegroup
+    consisting of the source shards, the generated source file, and the generated header file.
+
+    Args:
+      name: The name of the filegroup.
+      tblgen: The binary used to produce the output.
+      sharder: The source file sharder to use.
+      td_file: The primary table definitions file.
+      shard_count: The number of op definition shards to produce.
+      src_file: The source file template.
+      src_out: The generated source file.
+      hdr_out: The generated header file.
+      test: Whether this is a test target.
+      includes: See gentbl_rule.includes
+      deps: See gentbl_rule.deps
+      strip_include_prefix: Attribute to pass through to cc_library.
+    """
+    cc_lib_name = name + "__gentbl_cc_lib"
+    gentbl_cc_library(
+        name = cc_lib_name,
+        strip_include_prefix = strip_include_prefix,
+        includes = includes,
+        tbl_outs = [
+            (
+                [
+                    "-gen-op-defs",
+                    "-op-shard-count=" + str(shard_count),
+                ],
+                src_out,
+            ),
+            (
+                [
+                    "-gen-op-decls",
+                    "-op-shard-count=" + str(shard_count),
+                ],
+                hdr_out,
+            ),
+        ],
+        tblgen = tblgen,
+        td_file = td_file,
+        test = test,
+        deps = deps,
+    )
+    all_files = [hdr_out, src_out]
+    for i in range(0, shard_count):
+        out_file = "shard_copy_" + str(i) + "_" + src_file
+        gentbl_shard_rule(
+            index = i,
+            name = name + "__src_shard" + str(i),
+            testonly = test,
+            out = out_file,
+            sharder = sharder,
+            src_file = src_file,
+        )
+        all_files.append(out_file)
+    native.filegroup(name = name, srcs = all_files)
+
+def gentbl_sharded_op_defs(name, source_file, shard_count):
+    """Generates multiple copies of a source file that includes sharded op definitions.
+
+    Args:
+      name: The name of the rule.
+      source_file: The source to copy.
+      shard_count: The number of shards.
+
+    Returns:
+      A list of the copied filenames to be included in the dialect library.
+    """
+    copies = []
+    for i in range(0, shard_count):
+        out_file = "shard_copy_" + str(i) + "_" + source_file
+        copies.append(out_file)
+        native.genrule(
+            name = name + "_shard_" + str(i),
+            srcs = [source_file],
+            outs = [out_file],
+            cmd = "echo -e \"#define GET_OP_DEFS_" + str(i) + "\n$$(cat $(SRCS))\" > $(OUTS)",
+        )
+    return copies
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index dc5f4047c286..0ebfcbe284bd 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -4,7 +4,7 @@
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("//llvm:lit_test.bzl", "package_path")
-load("//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//mlir:tblgen.bzl", "gentbl_cc_library", "gentbl_sharded_ops", "td_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -152,14 +152,6 @@ gentbl_cc_library(
     strip_include_prefix = "lib/Dialect/Test",
     tbl_outs = [
         (
-            ["-gen-op-decls"],
-            "lib/Dialect/Test/TestOps.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "lib/Dialect/Test/TestOps.cpp.inc",
-        ),
-        (
             [
                 "-gen-dialect-decls",
                 "-dialect=test",
@@ -370,12 +362,25 @@ cc_library(
     ],
 )
 
+gentbl_sharded_ops(
+    name = "TestDialectOpSrcs",
+    hdr_out = "lib/Dialect/Test/TestOps.h.inc",
+    shard_count = 20,
+    sharder = "//mlir:mlir-src-sharder",
+    src_file = "lib/Dialect/Test/TestOps.cpp",
+    src_out = "lib/Dialect/Test/TestOps.cpp.inc",
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "lib/Dialect/Test/TestOps.td",
+    test = True,
+    deps = [":TestOpTdFiles"],
+)
+
 cc_library(
     name = "TestDialect",
     srcs = glob(
         ["lib/Dialect/Test/*.cpp"],
         exclude = ["lib/Dialect/Test/TestToLLVMIRTranslation.cpp"],
-    ),
+    ) + [":TestDialectOpSrcs"],
     hdrs = glob(["lib/Dialect/Test/*.h"]),
     includes = [
         "lib/Dialect/Test",
@@ -418,9 +423,9 @@ cc_library(
         "//mlir:SideEffectInterfaces",
         "//mlir:Support",
         "//mlir:TensorDialect",
-        "//mlir:TranslateLib",
         "//mlir:TransformUtils",
         "//mlir:Transforms",
+        "//mlir:TranslateLib",
         "//mlir:ValueBoundsOpInterface",
         "//mlir:ViewLikeInterface",
     ],
author	Necip Fazil Yildiran <necip@google.com>	2024-05-01 15:41:58 -0700
committer	prabhukr <prabhukr@google.com>	2024-05-01 15:41:58 -0700
commit	c2f27b33d29096f69ebb3a6ab8db60ac71651865 (patch)
tree	59fd099b1987744e0b00a0c57e46353cca3138c1
parent	3065b9b275511f4bd1840187af7de8934078d9ed (diff)
parent	56e4111f9da499072e5119ccf92693202430f894 (diff)